mirror of https://github.com/explosion/spaCy.git
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
0a27afbf86
|
@ -35,21 +35,24 @@ def main(output_dir, model='en_core_web_sm', n_jobs=4, batch_size=1000,
|
||||||
texts, _ = zip(*data[-limit:])
|
texts, _ = zip(*data[-limit:])
|
||||||
print("Processing texts...")
|
print("Processing texts...")
|
||||||
partitions = partition_all(batch_size, texts)
|
partitions = partition_all(batch_size, texts)
|
||||||
items = ((i, [nlp(text) for text in texts], output_dir) for i, texts
|
executor = Parallel(n_jobs=n_jobs)
|
||||||
in enumerate(partitions))
|
do = delayed(transform_texts)
|
||||||
Parallel(n_jobs=n_jobs)(delayed(transform_texts)(*item) for item in items)
|
tasks = (do(nlp, i, batch, output_dir)
|
||||||
|
for i, batch in enumerate(partitions))
|
||||||
|
executor(tasks)
|
||||||
|
|
||||||
|
|
||||||
def transform_texts(batch_id, docs, output_dir):
|
def transform_texts(nlp, batch_id, texts, output_dir):
|
||||||
|
print(nlp.pipe_names)
|
||||||
out_path = Path(output_dir) / ('%d.txt' % batch_id)
|
out_path = Path(output_dir) / ('%d.txt' % batch_id)
|
||||||
if out_path.exists(): # return None in case same batch is called again
|
if out_path.exists(): # return None in case same batch is called again
|
||||||
return None
|
return None
|
||||||
print('Processing batch', batch_id)
|
print('Processing batch', batch_id)
|
||||||
with out_path.open('w', encoding='utf8') as f:
|
with out_path.open('w', encoding='utf8') as f:
|
||||||
for doc in docs:
|
for doc in nlp.pipe(texts):
|
||||||
f.write(' '.join(represent_word(w) for w in doc if not w.is_space))
|
f.write(' '.join(represent_word(w) for w in doc if not w.is_space))
|
||||||
f.write('\n')
|
f.write('\n')
|
||||||
print('Saved {} texts to {}.txt'.format(len(docs), batch_id))
|
print('Saved {} texts to {}.txt'.format(len(texts), batch_id))
|
||||||
|
|
||||||
|
|
||||||
def represent_word(word):
|
def represent_word(word):
|
||||||
|
|
|
@ -135,10 +135,6 @@ class Language(object):
|
||||||
self.pipeline = []
|
self.pipeline = []
|
||||||
self._optimizer = None
|
self._optimizer = None
|
||||||
|
|
||||||
def __reduce__(self):
|
|
||||||
bytes_data = self.to_bytes(vocab=False)
|
|
||||||
return (unpickle_language, (self.vocab, self.meta, bytes_data))
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def path(self):
|
def path(self):
|
||||||
return self._path
|
return self._path
|
||||||
|
@ -724,12 +720,6 @@ class DisabledPipes(list):
|
||||||
self[:] = []
|
self[:] = []
|
||||||
|
|
||||||
|
|
||||||
def unpickle_language(vocab, meta, bytes_data):
|
|
||||||
lang = Language(vocab=vocab)
|
|
||||||
lang.from_bytes(bytes_data)
|
|
||||||
return lang
|
|
||||||
|
|
||||||
|
|
||||||
def _pipe(func, docs):
|
def _pipe(func, docs):
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
func(doc)
|
func(doc)
|
||||||
|
|
Loading…
Reference in New Issue