From d185927998c72fb8163a2456520826fd15907059 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 4 Nov 2017 23:07:03 +0100 Subject: [PATCH 1/2] Undo harmful pickling hacks on Language class --- spacy/language.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index bcdb93ef2..c7de79424 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -135,10 +135,6 @@ class Language(object): self.pipeline = [] self._optimizer = None - def __reduce__(self): - bytes_data = self.to_bytes(vocab=False) - return (unpickle_language, (self.vocab, self.meta, bytes_data)) - @property def path(self): return self._path @@ -724,12 +720,6 @@ class DisabledPipes(list): self[:] = [] -def unpickle_language(vocab, meta, bytes_data): - lang = Language(vocab=vocab) - lang.from_bytes(bytes_data) - return lang - - def _pipe(func, docs): for doc in docs: func(doc) From ba0201de072c0be4e353586e4faef387c2d49be0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 4 Nov 2017 23:07:57 +0100 Subject: [PATCH 2/2] Update multiprocessing example --- examples/pipeline/multi_processing.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/examples/pipeline/multi_processing.py b/examples/pipeline/multi_processing.py index 99bb9c53f..0efb00099 100644 --- a/examples/pipeline/multi_processing.py +++ b/examples/pipeline/multi_processing.py @@ -34,21 +34,24 @@ def main(output_dir, model='en_core_web_sm', n_jobs=4, batch_size=1000, data, _ = thinc.extra.datasets.imdb() texts, _ = zip(*data[-limit:]) partitions = partition_all(batch_size, texts) - items = ((i, [nlp(text) for text in texts], output_dir) for i, texts - in enumerate(partitions)) - Parallel(n_jobs=n_jobs)(delayed(transform_texts)(*item) for item in items) + executor = Parallel(n_jobs=n_jobs) + do = delayed(transform_texts) + tasks = (do(nlp, i, batch, output_dir) + for i, batch in enumerate(partitions)) + executor(tasks) -def transform_texts(batch_id, docs, output_dir): +def transform_texts(nlp, batch_id, texts, output_dir): + print(nlp.pipe_names) out_path = Path(output_dir) / ('%d.txt' % batch_id) if out_path.exists(): # return None in case same batch is called again return None print('Processing batch', batch_id) with out_path.open('w', encoding='utf8') as f: - for doc in docs: + for doc in nlp.pipe(texts): f.write(' '.join(represent_word(w) for w in doc if not w.is_space)) f.write('\n') - print('Saved {} texts to {}.txt'.format(len(docs), batch_id)) + print('Saved {} texts to {}.txt'.format(len(texts), batch_id)) def represent_word(word):