diff --git a/examples/pipeline/multi_processing.py b/examples/pipeline/multi_processing.py index d6e98752e..e82654446 100644 --- a/examples/pipeline/multi_processing.py +++ b/examples/pipeline/multi_processing.py @@ -35,21 +35,24 @@ def main(output_dir, model='en_core_web_sm', n_jobs=4, batch_size=1000, texts, _ = zip(*data[-limit:]) print("Processing texts...") partitions = partition_all(batch_size, texts) - items = ((i, [nlp(text) for text in texts], output_dir) for i, texts - in enumerate(partitions)) - Parallel(n_jobs=n_jobs)(delayed(transform_texts)(*item) for item in items) + executor = Parallel(n_jobs=n_jobs) + do = delayed(transform_texts) + tasks = (do(nlp, i, batch, output_dir) + for i, batch in enumerate(partitions)) + executor(tasks) -def transform_texts(batch_id, docs, output_dir): +def transform_texts(nlp, batch_id, texts, output_dir): + print(nlp.pipe_names) out_path = Path(output_dir) / ('%d.txt' % batch_id) if out_path.exists(): # return None in case same batch is called again return None print('Processing batch', batch_id) with out_path.open('w', encoding='utf8') as f: - for doc in docs: + for doc in nlp.pipe(texts): f.write(' '.join(represent_word(w) for w in doc if not w.is_space)) f.write('\n') - print('Saved {} texts to {}.txt'.format(len(docs), batch_id)) + print('Saved {} texts to {}.txt'.format(len(texts), batch_id)) def represent_word(word): diff --git a/spacy/language.py b/spacy/language.py index bcdb93ef2..c7de79424 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -135,10 +135,6 @@ class Language(object): self.pipeline = [] self._optimizer = None - def __reduce__(self): - bytes_data = self.to_bytes(vocab=False) - return (unpickle_language, (self.vocab, self.meta, bytes_data)) - @property def path(self): return self._path @@ -724,12 +720,6 @@ class DisabledPipes(list): self[:] = [] -def unpickle_language(vocab, meta, bytes_data): - lang = Language(vocab=vocab) - lang.from_bytes(bytes_data) - return lang - - def _pipe(func, docs): for doc in docs: func(doc)