From ed69bd69f4cb7dcc8ba9f70cdc2e4de197520869 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 27 Oct 2017 01:48:52 +0200 Subject: [PATCH] Update parallel tagging example --- examples/parallel_tag.py | 71 +++++++++++++++++++++++++++++++ examples/pos_tag.py | 90 ---------------------------------------- 2 files changed, 71 insertions(+), 90 deletions(-) create mode 100644 examples/parallel_tag.py delete mode 100644 examples/pos_tag.py diff --git a/examples/parallel_tag.py b/examples/parallel_tag.py new file mode 100644 index 000000000..a6571a2ac --- /dev/null +++ b/examples/parallel_tag.py @@ -0,0 +1,71 @@ +""" +Print part-of-speech tagged, true-cased, (very roughly) sentence-separated +text, with each "sentence" on a newline, and spaces between tokens. Supports +multi-processing. + +Last updated for: spaCy 2.0.0a18 +""" +from __future__ import print_function, unicode_literals, division +from toolz import partition_all +from pathlib import Path +from joblib import Parallel, delayed +import thinc.extra.datasets +import plac +import spacy + + +@plac.annotations( + output_dir=("Output directory", "positional", None, Path), + model=("Model name (needs tagger)", "positional", None, str), + n_jobs=("Number of workers", "option", "n", int), + batch_size=("Batch-size for each process", "option", "b", int), + limit=("Limit of entries from the dataset", "option", "l", int)) +def main(output_dir, model='en_core_web_sm', n_jobs=4, batch_size=1000, + limit=10000): + nlp = spacy.load(model) # load spaCy model + print("Loaded model '%s'" % model) + if not output_dir.exists(): + output_dir.mkdir() + # load and pre-process the IMBD dataset + print("Loading IMDB data...") + data, _ = thinc.extra.datasets.imdb() + texts, _ = zip(*data[-limit:]) + partitions = partition_all(batch_size, texts) + items = ((i, [nlp(text) for text in texts], output_dir) for i, texts + in enumerate(partitions)) + Parallel(n_jobs=n_jobs)(delayed(transform_texts)(*item) for item in items) + + +def transform_texts(batch_id, docs, output_dir): + out_path = Path(output_dir) / ('%d.txt' % batch_id) + if out_path.exists(): # return None in case same batch is called again + return None + print('Processing batch', batch_id) + with out_path.open('w', encoding='utf8') as f: + for doc in docs: + f.write(' '.join(represent_word(w) for w in doc if not w.is_space)) + f.write('\n') + print('Saved {} texts to {}.txt'.format(len(docs), batch_id)) + + +def represent_word(word): + text = word.text + # True-case, i.e. try to normalize sentence-initial capitals. + # Only do this if the lower-cased form is more probable. + if text.istitle() and is_sent_begin(word) \ + and word.prob < word.doc.vocab[text.lower()].prob: + text = text.lower() + return text + '|' + word.tag_ + + +def is_sent_begin(word): + if word.i == 0: + return True + elif word.i >= 2 and word.nbor(-1).text in ('.', '!', '?', '...'): + return True + else: + return False + + +if __name__ == '__main__': + plac.call(main) diff --git a/examples/pos_tag.py b/examples/pos_tag.py deleted file mode 100644 index 1dd6add0f..000000000 --- a/examples/pos_tag.py +++ /dev/null @@ -1,90 +0,0 @@ -""" -Print part-of-speech tagged, true-cased, (very roughly) sentence-separated -text, with each "sentence" on a newline, and spaces between tokens. Supports -multi-processing. -""" -from __future__ import print_function, unicode_literals, division -import io -import bz2 -import logging -from toolz import partition -from os import path - -import spacy.en - -from joblib import Parallel, delayed -import plac -import ujson - - -def parallelize(func, iterator, n_jobs, extra): - extra = tuple(extra) - return Parallel(n_jobs=n_jobs)(delayed(func)(*(item + extra)) for item in iterator) - - -def iter_texts_from_json_bz2(loc): - """ - Iterator of unicode strings, one per document (here, a comment). - - Expects a a path to a BZ2 file, which should be new-line delimited JSON. The - document text should be in a string field titled 'body'. - - This is the data format of the Reddit comments corpus. - """ - with bz2.BZ2File(loc) as file_: - for i, line in enumerate(file_): - yield ujson.loads(line)['body'] - - -def transform_texts(batch_id, input_, out_dir): - out_loc = path.join(out_dir, '%d.txt' % batch_id) - if path.exists(out_loc): - return None - print('Batch', batch_id) - nlp = spacy.en.English(parser=False, entity=False) - with io.open(out_loc, 'w', encoding='utf8') as file_: - for text in input_: - doc = nlp(text) - file_.write(' '.join(represent_word(w) for w in doc if not w.is_space)) - file_.write('\n') - - -def represent_word(word): - text = word.text - # True-case, i.e. try to normalize sentence-initial capitals. - # Only do this if the lower-cased form is more probable. - if text.istitle() \ - and is_sent_begin(word) \ - and word.prob < word.doc.vocab[text.lower()].prob: - text = text.lower() - return text + '|' + word.tag_ - - -def is_sent_begin(word): - # It'd be nice to have some heuristics like these in the library, for these - # times where we don't care so much about accuracy of SBD, and we don't want - # to parse - if word.i == 0: - return True - elif word.i >= 2 and word.nbor(-1).text in ('.', '!', '?', '...'): - return True - else: - return False - - -@plac.annotations( - in_loc=("Location of input file"), - out_dir=("Location of input file"), - n_workers=("Number of workers", "option", "n", int), - batch_size=("Batch-size for each process", "option", "b", int) -) -def main(in_loc, out_dir, n_workers=4, batch_size=100000): - if not path.exists(out_dir): - path.join(out_dir) - texts = partition(batch_size, iter_texts_from_json_bz2(in_loc)) - parallelize(transform_texts, enumerate(texts), n_workers, [out_dir]) - - -if __name__ == '__main__': - plac.call(main) -