Update parallel tagging example

2017-10-27 01:48:52 +02:00 · 2017-10-27 01:48:52 +02:00 · ed69bd69f4
parent 096a80170d
commit ed69bd69f4
2 changed files with 71 additions and 90 deletions
--- a/examples/parallel_tag.py
+++ b/examples/parallel_tag.py
@ -0,0 +1,71 @@
+"""
+Print part-of-speech tagged, true-cased, (very roughly) sentence-separated
+text, with each "sentence" on a newline, and spaces between tokens. Supports
+multi-processing.
+
+Last updated for: spaCy 2.0.0a18
+"""
+from __future__ import print_function, unicode_literals, division
+from toolz import partition_all
+from pathlib import Path
+from joblib import Parallel, delayed
+import thinc.extra.datasets
+import plac
+import spacy
+
+
+@plac.annotations(
+    output_dir=("Output directory", "positional", None, Path),
+    model=("Model name (needs tagger)", "positional", None, str),
+    n_jobs=("Number of workers", "option", "n", int),
+    batch_size=("Batch-size for each process", "option", "b", int),
+    limit=("Limit of entries from the dataset", "option", "l", int))
+def main(output_dir, model='en_core_web_sm', n_jobs=4, batch_size=1000,
+         limit=10000):
+    nlp = spacy.load(model)  # load spaCy model
+    print("Loaded model '%s'" % model)
+    if not output_dir.exists():
+        output_dir.mkdir()
+    # load and pre-process the IMBD dataset
+    print("Loading IMDB data...")
+    data, _ = thinc.extra.datasets.imdb()
+    texts, _ = zip(*data[-limit:])
+    partitions = partition_all(batch_size, texts)
+    items = ((i, [nlp(text) for text in texts], output_dir) for i, texts
+             in enumerate(partitions))
+    Parallel(n_jobs=n_jobs)(delayed(transform_texts)(*item) for item in items)
+
+
+def transform_texts(batch_id, docs, output_dir):
+    out_path = Path(output_dir) / ('%d.txt' % batch_id)
+    if out_path.exists():  # return None in case same batch is called again
+        return None
+    print('Processing batch', batch_id)
+    with out_path.open('w', encoding='utf8') as f:
+        for doc in docs:
+            f.write(' '.join(represent_word(w) for w in doc if not w.is_space))
+            f.write('\n')
+    print('Saved {} texts to {}.txt'.format(len(docs), batch_id))
+
+
+def represent_word(word):
+    text = word.text
+    # True-case, i.e. try to normalize sentence-initial capitals.
+    # Only do this if the lower-cased form is more probable.
+    if text.istitle() and is_sent_begin(word) \
+       and word.prob < word.doc.vocab[text.lower()].prob:
+        text = text.lower()
+    return text + '|' + word.tag_
+
+
+def is_sent_begin(word):
+    if word.i == 0:
+        return True
+    elif word.i >= 2 and word.nbor(-1).text in ('.', '!', '?', '...'):
+        return True
+    else:
+        return False
+
+
+if __name__ == '__main__':
+    plac.call(main)
--- a/examples/pos_tag.py
+++ b/examples/pos_tag.py
@ -1,90 +0,0 @@
-"""
-Print part-of-speech tagged, true-cased, (very roughly) sentence-separated
-text, with each "sentence" on a newline, and spaces between tokens. Supports
-multi-processing.
-"""
-from __future__ import print_function, unicode_literals, division
-import io
-import bz2
-import logging
-from toolz import partition
-from os import path
-
-import spacy.en
-
-from joblib import Parallel, delayed
-import plac
-import ujson
-
-
-def parallelize(func, iterator, n_jobs, extra):
-    extra = tuple(extra)
-    return Parallel(n_jobs=n_jobs)(delayed(func)(*(item + extra)) for item in iterator)
-
-
-def iter_texts_from_json_bz2(loc):
-    """
-    Iterator of unicode strings, one per document (here, a comment).
-    
-    Expects a a path to a BZ2 file, which should be new-line delimited JSON. The
-    document text should be in a string field titled 'body'.
-
-    This is the data format of the Reddit comments corpus.
-    """
-    with bz2.BZ2File(loc) as file_:
-        for i, line in enumerate(file_):
-            yield ujson.loads(line)['body']
-
-
-def transform_texts(batch_id, input_, out_dir):
-    out_loc = path.join(out_dir, '%d.txt' % batch_id)
-    if path.exists(out_loc):
-        return None
-    print('Batch', batch_id)
-    nlp = spacy.en.English(parser=False, entity=False)
-    with io.open(out_loc, 'w', encoding='utf8') as file_:
-        for text in input_:
-            doc = nlp(text)
-            file_.write(' '.join(represent_word(w) for w in doc if not w.is_space))
-            file_.write('\n')
-
-
-def represent_word(word):
-    text = word.text
-    # True-case, i.e. try to normalize sentence-initial capitals.
-    # Only do this if the lower-cased form is more probable.
-    if text.istitle() \
-    and is_sent_begin(word) \
-    and word.prob < word.doc.vocab[text.lower()].prob:
-        text = text.lower()
-    return text + '|' + word.tag_
-
-
-def is_sent_begin(word):
-    # It'd be nice to have some heuristics like these in the library, for these
-    # times where we don't care so much about accuracy of SBD, and we don't want
-    # to parse
-    if word.i == 0:
-        return True
-    elif word.i >= 2 and word.nbor(-1).text in ('.', '!', '?', '...'):
-        return True
-    else:
-        return False
-
-
-@plac.annotations(
-    in_loc=("Location of input file"),
-    out_dir=("Location of input file"),
-    n_workers=("Number of workers", "option", "n", int),
-    batch_size=("Batch-size for each process", "option", "b", int)
-)
-def main(in_loc, out_dir, n_workers=4, batch_size=100000):
-    if not path.exists(out_dir):
-        path.join(out_dir)
-    texts = partition(batch_size, iter_texts_from_json_bz2(in_loc))
-    parallelize(transform_texts, enumerate(texts), n_workers, [out_dir])
- 
-
-if __name__ == '__main__':
-    plac.call(main)
-