Merge branch 'master' of ssh://github.com/honnibal/spaCy

2016-02-08 14:00:30 +01:00 · 2016-02-08 14:00:30 +01:00 · ad5fa3f335
parent 9bbc41c1e3 5d96b3ef4f
commit ad5fa3f335
2 changed files with 76 additions and 2 deletions
--- a/examples/parallel_parse.py
+++ b/examples/parallel_parse.py
@ -0,0 +1,74 @@
 from __future__ import print_function, unicode_literals, division
 import io
 import bz2
 import logging
 from toolz import partition
 from os import path
 import re
 import spacy.en
 from spacy.tokens import Doc
 from joblib import Parallel, delayed
 import plac
 import ujson
 def parallelize(func, iterator, n_jobs, extra, backend='multiprocessing'):
    extra = tuple(extra)
    return Parallel(n_jobs=n_jobs, backend=backend)(delayed(func)(*(item + extra))
                    for item in iterator)
 def iter_comments(loc):
    with bz2.BZ2File(loc) as file_:
        for i, line in enumerate(file_):
            yield ujson.loads(line)['body']
 pre_format_re = re.compile(r'^[\`\*\~]')
 post_format_re = re.compile(r'[\`\*\~]$')
 url_re = re.compile(r'\[([^]]+)\]\(%%URL\)')
 link_re = re.compile(r'\[([^]]+)\]\(https?://[^\)]+\)')
 def strip_meta(text):
    text = link_re.sub(r'\1', text)
    text = text.replace('&gt;', '>').replace('&lt;', '<')
    text = pre_format_re.sub('', text)
    text = post_format_re.sub('', text)
    return text.strip()
 def save_parses(batch_id, input_, out_dir, n_threads, batch_size):
    out_loc = path.join(out_dir, '%d.bin' % batch_id)
    if path.exists(out_loc):
        return None
    print('Batch', batch_id)
    nlp = spacy.en.English()
    nlp.matcher = None
    with open(out_loc, 'wb') as file_:
        texts = (strip_meta(text) for text in input_)
        texts = (text for text in texts if text.strip())
        for doc in nlp.pipe(texts, batch_size=batch_size, n_threads=n_threads):
            file_.write(doc.to_bytes())
@plac.annotations(
    in_loc=("Location of input file"),
    out_dir=("Location of input file"),
    n_process=("Number of processes", "option", "p", int),
    n_thread=("Number of threads per process", "option", "t", int),
    batch_size=("Number of texts to accumulate in a buffer", "option", "b", int)
 )
 def main(in_loc, out_dir, n_process=1, n_thread=4):
    if not path.exists(out_dir):
        path.join(out_dir)
    if n_process >= 2:
        texts = partition(200000, iter_comments(in_loc))
        parallelize(save_parses, enumerate(texts), n_process, [out_dir, n_thread, batch_size],
                   backend='multiprocessing')
    else:
        save_parses(0, iter_comments(in_loc), out_dir, n_thread, batch_size)
 if __name__ == '__main__':
    plac.call(main)
--- a/spacy/about.py
+++ b/spacy/about.py
@ -4,11 +4,11 @@
 # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
 __name__ = 'spacy'
-__version__ = '0.100.3'
+__version__ = '0.100.5'
 __summary__ = 'Industrial-strength NLP'
 __uri__ = 'https://spacy.io'
 __author__ = 'Matthew Honnibal'
 __email__ = 'matt@spacy.io'
 __license__ = 'MIT'
-__release__ = False
+__release__ = True
 __default_model__ = 'en_default'