mirror of https://github.com/explosion/spaCy.git
Update parallel tagging example
This commit is contained in:
parent
096a80170d
commit
ed69bd69f4
|
@ -0,0 +1,71 @@
|
|||
"""
|
||||
Print part-of-speech tagged, true-cased, (very roughly) sentence-separated
|
||||
text, with each "sentence" on a newline, and spaces between tokens. Supports
|
||||
multi-processing.
|
||||
|
||||
Last updated for: spaCy 2.0.0a18
|
||||
"""
|
||||
from __future__ import print_function, unicode_literals, division
|
||||
from toolz import partition_all
|
||||
from pathlib import Path
|
||||
from joblib import Parallel, delayed
|
||||
import thinc.extra.datasets
|
||||
import plac
|
||||
import spacy
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
output_dir=("Output directory", "positional", None, Path),
|
||||
model=("Model name (needs tagger)", "positional", None, str),
|
||||
n_jobs=("Number of workers", "option", "n", int),
|
||||
batch_size=("Batch-size for each process", "option", "b", int),
|
||||
limit=("Limit of entries from the dataset", "option", "l", int))
|
||||
def main(output_dir, model='en_core_web_sm', n_jobs=4, batch_size=1000,
|
||||
limit=10000):
|
||||
nlp = spacy.load(model) # load spaCy model
|
||||
print("Loaded model '%s'" % model)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
# load and pre-process the IMBD dataset
|
||||
print("Loading IMDB data...")
|
||||
data, _ = thinc.extra.datasets.imdb()
|
||||
texts, _ = zip(*data[-limit:])
|
||||
partitions = partition_all(batch_size, texts)
|
||||
items = ((i, [nlp(text) for text in texts], output_dir) for i, texts
|
||||
in enumerate(partitions))
|
||||
Parallel(n_jobs=n_jobs)(delayed(transform_texts)(*item) for item in items)
|
||||
|
||||
|
||||
def transform_texts(batch_id, docs, output_dir):
|
||||
out_path = Path(output_dir) / ('%d.txt' % batch_id)
|
||||
if out_path.exists(): # return None in case same batch is called again
|
||||
return None
|
||||
print('Processing batch', batch_id)
|
||||
with out_path.open('w', encoding='utf8') as f:
|
||||
for doc in docs:
|
||||
f.write(' '.join(represent_word(w) for w in doc if not w.is_space))
|
||||
f.write('\n')
|
||||
print('Saved {} texts to {}.txt'.format(len(docs), batch_id))
|
||||
|
||||
|
||||
def represent_word(word):
|
||||
text = word.text
|
||||
# True-case, i.e. try to normalize sentence-initial capitals.
|
||||
# Only do this if the lower-cased form is more probable.
|
||||
if text.istitle() and is_sent_begin(word) \
|
||||
and word.prob < word.doc.vocab[text.lower()].prob:
|
||||
text = text.lower()
|
||||
return text + '|' + word.tag_
|
||||
|
||||
|
||||
def is_sent_begin(word):
|
||||
if word.i == 0:
|
||||
return True
|
||||
elif word.i >= 2 and word.nbor(-1).text in ('.', '!', '?', '...'):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
|
@ -1,90 +0,0 @@
|
|||
"""
|
||||
Print part-of-speech tagged, true-cased, (very roughly) sentence-separated
|
||||
text, with each "sentence" on a newline, and spaces between tokens. Supports
|
||||
multi-processing.
|
||||
"""
|
||||
from __future__ import print_function, unicode_literals, division
|
||||
import io
|
||||
import bz2
|
||||
import logging
|
||||
from toolz import partition
|
||||
from os import path
|
||||
|
||||
import spacy.en
|
||||
|
||||
from joblib import Parallel, delayed
|
||||
import plac
|
||||
import ujson
|
||||
|
||||
|
||||
def parallelize(func, iterator, n_jobs, extra):
|
||||
extra = tuple(extra)
|
||||
return Parallel(n_jobs=n_jobs)(delayed(func)(*(item + extra)) for item in iterator)
|
||||
|
||||
|
||||
def iter_texts_from_json_bz2(loc):
|
||||
"""
|
||||
Iterator of unicode strings, one per document (here, a comment).
|
||||
|
||||
Expects a a path to a BZ2 file, which should be new-line delimited JSON. The
|
||||
document text should be in a string field titled 'body'.
|
||||
|
||||
This is the data format of the Reddit comments corpus.
|
||||
"""
|
||||
with bz2.BZ2File(loc) as file_:
|
||||
for i, line in enumerate(file_):
|
||||
yield ujson.loads(line)['body']
|
||||
|
||||
|
||||
def transform_texts(batch_id, input_, out_dir):
|
||||
out_loc = path.join(out_dir, '%d.txt' % batch_id)
|
||||
if path.exists(out_loc):
|
||||
return None
|
||||
print('Batch', batch_id)
|
||||
nlp = spacy.en.English(parser=False, entity=False)
|
||||
with io.open(out_loc, 'w', encoding='utf8') as file_:
|
||||
for text in input_:
|
||||
doc = nlp(text)
|
||||
file_.write(' '.join(represent_word(w) for w in doc if not w.is_space))
|
||||
file_.write('\n')
|
||||
|
||||
|
||||
def represent_word(word):
|
||||
text = word.text
|
||||
# True-case, i.e. try to normalize sentence-initial capitals.
|
||||
# Only do this if the lower-cased form is more probable.
|
||||
if text.istitle() \
|
||||
and is_sent_begin(word) \
|
||||
and word.prob < word.doc.vocab[text.lower()].prob:
|
||||
text = text.lower()
|
||||
return text + '|' + word.tag_
|
||||
|
||||
|
||||
def is_sent_begin(word):
|
||||
# It'd be nice to have some heuristics like these in the library, for these
|
||||
# times where we don't care so much about accuracy of SBD, and we don't want
|
||||
# to parse
|
||||
if word.i == 0:
|
||||
return True
|
||||
elif word.i >= 2 and word.nbor(-1).text in ('.', '!', '?', '...'):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
in_loc=("Location of input file"),
|
||||
out_dir=("Location of input file"),
|
||||
n_workers=("Number of workers", "option", "n", int),
|
||||
batch_size=("Batch-size for each process", "option", "b", int)
|
||||
)
|
||||
def main(in_loc, out_dir, n_workers=4, batch_size=100000):
|
||||
if not path.exists(out_dir):
|
||||
path.join(out_dir)
|
||||
texts = partition(batch_size, iter_texts_from_json_bz2(in_loc))
|
||||
parallelize(transform_texts, enumerate(texts), n_workers, [out_dir])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
||||
|
Loading…
Reference in New Issue