From 40b57ea4acdec97534e2cbd74aeeee66456db9bd Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 2 Dec 2018 04:28:34 +0100 Subject: [PATCH] Format example --- .../information_extraction/phrase_matcher.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/examples/information_extraction/phrase_matcher.py b/examples/information_extraction/phrase_matcher.py index 28266bbd1..b49cb88e8 100644 --- a/examples/information_extraction/phrase_matcher.py +++ b/examples/information_extraction/phrase_matcher.py @@ -55,15 +55,15 @@ import spacy patterns_loc=("Path to gazetteer", "positional", None, str), text_loc=("Path to Reddit corpus file", "positional", None, str), n=("Number of texts to read", "option", "n", int), - lang=("Language class to initialise", "option", "l", str)) -def main(patterns_loc, text_loc, n=10000, lang='en'): - nlp = spacy.blank('en') + lang=("Language class to initialise", "option", "l", str), +) +def main(patterns_loc, text_loc, n=10000, lang="en"): + nlp = spacy.blank("en") nlp.vocab.lex_attr_getters = {} phrases = read_gazetteer(nlp.tokenizer, patterns_loc) count = 0 t1 = time.time() - for ent_id, text in get_matches(nlp.tokenizer, phrases, - read_text(text_loc, n=n)): + for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)): count += 1 t2 = time.time() print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count)) @@ -72,7 +72,7 @@ def main(patterns_loc, text_loc, n=10000, lang='en'): def read_gazetteer(tokenizer, loc, n=-1): for i, line in enumerate(open(loc)): data = ujson.loads(line.strip()) - phrase = tokenizer(data['text']) + phrase = tokenizer(data["text"]) for w in phrase: _ = tokenizer.vocab[w.text] if len(phrase) >= 2: @@ -83,14 +83,14 @@ def read_text(bz2_loc, n=10000): with BZ2File(bz2_loc) as file_: for i, line in enumerate(file_): data = ujson.loads(line) - yield data['body'] + yield data["body"] if i >= n: break def get_matches(tokenizer, phrases, texts, max_length=6): matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length) - matcher.add('Phrase', None, *phrases) + matcher.add("Phrase", None, *phrases) for text in texts: doc = tokenizer(text) for w in doc: @@ -100,10 +100,11 @@ def get_matches(tokenizer, phrases, texts, max_length=6): yield (ent_id, doc[start:end].text) -if __name__ == '__main__': +if __name__ == "__main__": if False: import cProfile import pstats + cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof") s = pstats.Stats("Profile.prof") s.strip_dirs().sort_stats("time").print_stats()