From 40b57ea4acdec97534e2cbd74aeeee66456db9bd Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 2 Dec 2018 04:28:34 +0100
Subject: [PATCH] Format example

---
 .../information_extraction/phrase_matcher.py  | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/examples/information_extraction/phrase_matcher.py b/examples/information_extraction/phrase_matcher.py
index 28266bbd1..b49cb88e8 100644
--- a/examples/information_extraction/phrase_matcher.py
+++ b/examples/information_extraction/phrase_matcher.py
@@ -55,15 +55,15 @@ import spacy
     patterns_loc=("Path to gazetteer", "positional", None, str),
     text_loc=("Path to Reddit corpus file", "positional", None, str),
     n=("Number of texts to read", "option", "n", int),
-    lang=("Language class to initialise", "option", "l", str))
-def main(patterns_loc, text_loc, n=10000, lang='en'):
-    nlp = spacy.blank('en')
+    lang=("Language class to initialise", "option", "l", str),
+)
+def main(patterns_loc, text_loc, n=10000, lang="en"):
+    nlp = spacy.blank("en")
     nlp.vocab.lex_attr_getters = {}
     phrases = read_gazetteer(nlp.tokenizer, patterns_loc)
     count = 0
     t1 = time.time()
-    for ent_id, text in get_matches(nlp.tokenizer, phrases,
-                                    read_text(text_loc, n=n)):
+    for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)):
         count += 1
     t2 = time.time()
     print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count))
@@ -72,7 +72,7 @@ def main(patterns_loc, text_loc, n=10000, lang='en'):
 def read_gazetteer(tokenizer, loc, n=-1):
     for i, line in enumerate(open(loc)):
         data = ujson.loads(line.strip())
-        phrase = tokenizer(data['text'])
+        phrase = tokenizer(data["text"])
         for w in phrase:
             _ = tokenizer.vocab[w.text]
         if len(phrase) >= 2:
@@ -83,14 +83,14 @@ def read_text(bz2_loc, n=10000):
     with BZ2File(bz2_loc) as file_:
         for i, line in enumerate(file_):
             data = ujson.loads(line)
-            yield data['body']
+            yield data["body"]
             if i >= n:
                 break
 
 
 def get_matches(tokenizer, phrases, texts, max_length=6):
     matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length)
-    matcher.add('Phrase', None, *phrases)
+    matcher.add("Phrase", None, *phrases)
     for text in texts:
         doc = tokenizer(text)
         for w in doc:
@@ -100,10 +100,11 @@ def get_matches(tokenizer, phrases, texts, max_length=6):
             yield (ent_id, doc[start:end].text)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     if False:
         import cProfile
         import pstats
+
         cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
         s = pstats.Stats("Profile.prof")
         s.strip_dirs().sort_stats("time").print_stats()