Improve profiling

2017-11-15 13:51:25 +01:00 · 2017-11-15 13:51:25 +01:00 · 8d692771f6
parent b797dca977
commit 8d692771f6
1 changed files with 7 additions and 2 deletions
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@ -11,6 +11,7 @@ import spacy
 import sys
 import tqdm
 import cytoolz
+import thinc.extra.datasets


 def read_inputs(loc):
@ -32,14 +33,18 @@ def profile(cmd, lang, inputs=None):
    """
    Profile a spaCy pipeline, to find out which functions take the most time.
    """
+    if inputs is None:
+        imdb_train, _ = thinc.extra.datasets.imdb()
+        inputs, _ = zip(*imdb_train)
+        inputs = inputs[:2000]
    nlp = spacy.load(lang)
    texts = list(cytoolz.take(10000, inputs))
    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
                    "Profile.prof")
    s = pstats.Stats("Profile.prof")
-    s.strip_dirs().sort_stats("time").print_stats()
+    s.strip_dirs().sort_stats("cumtime").print_stats()


 def parse_texts(nlp, texts):
-    for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=128):
+    for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
        pass