From 8d692771f68f0b510c5c4f4f58ba9ddb3c677c81 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 15 Nov 2017 13:51:25 +0100 Subject: [PATCH] Improve profiling --- spacy/cli/profile.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index a394989d0..5c0ed521b 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -11,6 +11,7 @@ import spacy import sys import tqdm import cytoolz +import thinc.extra.datasets def read_inputs(loc): @@ -32,14 +33,18 @@ def profile(cmd, lang, inputs=None): """ Profile a spaCy pipeline, to find out which functions take the most time. """ + if inputs is None: + imdb_train, _ = thinc.extra.datasets.imdb() + inputs, _ = zip(*imdb_train) + inputs = inputs[:2000] nlp = spacy.load(lang) texts = list(cytoolz.take(10000, inputs)) cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof") s = pstats.Stats("Profile.prof") - s.strip_dirs().sort_stats("time").print_stats() + s.strip_dirs().sort_stats("cumtime").print_stats() def parse_texts(nlp, texts): - for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=128): + for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16): pass