diff --git a/spacy/__main__.py b/spacy/__main__.py index 2b15e4374..d02242d68 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -7,6 +7,7 @@ if __name__ == '__main__': import plac import sys from spacy.cli import download, link, info, package, train, convert, model + from spacy.cli import profile from spacy.util import prints commands = { @@ -16,7 +17,8 @@ if __name__ == '__main__': 'train': train, 'convert': convert, 'package': package, - 'model': model + 'model': model, + 'profile': profile, } if len(sys.argv) == 1: prints(', '.join(commands), title="Available commands", exits=1) diff --git a/spacy/_ml.py b/spacy/_ml.py index 5ab430684..e2e0d070b 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -218,7 +218,10 @@ def drop_layer(layer, factor=2.): return layer.begin_update(X, drop=drop) else: return X, lambda dX, sgd=None: dX - return wrap(drop_layer_fwd, layer) + + model = wrap(drop_layer_fwd, layer) + model.predict = layer + return model def Tok2Vec(width, embed_size, preprocess=None): @@ -382,10 +385,18 @@ def fine_tune(embedding, combine=None): sgd(model._mem.weights, model._mem.gradient, key=model.id) return [d_o * model.mix[0] for d_o in d_output] return output, fine_tune_bwd + + def fine_tune_predict(docs_tokvecs): + docs, tokvecs = docs_tokvecs + vecs = embedding(docs) + return [model.mix[0]*tv+model.mix[1]*v + for tv, v in zip(tokvecs, vecs)] + model = wrap(fine_tune_fwd, embedding) model.mix = model._mem.add((model.id, 'mix'), (2,)) model.mix.fill(0.5) model.d_mix = model._mem.add_gradient((model.id, 'd_mix'), (model.id, 'mix')) + model.predict = fine_tune_predict return model diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 480b27a23..e58c94642 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -2,6 +2,7 @@ from .download import download from .info import info from .link import link from .package import package +from .profile import profile from .train import train from .convert import convert from .model import model diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py new file mode 100644 index 000000000..db6fc5b41 --- /dev/null +++ b/spacy/cli/profile.py @@ -0,0 +1,45 @@ +# coding: utf8 +from __future__ import unicode_literals, division, print_function + +import plac +from pathlib import Path +import ujson +import cProfile +import pstats + +import spacy +import sys +import tqdm +import cytoolz + + +def read_inputs(loc): + if loc is None: + file_ = sys.stdin + file_ = (line.encode('utf8') for line in file_) + else: + file_ = Path(loc).open() + for line in file_: + data = ujson.loads(line) + text = data['text'] + yield text + + +@plac.annotations( + lang=("model/language", "positional", None, str), + inputs=("Location of input file", "positional", None, read_inputs) +) +def profile(cmd, lang, inputs=None): + """ + Profile a spaCy pipeline, to find out which functions take the most time. + """ + nlp = spacy.load(lang) + texts = list(cytoolz.take(10000, inputs)) + cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof") + s = pstats.Stats("Profile.prof") + s.strip_dirs().sort_stats("time").print_stats() + + +def parse_texts(nlp, texts): + for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=128): + pass diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index dd52c4cbf..3b1f38b68 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -303,8 +303,14 @@ cdef class Doc: return self.user_hooks['vector'](self) if self._vector is not None: return self._vector - elif self.has_vector and len(self): - self._vector = sum(t.vector for t in self) / len(self) + elif not len(self): + self._vector = numpy.zeros((self.vocab.vectors_length,), dtype='f') + return self._vector + elif self.has_vector: + vector = numpy.zeros((self.vocab.vectors_length,), dtype='f') + for token in self.c[:self.length]: + vector += self.vocab.get_vector(token.lex.orth) + self._vector = vector / len(self) return self._vector elif self.tensor is not None: self._vector = self.tensor.mean(axis=0) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index dc141552d..bf7fb6903 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -4,6 +4,7 @@ from __future__ import unicode_literals import bz2 import ujson import re +import numpy from libc.string cimport memset, memcpy from libc.stdint cimport int32_t @@ -244,7 +245,7 @@ cdef class Vocab: @property def vectors_length(self): - return len(self.vectors) + return self.vectors.data.shape[1] def clear_vectors(self, new_dim=None): """Drop the current vector table. Because all vectors must be the same @@ -268,7 +269,10 @@ cdef class Vocab: """ if isinstance(orth, basestring_): orth = self.strings.add(orth) - return self.vectors[orth] + if orth in self.vectors.key2row: + return self.vectors[orth] + else: + return numpy.zeros((self.vectors_length,), dtype='f') def set_vector(self, orth, vector): """Set a vector for a word in the vocabulary.