diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index b669e95ec..4cbb666c0 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -33,7 +33,7 @@ from .morphology cimport Morphology from .vocab cimport Vocab from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS -from ._ml import Tok2Vec, flatten, get_col, doc2feats +from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats from .parts_of_speech import X @@ -57,18 +57,12 @@ class TokenVectorEncoder(object): docs = [docs] tokvecs = self.predict(docs) self.set_annotations(docs, tokvecs) - state = {} if state is None else state - state['tokvecs'] = tokvecs - return state def pipe(self, stream, batch_size=128, n_threads=-1): - for batch in cytoolz.partition_all(batch_size, stream): - docs, states = zip(*batch) + for docs in cytoolz.partition_all(batch_size, stream): tokvecs = self.predict(docs) self.set_annotations(docs, tokvecs) - for state in states: - state['tokvecs'] = tokvecs - yield from zip(docs, states) + yield from docs def predict(self, docs): feats = self.doc2feats(docs) @@ -81,18 +75,12 @@ class TokenVectorEncoder(object): doc.tensor = tokvecs[start : start + len(doc)] start += len(doc) - def update(self, docs, golds, state=None, - drop=0., sgd=None): + def begin_update(self, docs, drop=0.): if isinstance(docs, Doc): docs = [docs] - golds = [golds] - state = {} if state is None else state feats = self.doc2feats(docs) tokvecs, bp_tokvecs = self.model.begin_update(feats, drop=drop) - state['feats'] = feats - state['tokvecs'] = tokvecs - state['bp_tokvecs'] = bp_tokvecs - return state + return tokvecs, bp_tokvecs def get_loss(self, docs, golds, scores): raise NotImplementedError @@ -113,22 +101,16 @@ class NeuralTagger(object): self.vocab = vocab self.model = model - def __call__(self, doc, state=None): - assert state is not None - assert 'tokvecs' in state - tokvecs = state['tokvecs'] - tags = self.predict(tokvecs) + def __call__(self, doc): + tags = self.predict(doc.tensor) self.set_annotations([doc], tags) - return state def pipe(self, stream, batch_size=128, n_threads=-1): - for batch in cytoolz.partition_all(batch_size, stream): - docs, states = zip(*batch) - tag_ids = self.predict(states[0]['tokvecs']) + for docs in cytoolz.partition_all(batch_size, stream): + tokvecs = self.model.ops.flatten([d.tensor for d in docs]) + tag_ids = self.predict(tokvecs) self.set_annotations(docs, tag_ids) - for state in states: - state['tag_ids'] = tag_ids - yield from zip(docs, states) + yield from docs def predict(self, tokvecs): scores = self.model(tokvecs) @@ -150,11 +132,9 @@ class NeuralTagger(object): vocab.morphology.assign_tag_id(&doc.c[j], tag_id) idx += 1 - def update(self, docs, golds, state=None, drop=0., sgd=None): - state = {} if state is None else state + def update(self, docs_tokvecs, golds, drop=0., sgd=None): + docs, tokvecs = docs_tokvecs - tokvecs = state['tokvecs'] - bp_tokvecs = state['bp_tokvecs'] if self.model.nI is None: self.model.nI = tokvecs.shape[1] @@ -163,20 +143,20 @@ class NeuralTagger(object): d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd) - bp_tokvecs(d_tokvecs, sgd=sgd) - - state['tag_scores'] = tag_scores - state['tag_loss'] = loss - return state + return d_tokvecs def get_loss(self, docs, golds, scores): tag_index = {tag: i for i, tag in enumerate(self.vocab.morphology.tag_names)} cdef int idx = 0 correct = numpy.zeros((scores.shape[0],), dtype='i') + guesses = scores.argmax(axis=1) for gold in golds: for tag in gold.tags: - correct[idx] = tag_index[tag] + if tag is None: + correct[idx] = guesses[idx] + else: + correct[idx] = tag_index[tag] idx += 1 correct = self.model.ops.xp.array(correct, dtype='i') d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) @@ -198,15 +178,16 @@ class NeuralTagger(object): cdef Vocab vocab = self.vocab vocab.morphology = Morphology(vocab.strings, new_tag_map, vocab.morphology.lemmatizer) - self.model = Softmax(self.vocab.morphology.n_tags) - print("Tagging", self.model.nO, "tags") + token_vector_width = pipeline[0].model.nO + self.model = rebatch(1024, Softmax(self.vocab.morphology.n_tags, + token_vector_width)) + #self.model = Softmax(self.vocab.morphology.n_tags) def use_params(self, params): with self.model.use_params(params): yield - cdef class EntityRecognizer(LinearParser): """ Annotate named entities on Doc objects. @@ -275,8 +256,6 @@ cdef class NeuralEntityRecognizer(NeuralParser): return ids - - cdef class BeamDependencyParser(BeamParser): TransitionSystem = ArcEager diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 2e6687730..52ebe4362 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -35,12 +35,12 @@ from preshed.maps cimport map_get from thinc.api import layerize, chain from thinc.neural import Model, Affine, ELU, ReLu, Maxout -from thinc.neural.ops import NumpyOps +from thinc.neural.ops import NumpyOps, CupyOps from .. import util from ..util import get_async, get_cuda_stream from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts -from .._ml import Tok2Vec, doc2feats +from .._ml import Tok2Vec, doc2feats, rebatch from . import _parse_features from ._parse_features cimport CONTEXT_SIZE @@ -229,6 +229,8 @@ cdef class Parser: nI=token_vector_width, pieces=maxout_pieces) + lower = rebatch(1024, lower) + with Model.use_device('cpu'): upper = chain( Maxout(hidden_width), @@ -274,7 +276,7 @@ cdef class Parser: def __reduce__(self): return (Parser, (self.vocab, self.moves, self.model), None, None) - def __call__(self, Doc tokens, state=None): + def __call__(self, Doc doc): """ Apply the parser or entity recognizer, setting the annotations onto the Doc object. @@ -283,10 +285,9 @@ cdef class Parser: Returns: None """ - self.parse_batch([tokens], state['tokvecs']) - return state + self.parse_batch([doc], doc.tensor) - def pipe(self, stream, int batch_size=1000, int n_threads=2): + def pipe(self, docs, int batch_size=1000, int n_threads=2): """ Process a stream of documents. @@ -301,12 +302,11 @@ cdef class Parser: cdef StateClass parse_state cdef Doc doc queue = [] - for batch in cytoolz.partition_all(batch_size, stream): - batch = list(batch) - docs, states = zip(*batch) - parse_states = self.parse_batch(docs, states[0]['tokvecs']) + for docs in cytoolz.partition_all(batch_size, docs): + tokvecs = self.model[0].ops.flatten([d.tensor for d in docs]) + parse_states = self.parse_batch(docs, tokvecs) self.set_annotations(docs, parse_states) - yield from zip(docs, states) + yield from docs def parse_batch(self, docs, tokvecs): cuda_stream = get_cuda_stream() @@ -324,10 +324,8 @@ cdef class Parser: todo = [st for st in states if not st.is_final()] return states - def update(self, docs, golds, state=None, drop=0., sgd=None): - assert state is not None - assert 'tokvecs' in state - assert 'bp_tokvecs' in state + def update(self, docs_tokvecs, golds, drop=0., sgd=None): + docs, tokvecs = docs_tokvecs if isinstance(docs, Doc) and isinstance(golds, GoldParse): docs = [docs] golds = [golds] @@ -336,9 +334,6 @@ cdef class Parser: for gold in golds: self.moves.preprocess_gold(gold) - tokvecs = state['tokvecs'] - bp_tokvecs = state['bp_tokvecs'] - states = self.moves.init_batch(docs) state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, drop) @@ -357,17 +352,17 @@ cdef class Parser: d_scores = self.get_batch_loss(states, golds, scores) d_vector = bp_scores(d_scores, sgd=sgd) - loss += (d_scores**2).sum() - if not isinstance(tokvecs, state2vec.ops.xp.ndarray): - backprops.append((token_ids, d_vector, bp_vector)) - else: + if isinstance(self.model[0].ops, CupyOps) \ + and not isinstance(token_ids, state2vec.ops.xp.ndarray): # Move token_ids and d_vector to CPU, asynchronously backprops.append(( get_async(cuda_stream, token_ids), get_async(cuda_stream, d_vector), bp_vector )) + else: + backprops.append((token_ids, d_vector, bp_vector)) self.transition_batch(states, scores) todo = [st for st in todo if not st[0].is_final()] # Tells CUDA to block, so our async copies complete. @@ -385,9 +380,7 @@ cdef class Parser: else: xp.add.at(d_tokvecs, token_ids, d_state_features * active_feats) - bp_tokvecs(d_tokvecs, sgd) - state['parser_loss'] = loss - return state + return d_tokvecs def get_batch_model(self, batch_size, tokvecs, stream, dropout): lower, upper = self.model @@ -445,7 +438,6 @@ cdef class Parser: self.moves.finalize_doc(doc) def add_label(self, label): - # Doesn't set label into serializer -- subclasses override it to do that. for action in self.moves.action_types: added = self.moves.add_action(action, label) if added: