diff --git a/spacy/language.py b/spacy/language.py index 86548d42e..228225404 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -220,13 +220,19 @@ class Language(object): @contextmanager def use_params(self, params, **cfg): - contexts = [pipe.model.use_params(params) for pipe - in self.pipeline if hasattr(pipe, 'model') - and hasattr(pipe.model, 'use_params')] + contexts = [pipe.use_params(params) for pipe + in self.pipeline if hasattr(pipe, 'use_params')] + # TODO: Having trouble with contextlib + # Workaround: these aren't actually context managers atm. + for context in contexts: + try: + next(context) + except StopIteration: + pass yield for context in contexts: try: - next(context.gen) + next(context) except StopIteration: pass @@ -242,7 +248,8 @@ class Language(object): parse (bool) entity (bool) """ - stream = ((self.make_doc(text), None) for text in texts) + #stream = ((self.make_doc(text), None) for text in texts) + stream = ((doc, {}) for doc in texts) for proc in self.pipeline: name = getattr(proc, 'name', None) if name in disabled and not disabled[name]: diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 3e68966fa..b669e95ec 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -61,8 +61,14 @@ class TokenVectorEncoder(object): state['tokvecs'] = tokvecs return state - def pipe(self, docs, **kwargs): - raise NotImplementedError + def pipe(self, stream, batch_size=128, n_threads=-1): + for batch in cytoolz.partition_all(batch_size, stream): + docs, states = zip(*batch) + tokvecs = self.predict(docs) + self.set_annotations(docs, tokvecs) + for state in states: + state['tokvecs'] = tokvecs + yield from zip(docs, states) def predict(self, docs): feats = self.doc2feats(docs) @@ -96,6 +102,10 @@ class TokenVectorEncoder(object): if self.model is True: self.model = self.Model() + def use_params(self, params): + with self.model.use_params(params): + yield + class NeuralTagger(object): name = 'nn_tagger' @@ -112,11 +122,13 @@ class NeuralTagger(object): return state def pipe(self, stream, batch_size=128, n_threads=-1): - for batch in cytoolz.partition_all(batch_size, batch): - docs, tokvecs = zip(*batch) - tag_ids = self.predict(docs, tokvecs) + for batch in cytoolz.partition_all(batch_size, stream): + docs, states = zip(*batch) + tag_ids = self.predict(states[0]['tokvecs']) self.set_annotations(docs, tag_ids) - yield from docs + for state in states: + state['tag_ids'] = tag_ids + yield from zip(docs, states) def predict(self, tokvecs): scores = self.model(tokvecs) @@ -130,7 +142,7 @@ class NeuralTagger(object): docs = [docs] cdef Doc doc cdef int idx = 0 - cdef int i, j + cdef int i, j, tag_id cdef Vocab vocab = self.vocab for i, doc in enumerate(docs): doc_tag_ids = batch_tag_ids[idx:idx+len(doc)] @@ -147,7 +159,6 @@ class NeuralTagger(object): self.model.nI = tokvecs.shape[1] tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop) - loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd) @@ -167,24 +178,33 @@ class NeuralTagger(object): for tag in gold.tags: correct[idx] = tag_index[tag] idx += 1 - correct = self.model.ops.xp.array(correct) + correct = self.model.ops.xp.array(correct, dtype='i') d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) loss = (d_scores**2).sum() - d_scores = self.model.ops.asarray(d_scores) - return loss, d_scores + d_scores = self.model.ops.asarray(d_scores, dtype='f') + return float(loss), d_scores def begin_training(self, gold_tuples, pipeline=None): - tag_map = dict(self.vocab.morphology.tag_map) + orig_tag_map = dict(self.vocab.morphology.tag_map) + new_tag_map = {} for raw_text, annots_brackets in gold_tuples: for annots, brackets in annots_brackets: ids, words, tags, heads, deps, ents = annots for tag in tags: - if tag not in tag_map: - tag_map[tag] = {POS: X} + if tag in orig_tag_map: + new_tag_map[tag] = orig_tag_map[tag] + else: + new_tag_map[tag] = {POS: X} cdef Vocab vocab = self.vocab - vocab.morphology = Morphology(vocab.strings, tag_map, + vocab.morphology = Morphology(vocab.strings, new_tag_map, vocab.morphology.lemmatizer) self.model = Softmax(self.vocab.morphology.n_tags) + print("Tagging", self.model.nO, "tags") + + def use_params(self, params): + with self.model.use_params(params): + yield + cdef class EntityRecognizer(LinearParser): diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 8c04a327a..2e6687730 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -7,6 +7,7 @@ from __future__ import unicode_literals, print_function from collections import Counter import ujson +import contextlib from libc.math cimport exp cimport cython @@ -297,18 +298,15 @@ cdef class Parser: The number of threads with which to work on the buffer in parallel. Yields (Doc): Documents, in order. """ - cdef StateClass state + cdef StateClass parse_state cdef Doc doc queue = [] for batch in cytoolz.partition_all(batch_size, stream): - docs, tokvecs = zip(*batch) - states = self.parse_batch(docs, tokvecs) - for doc, state in zip(docs, states): - self.moves.finalize_state(state.c) - for i in range(doc.length): - doc.c[i] = state.c._sent[i] - self.moves.finalize_doc(doc) - yield doc + batch = list(batch) + docs, states = zip(*batch) + parse_states = self.parse_batch(docs, states[0]['tokvecs']) + self.set_annotations(docs, parse_states) + yield from zip(docs, states) def parse_batch(self, docs, tokvecs): cuda_stream = get_cuda_stream() @@ -324,7 +322,7 @@ cdef class Parser: scores = vec2scores(vectors) self.transition_batch(states, scores) todo = [st for st in states if not st.is_final()] - self.finish_batch(states, docs) + return states def update(self, docs, golds, state=None, drop=0., sgd=None): assert state is not None @@ -437,7 +435,7 @@ cdef class Parser: c_d_scores += d_scores.shape[1] return d_scores - def finish_batch(self, states, docs): + def set_annotations(self, docs, states): cdef StateClass state cdef Doc doc for state, doc in zip(states, docs): @@ -465,6 +463,12 @@ cdef class Parser: if self.model is True: self.model = self.Model(self.moves.n_moves, **cfg) + def use_params(self, params): + # Can't decorate cdef class :(. Workaround. + with self.model[0].use_params(params): + with self.model[1].use_params(params): + yield + def to_disk(self, path): path = util.ensure_path(path) with (path / 'model.bin').open('wb') as file_: