Fix use_params and pipe methods

2017-05-18 08:30:59 -05:00 · 2017-05-18 08:30:59 -05:00 · c2c825127a
parent ca70b08661
commit c2c825127a
3 changed files with 62 additions and 31 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -220,13 +220,19 @@ class Language(object):

    @contextmanager
    def use_params(self, params, **cfg):
-        contexts = [pipe.model.use_params(params) for pipe
-                    in self.pipeline if hasattr(pipe, 'model')
-                    and hasattr(pipe.model, 'use_params')]
+        contexts = [pipe.use_params(params) for pipe
+                    in self.pipeline if hasattr(pipe, 'use_params')]
+        # TODO: Having trouble with contextlib
+        # Workaround: these aren't actually context managers atm.
+        for context in contexts:
+            try:
+                next(context)
+            except StopIteration:
+                pass
        yield
        for context in contexts:
            try:
-                next(context.gen)
+                next(context)
            except StopIteration:
                pass

@ -242,7 +248,8 @@ class Language(object):
            parse (bool)
            entity (bool)
        """
-        stream = ((self.make_doc(text), None) for text in texts)
+        #stream = ((self.make_doc(text), None) for text in texts)
+        stream = ((doc, {}) for doc in texts)
        for proc in self.pipeline:
            name = getattr(proc, 'name', None)
            if name in disabled and not disabled[name]:
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -61,8 +61,14 @@ class TokenVectorEncoder(object):
        state['tokvecs'] = tokvecs
        return state

-    def pipe(self, docs, **kwargs):
-        raise NotImplementedError
+    def pipe(self, stream, batch_size=128, n_threads=-1):
+        for batch in cytoolz.partition_all(batch_size, stream):
+            docs, states = zip(*batch)
+            tokvecs = self.predict(docs)
+            self.set_annotations(docs, tokvecs)
+            for state in states:
+                state['tokvecs'] = tokvecs
+            yield from zip(docs, states)

    def predict(self, docs):
        feats = self.doc2feats(docs)
@ -96,6 +102,10 @@ class TokenVectorEncoder(object):
        if self.model is True:
            self.model = self.Model()

+    def use_params(self, params):
+        with self.model.use_params(params):
+            yield
+

 class NeuralTagger(object):
    name = 'nn_tagger'
@ -112,11 +122,13 @@ class NeuralTagger(object):
        return state

    def pipe(self, stream, batch_size=128, n_threads=-1):
-        for batch in cytoolz.partition_all(batch_size, batch):
-            docs, tokvecs = zip(*batch)
-            tag_ids = self.predict(docs, tokvecs)
+        for batch in cytoolz.partition_all(batch_size, stream):
+            docs, states = zip(*batch)
+            tag_ids = self.predict(states[0]['tokvecs'])
            self.set_annotations(docs, tag_ids)
-            yield from docs
+            for state in states:
+                state['tag_ids'] = tag_ids
+            yield from zip(docs, states)

    def predict(self, tokvecs):
        scores = self.model(tokvecs)
@ -130,7 +142,7 @@ class NeuralTagger(object):
            docs = [docs]
        cdef Doc doc
        cdef int idx = 0
-        cdef int i, j
+        cdef int i, j, tag_id
        cdef Vocab vocab = self.vocab
        for i, doc in enumerate(docs):
            doc_tag_ids = batch_tag_ids[idx:idx+len(doc)]
@ -147,7 +159,6 @@ class NeuralTagger(object):
            self.model.nI = tokvecs.shape[1]

        tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop)
-
        loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)

        d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)
@ -167,24 +178,33 @@ class NeuralTagger(object):
            for tag in gold.tags:
                correct[idx] = tag_index[tag]
                idx += 1
-        correct = self.model.ops.xp.array(correct)
+        correct = self.model.ops.xp.array(correct, dtype='i')
        d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
        loss = (d_scores**2).sum()
-        d_scores = self.model.ops.asarray(d_scores)
-        return loss, d_scores
+        d_scores = self.model.ops.asarray(d_scores, dtype='f')
+        return float(loss), d_scores

    def begin_training(self, gold_tuples, pipeline=None):
-        tag_map = dict(self.vocab.morphology.tag_map)
+        orig_tag_map = dict(self.vocab.morphology.tag_map)
+        new_tag_map = {}
        for raw_text, annots_brackets in gold_tuples:
            for annots, brackets in annots_brackets:
                ids, words, tags, heads, deps, ents = annots
                for tag in tags:
-                    if tag not in tag_map:
-                        tag_map[tag] = {POS: X}
+                    if tag in orig_tag_map:
+                        new_tag_map[tag] = orig_tag_map[tag]
+                    else:
+                        new_tag_map[tag] = {POS: X}
        cdef Vocab vocab = self.vocab
-        vocab.morphology = Morphology(vocab.strings, tag_map,
+        vocab.morphology = Morphology(vocab.strings, new_tag_map,
                                      vocab.morphology.lemmatizer)
        self.model = Softmax(self.vocab.morphology.n_tags)
+        print("Tagging", self.model.nO, "tags")
+
+    def use_params(self, params):
+        with self.model.use_params(params):
+            yield
+


 cdef class EntityRecognizer(LinearParser):
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -7,6 +7,7 @@ from __future__ import unicode_literals, print_function

 from collections import Counter
 import ujson
+import contextlib

 from libc.math cimport exp
 cimport cython
@ -297,18 +298,15 @@ cdef class Parser:
                The number of threads with which to work on the buffer in parallel.
        Yields (Doc): Documents, in order.
        """
-        cdef StateClass state
+        cdef StateClass parse_state
        cdef Doc doc
        queue = []
        for batch in cytoolz.partition_all(batch_size, stream):
-            docs, tokvecs = zip(*batch)
-            states = self.parse_batch(docs, tokvecs)
-            for doc, state in zip(docs, states):
-                self.moves.finalize_state(state.c)
-                for i in range(doc.length):
-                    doc.c[i] = state.c._sent[i]
-                self.moves.finalize_doc(doc)
-                yield doc
+            batch = list(batch)
+            docs, states = zip(*batch)
+            parse_states = self.parse_batch(docs, states[0]['tokvecs'])
+            self.set_annotations(docs, parse_states)
+            yield from zip(docs, states)

    def parse_batch(self, docs, tokvecs):
        cuda_stream = get_cuda_stream()
@ -324,7 +322,7 @@ cdef class Parser:
            scores = vec2scores(vectors)
            self.transition_batch(states, scores)
            todo = [st for st in states if not st.is_final()]
-        self.finish_batch(states, docs)
+        return states

    def update(self, docs, golds, state=None, drop=0., sgd=None):
        assert state is not None
@ -437,7 +435,7 @@ cdef class Parser:
            c_d_scores += d_scores.shape[1]
        return d_scores

-    def finish_batch(self, states, docs):
+    def set_annotations(self, docs, states):
        cdef StateClass state
        cdef Doc doc
        for state, doc in zip(states, docs):
@ -465,6 +463,12 @@ cdef class Parser:
        if self.model is True:
            self.model = self.Model(self.moves.n_moves, **cfg)

+    def use_params(self, params):
+        # Can't decorate cdef class :(. Workaround.
+        with self.model[0].use_params(params):
+            with self.model[1].use_params(params):
+                yield
+
    def to_disk(self, path):
        path = util.ensure_path(path)
        with (path / 'model.bin').open('wb') as file_: