Fix tagging model

2017-08-06 01:50:08 +02:00 · 2017-08-06 01:50:08 +02:00 · e9ab800e15
parent 468c138ab3
commit e9ab800e15
2 changed files with 16 additions and 23 deletions
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -346,16 +346,16 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):


 def fine_tune(model1, combine=None):
-    def fine_tune_fwd(docs, drop=0.):
+    def fine_tune_fwd(docs_tokvecs, drop=0.):
+        docs, tokvecs = docs_tokvecs
+        lengths = model.ops.asarray([len(doc) for doc in docs], dtype='i')
        X1, bp_X1 = model1.begin_update(docs)
-        lengths = [len(doc) for doc in docs]
-        X2 = model1.ops.flatten(X1)

        def fine_tune_bwd(d_output, sgd=None):
-            bp_X1(d_output, sgd=sgd)
+            bp_X1(model1.ops.flatten(d_output), sgd=sgd)
            return d_output

-        return (X1+X2, lengths), fine_tune_bwd
+        return model1.ops.unflatten(X1+X2, lengths), fine_tune_bwd
    model = wrap(fine_tune_fwd)
    return model

@ -410,30 +410,21 @@ def preprocess_doc(docs, drop=0.):
 def build_tagger_model(nr_class, token_vector_width, **cfg):
    with Model.define_operators({'>>': chain, '+': add}):
        # Input: (doc, tensor) tuples
-        embed_docs = with_getitem(0, 
+        embed_docs = ( 
            FeatureExtracter([NORM])
+            >> flatten
            >> HashEmbed(token_vector_width, 1000)
-            >> flatten_add_lengths
        )
 
        model = ( 
            fine_tune(embed_docs)
-            >> 
-            with_getitem(0, 
-                FeatureExtracter([NORM])
-                >> HashEmbed(token_vector_width, 1000)
-                >> flatten_add_lengths
-            )
-            >> with_getitem(1,
-                flatten_add_lengths) 
-            >> add_tuples
            >> with_flatten(
                Maxout(token_vector_width, token_vector_width)
                >> Softmax(nr_class, token_vector_width)
            )
        )
-        return model
-
+    model.nI = None
+    return model


 def build_text_classifier(nr_class, width=64, **cfg):
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -253,23 +253,25 @@ class NeuralTagger(BaseThincComponent):
        self.cfg = dict(cfg)

    def __call__(self, doc):
-        tags = self.predict([doc.tensor])
+        tags = self.predict(([doc], [doc.tensor]))
        self.set_annotations([doc], tags)
        return doc

    def pipe(self, stream, batch_size=128, n_threads=-1):
        for docs in cytoolz.partition_all(batch_size, stream):
+            docs = list(docs)
            tokvecs = [d.tensor for d in docs]
-            tag_ids = self.predict(tokvecs)
+            tag_ids = self.predict((docs, tokvecs))
            self.set_annotations(docs, tag_ids)
            yield from docs

-    def predict(self, tokvecs):
-        scores = self.model(tokvecs)
+    def predict(self, docs_tokvecs):
+        scores = self.model(docs_tokvecs)
        scores = self.model.ops.flatten(scores)
        guesses = scores.argmax(axis=1)
        if not isinstance(guesses, numpy.ndarray):
            guesses = guesses.get()
+        tokvecs = docs_tokvecs[1]
        guesses = self.model.ops.unflatten(guesses,
                    [tv.shape[0] for tv in tokvecs])
        return guesses
@ -295,7 +297,7 @@ class NeuralTagger(BaseThincComponent):
        if self.model.nI is None:
            self.model.nI = tokvecs[0].shape[1]

-        tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop)
+        tag_scores, bp_tag_scores = self.model.begin_update(docs_tokvecs, drop=drop)
        loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)

        d_tokvecs = bp_tag_scores(d_tag_scores, sgd=sgd)