From e420e5a809657b25a25fd9885213e679b3cf082f Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 7 May 2017 07:31:09 -0500
Subject: [PATCH] Tmp

---
 bin/parser/train_ud.py  | 25 +++++++++++++++++--------
 spacy/syntax/parser.pyx | 12 ++++++++++--
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/bin/parser/train_ud.py b/bin/parser/train_ud.py
index 79fba2b42..be471d196 100644
--- a/bin/parser/train_ud.py
+++ b/bin/parser/train_ud.py
@@ -16,6 +16,15 @@ from spacy.scorer import Scorer
 from spacy.language_data.tag_map import TAG_MAP as DEFAULT_TAG_MAP
 import spacy.attrs
 import io
+from thinc.neural.ops import CupyOps
+from thinc.neural import Model
+
+try:
+    import cupy
+    print("Using GPU")
+    Model.ops = CupyOps()
+except ImportError:
+    pass
 
 
 def read_conllx(loc, n=0):
@@ -137,10 +146,10 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
 
     Xs, ys = organize_data(vocab, train_sents)
     dev_Xs, dev_ys = organize_data(vocab, dev_sents)
-    Xs = Xs[:500]
-    ys = ys[:500]
-    dev_Xs = dev_Xs[:100]
-    dev_ys = dev_ys[:100]
+    Xs = Xs
+    ys = ys
+    dev_Xs = dev_Xs[:1000]
+    dev_ys = dev_ys[:1000]
     with encoder.model.begin_training(Xs[:100], ys[:100]) as (trainer, optimizer):
         docs = list(Xs)
         for doc in docs:
@@ -154,9 +163,9 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
             print('%d:\t%.3f\t%.3f\t%.3f' % (itn, nn_loss[-1], scorer.uas, scorer.tags_acc))
             nn_loss.append(0.)
         trainer.each_epoch.append(track_progress)
-        trainer.batch_size = 6
-        trainer.nb_epoch = 10000
-        for docs, golds in trainer.iterate(Xs, ys, progress_bar=False):
+        trainer.batch_size = 12
+        trainer.nb_epoch = 10
+        for docs, golds in trainer.iterate(Xs, ys):
             docs = [Doc(vocab, words=[w.text for w in doc]) for doc in docs]
             tokvecs, upd_tokvecs = encoder.begin_update(docs)
             for doc, tokvec in zip(docs, tokvecs):
@@ -165,7 +174,7 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
                 tagger.update(doc, gold)
             d_tokvecs, loss = parser.update(docs, golds, sgd=optimizer)
             upd_tokvecs(d_tokvecs, sgd=optimizer)
-            encoder.update(docs, golds, optimizer)
+            encoder.update(docs, golds, sgd=optimizer)
             nn_loss[-1] += loss
     nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser)
     nlp.end_training(model_dir)
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index 77d2a0ebc..a8ff384e3 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -132,7 +132,7 @@ cdef class Parser:
         """
         self.parse_batch([tokens])
         self.moves.finalize_doc(tokens)
-    
+
     def pipe(self, stream, int batch_size=1000, int n_threads=2):
         """
         Process a stream of documents.
@@ -175,6 +175,7 @@ cdef class Parser:
         tokvecs = [d.tensor for d in docs]
         all_states = list(states)
         todo = zip(states, tokvecs)
+        i = 0
         while todo:
             states, tokvecs = zip(*todo)
             scores, _ = self._begin_update(states, tokvecs)
@@ -182,6 +183,9 @@ cdef class Parser:
                 action = self.moves.c[guess]
                 action.do(state.c, action.label)
             todo = filter(lambda sp: not sp[0].py_is_final(), todo)
+            i += 1
+            if i >= 10000:
+                break
         for state, doc in zip(all_states, docs):
             self.moves.finalize_state(state.c)
             for i in range(doc.length):
@@ -218,6 +222,7 @@ cdef class Parser:
         todo = zip(states, tokvecs, golds, d_tokens)
         assert len(states) == len(todo)
         losses = []
+        i = 0
         while todo:
             states, tokvecs, golds, d_tokens = zip(*todo)
             scores, finish_update = self._begin_update(states, tokvecs)
@@ -232,6 +237,9 @@ cdef class Parser:
 
             # Get unfinished states (and their matching gold and token gradients)
             todo = filter(lambda sp: not sp[0].py_is_final(), todo)
+            i += 1
+            if i >= 10000:
+                break
         return output, sum(losses)
 
     def _begin_update(self, states, tokvecs, drop=0.):
@@ -284,7 +292,7 @@ cdef class Parser:
             state.set_attributes(features[i], tokens[i], attr_names)
             state.set_token_vectors(tokvecs[i], all_tokvecs[i], tokens[i])
         return (tokens, features, tokvecs)
- 
+
     def _validate_batch(self, int[:, ::1] is_valid, states):
         cdef StateClass state
         cdef int i