From e420e5a809657b25a25fd9885213e679b3cf082f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 7 May 2017 07:31:09 -0500 Subject: [PATCH] Tmp --- bin/parser/train_ud.py | 25 +++++++++++++++++-------- spacy/syntax/parser.pyx | 12 ++++++++++-- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/bin/parser/train_ud.py b/bin/parser/train_ud.py index 79fba2b42..be471d196 100644 --- a/bin/parser/train_ud.py +++ b/bin/parser/train_ud.py @@ -16,6 +16,15 @@ from spacy.scorer import Scorer from spacy.language_data.tag_map import TAG_MAP as DEFAULT_TAG_MAP import spacy.attrs import io +from thinc.neural.ops import CupyOps +from thinc.neural import Model + +try: + import cupy + print("Using GPU") + Model.ops = CupyOps() +except ImportError: + pass def read_conllx(loc, n=0): @@ -137,10 +146,10 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None): Xs, ys = organize_data(vocab, train_sents) dev_Xs, dev_ys = organize_data(vocab, dev_sents) - Xs = Xs[:500] - ys = ys[:500] - dev_Xs = dev_Xs[:100] - dev_ys = dev_ys[:100] + Xs = Xs + ys = ys + dev_Xs = dev_Xs[:1000] + dev_ys = dev_ys[:1000] with encoder.model.begin_training(Xs[:100], ys[:100]) as (trainer, optimizer): docs = list(Xs) for doc in docs: @@ -154,9 +163,9 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None): print('%d:\t%.3f\t%.3f\t%.3f' % (itn, nn_loss[-1], scorer.uas, scorer.tags_acc)) nn_loss.append(0.) trainer.each_epoch.append(track_progress) - trainer.batch_size = 6 - trainer.nb_epoch = 10000 - for docs, golds in trainer.iterate(Xs, ys, progress_bar=False): + trainer.batch_size = 12 + trainer.nb_epoch = 10 + for docs, golds in trainer.iterate(Xs, ys): docs = [Doc(vocab, words=[w.text for w in doc]) for doc in docs] tokvecs, upd_tokvecs = encoder.begin_update(docs) for doc, tokvec in zip(docs, tokvecs): @@ -165,7 +174,7 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None): tagger.update(doc, gold) d_tokvecs, loss = parser.update(docs, golds, sgd=optimizer) upd_tokvecs(d_tokvecs, sgd=optimizer) - encoder.update(docs, golds, optimizer) + encoder.update(docs, golds, sgd=optimizer) nn_loss[-1] += loss nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser) nlp.end_training(model_dir) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 77d2a0ebc..a8ff384e3 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -132,7 +132,7 @@ cdef class Parser: """ self.parse_batch([tokens]) self.moves.finalize_doc(tokens) - + def pipe(self, stream, int batch_size=1000, int n_threads=2): """ Process a stream of documents. @@ -175,6 +175,7 @@ cdef class Parser: tokvecs = [d.tensor for d in docs] all_states = list(states) todo = zip(states, tokvecs) + i = 0 while todo: states, tokvecs = zip(*todo) scores, _ = self._begin_update(states, tokvecs) @@ -182,6 +183,9 @@ cdef class Parser: action = self.moves.c[guess] action.do(state.c, action.label) todo = filter(lambda sp: not sp[0].py_is_final(), todo) + i += 1 + if i >= 10000: + break for state, doc in zip(all_states, docs): self.moves.finalize_state(state.c) for i in range(doc.length): @@ -218,6 +222,7 @@ cdef class Parser: todo = zip(states, tokvecs, golds, d_tokens) assert len(states) == len(todo) losses = [] + i = 0 while todo: states, tokvecs, golds, d_tokens = zip(*todo) scores, finish_update = self._begin_update(states, tokvecs) @@ -232,6 +237,9 @@ cdef class Parser: # Get unfinished states (and their matching gold and token gradients) todo = filter(lambda sp: not sp[0].py_is_final(), todo) + i += 1 + if i >= 10000: + break return output, sum(losses) def _begin_update(self, states, tokvecs, drop=0.): @@ -284,7 +292,7 @@ cdef class Parser: state.set_attributes(features[i], tokens[i], attr_names) state.set_token_vectors(tokvecs[i], all_tokvecs[i], tokens[i]) return (tokens, features, tokvecs) - + def _validate_batch(self, int[:, ::1] is_valid, states): cdef StateClass state cdef int i