This commit is contained in:
Matthew Honnibal 2017-05-07 07:31:09 -05:00
parent 12039e80ca
commit e420e5a809
2 changed files with 27 additions and 10 deletions

View File

@ -16,6 +16,15 @@ from spacy.scorer import Scorer
from spacy.language_data.tag_map import TAG_MAP as DEFAULT_TAG_MAP from spacy.language_data.tag_map import TAG_MAP as DEFAULT_TAG_MAP
import spacy.attrs import spacy.attrs
import io import io
from thinc.neural.ops import CupyOps
from thinc.neural import Model
try:
import cupy
print("Using GPU")
Model.ops = CupyOps()
except ImportError:
pass
def read_conllx(loc, n=0): def read_conllx(loc, n=0):
@ -137,10 +146,10 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
Xs, ys = organize_data(vocab, train_sents) Xs, ys = organize_data(vocab, train_sents)
dev_Xs, dev_ys = organize_data(vocab, dev_sents) dev_Xs, dev_ys = organize_data(vocab, dev_sents)
Xs = Xs[:500] Xs = Xs
ys = ys[:500] ys = ys
dev_Xs = dev_Xs[:100] dev_Xs = dev_Xs[:1000]
dev_ys = dev_ys[:100] dev_ys = dev_ys[:1000]
with encoder.model.begin_training(Xs[:100], ys[:100]) as (trainer, optimizer): with encoder.model.begin_training(Xs[:100], ys[:100]) as (trainer, optimizer):
docs = list(Xs) docs = list(Xs)
for doc in docs: for doc in docs:
@ -154,9 +163,9 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, nn_loss[-1], scorer.uas, scorer.tags_acc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, nn_loss[-1], scorer.uas, scorer.tags_acc))
nn_loss.append(0.) nn_loss.append(0.)
trainer.each_epoch.append(track_progress) trainer.each_epoch.append(track_progress)
trainer.batch_size = 6 trainer.batch_size = 12
trainer.nb_epoch = 10000 trainer.nb_epoch = 10
for docs, golds in trainer.iterate(Xs, ys, progress_bar=False): for docs, golds in trainer.iterate(Xs, ys):
docs = [Doc(vocab, words=[w.text for w in doc]) for doc in docs] docs = [Doc(vocab, words=[w.text for w in doc]) for doc in docs]
tokvecs, upd_tokvecs = encoder.begin_update(docs) tokvecs, upd_tokvecs = encoder.begin_update(docs)
for doc, tokvec in zip(docs, tokvecs): for doc, tokvec in zip(docs, tokvecs):
@ -165,7 +174,7 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
tagger.update(doc, gold) tagger.update(doc, gold)
d_tokvecs, loss = parser.update(docs, golds, sgd=optimizer) d_tokvecs, loss = parser.update(docs, golds, sgd=optimizer)
upd_tokvecs(d_tokvecs, sgd=optimizer) upd_tokvecs(d_tokvecs, sgd=optimizer)
encoder.update(docs, golds, optimizer) encoder.update(docs, golds, sgd=optimizer)
nn_loss[-1] += loss nn_loss[-1] += loss
nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser) nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser)
nlp.end_training(model_dir) nlp.end_training(model_dir)

View File

@ -175,6 +175,7 @@ cdef class Parser:
tokvecs = [d.tensor for d in docs] tokvecs = [d.tensor for d in docs]
all_states = list(states) all_states = list(states)
todo = zip(states, tokvecs) todo = zip(states, tokvecs)
i = 0
while todo: while todo:
states, tokvecs = zip(*todo) states, tokvecs = zip(*todo)
scores, _ = self._begin_update(states, tokvecs) scores, _ = self._begin_update(states, tokvecs)
@ -182,6 +183,9 @@ cdef class Parser:
action = self.moves.c[guess] action = self.moves.c[guess]
action.do(state.c, action.label) action.do(state.c, action.label)
todo = filter(lambda sp: not sp[0].py_is_final(), todo) todo = filter(lambda sp: not sp[0].py_is_final(), todo)
i += 1
if i >= 10000:
break
for state, doc in zip(all_states, docs): for state, doc in zip(all_states, docs):
self.moves.finalize_state(state.c) self.moves.finalize_state(state.c)
for i in range(doc.length): for i in range(doc.length):
@ -218,6 +222,7 @@ cdef class Parser:
todo = zip(states, tokvecs, golds, d_tokens) todo = zip(states, tokvecs, golds, d_tokens)
assert len(states) == len(todo) assert len(states) == len(todo)
losses = [] losses = []
i = 0
while todo: while todo:
states, tokvecs, golds, d_tokens = zip(*todo) states, tokvecs, golds, d_tokens = zip(*todo)
scores, finish_update = self._begin_update(states, tokvecs) scores, finish_update = self._begin_update(states, tokvecs)
@ -232,6 +237,9 @@ cdef class Parser:
# Get unfinished states (and their matching gold and token gradients) # Get unfinished states (and their matching gold and token gradients)
todo = filter(lambda sp: not sp[0].py_is_final(), todo) todo = filter(lambda sp: not sp[0].py_is_final(), todo)
i += 1
if i >= 10000:
break
return output, sum(losses) return output, sum(losses)
def _begin_update(self, states, tokvecs, drop=0.): def _begin_update(self, states, tokvecs, drop=0.):