This commit is contained in:
Matthew Honnibal 2017-05-07 07:31:09 -05:00
parent 12039e80ca
commit e420e5a809
2 changed files with 27 additions and 10 deletions

View File

@ -16,6 +16,15 @@ from spacy.scorer import Scorer
from spacy.language_data.tag_map import TAG_MAP as DEFAULT_TAG_MAP
import spacy.attrs
import io
from thinc.neural.ops import CupyOps
from thinc.neural import Model
try:
import cupy
print("Using GPU")
Model.ops = CupyOps()
except ImportError:
pass
def read_conllx(loc, n=0):
@ -137,10 +146,10 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
Xs, ys = organize_data(vocab, train_sents)
dev_Xs, dev_ys = organize_data(vocab, dev_sents)
Xs = Xs[:500]
ys = ys[:500]
dev_Xs = dev_Xs[:100]
dev_ys = dev_ys[:100]
Xs = Xs
ys = ys
dev_Xs = dev_Xs[:1000]
dev_ys = dev_ys[:1000]
with encoder.model.begin_training(Xs[:100], ys[:100]) as (trainer, optimizer):
docs = list(Xs)
for doc in docs:
@ -154,9 +163,9 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, nn_loss[-1], scorer.uas, scorer.tags_acc))
nn_loss.append(0.)
trainer.each_epoch.append(track_progress)
trainer.batch_size = 6
trainer.nb_epoch = 10000
for docs, golds in trainer.iterate(Xs, ys, progress_bar=False):
trainer.batch_size = 12
trainer.nb_epoch = 10
for docs, golds in trainer.iterate(Xs, ys):
docs = [Doc(vocab, words=[w.text for w in doc]) for doc in docs]
tokvecs, upd_tokvecs = encoder.begin_update(docs)
for doc, tokvec in zip(docs, tokvecs):
@ -165,7 +174,7 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
tagger.update(doc, gold)
d_tokvecs, loss = parser.update(docs, golds, sgd=optimizer)
upd_tokvecs(d_tokvecs, sgd=optimizer)
encoder.update(docs, golds, optimizer)
encoder.update(docs, golds, sgd=optimizer)
nn_loss[-1] += loss
nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser)
nlp.end_training(model_dir)

View File

@ -132,7 +132,7 @@ cdef class Parser:
"""
self.parse_batch([tokens])
self.moves.finalize_doc(tokens)
def pipe(self, stream, int batch_size=1000, int n_threads=2):
"""
Process a stream of documents.
@ -175,6 +175,7 @@ cdef class Parser:
tokvecs = [d.tensor for d in docs]
all_states = list(states)
todo = zip(states, tokvecs)
i = 0
while todo:
states, tokvecs = zip(*todo)
scores, _ = self._begin_update(states, tokvecs)
@ -182,6 +183,9 @@ cdef class Parser:
action = self.moves.c[guess]
action.do(state.c, action.label)
todo = filter(lambda sp: not sp[0].py_is_final(), todo)
i += 1
if i >= 10000:
break
for state, doc in zip(all_states, docs):
self.moves.finalize_state(state.c)
for i in range(doc.length):
@ -218,6 +222,7 @@ cdef class Parser:
todo = zip(states, tokvecs, golds, d_tokens)
assert len(states) == len(todo)
losses = []
i = 0
while todo:
states, tokvecs, golds, d_tokens = zip(*todo)
scores, finish_update = self._begin_update(states, tokvecs)
@ -232,6 +237,9 @@ cdef class Parser:
# Get unfinished states (and their matching gold and token gradients)
todo = filter(lambda sp: not sp[0].py_is_final(), todo)
i += 1
if i >= 10000:
break
return output, sum(losses)
def _begin_update(self, states, tokvecs, drop=0.):
@ -284,7 +292,7 @@ cdef class Parser:
state.set_attributes(features[i], tokens[i], attr_names)
state.set_token_vectors(tokvecs[i], all_tokvecs[i], tokens[i])
return (tokens, features, tokvecs)
def _validate_batch(self, int[:, ::1] is_valid, states):
cdef StateClass state
cdef int i