mirror of https://github.com/explosion/spaCy.git
Tmp
This commit is contained in:
parent
12039e80ca
commit
e420e5a809
|
@ -16,6 +16,15 @@ from spacy.scorer import Scorer
|
||||||
from spacy.language_data.tag_map import TAG_MAP as DEFAULT_TAG_MAP
|
from spacy.language_data.tag_map import TAG_MAP as DEFAULT_TAG_MAP
|
||||||
import spacy.attrs
|
import spacy.attrs
|
||||||
import io
|
import io
|
||||||
|
from thinc.neural.ops import CupyOps
|
||||||
|
from thinc.neural import Model
|
||||||
|
|
||||||
|
try:
|
||||||
|
import cupy
|
||||||
|
print("Using GPU")
|
||||||
|
Model.ops = CupyOps()
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def read_conllx(loc, n=0):
|
def read_conllx(loc, n=0):
|
||||||
|
@ -137,10 +146,10 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
|
||||||
|
|
||||||
Xs, ys = organize_data(vocab, train_sents)
|
Xs, ys = organize_data(vocab, train_sents)
|
||||||
dev_Xs, dev_ys = organize_data(vocab, dev_sents)
|
dev_Xs, dev_ys = organize_data(vocab, dev_sents)
|
||||||
Xs = Xs[:500]
|
Xs = Xs
|
||||||
ys = ys[:500]
|
ys = ys
|
||||||
dev_Xs = dev_Xs[:100]
|
dev_Xs = dev_Xs[:1000]
|
||||||
dev_ys = dev_ys[:100]
|
dev_ys = dev_ys[:1000]
|
||||||
with encoder.model.begin_training(Xs[:100], ys[:100]) as (trainer, optimizer):
|
with encoder.model.begin_training(Xs[:100], ys[:100]) as (trainer, optimizer):
|
||||||
docs = list(Xs)
|
docs = list(Xs)
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
|
@ -154,9 +163,9 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
|
||||||
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, nn_loss[-1], scorer.uas, scorer.tags_acc))
|
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, nn_loss[-1], scorer.uas, scorer.tags_acc))
|
||||||
nn_loss.append(0.)
|
nn_loss.append(0.)
|
||||||
trainer.each_epoch.append(track_progress)
|
trainer.each_epoch.append(track_progress)
|
||||||
trainer.batch_size = 6
|
trainer.batch_size = 12
|
||||||
trainer.nb_epoch = 10000
|
trainer.nb_epoch = 10
|
||||||
for docs, golds in trainer.iterate(Xs, ys, progress_bar=False):
|
for docs, golds in trainer.iterate(Xs, ys):
|
||||||
docs = [Doc(vocab, words=[w.text for w in doc]) for doc in docs]
|
docs = [Doc(vocab, words=[w.text for w in doc]) for doc in docs]
|
||||||
tokvecs, upd_tokvecs = encoder.begin_update(docs)
|
tokvecs, upd_tokvecs = encoder.begin_update(docs)
|
||||||
for doc, tokvec in zip(docs, tokvecs):
|
for doc, tokvec in zip(docs, tokvecs):
|
||||||
|
@ -165,7 +174,7 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
|
||||||
tagger.update(doc, gold)
|
tagger.update(doc, gold)
|
||||||
d_tokvecs, loss = parser.update(docs, golds, sgd=optimizer)
|
d_tokvecs, loss = parser.update(docs, golds, sgd=optimizer)
|
||||||
upd_tokvecs(d_tokvecs, sgd=optimizer)
|
upd_tokvecs(d_tokvecs, sgd=optimizer)
|
||||||
encoder.update(docs, golds, optimizer)
|
encoder.update(docs, golds, sgd=optimizer)
|
||||||
nn_loss[-1] += loss
|
nn_loss[-1] += loss
|
||||||
nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser)
|
nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser)
|
||||||
nlp.end_training(model_dir)
|
nlp.end_training(model_dir)
|
||||||
|
|
|
@ -175,6 +175,7 @@ cdef class Parser:
|
||||||
tokvecs = [d.tensor for d in docs]
|
tokvecs = [d.tensor for d in docs]
|
||||||
all_states = list(states)
|
all_states = list(states)
|
||||||
todo = zip(states, tokvecs)
|
todo = zip(states, tokvecs)
|
||||||
|
i = 0
|
||||||
while todo:
|
while todo:
|
||||||
states, tokvecs = zip(*todo)
|
states, tokvecs = zip(*todo)
|
||||||
scores, _ = self._begin_update(states, tokvecs)
|
scores, _ = self._begin_update(states, tokvecs)
|
||||||
|
@ -182,6 +183,9 @@ cdef class Parser:
|
||||||
action = self.moves.c[guess]
|
action = self.moves.c[guess]
|
||||||
action.do(state.c, action.label)
|
action.do(state.c, action.label)
|
||||||
todo = filter(lambda sp: not sp[0].py_is_final(), todo)
|
todo = filter(lambda sp: not sp[0].py_is_final(), todo)
|
||||||
|
i += 1
|
||||||
|
if i >= 10000:
|
||||||
|
break
|
||||||
for state, doc in zip(all_states, docs):
|
for state, doc in zip(all_states, docs):
|
||||||
self.moves.finalize_state(state.c)
|
self.moves.finalize_state(state.c)
|
||||||
for i in range(doc.length):
|
for i in range(doc.length):
|
||||||
|
@ -218,6 +222,7 @@ cdef class Parser:
|
||||||
todo = zip(states, tokvecs, golds, d_tokens)
|
todo = zip(states, tokvecs, golds, d_tokens)
|
||||||
assert len(states) == len(todo)
|
assert len(states) == len(todo)
|
||||||
losses = []
|
losses = []
|
||||||
|
i = 0
|
||||||
while todo:
|
while todo:
|
||||||
states, tokvecs, golds, d_tokens = zip(*todo)
|
states, tokvecs, golds, d_tokens = zip(*todo)
|
||||||
scores, finish_update = self._begin_update(states, tokvecs)
|
scores, finish_update = self._begin_update(states, tokvecs)
|
||||||
|
@ -232,6 +237,9 @@ cdef class Parser:
|
||||||
|
|
||||||
# Get unfinished states (and their matching gold and token gradients)
|
# Get unfinished states (and their matching gold and token gradients)
|
||||||
todo = filter(lambda sp: not sp[0].py_is_final(), todo)
|
todo = filter(lambda sp: not sp[0].py_is_final(), todo)
|
||||||
|
i += 1
|
||||||
|
if i >= 10000:
|
||||||
|
break
|
||||||
return output, sum(losses)
|
return output, sum(losses)
|
||||||
|
|
||||||
def _begin_update(self, states, tokvecs, drop=0.):
|
def _begin_update(self, states, tokvecs, drop=0.):
|
||||||
|
|
Loading…
Reference in New Issue