mirror of https://github.com/explosion/spaCy.git
Fix training with preset vectors
This commit is contained in:
parent
05596159bf
commit
e93d43a43a
|
@ -30,14 +30,14 @@ from ..compat import json_dumps
|
||||||
n_iter=("number of iterations", "option", "n", int),
|
n_iter=("number of iterations", "option", "n", int),
|
||||||
n_sents=("number of sentences", "option", "ns", int),
|
n_sents=("number of sentences", "option", "ns", int),
|
||||||
use_gpu=("Use GPU", "option", "g", int),
|
use_gpu=("Use GPU", "option", "g", int),
|
||||||
resume=("Whether to resume training", "flag", "R", bool),
|
vectors=("Model to load vectors from", "option", "v"),
|
||||||
no_tagger=("Don't train tagger", "flag", "T", bool),
|
no_tagger=("Don't train tagger", "flag", "T", bool),
|
||||||
no_parser=("Don't train parser", "flag", "P", bool),
|
no_parser=("Don't train parser", "flag", "P", bool),
|
||||||
no_entities=("Don't train NER", "flag", "N", bool),
|
no_entities=("Don't train NER", "flag", "N", bool),
|
||||||
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
gold_preproc=("Use gold preprocessing", "flag", "G", bool),
|
||||||
)
|
)
|
||||||
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False,
|
use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False,
|
||||||
gold_preproc=False):
|
gold_preproc=False):
|
||||||
"""
|
"""
|
||||||
Train a model. Expects data in spaCy's JSON format.
|
Train a model. Expects data in spaCy's JSON format.
|
||||||
|
@ -73,25 +73,20 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
|
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
|
||||||
n_train_words = corpus.count_train()
|
n_train_words = corpus.count_train()
|
||||||
|
|
||||||
if not resume:
|
lang_class = util.get_lang_class(lang)
|
||||||
lang_class = util.get_lang_class(lang)
|
nlp = lang_class(pipeline=pipeline)
|
||||||
nlp = lang_class(pipeline=pipeline)
|
if vectors:
|
||||||
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
util.load_model(vectors, vocab=nlp.vocab)
|
||||||
else:
|
optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
|
||||||
print("Load resume")
|
|
||||||
util.use_gpu(use_gpu)
|
|
||||||
nlp = _resume_model(lang, pipeline, corpus)
|
|
||||||
optimizer = nlp.resume_training(device=use_gpu)
|
|
||||||
lang_class = nlp.__class__
|
|
||||||
|
|
||||||
nlp._optimizer = None
|
nlp._optimizer = None
|
||||||
|
|
||||||
print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
|
print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %")
|
||||||
try:
|
try:
|
||||||
|
train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
|
||||||
|
gold_preproc=gold_preproc, max_length=0)
|
||||||
|
train_docs = list(train_docs)
|
||||||
for i in range(n_iter):
|
for i in range(n_iter):
|
||||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||||
train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0,
|
|
||||||
gold_preproc=gold_preproc, max_length=0)
|
|
||||||
losses = {}
|
losses = {}
|
||||||
for batch in minibatch(train_docs, size=batch_sizes):
|
for batch in minibatch(train_docs, size=batch_sizes):
|
||||||
docs, golds = zip(*batch)
|
docs, golds = zip(*batch)
|
||||||
|
@ -124,26 +119,6 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0,
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def _resume_model(lang, pipeline, corpus):
|
|
||||||
nlp = util.load_model(lang)
|
|
||||||
pipes = {getattr(pipe, 'name', None) for pipe in nlp.pipeline}
|
|
||||||
for name in pipeline:
|
|
||||||
if name not in pipes:
|
|
||||||
factory = nlp.Defaults.factories[name]
|
|
||||||
for pipe in factory(nlp):
|
|
||||||
if hasattr(pipe, 'begin_training'):
|
|
||||||
pipe.begin_training(corpus.train_tuples,
|
|
||||||
pipeline=nlp.pipeline)
|
|
||||||
nlp.pipeline.append(pipe)
|
|
||||||
nlp.meta['pipeline'] = pipeline
|
|
||||||
if nlp.vocab.vectors.data.shape[1] >= 1:
|
|
||||||
nlp.vocab.vectors.data = Model.ops.asarray(
|
|
||||||
nlp.vocab.vectors.data)
|
|
||||||
|
|
||||||
return nlp
|
|
||||||
|
|
||||||
|
|
||||||
def _render_parses(i, to_render):
|
def _render_parses(i, to_render):
|
||||||
to_render[0].user_data['title'] = "Batch %d" % i
|
to_render[0].user_data['title'] = "Batch %d" % i
|
||||||
with Path('/tmp/entities.html').open('w') as file_:
|
with Path('/tmp/entities.html').open('w') as file_:
|
||||||
|
|
Loading…
Reference in New Issue