From e93d43a43a03ed207a7d9efe5817adb0afb0ef82 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 22 Sep 2017 20:00:40 -0500 Subject: [PATCH] Fix training with preset vectors --- spacy/cli/train.py | 45 ++++++++++----------------------------------- 1 file changed, 10 insertions(+), 35 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 801706614..96233406d 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -30,14 +30,14 @@ from ..compat import json_dumps n_iter=("number of iterations", "option", "n", int), n_sents=("number of sentences", "option", "ns", int), use_gpu=("Use GPU", "option", "g", int), - resume=("Whether to resume training", "flag", "R", bool), + vectors=("Model to load vectors from", "option", "v"), no_tagger=("Don't train tagger", "flag", "T", bool), no_parser=("Don't train parser", "flag", "P", bool), no_entities=("Don't train NER", "flag", "N", bool), gold_preproc=("Use gold preprocessing", "flag", "G", bool), ) def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, - use_gpu=-1, resume=False, no_tagger=False, no_parser=False, no_entities=False, + use_gpu=-1, vectors=None, no_tagger=False, no_parser=False, no_entities=False, gold_preproc=False): """ Train a model. Expects data in spaCy's JSON format. @@ -73,25 +73,20 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, corpus = GoldCorpus(train_path, dev_path, limit=n_sents) n_train_words = corpus.count_train() - if not resume: - lang_class = util.get_lang_class(lang) - nlp = lang_class(pipeline=pipeline) - optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) - else: - print("Load resume") - util.use_gpu(use_gpu) - nlp = _resume_model(lang, pipeline, corpus) - optimizer = nlp.resume_training(device=use_gpu) - lang_class = nlp.__class__ - + lang_class = util.get_lang_class(lang) + nlp = lang_class(pipeline=pipeline) + if vectors: + util.load_model(vectors, vocab=nlp.vocab) + optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu) nlp._optimizer = None print("Itn.\tLoss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") try: + train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0, + gold_preproc=gold_preproc, max_length=0) + train_docs = list(train_docs) for i in range(n_iter): with tqdm.tqdm(total=n_train_words, leave=False) as pbar: - train_docs = corpus.train_docs(nlp, projectivize=True, noise_level=0.0, - gold_preproc=gold_preproc, max_length=0) losses = {} for batch in minibatch(train_docs, size=batch_sizes): docs, golds = zip(*batch) @@ -124,26 +119,6 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, except: pass - -def _resume_model(lang, pipeline, corpus): - nlp = util.load_model(lang) - pipes = {getattr(pipe, 'name', None) for pipe in nlp.pipeline} - for name in pipeline: - if name not in pipes: - factory = nlp.Defaults.factories[name] - for pipe in factory(nlp): - if hasattr(pipe, 'begin_training'): - pipe.begin_training(corpus.train_tuples, - pipeline=nlp.pipeline) - nlp.pipeline.append(pipe) - nlp.meta['pipeline'] = pipeline - if nlp.vocab.vectors.data.shape[1] >= 1: - nlp.vocab.vectors.data = Model.ops.asarray( - nlp.vocab.vectors.data) - - return nlp - - def _render_parses(i, to_render): to_render[0].user_data['title'] = "Batch %d" % i with Path('/tmp/entities.html').open('w') as file_: