diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 18589a954..7fdd39932 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -17,9 +17,10 @@ from wasabi import msg from ..vectors import Vectors from ..errors import Errors, Warnings -from ..util import ensure_path, get_lang_class, OOV_RANK +from ..util import ensure_path, get_lang_class, load_model, OOV_RANK from ..lookups import Lookups + try: import ftfy except ImportError: @@ -51,6 +52,7 @@ DEFAULT_OOV_PROB = -20 ), model_name=("Optional name for the model meta", "option", "mn", str), omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool), + base_model=("Base model (for languages with custom tokenizers)", "option", "b", str), ) def init_model( lang, @@ -64,6 +66,7 @@ def init_model( vectors_name=None, model_name=None, omit_extra_lookups=False, + base_model=None, ): """ Create a new model from raw data, like word frequencies, Brown clusters @@ -95,7 +98,7 @@ def init_model( lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc) with msg.loading("Creating model..."): - nlp = create_model(lang, lex_attrs, name=model_name) + nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model) # Create empty extra lexeme tables so the data from spacy-lookups-data # isn't loaded if these features are accessed @@ -164,9 +167,16 @@ def read_attrs_from_deprecated(freqs_loc, clusters_loc): return lex_attrs -def create_model(lang, lex_attrs, name=None): - lang_class = get_lang_class(lang) - nlp = lang_class() +def create_model(lang, lex_attrs, name=None, base_model=None): + if base_model: + nlp = load_model(base_model) + # keep the tokenizer but remove any existing pipeline components due to + # potentially conflicting vectors + for pipe in nlp.pipe_names: + nlp.remove_pipe(pipe) + else: + lang_class = get_lang_class(lang) + nlp = lang_class() for lexeme in nlp.vocab: lexeme.rank = OOV_RANK for attrs in lex_attrs: diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index d5c6bf2a8..fafa492c6 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -9,7 +9,6 @@ import numpy cimport cython.parallel import numpy.random cimport numpy as np -from itertools import islice from cpython.ref cimport PyObject, Py_XDECREF from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno from libc.math cimport exp @@ -621,15 +620,15 @@ cdef class Parser: self.model, cfg = self.Model(self.moves.n_moves, **cfg) if sgd is None: sgd = self.create_optimizer() - doc_sample = [] - gold_sample = [] - for raw_text, annots_brackets in islice(get_gold_tuples(), 1000): + docs = [] + golds = [] + for raw_text, annots_brackets in get_gold_tuples(): for annots, brackets in annots_brackets: ids, words, tags, heads, deps, ents = annots - doc_sample.append(Doc(self.vocab, words=words)) - gold_sample.append(GoldParse(doc_sample[-1], words=words, tags=tags, - heads=heads, deps=deps, entities=ents)) - self.model.begin_training(doc_sample, gold_sample) + docs.append(Doc(self.vocab, words=words)) + golds.append(GoldParse(docs[-1], words=words, tags=tags, + heads=heads, deps=deps, entities=ents)) + self.model.begin_training(docs, golds) if pipeline is not None: self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg) link_vectors_to_models(self.vocab)