Merge branch 'master' into feature/omit-extra-lexeme-info

This commit is contained in:
adrianeboyd 2020-05-21 10:26:01 +02:00 committed by GitHub
commit d45602bc11
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 22 additions and 13 deletions

View File

@ -17,9 +17,10 @@ from wasabi import msg
from ..vectors import Vectors from ..vectors import Vectors
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..util import ensure_path, get_lang_class, OOV_RANK from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
from ..lookups import Lookups from ..lookups import Lookups
try: try:
import ftfy import ftfy
except ImportError: except ImportError:
@ -51,6 +52,7 @@ DEFAULT_OOV_PROB = -20
), ),
model_name=("Optional name for the model meta", "option", "mn", str), model_name=("Optional name for the model meta", "option", "mn", str),
omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool), omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
base_model=("Base model (for languages with custom tokenizers)", "option", "b", str),
) )
def init_model( def init_model(
lang, lang,
@ -64,6 +66,7 @@ def init_model(
vectors_name=None, vectors_name=None,
model_name=None, model_name=None,
omit_extra_lookups=False, omit_extra_lookups=False,
base_model=None,
): ):
""" """
Create a new model from raw data, like word frequencies, Brown clusters Create a new model from raw data, like word frequencies, Brown clusters
@ -95,7 +98,7 @@ def init_model(
lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc) lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
with msg.loading("Creating model..."): with msg.loading("Creating model..."):
nlp = create_model(lang, lex_attrs, name=model_name) nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
# Create empty extra lexeme tables so the data from spacy-lookups-data # Create empty extra lexeme tables so the data from spacy-lookups-data
# isn't loaded if these features are accessed # isn't loaded if these features are accessed
@ -164,7 +167,14 @@ def read_attrs_from_deprecated(freqs_loc, clusters_loc):
return lex_attrs return lex_attrs
def create_model(lang, lex_attrs, name=None): def create_model(lang, lex_attrs, name=None, base_model=None):
if base_model:
nlp = load_model(base_model)
# keep the tokenizer but remove any existing pipeline components due to
# potentially conflicting vectors
for pipe in nlp.pipe_names:
nlp.remove_pipe(pipe)
else:
lang_class = get_lang_class(lang) lang_class = get_lang_class(lang)
nlp = lang_class() nlp = lang_class()
for lexeme in nlp.vocab: for lexeme in nlp.vocab:

View File

@ -9,7 +9,6 @@ import numpy
cimport cython.parallel cimport cython.parallel
import numpy.random import numpy.random
cimport numpy as np cimport numpy as np
from itertools import islice
from cpython.ref cimport PyObject, Py_XDECREF from cpython.ref cimport PyObject, Py_XDECREF
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
from libc.math cimport exp from libc.math cimport exp
@ -621,15 +620,15 @@ cdef class Parser:
self.model, cfg = self.Model(self.moves.n_moves, **cfg) self.model, cfg = self.Model(self.moves.n_moves, **cfg)
if sgd is None: if sgd is None:
sgd = self.create_optimizer() sgd = self.create_optimizer()
doc_sample = [] docs = []
gold_sample = [] golds = []
for raw_text, annots_brackets in islice(get_gold_tuples(), 1000): for raw_text, annots_brackets in get_gold_tuples():
for annots, brackets in annots_brackets: for annots, brackets in annots_brackets:
ids, words, tags, heads, deps, ents = annots ids, words, tags, heads, deps, ents = annots
doc_sample.append(Doc(self.vocab, words=words)) docs.append(Doc(self.vocab, words=words))
gold_sample.append(GoldParse(doc_sample[-1], words=words, tags=tags, golds.append(GoldParse(docs[-1], words=words, tags=tags,
heads=heads, deps=deps, entities=ents)) heads=heads, deps=deps, entities=ents))
self.model.begin_training(doc_sample, gold_sample) self.model.begin_training(docs, golds)
if pipeline is not None: if pipeline is not None:
self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg) self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg)
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)