mirror of https://github.com/explosion/spaCy.git
Merge branch 'master' into feature/omit-extra-lexeme-info
This commit is contained in:
commit
d45602bc11
|
@ -17,9 +17,10 @@ from wasabi import msg
|
||||||
|
|
||||||
from ..vectors import Vectors
|
from ..vectors import Vectors
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from ..util import ensure_path, get_lang_class, OOV_RANK
|
from ..util import ensure_path, get_lang_class, load_model, OOV_RANK
|
||||||
from ..lookups import Lookups
|
from ..lookups import Lookups
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import ftfy
|
import ftfy
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -51,6 +52,7 @@ DEFAULT_OOV_PROB = -20
|
||||||
),
|
),
|
||||||
model_name=("Optional name for the model meta", "option", "mn", str),
|
model_name=("Optional name for the model meta", "option", "mn", str),
|
||||||
omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
|
omit_extra_lookups=("Don't include extra lookups in model", "flag", "OEL", bool),
|
||||||
|
base_model=("Base model (for languages with custom tokenizers)", "option", "b", str),
|
||||||
)
|
)
|
||||||
def init_model(
|
def init_model(
|
||||||
lang,
|
lang,
|
||||||
|
@ -64,6 +66,7 @@ def init_model(
|
||||||
vectors_name=None,
|
vectors_name=None,
|
||||||
model_name=None,
|
model_name=None,
|
||||||
omit_extra_lookups=False,
|
omit_extra_lookups=False,
|
||||||
|
base_model=None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Create a new model from raw data, like word frequencies, Brown clusters
|
Create a new model from raw data, like word frequencies, Brown clusters
|
||||||
|
@ -95,7 +98,7 @@ def init_model(
|
||||||
lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
|
lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
|
||||||
|
|
||||||
with msg.loading("Creating model..."):
|
with msg.loading("Creating model..."):
|
||||||
nlp = create_model(lang, lex_attrs, name=model_name)
|
nlp = create_model(lang, lex_attrs, name=model_name, base_model=base_model)
|
||||||
|
|
||||||
# Create empty extra lexeme tables so the data from spacy-lookups-data
|
# Create empty extra lexeme tables so the data from spacy-lookups-data
|
||||||
# isn't loaded if these features are accessed
|
# isn't loaded if these features are accessed
|
||||||
|
@ -164,9 +167,16 @@ def read_attrs_from_deprecated(freqs_loc, clusters_loc):
|
||||||
return lex_attrs
|
return lex_attrs
|
||||||
|
|
||||||
|
|
||||||
def create_model(lang, lex_attrs, name=None):
|
def create_model(lang, lex_attrs, name=None, base_model=None):
|
||||||
lang_class = get_lang_class(lang)
|
if base_model:
|
||||||
nlp = lang_class()
|
nlp = load_model(base_model)
|
||||||
|
# keep the tokenizer but remove any existing pipeline components due to
|
||||||
|
# potentially conflicting vectors
|
||||||
|
for pipe in nlp.pipe_names:
|
||||||
|
nlp.remove_pipe(pipe)
|
||||||
|
else:
|
||||||
|
lang_class = get_lang_class(lang)
|
||||||
|
nlp = lang_class()
|
||||||
for lexeme in nlp.vocab:
|
for lexeme in nlp.vocab:
|
||||||
lexeme.rank = OOV_RANK
|
lexeme.rank = OOV_RANK
|
||||||
for attrs in lex_attrs:
|
for attrs in lex_attrs:
|
||||||
|
|
|
@ -9,7 +9,6 @@ import numpy
|
||||||
cimport cython.parallel
|
cimport cython.parallel
|
||||||
import numpy.random
|
import numpy.random
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
from itertools import islice
|
|
||||||
from cpython.ref cimport PyObject, Py_XDECREF
|
from cpython.ref cimport PyObject, Py_XDECREF
|
||||||
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
||||||
from libc.math cimport exp
|
from libc.math cimport exp
|
||||||
|
@ -621,15 +620,15 @@ cdef class Parser:
|
||||||
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
|
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
sgd = self.create_optimizer()
|
sgd = self.create_optimizer()
|
||||||
doc_sample = []
|
docs = []
|
||||||
gold_sample = []
|
golds = []
|
||||||
for raw_text, annots_brackets in islice(get_gold_tuples(), 1000):
|
for raw_text, annots_brackets in get_gold_tuples():
|
||||||
for annots, brackets in annots_brackets:
|
for annots, brackets in annots_brackets:
|
||||||
ids, words, tags, heads, deps, ents = annots
|
ids, words, tags, heads, deps, ents = annots
|
||||||
doc_sample.append(Doc(self.vocab, words=words))
|
docs.append(Doc(self.vocab, words=words))
|
||||||
gold_sample.append(GoldParse(doc_sample[-1], words=words, tags=tags,
|
golds.append(GoldParse(docs[-1], words=words, tags=tags,
|
||||||
heads=heads, deps=deps, entities=ents))
|
heads=heads, deps=deps, entities=ents))
|
||||||
self.model.begin_training(doc_sample, gold_sample)
|
self.model.begin_training(docs, golds)
|
||||||
if pipeline is not None:
|
if pipeline is not None:
|
||||||
self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg)
|
self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg)
|
||||||
link_vectors_to_models(self.vocab)
|
link_vectors_to_models(self.vocab)
|
||||||
|
|
Loading…
Reference in New Issue