mirror of https://github.com/explosion/spaCy.git
Improve warnings around normalization tables (#5794)
Provide more customized normalization table warnings when training a new model. Only suggest installing `spacy-lookups-data` if it's not already installed and it includes a table for this language (currently checked in a hard-coded list).
This commit is contained in:
parent
bf24f7f672
commit
038ff1a811
|
@ -127,13 +127,12 @@ class Warnings(object):
|
||||||
"this, download a newer compatible model or retrain your custom "
|
"this, download a newer compatible model or retrain your custom "
|
||||||
"model with the current spaCy version. For more details and "
|
"model with the current spaCy version. For more details and "
|
||||||
"available updates, run: python -m spacy validate")
|
"available updates, run: python -m spacy validate")
|
||||||
W033 = ("Training a new {model} using a model with no lexeme normalization "
|
W033 = ("Training a new {model} using a model with an empty lexeme "
|
||||||
"table. This may degrade the performance of the model to some "
|
"normalization table. This may degrade the performance to some "
|
||||||
"degree. If this is intentional or the language you're using "
|
"degree. If this is intentional or this language doesn't have a "
|
||||||
"doesn't have a normalization table, please ignore this warning. "
|
"normalization table, please ignore this warning.")
|
||||||
"If this is surprising, make sure you have the spacy-lookups-data "
|
W034 = ("Please install the package spacy-lookups-data in order to include "
|
||||||
"package installed. The languages with lexeme normalization tables "
|
"the default lexeme normalization table for the language '{lang}'.")
|
||||||
"are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.")
|
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -519,6 +519,12 @@ class Tagger(Pipe):
|
||||||
warnings.warn(Warnings.W022)
|
warnings.warn(Warnings.W022)
|
||||||
if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0:
|
if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0:
|
||||||
warnings.warn(Warnings.W033.format(model="part-of-speech tagger"))
|
warnings.warn(Warnings.W033.format(model="part-of-speech tagger"))
|
||||||
|
try:
|
||||||
|
import spacy_lookups_data
|
||||||
|
except ImportError:
|
||||||
|
if self.vocab.lang in ("da", "de", "el", "en", "id", "lb", "pt",
|
||||||
|
"ru", "sr", "ta", "th"):
|
||||||
|
warnings.warn(Warnings.W034.format(lang=self.vocab.lang))
|
||||||
orig_tag_map = dict(self.vocab.morphology.tag_map)
|
orig_tag_map = dict(self.vocab.morphology.tag_map)
|
||||||
new_tag_map = OrderedDict()
|
new_tag_map = OrderedDict()
|
||||||
for raw_text, annots_brackets in get_gold_tuples():
|
for raw_text, annots_brackets in get_gold_tuples():
|
||||||
|
|
|
@ -604,6 +604,12 @@ cdef class Parser:
|
||||||
def begin_training(self, get_gold_tuples, pipeline=None, sgd=None, **cfg):
|
def begin_training(self, get_gold_tuples, pipeline=None, sgd=None, **cfg):
|
||||||
if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0:
|
if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0:
|
||||||
warnings.warn(Warnings.W033.format(model="parser or NER"))
|
warnings.warn(Warnings.W033.format(model="parser or NER"))
|
||||||
|
try:
|
||||||
|
import spacy_lookups_data
|
||||||
|
except ImportError:
|
||||||
|
if self.vocab.lang in ("da", "de", "el", "en", "id", "lb", "pt",
|
||||||
|
"ru", "sr", "ta", "th"):
|
||||||
|
warnings.warn(Warnings.W034.format(lang=self.vocab.lang))
|
||||||
if 'model' in cfg:
|
if 'model' in cfg:
|
||||||
self.model = cfg['model']
|
self.model = cfg['model']
|
||||||
if not hasattr(get_gold_tuples, '__call__'):
|
if not hasattr(get_gold_tuples, '__call__'):
|
||||||
|
|
Loading…
Reference in New Issue