From 3297a19545027c8d8550b1ae793ce290567eff32 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 1 Oct 2019 15:13:55 +0200 Subject: [PATCH] Warn in Tagger.begin_training if no lemma tables are available (#4351) --- spacy/errors.py | 7 +++++++ spacy/pipeline/pipes.pyx | 5 ++++- spacy/tests/pipeline/test_tagger.py | 22 ++++++++++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 spacy/tests/pipeline/test_tagger.py diff --git a/spacy/errors.py b/spacy/errors.py index 30c7a5f48..93d42aa4c 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -88,6 +88,13 @@ class Warnings(object): "loaded. (Shape: {shape})") W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be " "incorrect. Modify PhraseMatcher._terminal_hash to fix.") + W022 = ("Training a new part-of-speech tagger using a model with no " + "lemmatization rules or data. This means that the trained model " + "may not be able to lemmatize correctly. If this is intentional " + "or the language you're using doesn't have lemmatization data, " + "you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. " + "If this is surprising, make sure you have the spacy-lookups-data " + "package installed.") @add_codes diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 9ac3affc9..53fa650e2 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -30,7 +30,7 @@ from .._ml import build_text_classifier, build_simple_cnn_text_classifier from .._ml import build_bow_text_classifier, build_nel_encoder from .._ml import link_vectors_to_models, zero_init, flatten from .._ml import masked_language_model, create_default_optimizer -from ..errors import Errors, TempErrors +from ..errors import Errors, TempErrors, user_warning, Warnings from .. import util @@ -501,6 +501,9 @@ class Tagger(Pipe): def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): + lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"] + if not any(table in self.vocab.lookups for table in lemma_tables): + user_warning(Warnings.W022) orig_tag_map = dict(self.vocab.morphology.tag_map) new_tag_map = OrderedDict() for raw_text, annots_brackets in get_gold_tuples(): diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py new file mode 100644 index 000000000..e843723e1 --- /dev/null +++ b/spacy/tests/pipeline/test_tagger.py @@ -0,0 +1,22 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest +from spacy.lang.en import English +from spacy.lookups import Lookups + + +def test_tagger_warns_no_lemma_lookups(): + nlp = English() + nlp.vocab.lookups = Lookups() + assert not len(nlp.vocab.lookups) + tagger = nlp.create_pipe("tagger") + with pytest.warns(UserWarning): + tagger.begin_training() + nlp.add_pipe(tagger) + with pytest.warns(UserWarning): + nlp.begin_training() + nlp.vocab.lookups.add_table("lemma_lookup") + with pytest.warns(None) as record: + nlp.begin_training() + assert not record.list