From 3297a19545027c8d8550b1ae793ce290567eff32 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 1 Oct 2019 15:13:55 +0200
Subject: [PATCH] Warn in Tagger.begin_training if no lemma tables are
 available (#4351)

---
 spacy/errors.py                     |  7 +++++++
 spacy/pipeline/pipes.pyx            |  5 ++++-
 spacy/tests/pipeline/test_tagger.py | 22 ++++++++++++++++++++++
 3 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 spacy/tests/pipeline/test_tagger.py

diff --git a/spacy/errors.py b/spacy/errors.py
index 30c7a5f48..93d42aa4c 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -88,6 +88,13 @@ class Warnings(object):
             "loaded. (Shape: {shape})")
     W021 = ("Unexpected hash collision in PhraseMatcher. Matches may be "
             "incorrect. Modify PhraseMatcher._terminal_hash to fix.")
+    W022 = ("Training a new part-of-speech tagger using a model with no "
+            "lemmatization rules or data. This means that the trained model "
+            "may not be able to lemmatize correctly. If this is intentional "
+            "or the language you're using doesn't have lemmatization data, "
+            "you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. "
+            "If this is surprising, make sure you have the spacy-lookups-data "
+            "package installed.")
 
 
 @add_codes
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 9ac3affc9..53fa650e2 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -30,7 +30,7 @@ from .._ml import build_text_classifier, build_simple_cnn_text_classifier
 from .._ml import build_bow_text_classifier, build_nel_encoder
 from .._ml import link_vectors_to_models, zero_init, flatten
 from .._ml import masked_language_model, create_default_optimizer
-from ..errors import Errors, TempErrors
+from ..errors import Errors, TempErrors, user_warning, Warnings
 from .. import util
 
 
@@ -501,6 +501,9 @@ class Tagger(Pipe):
 
     def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
                        **kwargs):
+        lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
+        if not any(table in self.vocab.lookups for table in lemma_tables):
+            user_warning(Warnings.W022)
         orig_tag_map = dict(self.vocab.morphology.tag_map)
         new_tag_map = OrderedDict()
         for raw_text, annots_brackets in get_gold_tuples():
diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py
new file mode 100644
index 000000000..e843723e1
--- /dev/null
+++ b/spacy/tests/pipeline/test_tagger.py
@@ -0,0 +1,22 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+from spacy.lang.en import English
+from spacy.lookups import Lookups
+
+
+def test_tagger_warns_no_lemma_lookups():
+    nlp = English()
+    nlp.vocab.lookups = Lookups()
+    assert not len(nlp.vocab.lookups)
+    tagger = nlp.create_pipe("tagger")
+    with pytest.warns(UserWarning):
+        tagger.begin_training()
+    nlp.add_pipe(tagger)
+    with pytest.warns(UserWarning):
+        nlp.begin_training()
+    nlp.vocab.lookups.add_table("lemma_lookup")
+    with pytest.warns(None) as record:
+        nlp.begin_training()
+        assert not record.list