Adding insensitive lemmatisation test

2017-04-25 18:07:02 +02:00 · 2017-04-25 18:07:02 +02:00 · ed5f094451
parent 26e31afc18
commit ed5f094451
2 changed files with 12 additions and 1 deletions
--- a/spacy/fr/init.py
+++ b/spacy/fr/init.py
@ -19,6 +19,7 @@ class FrenchDefaults(BaseDefaults):
    token_match = TOKEN_MATCH


+
    @classmethod
    def create_tokenizer(cls, nlp=None):
        cls.tokenizer_exceptions = get_tokenizer_exceptions()
--- a/spacy/tests/fr/test_lemmatization.py
+++ b/spacy/tests/fr/test_lemmatization.py
@ -28,4 +28,14 @@ def test_tokenizer_verb_noun(fr_tokenizer):
    tokens = fr_tokenizer(text)
    assert len(tokens) == 11
    assert tokens[1].lemma_ == "valider"
-    assert tokens[3].lemma_ == "notes"
+    assert tokens[3].lemma_ == "notes"
+
+
+@pytest.mark.xfail
+def test_tokenizer_verb_noun_insensitive(fr_tokenizer):
+    # This one is tricky because notes is a NOUN and can be a VERB
+    text = "Les Costaricaines et les costaricains sont jolies"
+    tokens = fr_tokenizer(text)
+    assert len(tokens) == 11
+    assert tokens[1].lemma_ == "costaricain"
+    assert tokens[4].lemma_ == "costaricain"