diff --git a/spacy/fr/__init__.py b/spacy/fr/__init__.py index cd1c15dc2..9bb9a434d 100644 --- a/spacy/fr/__init__.py +++ b/spacy/fr/__init__.py @@ -19,6 +19,7 @@ class FrenchDefaults(BaseDefaults): token_match = TOKEN_MATCH + @classmethod def create_tokenizer(cls, nlp=None): cls.tokenizer_exceptions = get_tokenizer_exceptions() diff --git a/spacy/tests/fr/test_lemmatization.py b/spacy/tests/fr/test_lemmatization.py index e90115549..7b5779e18 100644 --- a/spacy/tests/fr/test_lemmatization.py +++ b/spacy/tests/fr/test_lemmatization.py @@ -28,4 +28,14 @@ def test_tokenizer_verb_noun(fr_tokenizer): tokens = fr_tokenizer(text) assert len(tokens) == 11 assert tokens[1].lemma_ == "valider" - assert tokens[3].lemma_ == "notes" \ No newline at end of file + assert tokens[3].lemma_ == "notes" + + +@pytest.mark.xfail +def test_tokenizer_verb_noun_insensitive(fr_tokenizer): + # This one is tricky because notes is a NOUN and can be a VERB + text = "Les Costaricaines et les costaricains sont jolies" + tokens = fr_tokenizer(text) + assert len(tokens) == 11 + assert tokens[1].lemma_ == "costaricain" + assert tokens[4].lemma_ == "costaricain"