From ed5f094451fcbb043f010c613bb8287f7ba1a836 Mon Sep 17 00:00:00 2001 From: Gregory Howard Date: Tue, 25 Apr 2017 18:07:02 +0200 Subject: [PATCH] Adding insensitive lemmatisation test --- spacy/fr/__init__.py | 1 + spacy/tests/fr/test_lemmatization.py | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/spacy/fr/__init__.py b/spacy/fr/__init__.py index cd1c15dc2..9bb9a434d 100644 --- a/spacy/fr/__init__.py +++ b/spacy/fr/__init__.py @@ -19,6 +19,7 @@ class FrenchDefaults(BaseDefaults): token_match = TOKEN_MATCH + @classmethod def create_tokenizer(cls, nlp=None): cls.tokenizer_exceptions = get_tokenizer_exceptions() diff --git a/spacy/tests/fr/test_lemmatization.py b/spacy/tests/fr/test_lemmatization.py index e90115549..7b5779e18 100644 --- a/spacy/tests/fr/test_lemmatization.py +++ b/spacy/tests/fr/test_lemmatization.py @@ -28,4 +28,14 @@ def test_tokenizer_verb_noun(fr_tokenizer): tokens = fr_tokenizer(text) assert len(tokens) == 11 assert tokens[1].lemma_ == "valider" - assert tokens[3].lemma_ == "notes" \ No newline at end of file + assert tokens[3].lemma_ == "notes" + + +@pytest.mark.xfail +def test_tokenizer_verb_noun_insensitive(fr_tokenizer): + # This one is tricky because notes is a NOUN and can be a VERB + text = "Les Costaricaines et les costaricains sont jolies" + tokens = fr_tokenizer(text) + assert len(tokens) == 11 + assert tokens[1].lemma_ == "costaricain" + assert tokens[4].lemma_ == "costaricain"