Adding insensitive lemmatisation test

This commit is contained in:
Gregory Howard 2017-04-25 18:07:02 +02:00
parent 26e31afc18
commit ed5f094451
2 changed files with 12 additions and 1 deletions

View File

@ -19,6 +19,7 @@ class FrenchDefaults(BaseDefaults):
token_match = TOKEN_MATCH
@classmethod
def create_tokenizer(cls, nlp=None):
cls.tokenizer_exceptions = get_tokenizer_exceptions()

View File

@ -28,4 +28,14 @@ def test_tokenizer_verb_noun(fr_tokenizer):
tokens = fr_tokenizer(text)
assert len(tokens) == 11
assert tokens[1].lemma_ == "valider"
assert tokens[3].lemma_ == "notes"
assert tokens[3].lemma_ == "notes"
@pytest.mark.xfail
def test_tokenizer_verb_noun_insensitive(fr_tokenizer):
# This one is tricky because notes is a NOUN and can be a VERB
text = "Les Costaricaines et les costaricains sont jolies"
tokens = fr_tokenizer(text)
assert len(tokens) == 11
assert tokens[1].lemma_ == "costaricain"
assert tokens[4].lemma_ == "costaricain"