2017-04-25 15:44:16 +00:00
|
|
|
|
# coding: utf-8
|
|
|
|
|
|
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
2017-04-25 15:46:01 +00:00
|
|
|
|
|
2017-04-25 15:44:16 +00:00
|
|
|
|
@pytest.mark.xfail
|
2017-04-25 15:46:01 +00:00
|
|
|
|
def test_lemmatizer_verb(fr_tokenizer):
|
2017-04-25 15:44:16 +00:00
|
|
|
|
text = "Je suis allé au mois de janv. aux prud’hommes."
|
|
|
|
|
tokens = fr_tokenizer(text)
|
|
|
|
|
assert len(tokens) == 10
|
|
|
|
|
assert tokens[2].lemma_ == "aller"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.xfail
|
2017-04-25 15:46:01 +00:00
|
|
|
|
def test_tokenizer_verb_2(fr_tokenizer):
|
2017-04-25 15:44:16 +00:00
|
|
|
|
text = "Je dois manger ce soir"
|
|
|
|
|
tokens = fr_tokenizer(text)
|
|
|
|
|
assert len(tokens) == 11
|
|
|
|
|
assert tokens[1].lemma_ == "devoir"
|
|
|
|
|
|
2017-04-25 15:46:01 +00:00
|
|
|
|
|
2017-04-25 15:44:16 +00:00
|
|
|
|
@pytest.mark.xfail
|
2017-04-25 15:46:01 +00:00
|
|
|
|
def test_tokenizer_verb_noun(fr_tokenizer):
|
2017-04-25 15:44:16 +00:00
|
|
|
|
# This one is tricky because notes is a NOUN and can be a VERB
|
|
|
|
|
text = "Nous validerons vos notes plus tard"
|
|
|
|
|
tokens = fr_tokenizer(text)
|
|
|
|
|
assert len(tokens) == 11
|
|
|
|
|
assert tokens[1].lemma_ == "valider"
|
2017-04-25 16:07:02 +00:00
|
|
|
|
assert tokens[3].lemma_ == "notes"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.xfail
|
|
|
|
|
def test_tokenizer_verb_noun_insensitive(fr_tokenizer):
|
|
|
|
|
# This one is tricky because notes is a NOUN and can be a VERB
|
|
|
|
|
text = "Les Costaricaines et les costaricains sont jolies"
|
|
|
|
|
tokens = fr_tokenizer(text)
|
|
|
|
|
assert len(tokens) == 11
|
|
|
|
|
assert tokens[1].lemma_ == "costaricain"
|
|
|
|
|
assert tokens[4].lemma_ == "costaricain"
|