mirror of https://github.com/explosion/spaCy.git
Adding insensitive lemmatisation test
This commit is contained in:
parent
26e31afc18
commit
ed5f094451
|
@ -19,6 +19,7 @@ class FrenchDefaults(BaseDefaults):
|
|||
token_match = TOKEN_MATCH
|
||||
|
||||
|
||||
|
||||
@classmethod
|
||||
def create_tokenizer(cls, nlp=None):
|
||||
cls.tokenizer_exceptions = get_tokenizer_exceptions()
|
||||
|
|
|
@ -28,4 +28,14 @@ def test_tokenizer_verb_noun(fr_tokenizer):
|
|||
tokens = fr_tokenizer(text)
|
||||
assert len(tokens) == 11
|
||||
assert tokens[1].lemma_ == "valider"
|
||||
assert tokens[3].lemma_ == "notes"
|
||||
assert tokens[3].lemma_ == "notes"
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_tokenizer_verb_noun_insensitive(fr_tokenizer):
|
||||
# This one is tricky because notes is a NOUN and can be a VERB
|
||||
text = "Les Costaricaines et les costaricains sont jolies"
|
||||
tokens = fr_tokenizer(text)
|
||||
assert len(tokens) == 11
|
||||
assert tokens[1].lemma_ == "costaricain"
|
||||
assert tokens[4].lemma_ == "costaricain"
|
||||
|
|
Loading…
Reference in New Issue