mirror of https://github.com/explosion/spaCy.git
Adding insensitive lemmatisation test
This commit is contained in:
parent
26e31afc18
commit
ed5f094451
|
@ -19,6 +19,7 @@ class FrenchDefaults(BaseDefaults):
|
||||||
token_match = TOKEN_MATCH
|
token_match = TOKEN_MATCH
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_tokenizer(cls, nlp=None):
|
def create_tokenizer(cls, nlp=None):
|
||||||
cls.tokenizer_exceptions = get_tokenizer_exceptions()
|
cls.tokenizer_exceptions = get_tokenizer_exceptions()
|
||||||
|
|
|
@ -28,4 +28,14 @@ def test_tokenizer_verb_noun(fr_tokenizer):
|
||||||
tokens = fr_tokenizer(text)
|
tokens = fr_tokenizer(text)
|
||||||
assert len(tokens) == 11
|
assert len(tokens) == 11
|
||||||
assert tokens[1].lemma_ == "valider"
|
assert tokens[1].lemma_ == "valider"
|
||||||
assert tokens[3].lemma_ == "notes"
|
assert tokens[3].lemma_ == "notes"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
|
def test_tokenizer_verb_noun_insensitive(fr_tokenizer):
|
||||||
|
# This one is tricky because notes is a NOUN and can be a VERB
|
||||||
|
text = "Les Costaricaines et les costaricains sont jolies"
|
||||||
|
tokens = fr_tokenizer(text)
|
||||||
|
assert len(tokens) == 11
|
||||||
|
assert tokens[1].lemma_ == "costaricain"
|
||||||
|
assert tokens[4].lemma_ == "costaricain"
|
||||||
|
|
Loading…
Reference in New Issue