From 0967eb07bea28d84bac696de2c5ea6630424d92a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 23 Jan 2017 21:25:46 +0100 Subject: [PATCH] Add regression test for #768 --- spacy/tests/regression/test_issue768.py | 36 +++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 spacy/tests/regression/test_issue768.py diff --git a/spacy/tests/regression/test_issue768.py b/spacy/tests/regression/test_issue768.py new file mode 100644 index 000000000..d8c8be80b --- /dev/null +++ b/spacy/tests/regression/test_issue768.py @@ -0,0 +1,36 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ...language import Language +from ...attrs import LANG +from ...fr.language_data import TOKENIZER_EXCEPTIONS, STOP_WORDS +from ...language_data.punctuation import TOKENIZER_INFIXES, ALPHA + +import pytest + + +@pytest.fixture +def fr_tokenizer_w_infix(): + SPLIT_INFIX = r'(?<=[{a}]\')(?=[{a}])'.format(a=ALPHA) + + # create new Language subclass to add to default infixes + class French(Language): + lang = 'fr' + + class Defaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'fr' + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + stop_words = STOP_WORDS + infixes = TOKENIZER_INFIXES + [SPLIT_INFIX] + + return French.Defaults.create_tokenizer() + + +@pytest.mark.parametrize('text,expected_tokens', [("l'avion", ["l'", "avion"]), + ("j'ai", ["j'", "ai"])]) +def test_issue768(fr_tokenizer_w_infix, text, expected_tokens): + """Allow zero-width 'infix' token during the tokenization process.""" + tokens = fr_tokenizer_w_infix(text) + assert len(tokens) == 2 + assert [t.text for t in tokens] == expected_tokens