diff --git a/spacy/fr/__init__.py b/spacy/fr/__init__.py index 81584b926..33ac0e53a 100644 --- a/spacy/fr/__init__.py +++ b/spacy/fr/__init__.py @@ -7,6 +7,7 @@ from ..language import Language from ..attrs import LANG from .language_data import * +from .punctuation import TOKENIZER_INFIXES class French(Language): @@ -18,3 +19,4 @@ class French(Language): tokenizer_exceptions = TOKENIZER_EXCEPTIONS stop_words = STOP_WORDS + infixes = tuple(TOKENIZER_INFIXES) diff --git a/spacy/fr/punctuation.py b/spacy/fr/punctuation.py new file mode 100644 index 000000000..ee4e5a861 --- /dev/null +++ b/spacy/fr/punctuation.py @@ -0,0 +1,16 @@ +# encoding: utf8 + +from __future__ import unicode_literals + +from ..language_data.punctuation import ALPHA, TOKENIZER_INFIXES + + +_ELISION = " ' ’ " +ELISION = _ELISION.strip().replace(' ', '').replace('\n', '') + +TOKENIZER_INFIXES += [ + r'(?<=[{a}][{el}])(?=[{a}])'.format(a=ALPHA, el=ELISION), +] + + +__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"]