From e895d1afd70555868198de1ec403728406c02e10 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 9 May 2017 00:00:54 +0200 Subject: [PATCH] Reorganise French punctuation rules --- spacy/lang/fr/punctuation.py | 38 ++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/spacy/lang/fr/punctuation.py b/spacy/lang/fr/punctuation.py index 935e4c964..803afb478 100644 --- a/spacy/lang/fr/punctuation.py +++ b/spacy/lang/fr/punctuation.py @@ -1,29 +1,29 @@ # coding: utf8 from __future__ import unicode_literals -from ..char_classes import TOKENIZER_INFIXES, LIST_PUNCT LIST_ELLIPSES -from ..char_classes import LIST_QUOTES, CURRENCY, QUOTES, UNITS -from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER +from ..punctuation import TOKENIZER_INFIXES +from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY +from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER ELISION = " ' ’ ".strip().replace(' ', '').replace('\n', '') -HYPHENS = r"""- – — ‐ ‑""".strip().replace(' ', '').replace('\n', '') +HYPHENS = r"- – — ‐ ‑".strip().replace(' ', '').replace('\n', '') -TOKENIZER_SUFFIXES = ( - LIST_PUNCT + - LIST_ELLIPSES + - LIST_QUOTES + - [ - r'(?<=[0-9])\+', - r'(?<=°[FfCcKk])\.', # 4°C. -> ["4°C", "."] - r'(?<=[0-9])°[FfCcKk]', # 4°C -> ["4", "°C"] - r'(?<=[0-9])%', # 4% -> ["4", "%"] - r'(?<=[0-9])(?:{c})'.format(c=CURRENCY), - r'(?<=[0-9])(?:{u})'.format(u=UNITS), - r'(?<=[0-9{al}{p}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES), - r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER)]) +_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + + [r'(?<=[0-9])\+', + r'(?<=°[FfCcKk])\.', # 4°C. -> ["4°C", "."] + r'(?<=[0-9])°[FfCcKk]', # 4°C -> ["4", "°C"] + r'(?<=[0-9])%', # 4% -> ["4", "%"] + r'(?<=[0-9])(?:{})'.format(CURRENCY), + r'(?<=[0-9])(?:{})'.format(UNITS), + r'(?<=[0-9{}{}(?:{})])\.'.format(ALPHA_LOWER, r'%²\-\)\]\+', QUOTES), + r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER)]) -TOKENIZER_INFIXES += [ - r'(?<=[{a}][{el}])(?=[{a}])'.format(a=ALPHA, el=ELISION)] +_infixes = (TOKENIZER_INFIXES + + [r'(?<=[{a}][{el}])(?=[{a}])'.format(a=ALPHA, el=ELISION)]) + + +TOKENIZER_SUFFIXES = _suffixes +TOKENIZER_INFIXES = _infixes