From 1faaf698ca3fc33eb4bf8fc9e8ae87d4ec582486 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Tue, 24 Jan 2017 09:51:29 +0100 Subject: [PATCH] Add infixes and abbreviation exceptions (fr) --- spacy/fr/language_data.py | 51 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/spacy/fr/language_data.py b/spacy/fr/language_data.py index bbbeb1535..6eff8364f 100644 --- a/spacy/fr/language_data.py +++ b/spacy/fr/language_data.py @@ -4,6 +4,9 @@ from __future__ import unicode_literals from .. import language_data as base from ..language_data import strings_to_exc, update_exc +from .punctuation import ELISION + +from ..symbols import * from .stop_words import STOP_WORDS @@ -13,5 +16,53 @@ STOP_WORDS = set(STOP_WORDS) TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) +ABBREVIATIONS = { + "janv.": [ + {LEMMA: "janvier", ORTH: "janv."} + ], + "févr.": [ + {LEMMA: "février", ORTH: "févr."} + ], + "avr.": [ + {LEMMA: "avril", ORTH: "avr."} + ], + "juill.": [ + {LEMMA: "juillet", ORTH: "juill."} + ], + "sept.": [ + {LEMMA: "septembre", ORTH: "sept."} + ], + "oct.": [ + {LEMMA: "octobre", ORTH: "oct."} + ], + "nov.": [ + {LEMMA: "novembre", ORTH: "nov."} + ], + "déc.": [ + {LEMMA: "décembre", ORTH: "déc."} + ], +} + + +INFIXES_EXCEPTIONS_BASE = ["aujourd'hui", + "prud'homme", "prud'hommes", + "prud'homal", "prud'homaux", "prud'homale", + "prud'homales", + "prud'hommal", "prud'hommaux", "prud'hommale", + "prud'hommales", + "prud'homie", "prud'homies", + "prud'hommesque", "prud'hommesques", + "prud'hommesquement"] + +INFIXES_EXCEPTIONS = [] +for elision_char in ELISION: + INFIXES_EXCEPTIONS += [infix.replace("'", elision_char) + for infix in INFIXES_EXCEPTIONS_BASE] + +INFIXES_EXCEPTIONS += [word.capitalize() for word in INFIXES_EXCEPTIONS] + +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(INFIXES_EXCEPTIONS)) +update_exc(TOKENIZER_EXCEPTIONS, ABBREVIATIONS) + __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"]