diff --git a/spacy/fr/__init__.py b/spacy/fr/__init__.py index d15bcdc4a..e0481187d 100644 --- a/spacy/fr/__init__.py +++ b/spacy/fr/__init__.py @@ -1,37 +1,34 @@ # coding: utf8 -from __future__ import unicode_literals, print_function +from __future__ import unicode_literals -from ..language import Language, BaseDefaults -from ..attrs import LANG +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH +from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES +from .stop_words import STOP_WORDS +from .lemmatizer import LOOKUP -from .language_data import * -from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES +from ..language_data import BASE_EXCEPTIONS +from ..language import Language from ..lemmatizerlookup import Lemmatizer -from .lemmatization import LOOK_UP - -class FrenchDefaults(BaseDefaults): - lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: 'fr' - - stop_words = STOP_WORDS - infixes = tuple(TOKENIZER_INFIXES) - suffixes = tuple(TOKENIZER_SUFFIXES) - token_match = TOKEN_MATCH - - @classmethod - def create_tokenizer(cls, nlp=None): - cls.tokenizer_exceptions = get_tokenizer_exceptions() - return super(FrenchDefaults, cls).create_tokenizer(nlp) - - @classmethod - def create_lemmatizer(cls, nlp=None): - return Lemmatizer(LOOK_UP) +from ..attrs import LANG +from ..util import update_exc class French(Language): lang = 'fr' - Defaults = FrenchDefaults + class Defaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'fr' + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = set(STOP_WORDS) + infixes = tuple(TOKENIZER_INFIXES) + suffixes = tuple(TOKENIZER_SUFFIXES) + token_match = TOKEN_MATCH + + @classmethod + def create_lemmatizer(cls, nlp=None): + return Lemmatizer(LOOKUP) -EXPORT = French \ No newline at end of file +__all__ = ['French'] diff --git a/spacy/fr/_tokenizer_exceptions_list.py b/spacy/fr/_tokenizer_exceptions_list.py index 48b7e30a2..3cb59204b 100644 --- a/spacy/fr/_tokenizer_exceptions_list.py +++ b/spacy/fr/_tokenizer_exceptions_list.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals -BASE_EXCEPTIONS = [ +FR_BASE_EXCEPTIONS = [ "0-day", "0-days", "1000Base-T", diff --git a/spacy/fr/lemmatization.py b/spacy/fr/lemmatizer.py similarity index 99% rename from spacy/fr/lemmatization.py rename to spacy/fr/lemmatizer.py index 87dcc7c75..3dc6e9fe8 100644 --- a/spacy/fr/lemmatization.py +++ b/spacy/fr/lemmatizer.py @@ -1,7 +1,8 @@ # coding: utf8 from __future__ import unicode_literals -LOOK_UP = { + +LOOKUP = { "Ap.": "après", "Apr.": "après", "Auxerroises": "Auxerrois", @@ -217118,4 +217119,4 @@ LOOK_UP = { "ôtée": "ôter", "ôtées": "ôter", "ôtés": "ôter" -} \ No newline at end of file +} diff --git a/spacy/fr/punctuation.py b/spacy/fr/punctuation.py index a7a45b65d..3047eaffe 100644 --- a/spacy/fr/punctuation.py +++ b/spacy/fr/punctuation.py @@ -1,14 +1,12 @@ # coding: utf8 - from __future__ import unicode_literals -from ..language_data.punctuation import ALPHA, TOKENIZER_INFIXES, LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY,\ - UNITS, ALPHA_LOWER, QUOTES, ALPHA_UPPER +from ..language_data.punctuation import ALPHA, TOKENIZER_INFIXES, LIST_PUNCT +from ..language_data.punctuation import LIST_ELLIPSES, LIST_QUOTES, CURRENCY +from ..language_data.punctuation import UNITS, ALPHA_LOWER, QUOTES, ALPHA_UPPER -_ELISION = " ' ’ " -ELISION = _ELISION.strip().replace(' ', '').replace('\n', '') - +ELISION = " ' ’ ".strip().replace(' ', '').replace('\n', '') HYPHENS = r"""- – — ‐ ‑""".strip().replace(' ', '').replace('\n', '') @@ -24,14 +22,8 @@ TOKENIZER_SUFFIXES = ( r'(?<=[0-9])(?:{c})'.format(c=CURRENCY), r'(?<=[0-9])(?:{u})'.format(u=UNITS), r'(?<=[0-9{al}{p}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES), - r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER), - ] -) + r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER)]) TOKENIZER_INFIXES += [ - r'(?<=[{a}][{el}])(?=[{a}])'.format(a=ALPHA, el=ELISION), -] - - -__all__ = ["TOKENIZER_SUFFIXES", "TOKENIZER_INFIXES"] + r'(?<=[{a}][{el}])(?=[{a}])'.format(a=ALPHA, el=ELISION)] diff --git a/spacy/fr/tokenizer_exceptions.py b/spacy/fr/tokenizer_exceptions.py index fd05cff95..0472d8734 100644 --- a/spacy/fr/tokenizer_exceptions.py +++ b/spacy/fr/tokenizer_exceptions.py @@ -1,219 +1,151 @@ # coding: utf8 - from __future__ import unicode_literals -from .. import language_data as base -from ..language_data import strings_to_exc, update_exc -from ..language_data.tokenizer_exceptions import _URL_PATTERN -from ..language_data.punctuation import ALPHA_LOWER - -from .punctuation import ELISION, HYPHENS - -from ..symbols import * - -import os -import io import regex as re - -def get_exceptions(): - from ._tokenizer_exceptions_list import BASE_EXCEPTIONS - return BASE_EXCEPTIONS +from .punctuation import ELISION, HYPHENS +from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS +from ..symbols import ORTH, LEMMA, TAG, NORM +from ..deprecated import PRON_LEMMA +from ..language_data.tokenizer_exceptions import _URL_PATTERN +from ..language_data.punctuation import ALPHA_LOWER def upper_first_letter(text): if len(text) == 0: return text - if len(text) == 1: return text.upper() - return text[0].upper() + text[1:] def lower_first_letter(text): if len(text) == 0: return text - if len(text) == 1: return text.lower() - return text[0].lower() + text[1:] -def get_tokenizer_exceptions(): - tokenizer_exceptions = strings_to_exc(base.EMOTICONS) - update_exc(tokenizer_exceptions, strings_to_exc(base.ABBREVIATIONS)) +_exc = { + "J.-C.": [ + {LEMMA: "Jésus", ORTH: "J."}, + {LEMMA: "Christ", ORTH: "-C."}] +} - ABBREVIATIONS_1 = { - "av.": [ - {LEMMA: "avant", ORTH: "av."} - ], - "janv.": [ - {LEMMA: "janvier", ORTH: "janv."} - ], - "févr.": [ - {LEMMA: "février", ORTH: "févr."} - ], - "avr.": [ - {LEMMA: "avril", ORTH: "avr."} - ], - "juill.": [ - {LEMMA: "juillet", ORTH: "juill."} - ], - "sept.": [ - {LEMMA: "septembre", ORTH: "sept."} - ], - "oct.": [ - {LEMMA: "octobre", ORTH: "oct."} - ], - "nov.": [ - {LEMMA: "novembre", ORTH: "nov."} - ], - "déc.": [ - {LEMMA: "décembre", ORTH: "déc."} - ], - "apr.": [ - {LEMMA: "après", ORTH: "apr."} - ], - "J.-C.": [ - {LEMMA: "Jésus", ORTH: "J."}, - {LEMMA: "Christ", ORTH: "-C."} - ], - "Dr.": [ - {LEMMA: "docteur", ORTH: "Dr."} - ], - "M.": [ - {LEMMA: "monsieur", ORTH: "M."} - ], - "Mr.": [ - {LEMMA: "monsieur", ORTH: "Mr."} - ], - "Mme.": [ - {LEMMA: "madame", ORTH: "Mme."} - ], - "Mlle.": [ - {LEMMA: "mademoiselle", ORTH: "Mlle."} - ], - "n°": [ - {LEMMA: "numéro", ORTH: "n°"} - ], - "d°": [ - {LEMMA: "degrés", ORTH: "d°"} - ], - "St.": [ - {LEMMA: "saint", ORTH: "St."} - ], - "Ste.": [ - {LEMMA: "sainte", ORTH: "Ste."} - ] - } - ABBREVIATIONS_2 = [ - "etc.", - ] +for exc_data in [ + {LEMMA: "avant", ORTH: "av."}, + {LEMMA: "janvier", ORTH: "janv."}, + {LEMMA: "février", ORTH: "févr."}, + {LEMMA: "avril", ORTH: "avr."}, + {LEMMA: "juillet", ORTH: "juill."}, + {LEMMA: "septembre", ORTH: "sept."}, + {LEMMA: "octobre", ORTH: "oct."}, + {LEMMA: "novembre", ORTH: "nov."}, + {LEMMA: "décembre", ORTH: "déc."}, + {LEMMA: "après", ORTH: "apr."}, + {LEMMA: "docteur", ORTH: "Dr."}, + {LEMMA: "monsieur", ORTH: "M."}, + {LEMMA: "monsieur", ORTH: "Mr."}, + {LEMMA: "madame", ORTH: "Mme."}, + {LEMMA: "mademoiselle", ORTH: "Mlle."}, + {LEMMA: "numéro", ORTH: "n°"}, + {LEMMA: "degrés", ORTH: "d°"}, + {LEMMA: "saint", ORTH: "St."}, + {LEMMA: "sainte", ORTH: "Ste."}]: + _exc[exc_data[ORTH]] = [dict(exc_data)] - VERBS = {} - for verb, verb_lemma in (("a", "avoir"), ("est", "être"), - ("semble", "sembler"), ("indique", "indiquer"), - ("moque", "moquer"), ("passe", "passer")): - for orth in [verb,verb.title()]: - for pronoun in ("elle", "il", "on"): - token = "{}-t-{}".format(orth, pronoun) - VERBS[token] = [ - {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"}, - {LEMMA: "t", ORTH: "-t"}, - {LEMMA: pronoun, ORTH: "-" + pronoun} - ] - for verb, verb_lemma in [("est","être")]: - for orth in [verb,verb.title()]: - token = "{}-ce".format(orth) - VERBS[token] = [ +for orth in FR_BASE_EXCEPTIONS + ["etc."]: + _exc[orth] = [{ORTH: orth}] + + +for verb, verb_lemma in [ + ("a", "avoir"), + ("est", "être"), + ("semble", "sembler"), + ("indique", "indiquer"), + ("moque", "moquer"), + ("passe", "passer")]: + for orth in [verb, verb.title()]: + for pronoun in ["elle", "il", "on"]: + token = "{}-t-{}".format(orth, pronoun) + _exc[token] = [ {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"}, - {LEMMA: 'ce', ORTH: '-ce'} - ] + {LEMMA: "t", ORTH: "-t"}, + {LEMMA: pronoun, ORTH: "-" + pronoun}] - for pre, pre_lemma in (("qu'", "que"), ("n'", "ne")): - for orth in [pre,pre.title()]: - VERBS['{}est-ce'.format(orth)] = [ - {LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"}, - {LEMMA: 'être', ORTH: "est", TAG: "VERB"}, - {LEMMA: 'ce', ORTH: '-ce'} - ] - - HYPHEN = ['-', '‐'] - - base_exceptions = get_exceptions() - infixes_exceptions = [] - - for elision_char in ELISION: - for hyphen_char in HYPHEN: - infixes_exceptions += [infix.replace("'", elision_char).replace('-', hyphen_char) - for infix in base_exceptions] - - infixes_exceptions += [upper_first_letter(word) for word in infixes_exceptions] - - infixes_exceptions = list(set(infixes_exceptions)) - - update_exc(tokenizer_exceptions, strings_to_exc(infixes_exceptions)) - update_exc(tokenizer_exceptions, ABBREVIATIONS_1) - update_exc(tokenizer_exceptions, strings_to_exc(ABBREVIATIONS_2)) - update_exc(tokenizer_exceptions, VERBS) - return tokenizer_exceptions +for verb, verb_lemma in [ + ("est","être")]: + for orth in [verb, verb.title()]: + token = "{}-ce".format(orth) + _exc[token] = [ + {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"}, + {LEMMA: 'ce', ORTH: '-ce'}] -HYPHEN_PREFIX = [ - 'a[ée]ro', 'abat', 'a[fg]ro', 'after', 'am[ée]ricano', 'anglo', 'anti', 'apr[èe]s', 'arabo', 'arcs?', 'archi', - 'arrières?', 'avant', 'auto', - 'banc', 'bas(?:ses?)?', 'bec?', 'best', 'bio?', 'bien', 'blanc', 'bo[îi]te', 'bois', 'bou(?:c|rg)', 'b[êe]ta', - 'cache', 'cap(?:ello)?', 'champ', 'chapelle', 'ch[âa]teau', 'cha(?:ud|t)e?s?', 'chou', 'chromo', 'claire?s?', - 'co(?:de|ca)?', 'compte', 'contre', 'cordon', 'coupe?', 'court', 'crash', 'crise', 'croche', 'cross', 'cyber', - 'côte', - 'demi', 'di(?:sney)?', 'd[ée]s?', 'double', 'dys', - 'entre', 'est', 'ethno', 'extra', 'extrême', '[ée]co', - 'fil', 'fort', 'franco?s?', - 'gallo', 'gardes?', 'gastro', 'grande?', 'gratte', 'gr[ée]co', 'gros', 'g[ée]o', - 'haute?s?', 'hyper', - 'indo', 'infra', 'inter', 'intra', 'islamo', 'italo', - 'jean', - 'labio', 'latino', 'live', 'lot', 'louis', - 'm[ai]cro', 'mesnil', 'mi(?:ni)?', 'mono', 'mont?s?', 'moyen', 'multi', 'm[ée]cano', 'm[ée]dico', 'm[ée]do', 'm[ée]ta', - 'mots?', - 'noix', 'non', 'nord', 'notre', 'n[ée]o', - 'ouest', 'outre', 'ouvre', - 'passe', 'perce', 'pharmaco', 'ph[oy]to', 'pique', 'poissons?', 'ponce', 'pont', 'po[rs]t', - 'primo', 'pro(?:cès|to)?', 'pare', 'petite?', 'porte', 'pré', 'prêchi', 'pseudo', 'pêle', 'péri', 'puy', - 'quasi', - 'recourt', 'rythmo', 'r[ée]', 'r[ée]tro', - 'sans', 'sainte?s?', 'semi', 'social', 'sous', 'su[bdr]', 'super', - 'tire', 'thermo', 'tiers', 'trans', 'tr(?:i|ou)', 't[ée]l[ée]', - 'vi[cd]e', 'vid[ée]o', 'vie(?:ux|illes?)', 'vill(?:e|eneuve|ers|ette|iers|y)', - 'ultra', - 'à', - '[ée]lectro', '[ée]qui' - ] +for pre, pre_lemma in [ + ("qu'", "que"), + ("n'", "ne")]: + for orth in [pre,pre.title()]: + _exc['%sest-ce' % orth] = [ + {LEMMA: pre_lemma, ORTH: orth, TAG: "ADV"}, + {LEMMA: 'être', ORTH: "est", TAG: "VERB"}, + {LEMMA: 'ce', ORTH: '-ce'}] -ELISION_PREFIX = ['entr', 'grande?s?'] -REGULAR_EXP = [ +_infixes_exc = [] +for elision_char in ELISION: + for hyphen_char in ['-', '‐']: + _infixes_exc += [infix.replace("'", elision_char).replace('-', hyphen_char) + for infix in FR_BASE_EXCEPTIONS] +_infixes_exc += [upper_first_letter(word) for word in _infixes_exc] +_infixes_exc = list(set(_infixes_exc)) + +for exc_data in _infixes_exc: + _exc[orth] = [{ORTH: orth}] + + +_hyphen_prefix = [ + 'a[ée]ro', 'abat', 'a[fg]ro', 'after', 'am[ée]ricano', 'anglo', 'anti', + 'apr[èe]s', 'arabo', 'arcs?', 'archi', 'arrières?', 'avant', 'auto', + 'banc', 'bas(?:ses?)?', 'bec?', 'best', 'bio?', 'bien', 'blanc', 'bo[îi]te', + 'bois', 'bou(?:c|rg)', 'b[êe]ta', 'cache', 'cap(?:ello)?', 'champ', + 'chapelle', 'ch[âa]teau', 'cha(?:ud|t)e?s?', 'chou', 'chromo', 'claire?s?', + 'co(?:de|ca)?', 'compte', 'contre', 'cordon', 'coupe?', 'court', 'crash', + 'crise', 'croche', 'cross', 'cyber', 'côte', 'demi', 'di(?:sney)?', + 'd[ée]s?', 'double', 'dys', 'entre', 'est', 'ethno', 'extra', 'extrême', + '[ée]co', 'fil', 'fort', 'franco?s?', 'gallo', 'gardes?', 'gastro', + 'grande?', 'gratte', 'gr[ée]co', 'gros', 'g[ée]o', 'haute?s?', 'hyper', + 'indo', 'infra', 'inter', 'intra', 'islamo', 'italo', 'jean', 'labio', + 'latino', 'live', 'lot', 'louis', 'm[ai]cro', 'mesnil', 'mi(?:ni)?', 'mono', + 'mont?s?', 'moyen', 'multi', 'm[ée]cano', 'm[ée]dico', 'm[ée]do', 'm[ée]ta', + 'mots?', 'noix', 'non', 'nord', 'notre', 'n[ée]o', 'ouest', 'outre', 'ouvre', + 'passe', 'perce', 'pharmaco', 'ph[oy]to', 'pique', 'poissons?', 'ponce', + 'pont', 'po[rs]t', 'primo', 'pro(?:cès|to)?', 'pare', 'petite?', 'porte', + 'pré', 'prêchi', 'pseudo', 'pêle', 'péri', 'puy', 'quasi', 'recourt', + 'rythmo', 'r[ée]', 'r[ée]tro', 'sans', 'sainte?s?', 'semi', 'social', + 'sous', 'su[bdr]', 'super', 'tire', 'thermo', 'tiers', 'trans', + 'tr(?:i|ou)', 't[ée]l[ée]', 'vi[cd]e', 'vid[ée]o', 'vie(?:ux|illes?)', + 'vill(?:e|eneuve|ers|ette|iers|y)', 'ultra', 'à', '[ée]lectro', '[ée]qui'] + +_elision_prefix = ['entr', 'grande?s?'] +_other_hyphens = ''.join([h for h in HYPHENS if h != '-']) + +_regular_exp = [ '^droits?[{hyphen}]de[{hyphen}]l\'homm[{alpha}]+$'.format(hyphen=HYPHENS, alpha=ALPHA_LOWER), '^zig[{hyphen}]zag[{alpha}]*$'.format(hyphen=HYPHENS, alpha=ALPHA_LOWER), - '^prud[{elision}]homm[{alpha}]*$'.format(elision=ELISION, alpha=ALPHA_LOWER), -] + '^prud[{elision}]homm[{alpha}]*$'.format(elision=ELISION, alpha=ALPHA_LOWER)] +_regular_exp += ["^{prefix}[{hyphen}][{alpha}][{alpha}{elision}{other_hyphen}\-]*$".format( + prefix=p, hyphen=HYPHENS, other_hyphen=_other_hyphens, + elision=ELISION, alpha=ALPHA_LOWER) + for p in _hyphen_prefix] +_regular_exp += ["^{prefix}[{elision}][{alpha}][{alpha}{elision}{hyphen}\-]*$".format( + prefix=p, elision=HYPHENS, hyphen=_other_hyphens, alpha=ALPHA_LOWER) + for p in _elision_prefix] +_regular_exp.append(_URL_PATTERN) -other_hyphens = ''.join([h for h in HYPHENS if h != '-']) -REGULAR_EXP += ["^{prefix}[{hyphen}][{alpha}][{alpha}{elision}{other_hyphen}\-]*$".format( - prefix=p, hyphen=HYPHENS, other_hyphen=other_hyphens, elision=ELISION, alpha=ALPHA_LOWER) - for p in HYPHEN_PREFIX] - -REGULAR_EXP += ["^{prefix}[{elision}][{alpha}][{alpha}{elision}{hyphen}\-]*$".format( - prefix=p, elision=HYPHENS, hyphen=other_hyphens, alpha=ALPHA_LOWER) - for p in ELISION_PREFIX] - -REGULAR_EXP.append(_URL_PATTERN) - -TOKEN_MATCH = re.compile('|'.join('(?:{})'.format(m) for m in REGULAR_EXP), re.IGNORECASE).match - -__all__ = ["get_tokenizer_exceptions", "TOKEN_MATCH"] +TOKENIZER_EXCEPTIONS = dict(_exc) +TOKEN_MATCH = re.compile('|'.join('(?:{})'.format(m) for m in _regular_exp), re.IGNORECASE).match