refactored french tokenizer

2024-03-15 08:45:23 +01:00 · 2024-03-15 08:45:23 +01:00 · 5a3928fe1e
parent 0518c36f04
commit 5a3928fe1e
2 changed files with 30 additions and 16067 deletions
--- a/spacy/lang/fr/_tokenizer_exceptions_list.py
+++ b/spacy/lang/fr/_tokenizer_exceptions_list.py
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@ -1,443 +1,36 @@
 import re
-from ...symbols import ORTH
+_hyphen = "-–—"
-from ...util import update_exc
+_apostrophe = "'`´’"
 from ..char_classes import ALPHA, ALPHA_LOWER
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from .punctuation import ELISION, HYPHENS
-# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
+# fmt: off
-# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
+_suffix_inversion = r"|".join([
-FR_BASE_EXCEPTIONS = ["aujourd'hui", "Aujourd'hui"]
+    "je", "tu", "on", "il", "elle", "iel",
    "nous", "vous", "elles", "ils", "iels",
    # écoutons-les
    "moi", "toi", "lui", "leur",
    "eux",
    fr"en(?![{_hyphen}])", "y",
    # écoutons-les
    fr"la(?![{_hyphen}])", fr"le(?![{_hyphen}])", fr"les(?![{_hyphen}])",
    # a-t-il, pourra-t'on, dis-m'en plus
    fr"t[{_hyphen}]??[{_apostrophe}]?", fr"m[{_apostrophe}]?",
    "là", "ici",
 ])
 _prefix_elision = r"|".join([
    "n", "s", "c", "d", "j", "m", "t", "l", "qu",
    # i exclude "quelqu'un"/"quelqu'un" because it's one token (and not quelque + un, which lack the idea of 'one person').
    fr"quelqu(?![{_apostrophe}]un[ex]*\b)",  # quelque
    "jusqu", "presqu", "lorsqu", "puisqu", "quoiqu",
 ])
 # fmt: on
 _elision = rf"(?:\b(?:{_prefix_elision})[{_apostrophe}])"
 _inversion = (
    rf"(?:(?<=[^\W\d])[{_hyphen}]\b(?:{_suffix_inversion})\b)"
 )
 def upper_first_letter(text):
    if len(text) == 0:
        return text
    if len(text) == 1:
        return text.upper()
    return text[0].upper() + text[1:]
 def lower_first_letter(text):
    if len(text) == 0:
        return text
    if len(text) == 1:
        return text.lower()
    return text[0].lower() + text[1:]
 _exc = {"J.-C.": [{ORTH: "J."}, {ORTH: "-C."}]}
 for exc_data in [
    {ORTH: "av."},
    {ORTH: "janv."},
    {ORTH: "févr."},
    {ORTH: "avr."},
    {ORTH: "juill."},
    {ORTH: "sept."},
    {ORTH: "oct."},
    {ORTH: "nov."},
    {ORTH: "déc."},
    {ORTH: "apr."},
    {ORTH: "Dr."},
    {ORTH: "M."},
    {ORTH: "Mr."},
    {ORTH: "Mme."},
    {ORTH: "Mlle."},
    {ORTH: "n°"},
    {ORTH: "d°"},
    {ORTH: "St."},
    {ORTH: "Ste."},
 ]:
    _exc[exc_data[ORTH]] = [exc_data]
 for orth in [
    "après-midi",
    "au-delà",
    "au-dessus",
    "celle-ci",
    "celles-ci",
    "celui-ci",
    "cf.",
    "ci-dessous",
    "elle-même",
    "en-dessous",
    "etc.",
    "jusque-là",
    "lui-même",
    "MM.",
    "No.",
    "peut-être",
    "pp.",
    "quelques-uns",
    "rendez-vous",
    "Vol.",
 ]:
    _exc[orth] = [{ORTH: orth}]
 for verb in [
    "a",
    "est",
    "semble",
    "indique",
    "moque",
    "passe",
 ]:
    for orth in [verb, verb.title()]:
        for pronoun in ["elle", "il", "on"]:
            token = f"{orth}-t-{pronoun}"
            _exc[token] = [{ORTH: orth}, {ORTH: "-t"}, {ORTH: "-" + pronoun}]
 for verb in ["est"]:
    for orth in [verb, verb.title()]:
        _exc[f"{orth}-ce"] = [{ORTH: orth}, {ORTH: "-ce"}]
 for pre in ["qu'", "n'"]:
    for orth in [pre, pre.title()]:
        _exc[f"{orth}est-ce"] = [{ORTH: orth}, {ORTH: "est"}, {ORTH: "-ce"}]
 for verb, pronoun in [("est", "il"), ("EST", "IL")]:
    _exc[f"{verb}-{pronoun}"] = [{ORTH: verb}, {ORTH: "-" + pronoun}]
 for s, verb, pronoun in [("s", "est", "il"), ("S", "EST", "IL")]:
    _exc[f"{s}'{verb}-{pronoun}"] = [
        {ORTH: s + "'"},
        {ORTH: verb},
        {ORTH: "-" + pronoun},
    ]
 _infixes_exc = []  # type: ignore[var-annotated]
 orig_elision = "'"
 orig_hyphen = "-"
 # loop through the elison and hyphen characters, and try to substitute the ones that weren't used in the original list
 for infix in FR_BASE_EXCEPTIONS:
    variants_infix = {infix}
    for elision_char in [x for x in ELISION if x != orig_elision]:
        variants_infix.update(
            [word.replace(orig_elision, elision_char) for word in variants_infix]
        )
    for hyphen_char in [x for x in ["-", "‐"] if x != orig_hyphen]:
        variants_infix.update(
            [word.replace(orig_hyphen, hyphen_char) for word in variants_infix]
        )
    variants_infix.update([upper_first_letter(word) for word in variants_infix])
    _infixes_exc.extend(variants_infix)
 for orth in _infixes_exc:
    _exc[orth] = [{ORTH: orth}]
 _hyphen_prefix = [
    "a[ée]ro",
    "abat",
    "a[fg]ro",
    "after",
    "aigues?",
    "am[ée]ricano",
    "anglo",
    "anti",
    "apr[èe]s",
    "arabo",
    "arcs?",
    "archi",
    "arrières?",
    "audio",
    "avant",
    "avion",
    "auto",
    "banc",
    "bas(?:ses?)?",
    "bateaux?",
    "bec?",
    "belles?",
    "beau",
    "best",
    "bio?",
    "bien",
    "blanc",
    "bo[îi]te",
    "bonn?e?s?",
    "bois",
    "bou(?:c|rg)",
    "b[êe]ta",
    "cache",
    "cap(?:ello)?",
    "casse",
    "castel",
    "champ",
    "chapelle",
    "ch[âa]teau(?:neuf)?",
    "chasse",
    "cha(?:ud|t)e?s?",
    "chauffe",
    "chou",
    "chromo",
    "claire?s?",
    "co(?:de|ca)?",
    "compte",
    "contre",
    "cordon",
    "coupe?",
    "courte?s?",
    "couvre",
    "crash",
    "crise",
    "croche",
    "cross",
    "cyber",
    "côte",
    "demi",
    "di(?:sney)?",
    "dix",
    "d[ée]s?",
    "dys",
    "ex?",
    "émirato",
    "entre",
    "est",
    "ethno",
    "ex",
    "extra",
    "extrême",
    "[ée]co",
    "faux",
    "fil",
    "fort",
    "franco?s?",
    "gallo",
    "gardes?",
    "gastro",
    "grande?",
    "gratte",
    "gr[ée]co",
    "gros",
    "g[ée]o",
    "haute?s?",
    "homm?es?",
    "hors",
    "hyper",
    "indo",
    "infra",
    "inter",
    "intra",
    "islamo",
    "italo",
    "jean",
    "labio",
    "latino",
    "live",
    "lot",
    "louis",
    "m[ai]cro",
    "mal",
    "médio",
    "mesnil",
    "mi(?:ni)?",
    "mono",
    "mont?s?",
    "moyen",
    "multi",
    "m[ée]cano",
    "m[ée]dico",
    "m[ée]do",
    "m[ée]ta",
    "mots?",
    "neuro",
    "noix",
    "non",
    "nord",
    "notre",
    "n[ée]o",
    "ouest",
    "outre",
    "ouvre",
    "passe",
    "perce",
    "pharmaco",
    "ph[oy]to",
    "pieds?",
    "pique",
    "poissons?",
    "ponce",
    "pont",
    "po[rs]t",
    "pousse",
    "primo",
    "pro(?:cès|to)?",
    "pare",
    "petite?s?",
    "plessis",
    "porte",
    "pré",
    "prêchi",
    "protège",
    "pseudo",
    "pêle",
    "péri",
    "puy",
    "quasi",
    "quatre",
    "radio",
    "recourt",
    "rythmo",
    "(?:re)?doubles?",
    "r[ée]",
    "r[ée]tro",
    "requin",
    "sans?",
    "sa?inte?s?",
    "semi",
    "serre",
    "sino",
    "socio",
    "sociale?s?",
    "soixante",
    "sous",
    "su[bdrs]",
    "super",
    "taille",
    "tire",
    "thermo",
    "tiers",
    "tourne",
    "toute?s?",
    "tra[iî]ne?",
    "trans",
    "trente",
    "trois",
    "trousse",
    "tr(?:i|ou)",
    "t[ée]l[ée]",
    "utéro",
    "vaso",
    "vi[cd]e",
    "vid[ée]o",
    "vie(?:ux|i?lles?|i?l)",
    "vill(?:e|eneuve|ers|ette|iers|y)",
    "vingt",
    "voitures?",
    "wagons?",
    "ultra",
    "à",
    "[ée]lectro",
    "[ée]qui",
    "Fontaine",
    "La Chapelle",
    "Marie",
    "Le Mesnil",
    "Neuville",
    "Pierre",
    "Val",
    "Vaux",
 ]
 _regular_exp = [
    "^a[{hyphen}]sexualis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^arginine[{hyphen}]méthyl[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^binge[{hyphen}]watch[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^black[{hyphen}]out[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^bouche[{hyphen}]por[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^burn[{hyphen}]out[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^by[{hyphen}]pass[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^ch[{elision}]tiis[{al}]+$".format(elision=ELISION, al=ALPHA_LOWER),
    "^chape[{hyphen}]chut[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^down[{hyphen}]load[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^[ée]tats[{hyphen}]uni[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^droits?[{hyphen}]de[{hyphen}]l'homm[{al}]+$".format(
        hyphen=HYPHENS, al=ALPHA_LOWER
    ),
    "^fac[{hyphen}]simil[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^fleur[{hyphen}]bleuis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^flic[{hyphen}]flaqu[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^fox[{hyphen}]trott[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^google[{hyphen}]is[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^hard[{hyphen}]discount[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^hip[{hyphen}]hop[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^jet[{hyphen}]set[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^knock[{hyphen}]out[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^lèche[{hyphen}]bott[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^litho[{hyphen}]typographi[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^lock[{hyphen}]out[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^lombri[{hyphen}]compost[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^mac[{hyphen}]adamis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^marque[{hyphen}]pag[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^mouton[{hyphen}]noiris[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^new[{hyphen}]york[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^pair[{hyphen}]programm[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^people[{hyphen}]is[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^plan[{hyphen}]socialis[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^premier[{hyphen}]ministr[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^prud[{elision}]hom[{al}]+$".format(elision=ELISION, al=ALPHA_LOWER),
    "^réarc[{hyphen}]bout[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^refox[{hyphen}]trott[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^remicro[{hyphen}]ond[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^repique[{hyphen}]niqu[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^repetit[{hyphen}]déjeun[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^rick[{hyphen}]roll[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^rond[{hyphen}]ponn[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^shift[{hyphen}]cliqu[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^soudo[{hyphen}]bras[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^stabilo[{hyphen}]boss[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^strip[{hyphen}]teas[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^terra[{hyphen}]form[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^teuf[{hyphen}]teuf[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^yo[{hyphen}]yo[{al}]+$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^zig[{hyphen}]zag[{al}]*$".format(hyphen=HYPHENS, al=ALPHA_LOWER),
    "^z[{elision}]yeut[{al}]+$".format(elision=ELISION, al=ALPHA_LOWER),
 ]
 # catching cases like faux-vampire
 _regular_exp += [
    "^{prefix}[{hyphen}][{al}][{hyphen}{al}{elision}]*$".format(
        prefix=p,
        hyphen=HYPHENS,  # putting the - first in the [] range avoids having to use a backslash
        elision=ELISION,
        al=ALPHA_LOWER,
    )
    for p in _hyphen_prefix
 ]
 # catching cases like entr'abat
 _elision_prefix = ["r?é?entr", "grande?s?", "r"]
 _regular_exp += [
    "^{prefix}[{elision}][{al}][{hyphen}{al}{elision}]*$".format(
        prefix=p, elision=ELISION, hyphen=HYPHENS, al=ALPHA_LOWER
    )
    for p in _elision_prefix
 ]
 # catching cases like saut-de-ski, pet-en-l'air
 _hyphen_combination = [
    "l[èe]s?",
    "la",
    "en",
    "des?",
    "d[eu]",
    "sur",
    "sous",
    "aux?",
    "à",
    "et",
    "près",
    "saint",
 ]
 _regular_exp += [
    "^[{a}]+[{hyphen}]{hyphen_combo}[{hyphen}](?:l[{elision}])?[{a}]+$".format(
        hyphen_combo=hc, elision=ELISION, hyphen=HYPHENS, a=ALPHA
    )
    for hc in _hyphen_combination
 ]
 TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 TOKEN_MATCH = re.compile(
-    "(?iu)" + "|".join("(?:{})".format(m) for m in _regular_exp)
+    r"(?iu)" + r"|".join([_inversion, _elision])
-).match
+)
 # _abbrevs = ["ste?", "mme", "mr?", "mlle", "dr", "etc", "cf"]