spaCy/spacy/lang/lb/tokenizer_exceptions.py

from ...symbols import NORM, ORTH
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS

# TODO
# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)

_exc = {}

# translate / delete what is not necessary
for exc_data in [
    {ORTH: "’t", NORM: "et"},
    {ORTH: "’T", NORM: "et"},
    {ORTH: "'t", NORM: "et"},
    {ORTH: "'T", NORM: "et"},
    {ORTH: "wgl.", NORM: "wannechgelift"},
    {ORTH: "M.", NORM: "Monsieur"},
    {ORTH: "Mme.", NORM: "Madame"},
    {ORTH: "Dr.", NORM: "Dokter"},
    {ORTH: "Tel.", NORM: "Telefon"},
    {ORTH: "asw.", NORM: "an sou weider"},
    {ORTH: "etc.", NORM: "et cetera"},
    {ORTH: "bzw.", NORM: "bezéiungsweis"},
    {ORTH: "Jan.", NORM: "Januar"},
]:
    _exc[exc_data[ORTH]] = [exc_data]


# to be extended
for orth in [
    "z.B.",
    "Dipl.",
    "Dr.",
    "etc.",
    "i.e.",
    "o.k.",
    "O.K.",
    "p.a.",
    "p.s.",
    "P.S.",
    "phil.",
    "q.e.d.",
    "R.I.P.",
    "rer.",
    "sen.",
    "ë.a.",
    "U.S.",
    "U.S.A.",
]:
    _exc[orth] = [{ORTH: orth}]

TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)