spaCy/spacy/lang/lij/tokenizer_exceptions.py

from ...symbols import ORTH
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS

_exc = {}

for raw in [
    "a-e",
    "a-o",
    "a-i",
    "a-a",
    "co-a",
    "co-e",
    "co-i",
    "co-o",
    "da-a",
    "da-e",
    "da-i",
    "da-o",
    "pe-a",
    "pe-e",
    "pe-i",
    "pe-o",
]:
    for orth in [raw, raw.capitalize()]:
        _exc[orth] = [{ORTH: orth}]

# Prefix + prepositions with à (e.g. "sott'a-o")

for prep in [
    "a-a",
    "a-e",
    "a-o",
    "a-i",
]:
    for prefix in [
        "sott'",
        "sott’",
        "contr'",
        "contr’",
        "ch'",
        "ch’",
        "s'",
        "s’",
    ]:
        for prefix_orth in [prefix, prefix.capitalize()]:
            _exc[prefix_orth + prep] = [{ORTH: prefix_orth}, {ORTH: prep}]

TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)