spaCy/spacy/lang/ro/tokenizer_exceptions.py

# coding: utf8
from __future__ import unicode_literals

from ...symbols import ORTH


_exc = {}


# Source: https://en.wiktionary.org/wiki/Category:Romanian_abbreviations
for orth in [
    "1-a", "2-a", "3-a", "4-a", "5-a", "6-a", "7-a", "8-a", "9-a", "10-a", "11-a", "12-a",
    "1-ul", "2-lea", "3-lea", "4-lea", "5-lea", "6-lea", "7-lea", "8-lea", "9-lea", "10-lea", "11-lea", "12-lea",
    "d-voastră", "dvs.", "ing.", "dr.", "Rom.", "str.", "nr.", "etc.", "d.p.d.v.", "dpdv", "șamd.", "ș.a.m.d."]:
    _exc[orth] = [{ORTH: orth}]


TOKENIZER_EXCEPTIONS = _exc