spaCy/spacy/lang/pt/tokenizer_exceptions.py

57 lines
667 B
Python
Raw Normal View History

# coding: utf8
from __future__ import unicode_literals
from ...symbols import ORTH, NORM
_exc = {}
2017-05-08 13:52:01 +00:00
for orth in [
"Adm.",
"Art.",
"art.",
"Av.",
"av.",
"Cia.",
"dom.",
"Dr.",
"dr.",
"e.g.",
"E.g.",
"E.G.",
"e/ou",
"ed.",
"eng.",
"etc.",
"Fund.",
"Gen.",
"Gov.",
"i.e.",
"I.e.",
"I.E.",
"Inc.",
"Jr.",
"km/h",
"Ltd.",
"Mr.",
"p.m.",
"Ph.D.",
"Rep.",
"Rev.",
"S/A",
"Sen.",
"Sr.",
"sr.",
"Sra.",
"sra.",
"vs.",
"tel.",
"pág.",
"pag.",
]:
2017-05-08 13:52:01 +00:00
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc