spaCy/spacy/lang/tr/tokenizer_exceptions.py

28 lines
591 B
Python
Raw Normal View History

2017-11-02 15:32:24 +00:00
# coding: utf8
from __future__ import unicode_literals
from ...symbols import ORTH, NORM
# These exceptions are mostly for example purposes hoping that Turkish
# speakers can contribute in the future! Source of copy-pasted examples:
# https://en.wiktionary.org/wiki/Category:Turkish_language
_exc = {
"sağol": [
{ORTH: "sağ"},
{ORTH: "ol", NORM: "olun"}]
}
for exc_data in [
{ORTH: "A.B.D.", NORM: "Amerika Birleşik Devletleri"}]:
_exc[exc_data[ORTH]] = [exc_data]
for orth in ["Dr."]:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = _exc