mirror of https://github.com/explosion/spaCy.git
28 lines
591 B
Python
28 lines
591 B
Python
|
# coding: utf8
|
|||
|
from __future__ import unicode_literals
|
|||
|
|
|||
|
from ...symbols import ORTH, NORM
|
|||
|
|
|||
|
|
|||
|
# These exceptions are mostly for example purposes – hoping that Turkish
|
|||
|
# speakers can contribute in the future! Source of copy-pasted examples:
|
|||
|
# https://en.wiktionary.org/wiki/Category:Turkish_language
|
|||
|
|
|||
|
_exc = {
|
|||
|
"sağol": [
|
|||
|
{ORTH: "sağ"},
|
|||
|
{ORTH: "ol", NORM: "olun"}]
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
for exc_data in [
|
|||
|
{ORTH: "A.B.D.", NORM: "Amerika Birleşik Devletleri"}]:
|
|||
|
_exc[exc_data[ORTH]] = [exc_data]
|
|||
|
|
|||
|
|
|||
|
for orth in ["Dr."]:
|
|||
|
_exc[orth] = [{ORTH: orth}]
|
|||
|
|
|||
|
|
|||
|
TOKENIZER_EXCEPTIONS = _exc
|