From 28d06ab860414e14b99fffc6d12d8928139a892c Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Fri, 22 Jan 2021 23:08:41 +0600 Subject: [PATCH] Add tokenizer_exceptions --- spacy/lang/ky/__init__.py | 31 +++++++++++++++ spacy/lang/ky/tokenizer_exceptions.py | 55 +++++++++++++++++++++++++++ 2 files changed, 86 insertions(+) create mode 100644 spacy/lang/ky/__init__.py create mode 100644 spacy/lang/ky/tokenizer_exceptions.py diff --git a/spacy/lang/ky/__init__.py b/spacy/lang/ky/__init__.py new file mode 100644 index 000000000..3655e6264 --- /dev/null +++ b/spacy/lang/ky/__init__.py @@ -0,0 +1,31 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .lex_attrs import LEX_ATTRS +from .punctuation import TOKENIZER_INFIXES +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...attrs import LANG +from ...language import Language +from ...util import update_exc + + +class TatarDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "tt" + + lex_attr_getters.update(LEX_ATTRS) + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + infixes = tuple(TOKENIZER_INFIXES) + + stop_words = STOP_WORDS + + +class Tatar(Language): + lang = "tt" + Defaults = TatarDefaults + + +__all__ = ["Tatar"] diff --git a/spacy/lang/ky/tokenizer_exceptions.py b/spacy/lang/ky/tokenizer_exceptions.py new file mode 100644 index 000000000..be5e9530c --- /dev/null +++ b/spacy/lang/ky/tokenizer_exceptions.py @@ -0,0 +1,55 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import ORTH, LEMMA, NORM + +_exc = {} + +_abbrev_exc = [ + # Weekdays abbreviations + {ORTH: "дүй", LEMMA: "дүйшөмбү"}, + {ORTH: "шей", LEMMA: "шейшемби"}, + {ORTH: "шар", LEMMA: "шаршемби"}, + {ORTH: "бей", LEMMA: "бейшемби"}, + {ORTH: "жум", LEMMA: "жума"}, + {ORTH: "ишм", LEMMA: "ишемби"}, + {ORTH: "жек", LEMMA: "жекшемби"}, + # Months abbreviations + {ORTH: "янв", LEMMA: "январь"}, + {ORTH: "фев", LEMMA: "февраль"}, + {ORTH: "мар", LEMMA: "март"}, + {ORTH: "апр", LEMMA: "апрель"}, + {ORTH: "июн", LEMMA: "июнь"}, + {ORTH: "июл", LEMMA: "июль"}, + {ORTH: "авг", LEMMA: "август"}, + {ORTH: "сен", LEMMA: "сентябрь"}, + {ORTH: "окт", LEMMA: "октябрь"}, + {ORTH: "ноя", LEMMA: "ноябрь"}, + {ORTH: "дек", LEMMA: "декабрь"}, + # Number abbreviations + {ORTH: "млрд", LEMMA: "миллиард"}, + {ORTH: "млн", LEMMA: "миллион"}, +] + +for abbr in _abbrev_exc: + for orth in (abbr[ORTH], abbr[ORTH].capitalize(), abbr[ORTH].upper()): + _exc[orth] = [{ORTH: orth, LEMMA: abbr[LEMMA], NORM: abbr[LEMMA]}] + _exc[orth + "."] = [{ORTH: orth + ".", LEMMA: abbr[LEMMA], NORM: abbr[LEMMA]}] + +for exc_data in [ # "etc." abbreviations + {ORTH: "ж.б.у.с.", NORM: "жана башка ушул сыяктуу"}, + {ORTH: "ж.б.", NORM: "жана башка"}, + {ORTH: "ж.", NORM: "жыл"}, + {ORTH: "б.з.ч.", NORM: "биздин заманга чейин"}, + {ORTH: "б.з.", NORM: "биздин заман"}, + {ORTH: "кк.", NORM: "кылымдар"}, + {ORTH: "жж.", NORM: "жылдар"}, + {ORTH: "к.", NORM: "кылым"}, + {ORTH: "көч.", NORM: "көчөсү"}, + {ORTH: "м-н", NORM: "менен"}, + {ORTH: "б-ча", NORM: "боюнча"}, +]: + exc_data[LEMMA] = exc_data[NORM] + _exc[exc_data[ORTH]] = [exc_data] + +TOKENIZER_EXCEPTIONS = _exc