From 28d06ab860414e14b99fffc6d12d8928139a892c Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Fri, 22 Jan 2021 23:08:41 +0600
Subject: [PATCH 01/13] Add tokenizer_exceptions

---
 spacy/lang/ky/__init__.py             | 31 +++++++++++++++
 spacy/lang/ky/tokenizer_exceptions.py | 55 +++++++++++++++++++++++++++
 2 files changed, 86 insertions(+)
 create mode 100644 spacy/lang/ky/__init__.py
 create mode 100644 spacy/lang/ky/tokenizer_exceptions.py

diff --git a/spacy/lang/ky/__init__.py b/spacy/lang/ky/__init__.py
new file mode 100644
index 000000000..3655e6264
--- /dev/null
+++ b/spacy/lang/ky/__init__.py
@@ -0,0 +1,31 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ...attrs import LANG
+from ...language import Language
+from ...util import update_exc
+
+
+class TatarDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: "tt"
+
+    lex_attr_getters.update(LEX_ATTRS)
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    infixes = tuple(TOKENIZER_INFIXES)
+
+    stop_words = STOP_WORDS
+
+
+class Tatar(Language):
+    lang = "tt"
+    Defaults = TatarDefaults
+
+
+__all__ = ["Tatar"]
diff --git a/spacy/lang/ky/tokenizer_exceptions.py b/spacy/lang/ky/tokenizer_exceptions.py
new file mode 100644
index 000000000..be5e9530c
--- /dev/null
+++ b/spacy/lang/ky/tokenizer_exceptions.py
@@ -0,0 +1,55 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import ORTH, LEMMA, NORM
+
+_exc = {}
+
+_abbrev_exc = [
+    # Weekdays abbreviations
+    {ORTH: "дүй", LEMMA: "дүйшөмбү"},
+    {ORTH: "шей", LEMMA: "шейшемби"},
+    {ORTH: "шар", LEMMA: "шаршемби"},
+    {ORTH: "бей", LEMMA: "бейшемби"},
+    {ORTH: "жум", LEMMA: "жума"},
+    {ORTH: "ишм", LEMMA: "ишемби"},
+    {ORTH: "жек", LEMMA: "жекшемби"},
+    # Months abbreviations
+    {ORTH: "янв", LEMMA: "январь"},
+    {ORTH: "фев", LEMMA: "февраль"},
+    {ORTH: "мар", LEMMA: "март"},
+    {ORTH: "апр", LEMMA: "апрель"},
+    {ORTH: "июн", LEMMA: "июнь"},
+    {ORTH: "июл", LEMMA: "июль"},
+    {ORTH: "авг", LEMMA: "август"},
+    {ORTH: "сен", LEMMA: "сентябрь"},
+    {ORTH: "окт", LEMMA: "октябрь"},
+    {ORTH: "ноя", LEMMA: "ноябрь"},
+    {ORTH: "дек", LEMMA: "декабрь"},
+    # Number abbreviations
+    {ORTH: "млрд", LEMMA: "миллиард"},
+    {ORTH: "млн", LEMMA: "миллион"},
+]
+
+for abbr in _abbrev_exc:
+    for orth in (abbr[ORTH], abbr[ORTH].capitalize(), abbr[ORTH].upper()):
+        _exc[orth] = [{ORTH: orth, LEMMA: abbr[LEMMA], NORM: abbr[LEMMA]}]
+        _exc[orth + "."] = [{ORTH: orth + ".", LEMMA: abbr[LEMMA], NORM: abbr[LEMMA]}]
+
+for exc_data in [  # "etc." abbreviations
+    {ORTH: "ж.б.у.с.", NORM: "жана башка ушул сыяктуу"},
+    {ORTH: "ж.б.", NORM: "жана башка"},
+    {ORTH: "ж.", NORM: "жыл"},
+    {ORTH: "б.з.ч.", NORM: "биздин заманга чейин"},
+    {ORTH: "б.з.", NORM: "биздин заман"},
+    {ORTH: "кк.", NORM: "кылымдар"},
+    {ORTH: "жж.", NORM: "жылдар"},
+    {ORTH: "к.", NORM: "кылым"},
+    {ORTH: "көч.", NORM: "көчөсү"},
+    {ORTH: "м-н", NORM: "менен"},
+    {ORTH: "б-ча", NORM: "боюнча"},
+]:
+    exc_data[LEMMA] = exc_data[NORM]
+    _exc[exc_data[ORTH]] = [exc_data]
+
+TOKENIZER_EXCEPTIONS = _exc

From 101d265778633f5f4cbe15013ab8c5cc3c9f3789 Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Sat, 23 Jan 2021 21:25:28 +0600
Subject: [PATCH 02/13] Add stopwords

---
 spacy/lang/ky/stop_words.py | 47 +++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 spacy/lang/ky/stop_words.py

diff --git a/spacy/lang/ky/stop_words.py b/spacy/lang/ky/stop_words.py
new file mode 100644
index 000000000..1f59539fe
--- /dev/null
+++ b/spacy/lang/ky/stop_words.py
@@ -0,0 +1,47 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+# Tatar stopwords are from https://github.com/aliiae/stopwords-tt
+
+STOP_WORDS = set(
+"""
+ага адам айтты айтымында айтып ал алар
+алардын алган алуу алып анда андан аны
+анын ар
+
+бар басма баш башка башкы башчысы берген
+биз билдирген билдирди бир биринчи бирок
+бишкек болгон болот болсо болуп боюнча
+буга бул
+
+гана
+
+да дагы деген деди деп
+
+жана жатат жаткан жаңы же жогорку жок жол
+жолу
+
+кабыл калган кандай карата каршы катары
+келген керек кийин кол кылмыш кыргыз
+күнү көп
+
+маалымат мамлекеттик мен менен миң
+мурдагы мыйзам мындай мүмкүн
+
+ошол ошондой
+
+сүрөт сөз
+
+тарабынан турган тууралуу
+
+укук учурда
+
+чейин чек
+
+экенин эки эл эле эмес эми эч
+
+үч үчүн
+
+өз
+""".split()
+)

From 4418ec2eeedb0889968127ac4c0d9a1a0439723b Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Sat, 23 Jan 2021 21:31:31 +0600
Subject: [PATCH 03/13] Add punctuation

---
 spacy/lang/ky/punctuation.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 spacy/lang/ky/punctuation.py

diff --git a/spacy/lang/ky/punctuation.py b/spacy/lang/ky/punctuation.py
new file mode 100644
index 000000000..9ee66a59e
--- /dev/null
+++ b/spacy/lang/ky/punctuation.py
@@ -0,0 +1,23 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, HYPHENS
+from ..char_classes import LIST_ELLIPSES, LIST_ICONS
+
+_hyphens_no_dash = HYPHENS.replace("-", "").strip("|").replace("||", "")
+_infixes = (
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
+        r"(?<=[{a}])[,!?/()]+(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}{q}])[:<>=](?=[{a}])".format(a=ALPHA, q=CONCAT_QUOTES),
+        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES),
+        r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=_hyphens_no_dash),
+        r"(?<=[0-9])-(?=[0-9])",
+    ]
+)
+
+TOKENIZER_INFIXES = _infixes

From d53724ba1d6a22b3f25fad118dce14e3495040e5 Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Sat, 23 Jan 2021 21:35:25 +0600
Subject: [PATCH 04/13] Add lex_attrs

---
 spacy/lang/ky/lex_attrs.py | 51 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 spacy/lang/ky/lex_attrs.py

diff --git a/spacy/lang/ky/lex_attrs.py b/spacy/lang/ky/lex_attrs.py
new file mode 100644
index 000000000..af926b138
--- /dev/null
+++ b/spacy/lang/ky/lex_attrs.py
@@ -0,0 +1,51 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...attrs import LIKE_NUM
+
+_num_words = [
+    "нөл",
+    "ноль",
+    "бир",
+    "эки",
+    "үч",
+    "төрт",
+    "беш",
+    "алты",
+    "жети",
+    "сегиз",
+    "тогуз",
+    "он",
+    "жыйырма",
+    "отуз",
+    "кырк",
+    "элүү",
+    "алтымыш",
+    "жетмиш",
+    "сексен",
+    "токсон",
+    "жүз",
+    "миң",
+    "миллион",
+    "миллиард",
+    "триллион",
+    "триллиард",
+]
+
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    if text in _num_words:
+        return True
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}

From 2f385385a95f9a6ce22dc8489a95fcd58b853fc2 Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Sat, 23 Jan 2021 21:36:28 +0600
Subject: [PATCH 05/13] Remove comment

---
 spacy/lang/ky/stop_words.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/spacy/lang/ky/stop_words.py b/spacy/lang/ky/stop_words.py
index 1f59539fe..eede62767 100644
--- a/spacy/lang/ky/stop_words.py
+++ b/spacy/lang/ky/stop_words.py
@@ -1,8 +1,6 @@
 # encoding: utf8
 from __future__ import unicode_literals
 
-# Tatar stopwords are from https://github.com/aliiae/stopwords-tt
-
 STOP_WORDS = set(
 """
 ага адам айтты айтымында айтып ал алар

From e30bbf5432c86352c9ae0e7f9b5329ac6ba39620 Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Sat, 23 Jan 2021 21:49:08 +0600
Subject: [PATCH 06/13] Add examples

---
 spacy/lang/ky/examples.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 spacy/lang/ky/examples.py

diff --git a/spacy/lang/ky/examples.py b/spacy/lang/ky/examples.py
new file mode 100644
index 000000000..f1f31e3ab
--- /dev/null
+++ b/spacy/lang/ky/examples.py
@@ -0,0 +1,19 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+"""
+Example sentences to test spaCy and its language models.
+>>> from spacy.lang.ky.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+sentences = [
+    "Apple Улуу Британия стартабын $1 миллиардга сатып алууну көздөөдө.",
+    "Автоном автомобилдерди камсыздоо жоопкерчилиги өндүрүүчүлөргө артылды.",
+    "Сан-Франциско тротуар менен жүрүүчү робот-курьерлерге тыю салууну караштырууда.",
+    "Лондон - Улуу Британияда жайгашкан ири шаар.",
+    "Кайдасың?",
+    "Франциянын президенти ким?",
+    "Америка Кошмо Штаттарынын борбор калаасы кайсы шаар?",
+    "Барак Обама качан төрөлгөн?",
+]

From fe3b5b8ff596117d39a9143f8d076a601e8016db Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Sat, 23 Jan 2021 21:53:41 +0600
Subject: [PATCH 07/13] Add kyrgyz to char_classes

---
 spacy/lang/char_classes.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py
index 3fb0fb41e..d876d375a 100644
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@@ -207,6 +207,10 @@ _tatar_lower = r"әөүҗңһ"
 _tatar_upper = r"ӘӨҮҖҢҺ"
 _tatar = r"әөүҗңһӘӨҮҖҢҺ"
 
+_kyrgyz_lower = r"өңү"
+_kyrgyz_upper = r"ӨҢҮ"
+_kyrgyz = r"өңүӨҢҮ"
+
 _greek_lower = r"α-ωάέίόώήύ"
 _greek_upper = r"Α-ΩΆΈΊΌΏΉΎ"
 _greek = r"α-ωάέίόώήύΑ-ΩΆΈΊΌΏΉΎ"
@@ -219,8 +223,8 @@ _macedonian_lower = r"ѓѕјљњќѐѝ"
 _macedonian_upper = r"ЃЅЈЉЊЌЀЍ"
 _macedonian = r"ѓѕјљњќѐѝЃЅЈЉЊЌЀЍ"
 
-_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper + _macedonian_upper
-_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower + _macedonian_lower
+_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _kyrgyz_upper + _greek_upper + _ukrainian_upper + _macedonian_upper
+_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _kyrgyz_lower + _greek_lower + _ukrainian_lower + _macedonian_lower
 
 _uncased = (
     _ethiopic
@@ -236,7 +240,7 @@ _uncased = (
     + _cjk
 )
 
-ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _macedonian + _uncased)
+ALPHA = group_chars(LATIN + _russian + _tatar + _kyrgyz + _greek + _ukrainian + _macedonian + _uncased)
 ALPHA_LOWER = group_chars(_lower + _uncased)
 ALPHA_UPPER = group_chars(_upper + _uncased)
 

From 2a2646362be11ee9122328353d46f24277a6b1b5 Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Sat, 23 Jan 2021 22:00:50 +0600
Subject: [PATCH 08/13] Fix language subclass

---
 spacy/lang/ky/__init__.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/spacy/lang/ky/__init__.py b/spacy/lang/ky/__init__.py
index 3655e6264..4656cfeb9 100644
--- a/spacy/lang/ky/__init__.py
+++ b/spacy/lang/ky/__init__.py
@@ -11,9 +11,9 @@ from ...language import Language
 from ...util import update_exc
 
 
-class TatarDefaults(Language.Defaults):
+class KyrgyzDefaults(Language.Defaults):
     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
-    lex_attr_getters[LANG] = lambda text: "tt"
+    lex_attr_getters[LANG] = lambda text: "ky"
 
     lex_attr_getters.update(LEX_ATTRS)
 
@@ -23,9 +23,9 @@ class TatarDefaults(Language.Defaults):
     stop_words = STOP_WORDS
 
 
-class Tatar(Language):
-    lang = "tt"
-    Defaults = TatarDefaults
+class Kyrgyz(Language):
+    lang = "ky"
+    Defaults = KyrgyzDefaults
 
 
-__all__ = ["Tatar"]
+__all__ = ["Kyrgyz"]

From 53abf759ad035ad64d4cfb1f0ae3ced1a6e00522 Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Sun, 24 Jan 2021 20:54:22 +0600
Subject: [PATCH 09/13] Fix punctuation

---
 spacy/lang/ky/punctuation.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/lang/ky/punctuation.py b/spacy/lang/ky/punctuation.py
index 9ee66a59e..22c2061ca 100644
--- a/spacy/lang/ky/punctuation.py
+++ b/spacy/lang/ky/punctuation.py
@@ -16,6 +16,7 @@ _infixes = (
         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
         r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES),
         r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=_hyphens_no_dash),
+        r"(?<=[0-9])-(?=[{a}])".format(a=ALPHA),
         r"(?<=[0-9])-(?=[0-9])",
     ]
 )

From 87168eb81f679ba17b7ddac9fb934b058c70a40c Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Sun, 24 Jan 2021 20:56:16 +0600
Subject: [PATCH 10/13] Add tests

---
 spacy/tests/conftest.py               |  5 ++
 spacy/tests/lang/ky/__init__.py       |  0
 spacy/tests/lang/ky/test_tokenizer.py | 91 +++++++++++++++++++++++++++
 3 files changed, 96 insertions(+)
 create mode 100644 spacy/tests/lang/ky/__init__.py
 create mode 100644 spacy/tests/lang/ky/test_tokenizer.py

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 90a18925b..ad545bcfd 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -262,6 +262,11 @@ def tt_tokenizer():
     return get_lang_class("tt").Defaults.create_tokenizer()
 
 
+@pytest.fixture(scope="session")
+def ky_tokenizer():
+    return get_lang_class("ky").Defaults.create_tokenizer()
+
+
 @pytest.fixture(scope="session")
 def uk_tokenizer():
     pytest.importorskip("pymorphy2")
diff --git a/spacy/tests/lang/ky/__init__.py b/spacy/tests/lang/ky/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/ky/test_tokenizer.py b/spacy/tests/lang/ky/test_tokenizer.py
new file mode 100644
index 000000000..99dab2b16
--- /dev/null
+++ b/spacy/tests/lang/ky/test_tokenizer.py
@@ -0,0 +1,91 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+
+
+INFIX_HYPHEN_TESTS = [
+    ("Бала-чака жакшыбы?", "Бала-чака жакшыбы ?".split()),
+    ("Кыз-келиндер кийими.", "Кыз-келиндер кийими .".split()),
+]
+
+PUNC_INSIDE_WORDS_TESTS = [
+    (
+        "Пассажир саны - 2,13 млн — киши/күнүнө (2010), 783,9 млн. киши/жылына.",
+        "Пассажир саны - 2,13 млн — киши / күнүнө ( 2010 ) ,"
+        " 783,9 млн. киши / жылына .".split(),
+    ),
+    ('То"кой', 'То " кой'.split()),
+]
+
+MIXED_ORDINAL_NUMS_TESTS = [
+    ("Эртең 22-январь...", "Эртең 22 - январь ...".split())
+]
+
+ABBREV_TESTS = [
+    ("Маселе б-ча эртең келет", "Маселе б-ча эртең келет".split()),
+    ("Ахунбаев көч. турат.", "Ахунбаев көч. турат .".split()),
+    ("«3-жылы (б.з.ч.) туулган", "« 3 - жылы ( б.з.ч. ) туулган".split()),
+    ("Жүгөрү ж.б. дандар колдонулат", "Жүгөрү ж.б. дандар колдонулат".split()),
+    ("3-4 кк. курулган.", "3 - 4 кк. курулган .".split()),
+]
+
+NAME_ABBREV_TESTS = [
+    ("М.Жумаш", "М.Жумаш".split()),
+    ("М.жумаш", "М.жумаш".split()),
+    ("м.Жумаш", "м . Жумаш".split()),
+    ("Жумаш М.Н.", "Жумаш М.Н.".split()),
+    ("Жумаш.", "Жумаш .".split()),
+]
+
+TYPOS_IN_PUNC_TESTS = [
+    ("«3-жылда , туулган", "« 3 - жылда , туулган".split()),
+    ("«3-жылда,туулган", "« 3 - жылда , туулган".split()),
+    ("«3-жылда,туулган.", "« 3 - жылда , туулган .".split()),
+    ("Ал иштейт(качан?)", "Ал иштейт ( качан ? )".split()),
+    ("Ал (качан?)иштейт", "Ал ( качан ?) иштейт".split()),  # "?)" => "?)" or "? )"
+]
+
+LONG_TEXTS_TESTS = [
+    (
+        "Алыскы өлкөлөргө аздыр-көптүр татаалыраак жүрүштөргө чыккандар "
+        "азыраак: ал бир топ кымбат жана логистика маселесинин айынан "
+        "кыйла татаал. Мисалы, январдагы майрамдарда Мароккого үчүнчү "
+        "категориядагы маршрутка (100 чакырымдан кем эмес) барып "
+        "келгенге аракет кылдык.",
+        "Алыскы өлкөлөргө аздыр-көптүр татаалыраак жүрүштөргө чыккандар "
+        "азыраак : ал бир топ кымбат жана логистика маселесинин айынан "
+        "кыйла татаал . Мисалы , январдагы майрамдарда Мароккого үчүнчү "
+        "категориядагы маршрутка ( 100 чакырымдан кем эмес ) барып "
+        "келгенге аракет кылдык .".split(),
+    )
+]
+
+TESTCASES = (
+    INFIX_HYPHEN_TESTS
+    + PUNC_INSIDE_WORDS_TESTS
+    + MIXED_ORDINAL_NUMS_TESTS
+    + ABBREV_TESTS
+    + NAME_ABBREV_TESTS
+    + LONG_TEXTS_TESTS
+    + TYPOS_IN_PUNC_TESTS
+)
+
+NORM_TESTCASES = [
+    (
+        "ит, мышык ж.б.у.с. үй жаныбарлары.",
+        ["ит", ",", "мышык", "жана башка ушул сыяктуу", "үй", "жаныбарлары", "."],
+    )
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", TESTCASES)
+def test_ky_tokenizer_handles_testcases(ky_tokenizer, text, expected_tokens):
+    tokens = [token.text for token in ky_tokenizer(text) if not token.is_space]
+    assert expected_tokens == tokens
+
+
+@pytest.mark.parametrize("text,norms", NORM_TESTCASES)
+def test_ky_tokenizer_handles_norm_exceptions(ky_tokenizer, text, norms):
+    tokens = ky_tokenizer(text)
+    assert [token.norm_ for token in tokens] == norms

From 79327197d133b106d2f524d172705842043c9f0a Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Mon, 25 Jan 2021 00:34:12 +0600
Subject: [PATCH 11/13] Add contributor agreement

---
 .github/contributors/jumasheff.md | 106 ++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 .github/contributors/jumasheff.md

diff --git a/.github/contributors/jumasheff.md b/.github/contributors/jumasheff.md
new file mode 100644
index 000000000..1ce6d2341
--- /dev/null
+++ b/.github/contributors/jumasheff.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Murat Jumashev       |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 25.01.2021           |
+| GitHub username                | jumasheff            |
+| Website (optional)             |                      |

From 7d0154a36e180a6ff01059d57b62d186f2fd4458 Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Mon, 25 Jan 2021 00:42:04 +0600
Subject: [PATCH 12/13] Added language meta data

---
 website/meta/languages.json | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/website/meta/languages.json b/website/meta/languages.json
index 4975a1a1e..02a8eb123 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -151,6 +151,12 @@
         { "code": "fa", "name": "Persian", "has_examples": true },
         { "code": "ur", "name": "Urdu", "example": "یہ ایک جملہ ہے", "has_examples": true },
         { "code": "tt", "name": "Tatar", "has_examples": true },
+        {
+            "code": "ky",
+            "name": "Kyrgyz",
+            "example": "Адамга эң кыйыны — күн сайын адам болуу",
+            "has_examples": true
+        },
         { "code": "te", "name": "Telugu", "example": "ఇది ఒక వాక్యం.", "has_examples": true },
         { "code": "si", "name": "Sinhala", "example": "මෙය වාක්‍යයකි.", "has_examples": true },
         { "code": "ga", "name": "Irish" },

From 2b19ebad59c37f97a374b8ea7eec127889ef4709 Mon Sep 17 00:00:00 2001
From: muratjumashev <jumasheff@gmail.com>
Date: Mon, 25 Jan 2021 00:46:45 +0600
Subject: [PATCH 13/13] Remove Kyrgyz chars fr. char_classes since Tatar ones
 already cover

---
 spacy/lang/char_classes.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py
index d876d375a..3fb0fb41e 100644
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@@ -207,10 +207,6 @@ _tatar_lower = r"әөүҗңһ"
 _tatar_upper = r"ӘӨҮҖҢҺ"
 _tatar = r"әөүҗңһӘӨҮҖҢҺ"
 
-_kyrgyz_lower = r"өңү"
-_kyrgyz_upper = r"ӨҢҮ"
-_kyrgyz = r"өңүӨҢҮ"
-
 _greek_lower = r"α-ωάέίόώήύ"
 _greek_upper = r"Α-ΩΆΈΊΌΏΉΎ"
 _greek = r"α-ωάέίόώήύΑ-ΩΆΈΊΌΏΉΎ"
@@ -223,8 +219,8 @@ _macedonian_lower = r"ѓѕјљњќѐѝ"
 _macedonian_upper = r"ЃЅЈЉЊЌЀЍ"
 _macedonian = r"ѓѕјљњќѐѝЃЅЈЉЊЌЀЍ"
 
-_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _kyrgyz_upper + _greek_upper + _ukrainian_upper + _macedonian_upper
-_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _kyrgyz_lower + _greek_lower + _ukrainian_lower + _macedonian_lower
+_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper + _macedonian_upper
+_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower + _macedonian_lower
 
 _uncased = (
     _ethiopic
@@ -240,7 +236,7 @@ _uncased = (
     + _cjk
 )
 
-ALPHA = group_chars(LATIN + _russian + _tatar + _kyrgyz + _greek + _ukrainian + _macedonian + _uncased)
+ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _macedonian + _uncased)
 ALPHA_LOWER = group_chars(_lower + _uncased)
 ALPHA_UPPER = group_chars(_upper + _uncased)