From 28d06ab860414e14b99fffc6d12d8928139a892c Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Fri, 22 Jan 2021 23:08:41 +0600 Subject: [PATCH 01/13] Add tokenizer_exceptions --- spacy/lang/ky/__init__.py | 31 +++++++++++++++ spacy/lang/ky/tokenizer_exceptions.py | 55 +++++++++++++++++++++++++++ 2 files changed, 86 insertions(+) create mode 100644 spacy/lang/ky/__init__.py create mode 100644 spacy/lang/ky/tokenizer_exceptions.py diff --git a/spacy/lang/ky/__init__.py b/spacy/lang/ky/__init__.py new file mode 100644 index 000000000..3655e6264 --- /dev/null +++ b/spacy/lang/ky/__init__.py @@ -0,0 +1,31 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .lex_attrs import LEX_ATTRS +from .punctuation import TOKENIZER_INFIXES +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...attrs import LANG +from ...language import Language +from ...util import update_exc + + +class TatarDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "tt" + + lex_attr_getters.update(LEX_ATTRS) + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + infixes = tuple(TOKENIZER_INFIXES) + + stop_words = STOP_WORDS + + +class Tatar(Language): + lang = "tt" + Defaults = TatarDefaults + + +__all__ = ["Tatar"] diff --git a/spacy/lang/ky/tokenizer_exceptions.py b/spacy/lang/ky/tokenizer_exceptions.py new file mode 100644 index 000000000..be5e9530c --- /dev/null +++ b/spacy/lang/ky/tokenizer_exceptions.py @@ -0,0 +1,55 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import ORTH, LEMMA, NORM + +_exc = {} + +_abbrev_exc = [ + # Weekdays abbreviations + {ORTH: "дүй", LEMMA: "дүйшөмбү"}, + {ORTH: "шей", LEMMA: "шейшемби"}, + {ORTH: "шар", LEMMA: "шаршемби"}, + {ORTH: "бей", LEMMA: "бейшемби"}, + {ORTH: "жум", LEMMA: "жума"}, + {ORTH: "ишм", LEMMA: "ишемби"}, + {ORTH: "жек", LEMMA: "жекшемби"}, + # Months abbreviations + {ORTH: "янв", LEMMA: "январь"}, + {ORTH: "фев", LEMMA: "февраль"}, + {ORTH: "мар", LEMMA: "март"}, + {ORTH: "апр", LEMMA: "апрель"}, + {ORTH: "июн", LEMMA: "июнь"}, + {ORTH: "июл", LEMMA: "июль"}, + {ORTH: "авг", LEMMA: "август"}, + {ORTH: "сен", LEMMA: "сентябрь"}, + {ORTH: "окт", LEMMA: "октябрь"}, + {ORTH: "ноя", LEMMA: "ноябрь"}, + {ORTH: "дек", LEMMA: "декабрь"}, + # Number abbreviations + {ORTH: "млрд", LEMMA: "миллиард"}, + {ORTH: "млн", LEMMA: "миллион"}, +] + +for abbr in _abbrev_exc: + for orth in (abbr[ORTH], abbr[ORTH].capitalize(), abbr[ORTH].upper()): + _exc[orth] = [{ORTH: orth, LEMMA: abbr[LEMMA], NORM: abbr[LEMMA]}] + _exc[orth + "."] = [{ORTH: orth + ".", LEMMA: abbr[LEMMA], NORM: abbr[LEMMA]}] + +for exc_data in [ # "etc." abbreviations + {ORTH: "ж.б.у.с.", NORM: "жана башка ушул сыяктуу"}, + {ORTH: "ж.б.", NORM: "жана башка"}, + {ORTH: "ж.", NORM: "жыл"}, + {ORTH: "б.з.ч.", NORM: "биздин заманга чейин"}, + {ORTH: "б.з.", NORM: "биздин заман"}, + {ORTH: "кк.", NORM: "кылымдар"}, + {ORTH: "жж.", NORM: "жылдар"}, + {ORTH: "к.", NORM: "кылым"}, + {ORTH: "көч.", NORM: "көчөсү"}, + {ORTH: "м-н", NORM: "менен"}, + {ORTH: "б-ча", NORM: "боюнча"}, +]: + exc_data[LEMMA] = exc_data[NORM] + _exc[exc_data[ORTH]] = [exc_data] + +TOKENIZER_EXCEPTIONS = _exc From 101d265778633f5f4cbe15013ab8c5cc3c9f3789 Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Sat, 23 Jan 2021 21:25:28 +0600 Subject: [PATCH 02/13] Add stopwords --- spacy/lang/ky/stop_words.py | 47 +++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 spacy/lang/ky/stop_words.py diff --git a/spacy/lang/ky/stop_words.py b/spacy/lang/ky/stop_words.py new file mode 100644 index 000000000..1f59539fe --- /dev/null +++ b/spacy/lang/ky/stop_words.py @@ -0,0 +1,47 @@ +# encoding: utf8 +from __future__ import unicode_literals + +# Tatar stopwords are from https://github.com/aliiae/stopwords-tt + +STOP_WORDS = set( +""" +ага адам айтты айтымында айтып ал алар +алардын алган алуу алып анда андан аны +анын ар + +бар басма баш башка башкы башчысы берген +биз билдирген билдирди бир биринчи бирок +бишкек болгон болот болсо болуп боюнча +буга бул + +гана + +да дагы деген деди деп + +жана жатат жаткан жаңы же жогорку жок жол +жолу + +кабыл калган кандай карата каршы катары +келген керек кийин кол кылмыш кыргыз +күнү көп + +маалымат мамлекеттик мен менен миң +мурдагы мыйзам мындай мүмкүн + +ошол ошондой + +сүрөт сөз + +тарабынан турган тууралуу + +укук учурда + +чейин чек + +экенин эки эл эле эмес эми эч + +үч үчүн + +өз +""".split() +) From 4418ec2eeedb0889968127ac4c0d9a1a0439723b Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Sat, 23 Jan 2021 21:31:31 +0600 Subject: [PATCH 03/13] Add punctuation --- spacy/lang/ky/punctuation.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 spacy/lang/ky/punctuation.py diff --git a/spacy/lang/ky/punctuation.py b/spacy/lang/ky/punctuation.py new file mode 100644 index 000000000..9ee66a59e --- /dev/null +++ b/spacy/lang/ky/punctuation.py @@ -0,0 +1,23 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, HYPHENS +from ..char_classes import LIST_ELLIPSES, LIST_ICONS + +_hyphens_no_dash = HYPHENS.replace("-", "").strip("|").replace("||", "") +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), + r"(?<=[{a}])[,!?/()]+(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}{q}])[:<>=](?=[{a}])".format(a=ALPHA, q=CONCAT_QUOTES), + r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES), + r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=_hyphens_no_dash), + r"(?<=[0-9])-(?=[0-9])", + ] +) + +TOKENIZER_INFIXES = _infixes From d53724ba1d6a22b3f25fad118dce14e3495040e5 Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Sat, 23 Jan 2021 21:35:25 +0600 Subject: [PATCH 04/13] Add lex_attrs --- spacy/lang/ky/lex_attrs.py | 51 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 spacy/lang/ky/lex_attrs.py diff --git a/spacy/lang/ky/lex_attrs.py b/spacy/lang/ky/lex_attrs.py new file mode 100644 index 000000000..af926b138 --- /dev/null +++ b/spacy/lang/ky/lex_attrs.py @@ -0,0 +1,51 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + +_num_words = [ + "нөл", + "ноль", + "бир", + "эки", + "үч", + "төрт", + "беш", + "алты", + "жети", + "сегиз", + "тогуз", + "он", + "жыйырма", + "отуз", + "кырк", + "элүү", + "алтымыш", + "жетмиш", + "сексен", + "токсон", + "жүз", + "миң", + "миллион", + "миллиард", + "триллион", + "триллиард", +] + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text in _num_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} From 2f385385a95f9a6ce22dc8489a95fcd58b853fc2 Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Sat, 23 Jan 2021 21:36:28 +0600 Subject: [PATCH 05/13] Remove comment --- spacy/lang/ky/stop_words.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/spacy/lang/ky/stop_words.py b/spacy/lang/ky/stop_words.py index 1f59539fe..eede62767 100644 --- a/spacy/lang/ky/stop_words.py +++ b/spacy/lang/ky/stop_words.py @@ -1,8 +1,6 @@ # encoding: utf8 from __future__ import unicode_literals -# Tatar stopwords are from https://github.com/aliiae/stopwords-tt - STOP_WORDS = set( """ ага адам айтты айтымында айтып ал алар From e30bbf5432c86352c9ae0e7f9b5329ac6ba39620 Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Sat, 23 Jan 2021 21:49:08 +0600 Subject: [PATCH 06/13] Add examples --- spacy/lang/ky/examples.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 spacy/lang/ky/examples.py diff --git a/spacy/lang/ky/examples.py b/spacy/lang/ky/examples.py new file mode 100644 index 000000000..f1f31e3ab --- /dev/null +++ b/spacy/lang/ky/examples.py @@ -0,0 +1,19 @@ +# coding: utf8 +from __future__ import unicode_literals + +""" +Example sentences to test spaCy and its language models. +>>> from spacy.lang.ky.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + +sentences = [ + "Apple Улуу Британия стартабын $1 миллиардга сатып алууну көздөөдө.", + "Автоном автомобилдерди камсыздоо жоопкерчилиги өндүрүүчүлөргө артылды.", + "Сан-Франциско тротуар менен жүрүүчү робот-курьерлерге тыю салууну караштырууда.", + "Лондон - Улуу Британияда жайгашкан ири шаар.", + "Кайдасың?", + "Франциянын президенти ким?", + "Америка Кошмо Штаттарынын борбор калаасы кайсы шаар?", + "Барак Обама качан төрөлгөн?", +] From fe3b5b8ff596117d39a9143f8d076a601e8016db Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Sat, 23 Jan 2021 21:53:41 +0600 Subject: [PATCH 07/13] Add kyrgyz to char_classes --- spacy/lang/char_classes.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 3fb0fb41e..d876d375a 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -207,6 +207,10 @@ _tatar_lower = r"әөүҗңһ" _tatar_upper = r"ӘӨҮҖҢҺ" _tatar = r"әөүҗңһӘӨҮҖҢҺ" +_kyrgyz_lower = r"өңү" +_kyrgyz_upper = r"ӨҢҮ" +_kyrgyz = r"өңүӨҢҮ" + _greek_lower = r"α-ωάέίόώήύ" _greek_upper = r"Α-ΩΆΈΊΌΏΉΎ" _greek = r"α-ωάέίόώήύΑ-ΩΆΈΊΌΏΉΎ" @@ -219,8 +223,8 @@ _macedonian_lower = r"ѓѕјљњќѐѝ" _macedonian_upper = r"ЃЅЈЉЊЌЀЍ" _macedonian = r"ѓѕјљњќѐѝЃЅЈЉЊЌЀЍ" -_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper + _macedonian_upper -_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower + _macedonian_lower +_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _kyrgyz_upper + _greek_upper + _ukrainian_upper + _macedonian_upper +_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _kyrgyz_lower + _greek_lower + _ukrainian_lower + _macedonian_lower _uncased = ( _ethiopic @@ -236,7 +240,7 @@ _uncased = ( + _cjk ) -ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _macedonian + _uncased) +ALPHA = group_chars(LATIN + _russian + _tatar + _kyrgyz + _greek + _ukrainian + _macedonian + _uncased) ALPHA_LOWER = group_chars(_lower + _uncased) ALPHA_UPPER = group_chars(_upper + _uncased) From 2a2646362be11ee9122328353d46f24277a6b1b5 Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Sat, 23 Jan 2021 22:00:50 +0600 Subject: [PATCH 08/13] Fix language subclass --- spacy/lang/ky/__init__.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/lang/ky/__init__.py b/spacy/lang/ky/__init__.py index 3655e6264..4656cfeb9 100644 --- a/spacy/lang/ky/__init__.py +++ b/spacy/lang/ky/__init__.py @@ -11,9 +11,9 @@ from ...language import Language from ...util import update_exc -class TatarDefaults(Language.Defaults): +class KyrgyzDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: "tt" + lex_attr_getters[LANG] = lambda text: "ky" lex_attr_getters.update(LEX_ATTRS) @@ -23,9 +23,9 @@ class TatarDefaults(Language.Defaults): stop_words = STOP_WORDS -class Tatar(Language): - lang = "tt" - Defaults = TatarDefaults +class Kyrgyz(Language): + lang = "ky" + Defaults = KyrgyzDefaults -__all__ = ["Tatar"] +__all__ = ["Kyrgyz"] From 53abf759ad035ad64d4cfb1f0ae3ced1a6e00522 Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Sun, 24 Jan 2021 20:54:22 +0600 Subject: [PATCH 09/13] Fix punctuation --- spacy/lang/ky/punctuation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/lang/ky/punctuation.py b/spacy/lang/ky/punctuation.py index 9ee66a59e..22c2061ca 100644 --- a/spacy/lang/ky/punctuation.py +++ b/spacy/lang/ky/punctuation.py @@ -16,6 +16,7 @@ _infixes = ( r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES), r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=_hyphens_no_dash), + r"(?<=[0-9])-(?=[{a}])".format(a=ALPHA), r"(?<=[0-9])-(?=[0-9])", ] ) From 87168eb81f679ba17b7ddac9fb934b058c70a40c Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Sun, 24 Jan 2021 20:56:16 +0600 Subject: [PATCH 10/13] Add tests --- spacy/tests/conftest.py | 5 ++ spacy/tests/lang/ky/__init__.py | 0 spacy/tests/lang/ky/test_tokenizer.py | 91 +++++++++++++++++++++++++++ 3 files changed, 96 insertions(+) create mode 100644 spacy/tests/lang/ky/__init__.py create mode 100644 spacy/tests/lang/ky/test_tokenizer.py diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 90a18925b..ad545bcfd 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -262,6 +262,11 @@ def tt_tokenizer(): return get_lang_class("tt").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def ky_tokenizer(): + return get_lang_class("ky").Defaults.create_tokenizer() + + @pytest.fixture(scope="session") def uk_tokenizer(): pytest.importorskip("pymorphy2") diff --git a/spacy/tests/lang/ky/__init__.py b/spacy/tests/lang/ky/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/ky/test_tokenizer.py b/spacy/tests/lang/ky/test_tokenizer.py new file mode 100644 index 000000000..99dab2b16 --- /dev/null +++ b/spacy/tests/lang/ky/test_tokenizer.py @@ -0,0 +1,91 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + + +INFIX_HYPHEN_TESTS = [ + ("Бала-чака жакшыбы?", "Бала-чака жакшыбы ?".split()), + ("Кыз-келиндер кийими.", "Кыз-келиндер кийими .".split()), +] + +PUNC_INSIDE_WORDS_TESTS = [ + ( + "Пассажир саны - 2,13 млн — киши/күнүнө (2010), 783,9 млн. киши/жылына.", + "Пассажир саны - 2,13 млн — киши / күнүнө ( 2010 ) ," + " 783,9 млн. киши / жылына .".split(), + ), + ('То"кой', 'То " кой'.split()), +] + +MIXED_ORDINAL_NUMS_TESTS = [ + ("Эртең 22-январь...", "Эртең 22 - январь ...".split()) +] + +ABBREV_TESTS = [ + ("Маселе б-ча эртең келет", "Маселе б-ча эртең келет".split()), + ("Ахунбаев көч. турат.", "Ахунбаев көч. турат .".split()), + ("«3-жылы (б.з.ч.) туулган", "« 3 - жылы ( б.з.ч. ) туулган".split()), + ("Жүгөрү ж.б. дандар колдонулат", "Жүгөрү ж.б. дандар колдонулат".split()), + ("3-4 кк. курулган.", "3 - 4 кк. курулган .".split()), +] + +NAME_ABBREV_TESTS = [ + ("М.Жумаш", "М.Жумаш".split()), + ("М.жумаш", "М.жумаш".split()), + ("м.Жумаш", "м . Жумаш".split()), + ("Жумаш М.Н.", "Жумаш М.Н.".split()), + ("Жумаш.", "Жумаш .".split()), +] + +TYPOS_IN_PUNC_TESTS = [ + ("«3-жылда , туулган", "« 3 - жылда , туулган".split()), + ("«3-жылда,туулган", "« 3 - жылда , туулган".split()), + ("«3-жылда,туулган.", "« 3 - жылда , туулган .".split()), + ("Ал иштейт(качан?)", "Ал иштейт ( качан ? )".split()), + ("Ал (качан?)иштейт", "Ал ( качан ?) иштейт".split()), # "?)" => "?)" or "? )" +] + +LONG_TEXTS_TESTS = [ + ( + "Алыскы өлкөлөргө аздыр-көптүр татаалыраак жүрүштөргө чыккандар " + "азыраак: ал бир топ кымбат жана логистика маселесинин айынан " + "кыйла татаал. Мисалы, январдагы майрамдарда Мароккого үчүнчү " + "категориядагы маршрутка (100 чакырымдан кем эмес) барып " + "келгенге аракет кылдык.", + "Алыскы өлкөлөргө аздыр-көптүр татаалыраак жүрүштөргө чыккандар " + "азыраак : ал бир топ кымбат жана логистика маселесинин айынан " + "кыйла татаал . Мисалы , январдагы майрамдарда Мароккого үчүнчү " + "категориядагы маршрутка ( 100 чакырымдан кем эмес ) барып " + "келгенге аракет кылдык .".split(), + ) +] + +TESTCASES = ( + INFIX_HYPHEN_TESTS + + PUNC_INSIDE_WORDS_TESTS + + MIXED_ORDINAL_NUMS_TESTS + + ABBREV_TESTS + + NAME_ABBREV_TESTS + + LONG_TEXTS_TESTS + + TYPOS_IN_PUNC_TESTS +) + +NORM_TESTCASES = [ + ( + "ит, мышык ж.б.у.с. үй жаныбарлары.", + ["ит", ",", "мышык", "жана башка ушул сыяктуу", "үй", "жаныбарлары", "."], + ) +] + + +@pytest.mark.parametrize("text,expected_tokens", TESTCASES) +def test_ky_tokenizer_handles_testcases(ky_tokenizer, text, expected_tokens): + tokens = [token.text for token in ky_tokenizer(text) if not token.is_space] + assert expected_tokens == tokens + + +@pytest.mark.parametrize("text,norms", NORM_TESTCASES) +def test_ky_tokenizer_handles_norm_exceptions(ky_tokenizer, text, norms): + tokens = ky_tokenizer(text) + assert [token.norm_ for token in tokens] == norms From 79327197d133b106d2f524d172705842043c9f0a Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Mon, 25 Jan 2021 00:34:12 +0600 Subject: [PATCH 11/13] Add contributor agreement --- .github/contributors/jumasheff.md | 106 ++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/jumasheff.md diff --git a/.github/contributors/jumasheff.md b/.github/contributors/jumasheff.md new file mode 100644 index 000000000..1ce6d2341 --- /dev/null +++ b/.github/contributors/jumasheff.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Murat Jumashev | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 25.01.2021 | +| GitHub username | jumasheff | +| Website (optional) | | From 7d0154a36e180a6ff01059d57b62d186f2fd4458 Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Mon, 25 Jan 2021 00:42:04 +0600 Subject: [PATCH 12/13] Added language meta data --- website/meta/languages.json | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/website/meta/languages.json b/website/meta/languages.json index 4975a1a1e..02a8eb123 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -151,6 +151,12 @@ { "code": "fa", "name": "Persian", "has_examples": true }, { "code": "ur", "name": "Urdu", "example": "یہ ایک جملہ ہے", "has_examples": true }, { "code": "tt", "name": "Tatar", "has_examples": true }, + { + "code": "ky", + "name": "Kyrgyz", + "example": "Адамга эң кыйыны — күн сайын адам болуу", + "has_examples": true + }, { "code": "te", "name": "Telugu", "example": "ఇది ఒక వాక్యం.", "has_examples": true }, { "code": "si", "name": "Sinhala", "example": "මෙය වාක්‍යයකි.", "has_examples": true }, { "code": "ga", "name": "Irish" }, From 2b19ebad59c37f97a374b8ea7eec127889ef4709 Mon Sep 17 00:00:00 2001 From: muratjumashev Date: Mon, 25 Jan 2021 00:46:45 +0600 Subject: [PATCH 13/13] Remove Kyrgyz chars fr. char_classes since Tatar ones already cover --- spacy/lang/char_classes.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index d876d375a..3fb0fb41e 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -207,10 +207,6 @@ _tatar_lower = r"әөүҗңһ" _tatar_upper = r"ӘӨҮҖҢҺ" _tatar = r"әөүҗңһӘӨҮҖҢҺ" -_kyrgyz_lower = r"өңү" -_kyrgyz_upper = r"ӨҢҮ" -_kyrgyz = r"өңүӨҢҮ" - _greek_lower = r"α-ωάέίόώήύ" _greek_upper = r"Α-ΩΆΈΊΌΏΉΎ" _greek = r"α-ωάέίόώήύΑ-ΩΆΈΊΌΏΉΎ" @@ -223,8 +219,8 @@ _macedonian_lower = r"ѓѕјљњќѐѝ" _macedonian_upper = r"ЃЅЈЉЊЌЀЍ" _macedonian = r"ѓѕјљњќѐѝЃЅЈЉЊЌЀЍ" -_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _kyrgyz_upper + _greek_upper + _ukrainian_upper + _macedonian_upper -_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _kyrgyz_lower + _greek_lower + _ukrainian_lower + _macedonian_lower +_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper + _macedonian_upper +_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower + _macedonian_lower _uncased = ( _ethiopic @@ -240,7 +236,7 @@ _uncased = ( + _cjk ) -ALPHA = group_chars(LATIN + _russian + _tatar + _kyrgyz + _greek + _ukrainian + _macedonian + _uncased) +ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _macedonian + _uncased) ALPHA_LOWER = group_chars(_lower + _uncased) ALPHA_UPPER = group_chars(_upper + _uncased)