diff --git a/.github/contributors/jumasheff.md b/.github/contributors/jumasheff.md new file mode 100644 index 000000000..1ce6d2341 --- /dev/null +++ b/.github/contributors/jumasheff.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Murat Jumashev | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 25.01.2021 | +| GitHub username | jumasheff | +| Website (optional) | | diff --git a/spacy/lang/ky/__init__.py b/spacy/lang/ky/__init__.py new file mode 100644 index 000000000..4656cfeb9 --- /dev/null +++ b/spacy/lang/ky/__init__.py @@ -0,0 +1,31 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .lex_attrs import LEX_ATTRS +from .punctuation import TOKENIZER_INFIXES +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ...attrs import LANG +from ...language import Language +from ...util import update_exc + + +class KyrgyzDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: "ky" + + lex_attr_getters.update(LEX_ATTRS) + + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + infixes = tuple(TOKENIZER_INFIXES) + + stop_words = STOP_WORDS + + +class Kyrgyz(Language): + lang = "ky" + Defaults = KyrgyzDefaults + + +__all__ = ["Kyrgyz"] diff --git a/spacy/lang/ky/examples.py b/spacy/lang/ky/examples.py new file mode 100644 index 000000000..f1f31e3ab --- /dev/null +++ b/spacy/lang/ky/examples.py @@ -0,0 +1,19 @@ +# coding: utf8 +from __future__ import unicode_literals + +""" +Example sentences to test spaCy and its language models. +>>> from spacy.lang.ky.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + +sentences = [ + "Apple Улуу Британия стартабын $1 миллиардга сатып алууну көздөөдө.", + "Автоном автомобилдерди камсыздоо жоопкерчилиги өндүрүүчүлөргө артылды.", + "Сан-Франциско тротуар менен жүрүүчү робот-курьерлерге тыю салууну караштырууда.", + "Лондон - Улуу Британияда жайгашкан ири шаар.", + "Кайдасың?", + "Франциянын президенти ким?", + "Америка Кошмо Штаттарынын борбор калаасы кайсы шаар?", + "Барак Обама качан төрөлгөн?", +] diff --git a/spacy/lang/ky/lex_attrs.py b/spacy/lang/ky/lex_attrs.py new file mode 100644 index 000000000..af926b138 --- /dev/null +++ b/spacy/lang/ky/lex_attrs.py @@ -0,0 +1,51 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + +_num_words = [ + "нөл", + "ноль", + "бир", + "эки", + "үч", + "төрт", + "беш", + "алты", + "жети", + "сегиз", + "тогуз", + "он", + "жыйырма", + "отуз", + "кырк", + "элүү", + "алтымыш", + "жетмиш", + "сексен", + "токсон", + "жүз", + "миң", + "миллион", + "миллиард", + "триллион", + "триллиард", +] + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text in _num_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/ky/punctuation.py b/spacy/lang/ky/punctuation.py new file mode 100644 index 000000000..22c2061ca --- /dev/null +++ b/spacy/lang/ky/punctuation.py @@ -0,0 +1,24 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, HYPHENS +from ..char_classes import LIST_ELLIPSES, LIST_ICONS + +_hyphens_no_dash = HYPHENS.replace("-", "").strip("|").replace("||", "") +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), + r"(?<=[{a}])[,!?/()]+(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}{q}])[:<>=](?=[{a}])".format(a=ALPHA, q=CONCAT_QUOTES), + r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES), + r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=_hyphens_no_dash), + r"(?<=[0-9])-(?=[{a}])".format(a=ALPHA), + r"(?<=[0-9])-(?=[0-9])", + ] +) + +TOKENIZER_INFIXES = _infixes diff --git a/spacy/lang/ky/stop_words.py b/spacy/lang/ky/stop_words.py new file mode 100644 index 000000000..eede62767 --- /dev/null +++ b/spacy/lang/ky/stop_words.py @@ -0,0 +1,45 @@ +# encoding: utf8 +from __future__ import unicode_literals + +STOP_WORDS = set( +""" +ага адам айтты айтымында айтып ал алар +алардын алган алуу алып анда андан аны +анын ар + +бар басма баш башка башкы башчысы берген +биз билдирген билдирди бир биринчи бирок +бишкек болгон болот болсо болуп боюнча +буга бул + +гана + +да дагы деген деди деп + +жана жатат жаткан жаңы же жогорку жок жол +жолу + +кабыл калган кандай карата каршы катары +келген керек кийин кол кылмыш кыргыз +күнү көп + +маалымат мамлекеттик мен менен миң +мурдагы мыйзам мындай мүмкүн + +ошол ошондой + +сүрөт сөз + +тарабынан турган тууралуу + +укук учурда + +чейин чек + +экенин эки эл эле эмес эми эч + +үч үчүн + +өз +""".split() +) diff --git a/spacy/lang/ky/tokenizer_exceptions.py b/spacy/lang/ky/tokenizer_exceptions.py new file mode 100644 index 000000000..be5e9530c --- /dev/null +++ b/spacy/lang/ky/tokenizer_exceptions.py @@ -0,0 +1,55 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import ORTH, LEMMA, NORM + +_exc = {} + +_abbrev_exc = [ + # Weekdays abbreviations + {ORTH: "дүй", LEMMA: "дүйшөмбү"}, + {ORTH: "шей", LEMMA: "шейшемби"}, + {ORTH: "шар", LEMMA: "шаршемби"}, + {ORTH: "бей", LEMMA: "бейшемби"}, + {ORTH: "жум", LEMMA: "жума"}, + {ORTH: "ишм", LEMMA: "ишемби"}, + {ORTH: "жек", LEMMA: "жекшемби"}, + # Months abbreviations + {ORTH: "янв", LEMMA: "январь"}, + {ORTH: "фев", LEMMA: "февраль"}, + {ORTH: "мар", LEMMA: "март"}, + {ORTH: "апр", LEMMA: "апрель"}, + {ORTH: "июн", LEMMA: "июнь"}, + {ORTH: "июл", LEMMA: "июль"}, + {ORTH: "авг", LEMMA: "август"}, + {ORTH: "сен", LEMMA: "сентябрь"}, + {ORTH: "окт", LEMMA: "октябрь"}, + {ORTH: "ноя", LEMMA: "ноябрь"}, + {ORTH: "дек", LEMMA: "декабрь"}, + # Number abbreviations + {ORTH: "млрд", LEMMA: "миллиард"}, + {ORTH: "млн", LEMMA: "миллион"}, +] + +for abbr in _abbrev_exc: + for orth in (abbr[ORTH], abbr[ORTH].capitalize(), abbr[ORTH].upper()): + _exc[orth] = [{ORTH: orth, LEMMA: abbr[LEMMA], NORM: abbr[LEMMA]}] + _exc[orth + "."] = [{ORTH: orth + ".", LEMMA: abbr[LEMMA], NORM: abbr[LEMMA]}] + +for exc_data in [ # "etc." abbreviations + {ORTH: "ж.б.у.с.", NORM: "жана башка ушул сыяктуу"}, + {ORTH: "ж.б.", NORM: "жана башка"}, + {ORTH: "ж.", NORM: "жыл"}, + {ORTH: "б.з.ч.", NORM: "биздин заманга чейин"}, + {ORTH: "б.з.", NORM: "биздин заман"}, + {ORTH: "кк.", NORM: "кылымдар"}, + {ORTH: "жж.", NORM: "жылдар"}, + {ORTH: "к.", NORM: "кылым"}, + {ORTH: "көч.", NORM: "көчөсү"}, + {ORTH: "м-н", NORM: "менен"}, + {ORTH: "б-ча", NORM: "боюнча"}, +]: + exc_data[LEMMA] = exc_data[NORM] + _exc[exc_data[ORTH]] = [exc_data] + +TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 90a18925b..ad545bcfd 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -262,6 +262,11 @@ def tt_tokenizer(): return get_lang_class("tt").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def ky_tokenizer(): + return get_lang_class("ky").Defaults.create_tokenizer() + + @pytest.fixture(scope="session") def uk_tokenizer(): pytest.importorskip("pymorphy2") diff --git a/spacy/tests/lang/ky/__init__.py b/spacy/tests/lang/ky/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/ky/test_tokenizer.py b/spacy/tests/lang/ky/test_tokenizer.py new file mode 100644 index 000000000..99dab2b16 --- /dev/null +++ b/spacy/tests/lang/ky/test_tokenizer.py @@ -0,0 +1,91 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + + +INFIX_HYPHEN_TESTS = [ + ("Бала-чака жакшыбы?", "Бала-чака жакшыбы ?".split()), + ("Кыз-келиндер кийими.", "Кыз-келиндер кийими .".split()), +] + +PUNC_INSIDE_WORDS_TESTS = [ + ( + "Пассажир саны - 2,13 млн — киши/күнүнө (2010), 783,9 млн. киши/жылына.", + "Пассажир саны - 2,13 млн — киши / күнүнө ( 2010 ) ," + " 783,9 млн. киши / жылына .".split(), + ), + ('То"кой', 'То " кой'.split()), +] + +MIXED_ORDINAL_NUMS_TESTS = [ + ("Эртең 22-январь...", "Эртең 22 - январь ...".split()) +] + +ABBREV_TESTS = [ + ("Маселе б-ча эртең келет", "Маселе б-ча эртең келет".split()), + ("Ахунбаев көч. турат.", "Ахунбаев көч. турат .".split()), + ("«3-жылы (б.з.ч.) туулган", "« 3 - жылы ( б.з.ч. ) туулган".split()), + ("Жүгөрү ж.б. дандар колдонулат", "Жүгөрү ж.б. дандар колдонулат".split()), + ("3-4 кк. курулган.", "3 - 4 кк. курулган .".split()), +] + +NAME_ABBREV_TESTS = [ + ("М.Жумаш", "М.Жумаш".split()), + ("М.жумаш", "М.жумаш".split()), + ("м.Жумаш", "м . Жумаш".split()), + ("Жумаш М.Н.", "Жумаш М.Н.".split()), + ("Жумаш.", "Жумаш .".split()), +] + +TYPOS_IN_PUNC_TESTS = [ + ("«3-жылда , туулган", "« 3 - жылда , туулган".split()), + ("«3-жылда,туулган", "« 3 - жылда , туулган".split()), + ("«3-жылда,туулган.", "« 3 - жылда , туулган .".split()), + ("Ал иштейт(качан?)", "Ал иштейт ( качан ? )".split()), + ("Ал (качан?)иштейт", "Ал ( качан ?) иштейт".split()), # "?)" => "?)" or "? )" +] + +LONG_TEXTS_TESTS = [ + ( + "Алыскы өлкөлөргө аздыр-көптүр татаалыраак жүрүштөргө чыккандар " + "азыраак: ал бир топ кымбат жана логистика маселесинин айынан " + "кыйла татаал. Мисалы, январдагы майрамдарда Мароккого үчүнчү " + "категориядагы маршрутка (100 чакырымдан кем эмес) барып " + "келгенге аракет кылдык.", + "Алыскы өлкөлөргө аздыр-көптүр татаалыраак жүрүштөргө чыккандар " + "азыраак : ал бир топ кымбат жана логистика маселесинин айынан " + "кыйла татаал . Мисалы , январдагы майрамдарда Мароккого үчүнчү " + "категориядагы маршрутка ( 100 чакырымдан кем эмес ) барып " + "келгенге аракет кылдык .".split(), + ) +] + +TESTCASES = ( + INFIX_HYPHEN_TESTS + + PUNC_INSIDE_WORDS_TESTS + + MIXED_ORDINAL_NUMS_TESTS + + ABBREV_TESTS + + NAME_ABBREV_TESTS + + LONG_TEXTS_TESTS + + TYPOS_IN_PUNC_TESTS +) + +NORM_TESTCASES = [ + ( + "ит, мышык ж.б.у.с. үй жаныбарлары.", + ["ит", ",", "мышык", "жана башка ушул сыяктуу", "үй", "жаныбарлары", "."], + ) +] + + +@pytest.mark.parametrize("text,expected_tokens", TESTCASES) +def test_ky_tokenizer_handles_testcases(ky_tokenizer, text, expected_tokens): + tokens = [token.text for token in ky_tokenizer(text) if not token.is_space] + assert expected_tokens == tokens + + +@pytest.mark.parametrize("text,norms", NORM_TESTCASES) +def test_ky_tokenizer_handles_norm_exceptions(ky_tokenizer, text, norms): + tokens = ky_tokenizer(text) + assert [token.norm_ for token in tokens] == norms diff --git a/website/meta/languages.json b/website/meta/languages.json index 4975a1a1e..02a8eb123 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -151,6 +151,12 @@ { "code": "fa", "name": "Persian", "has_examples": true }, { "code": "ur", "name": "Urdu", "example": "یہ ایک جملہ ہے", "has_examples": true }, { "code": "tt", "name": "Tatar", "has_examples": true }, + { + "code": "ky", + "name": "Kyrgyz", + "example": "Адамга эң кыйыны — күн сайын адам болуу", + "has_examples": true + }, { "code": "te", "name": "Telugu", "example": "ఇది ఒక వాక్యం.", "has_examples": true }, { "code": "si", "name": "Sinhala", "example": "මෙය වාක්‍යයකි.", "has_examples": true }, { "code": "ga", "name": "Irish" },