Merge pull request #6802 from jumasheff/add-ky

2021-01-27 13:02:54 +11:00 · 2021-01-27 13:02:54 +11:00 · 560b7acece
parent 5ace559201 2b19ebad59
commit 560b7acece
11 changed files with 433 additions and 0 deletions
--- a/.github/contributors/jumasheff.md
+++ b/.github/contributors/jumasheff.md
@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+    * you hereby assign to us joint ownership, and to the extent that such
+    assignment is or becomes invalid, ineffective or unenforceable, you hereby
+    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+    royalty-free, unrestricted license to exercise all rights under those
+    copyrights. This includes, at our option, the right to sublicense these same
+    rights to third parties through multiple levels of sublicensees or other
+    licensing arrangements;
+
+    * you agree that each of us can do all things in relation to your
+    contribution as if each of us were the sole owners, and if one of us makes
+    a derivative work of your contribution, the one who makes the derivative
+    work (or has it made will be the sole owner of that derivative work;
+
+    * you agree that you will not assert any moral rights in your contribution
+    against us, our licensees or transferees;
+
+    * you agree that we may register a copyright in your contribution and
+    exercise all ownership rights associated with it; and
+
+    * you agree that neither of us has any duty to consult with, obtain the
+    consent of, pay or render an accounting to the other for any use or
+    distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+    * make, have made, use, sell, offer to sell, import, and otherwise transfer
+    your contribution in whole or in part, alone or in combination with or
+    included in any product, work or materials arising out of the project to
+    which your contribution was submitted, and
+
+    * at our option, to sublicense these same rights to third parties through
+    multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+    * Each contribution that you submit is and shall be an original work of
+    authorship and you can legally grant the rights set out in this SCA;
+
+    * to the best of your knowledge, each contribution will not violate any
+    third party's copyrights, trademarks, patents, or other intellectual
+    property rights; and
+
+    * each contribution shall be in compliance with U.S. export control laws and
+    other applicable export and import laws. You agree to notify us if you
+    become aware of any circumstance which would make any of the foregoing
+    representations inaccurate in any respect. We may publicly disclose your
+    participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+    * [x] I am signing on behalf of myself as an individual and no other person
+    or entity, including my employer, has or will have rights with respect to my
+    contributions.
+
+    * [ ] I am signing on behalf of my employer or a legal entity and I have the
+    actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field                          | Entry                |
+|------------------------------- | -------------------- |
+| Name                           | Murat Jumashev       |
+| Company name (if applicable)   |                      |
+| Title or role (if applicable)  |                      |
+| Date                           | 25.01.2021           |
+| GitHub username                | jumasheff            |
+| Website (optional)             |                      |
--- a/spacy/lang/ky/init.py
+++ b/spacy/lang/ky/init.py
@ -0,0 +1,31 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES
+from .stop_words import STOP_WORDS
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ...attrs import LANG
+from ...language import Language
+from ...util import update_exc
+
+
+class KyrgyzDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: "ky"
+
+    lex_attr_getters.update(LEX_ATTRS)
+
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    infixes = tuple(TOKENIZER_INFIXES)
+
+    stop_words = STOP_WORDS
+
+
+class Kyrgyz(Language):
+    lang = "ky"
+    Defaults = KyrgyzDefaults
+
+
+__all__ = ["Kyrgyz"]
--- a/spacy/lang/ky/examples.py
+++ b/spacy/lang/ky/examples.py
@ -0,0 +1,19 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+"""
+Example sentences to test spaCy and its language models.
+>>> from spacy.lang.ky.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+sentences = [
+    "Apple Улуу Британия стартабын $1 миллиардга сатып алууну көздөөдө.",
+    "Автоном автомобилдерди камсыздоо жоопкерчилиги өндүрүүчүлөргө артылды.",
+    "Сан-Франциско тротуар менен жүрүүчү робот-курьерлерге тыю салууну караштырууда.",
+    "Лондон - Улуу Британияда жайгашкан ири шаар.",
+    "Кайдасың?",
+    "Франциянын президенти ким?",
+    "Америка Кошмо Штаттарынын борбор калаасы кайсы шаар?",
+    "Барак Обама качан төрөлгөн?",
+]
--- a/spacy/lang/ky/lex_attrs.py
+++ b/spacy/lang/ky/lex_attrs.py
@ -0,0 +1,51 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...attrs import LIKE_NUM
+
+_num_words = [
+    "нөл",
+    "ноль",
+    "бир",
+    "эки",
+    "үч",
+    "төрт",
+    "беш",
+    "алты",
+    "жети",
+    "сегиз",
+    "тогуз",
+    "он",
+    "жыйырма",
+    "отуз",
+    "кырк",
+    "элүү",
+    "алтымыш",
+    "жетмиш",
+    "сексен",
+    "токсон",
+    "жүз",
+    "миң",
+    "миллион",
+    "миллиард",
+    "триллион",
+    "триллиард",
+]
+
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    if text in _num_words:
+        return True
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/ky/punctuation.py
+++ b/spacy/lang/ky/punctuation.py
@ -0,0 +1,24 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, HYPHENS
+from ..char_classes import LIST_ELLIPSES, LIST_ICONS
+
+_hyphens_no_dash = HYPHENS.replace("-", "").strip("|").replace("||", "")
+_infixes = (
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
+        r"(?<=[{a}])[,!?/()]+(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}{q}])[:<>=](?=[{a}])".format(a=ALPHA, q=CONCAT_QUOTES),
+        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES),
+        r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=_hyphens_no_dash),
+        r"(?<=[0-9])-(?=[{a}])".format(a=ALPHA),
+        r"(?<=[0-9])-(?=[0-9])",
+    ]
+)
+
+TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/ky/stop_words.py
+++ b/spacy/lang/ky/stop_words.py
@ -0,0 +1,45 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+STOP_WORDS = set(
+"""
+ага адам айтты айтымында айтып ал алар
+алардын алган алуу алып анда андан аны
+анын ар
+
+бар басма баш башка башкы башчысы берген
+биз билдирген билдирди бир биринчи бирок
+бишкек болгон болот болсо болуп боюнча
+буга бул
+
+гана
+
+да дагы деген деди деп
+
+жана жатат жаткан жаңы же жогорку жок жол
+жолу
+
+кабыл калган кандай карата каршы катары
+келген керек кийин кол кылмыш кыргыз
+күнү көп
+
+маалымат мамлекеттик мен менен миң
+мурдагы мыйзам мындай мүмкүн
+
+ошол ошондой
+
+сүрөт сөз
+
+тарабынан турган тууралуу
+
+укук учурда
+
+чейин чек
+
+экенин эки эл эле эмес эми эч
+
+үч үчүн
+
+өз
+""".split()
+)
--- a/spacy/lang/ky/tokenizer_exceptions.py
+++ b/spacy/lang/ky/tokenizer_exceptions.py
@ -0,0 +1,55 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import ORTH, LEMMA, NORM
+
+_exc = {}
+
+_abbrev_exc = [
+    # Weekdays abbreviations
+    {ORTH: "дүй", LEMMA: "дүйшөмбү"},
+    {ORTH: "шей", LEMMA: "шейшемби"},
+    {ORTH: "шар", LEMMA: "шаршемби"},
+    {ORTH: "бей", LEMMA: "бейшемби"},
+    {ORTH: "жум", LEMMA: "жума"},
+    {ORTH: "ишм", LEMMA: "ишемби"},
+    {ORTH: "жек", LEMMA: "жекшемби"},
+    # Months abbreviations
+    {ORTH: "янв", LEMMA: "январь"},
+    {ORTH: "фев", LEMMA: "февраль"},
+    {ORTH: "мар", LEMMA: "март"},
+    {ORTH: "апр", LEMMA: "апрель"},
+    {ORTH: "июн", LEMMA: "июнь"},
+    {ORTH: "июл", LEMMA: "июль"},
+    {ORTH: "авг", LEMMA: "август"},
+    {ORTH: "сен", LEMMA: "сентябрь"},
+    {ORTH: "окт", LEMMA: "октябрь"},
+    {ORTH: "ноя", LEMMA: "ноябрь"},
+    {ORTH: "дек", LEMMA: "декабрь"},
+    # Number abbreviations
+    {ORTH: "млрд", LEMMA: "миллиард"},
+    {ORTH: "млн", LEMMA: "миллион"},
+]
+
+for abbr in _abbrev_exc:
+    for orth in (abbr[ORTH], abbr[ORTH].capitalize(), abbr[ORTH].upper()):
+        _exc[orth] = [{ORTH: orth, LEMMA: abbr[LEMMA], NORM: abbr[LEMMA]}]
+        _exc[orth + "."] = [{ORTH: orth + ".", LEMMA: abbr[LEMMA], NORM: abbr[LEMMA]}]
+
+for exc_data in [  # "etc." abbreviations
+    {ORTH: "ж.б.у.с.", NORM: "жана башка ушул сыяктуу"},
+    {ORTH: "ж.б.", NORM: "жана башка"},
+    {ORTH: "ж.", NORM: "жыл"},
+    {ORTH: "б.з.ч.", NORM: "биздин заманга чейин"},
+    {ORTH: "б.з.", NORM: "биздин заман"},
+    {ORTH: "кк.", NORM: "кылымдар"},
+    {ORTH: "жж.", NORM: "жылдар"},
+    {ORTH: "к.", NORM: "кылым"},
+    {ORTH: "көч.", NORM: "көчөсү"},
+    {ORTH: "м-н", NORM: "менен"},
+    {ORTH: "б-ча", NORM: "боюнча"},
+]:
+    exc_data[LEMMA] = exc_data[NORM]
+    _exc[exc_data[ORTH]] = [exc_data]
+
+TOKENIZER_EXCEPTIONS = _exc
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -262,6 +262,11 @@ def tt_tokenizer():
    return get_lang_class("tt").Defaults.create_tokenizer()


+@pytest.fixture(scope="session")
+def ky_tokenizer():
+    return get_lang_class("ky").Defaults.create_tokenizer()
+
+
@pytest.fixture(scope="session")
 def uk_tokenizer():
    pytest.importorskip("pymorphy2")
--- a/spacy/tests/lang/ky/init.py
+++ b/spacy/tests/lang/ky/init.py
--- a/spacy/tests/lang/ky/test_tokenizer.py
+++ b/spacy/tests/lang/ky/test_tokenizer.py
@ -0,0 +1,91 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+
+
+INFIX_HYPHEN_TESTS = [
+    ("Бала-чака жакшыбы?", "Бала-чака жакшыбы ?".split()),
+    ("Кыз-келиндер кийими.", "Кыз-келиндер кийими .".split()),
+]
+
+PUNC_INSIDE_WORDS_TESTS = [
+    (
+        "Пассажир саны - 2,13 млн — киши/күнүнө (2010), 783,9 млн. киши/жылына.",
+        "Пассажир саны - 2,13 млн — киши / күнүнө ( 2010 ) ,"
+        " 783,9 млн. киши / жылына .".split(),
+    ),
+    ('То"кой', 'То " кой'.split()),
+]
+
+MIXED_ORDINAL_NUMS_TESTS = [
+    ("Эртең 22-январь...", "Эртең 22 - январь ...".split())
+]
+
+ABBREV_TESTS = [
+    ("Маселе б-ча эртең келет", "Маселе б-ча эртең келет".split()),
+    ("Ахунбаев көч. турат.", "Ахунбаев көч. турат .".split()),
+    ("«3-жылы (б.з.ч.) туулган", "« 3 - жылы ( б.з.ч. ) туулган".split()),
+    ("Жүгөрү ж.б. дандар колдонулат", "Жүгөрү ж.б. дандар колдонулат".split()),
+    ("3-4 кк. курулган.", "3 - 4 кк. курулган .".split()),
+]
+
+NAME_ABBREV_TESTS = [
+    ("М.Жумаш", "М.Жумаш".split()),
+    ("М.жумаш", "М.жумаш".split()),
+    ("м.Жумаш", "м . Жумаш".split()),
+    ("Жумаш М.Н.", "Жумаш М.Н.".split()),
+    ("Жумаш.", "Жумаш .".split()),
+]
+
+TYPOS_IN_PUNC_TESTS = [
+    ("«3-жылда , туулган", "« 3 - жылда , туулган".split()),
+    ("«3-жылда,туулган", "« 3 - жылда , туулган".split()),
+    ("«3-жылда,туулган.", "« 3 - жылда , туулган .".split()),
+    ("Ал иштейт(качан?)", "Ал иштейт ( качан ? )".split()),
+    ("Ал (качан?)иштейт", "Ал ( качан ?) иштейт".split()),  # "?)" => "?)" or "? )"
+]
+
+LONG_TEXTS_TESTS = [
+    (
+        "Алыскы өлкөлөргө аздыр-көптүр татаалыраак жүрүштөргө чыккандар "
+        "азыраак: ал бир топ кымбат жана логистика маселесинин айынан "
+        "кыйла татаал. Мисалы, январдагы майрамдарда Мароккого үчүнчү "
+        "категориядагы маршрутка (100 чакырымдан кем эмес) барып "
+        "келгенге аракет кылдык.",
+        "Алыскы өлкөлөргө аздыр-көптүр татаалыраак жүрүштөргө чыккандар "
+        "азыраак : ал бир топ кымбат жана логистика маселесинин айынан "
+        "кыйла татаал . Мисалы , январдагы майрамдарда Мароккого үчүнчү "
+        "категориядагы маршрутка ( 100 чакырымдан кем эмес ) барып "
+        "келгенге аракет кылдык .".split(),
+    )
+]
+
+TESTCASES = (
+    INFIX_HYPHEN_TESTS
+    + PUNC_INSIDE_WORDS_TESTS
+    + MIXED_ORDINAL_NUMS_TESTS
+    + ABBREV_TESTS
+    + NAME_ABBREV_TESTS
+    + LONG_TEXTS_TESTS
+    + TYPOS_IN_PUNC_TESTS
+)
+
+NORM_TESTCASES = [
+    (
+        "ит, мышык ж.б.у.с. үй жаныбарлары.",
+        ["ит", ",", "мышык", "жана башка ушул сыяктуу", "үй", "жаныбарлары", "."],
+    )
+]
+
+
+@pytest.mark.parametrize("text,expected_tokens", TESTCASES)
+def test_ky_tokenizer_handles_testcases(ky_tokenizer, text, expected_tokens):
+    tokens = [token.text for token in ky_tokenizer(text) if not token.is_space]
+    assert expected_tokens == tokens
+
+
+@pytest.mark.parametrize("text,norms", NORM_TESTCASES)
+def test_ky_tokenizer_handles_norm_exceptions(ky_tokenizer, text, norms):
+    tokens = ky_tokenizer(text)
+    assert [token.norm_ for token in tokens] == norms
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@ -151,6 +151,12 @@
        { "code": "fa", "name": "Persian", "has_examples": true },
        { "code": "ur", "name": "Urdu", "example": "یہ ایک جملہ ہے", "has_examples": true },
        { "code": "tt", "name": "Tatar", "has_examples": true },
+        {
+            "code": "ky",
+            "name": "Kyrgyz",
+            "example": "Адамга эң кыйыны — күн сайын адам болуу",
+            "has_examples": true
+        },
        { "code": "te", "name": "Telugu", "example": "ఇది ఒక వాక్యం.", "has_examples": true },
        { "code": "si", "name": "Sinhala", "example": "මෙය වාක්‍යයකි.", "has_examples": true },
        { "code": "ga", "name": "Irish" },