From c34420794acd4e3b656332c430a41252d33a9722 Mon Sep 17 00:00:00 2001 From: Rameshh <30867740+rameshhpathak@users.noreply.github.com> Date: Mon, 22 Jun 2020 14:10:46 +0545 Subject: [PATCH] Add Nepali Language (#5622) * added support for nepali lang * added examples and test files * added spacy contributor agreement --- .github/contributors/rameshhpathak.md | 106 ++++++ spacy/lang/ne/__init__.py | 23 ++ spacy/lang/ne/examples.py | 22 ++ spacy/lang/ne/lex_attrs.py | 98 +++++ spacy/lang/ne/stop_words.py | 498 ++++++++++++++++++++++++++ spacy/tests/conftest.py | 5 + spacy/tests/lang/ne/__init__.py | 0 spacy/tests/lang/ne/test_text.py | 19 + 8 files changed, 771 insertions(+) create mode 100644 .github/contributors/rameshhpathak.md create mode 100644 spacy/lang/ne/__init__.py create mode 100644 spacy/lang/ne/examples.py create mode 100644 spacy/lang/ne/lex_attrs.py create mode 100644 spacy/lang/ne/stop_words.py create mode 100644 spacy/tests/lang/ne/__init__.py create mode 100644 spacy/tests/lang/ne/test_text.py diff --git a/.github/contributors/rameshhpathak.md b/.github/contributors/rameshhpathak.md new file mode 100644 index 000000000..30a543307 --- /dev/null +++ b/.github/contributors/rameshhpathak.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Ramesh Pathak | +| Company name (if applicable) | Diyo AI | +| Title or role (if applicable) | AI Engineer | +| Date | June 21, 2020 | +| GitHub username | rameshhpathak | +| Website (optional) |rameshhpathak.github.io| | diff --git a/spacy/lang/ne/__init__.py b/spacy/lang/ne/__init__.py new file mode 100644 index 000000000..21556277d --- /dev/null +++ b/spacy/lang/ne/__init__.py @@ -0,0 +1,23 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS + +from ...language import Language +from ...attrs import LANG + + +class NepaliDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) + lex_attr_getters[LANG] = lambda text: "ne" # Nepali language ISO code + stop_words = STOP_WORDS + + +class Nepali(Language): + lang = "ne" + Defaults = NepaliDefaults + + +__all__ = ["Nepali"] diff --git a/spacy/lang/ne/examples.py b/spacy/lang/ne/examples.py new file mode 100644 index 000000000..b3c4f9e73 --- /dev/null +++ b/spacy/lang/ne/examples.py @@ -0,0 +1,22 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.ne.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "एप्पलले अमेरिकी स्टार्टअप १ अर्ब डलरमा किन्ने सोच्दै छ", + "स्वायत्त कारहरूले बीमा दायित्व निर्माताहरु तिर बदल्छन्", + "स्यान फ्रांसिस्कोले फुटपाथ वितरण रोबोटहरु प्रतिबंध गर्ने विचार गर्दै छ", + "लन्डन यूनाइटेड किंगडमको एक ठूलो शहर हो।", + "तिमी कहाँ छौ?", + "फ्रान्स को राष्ट्रपति को हो?", + "संयुक्त राज्यको राजधानी के हो?", + "बराक ओबामा कहिले कहिले जन्मेका हुन्?", +] diff --git a/spacy/lang/ne/lex_attrs.py b/spacy/lang/ne/lex_attrs.py new file mode 100644 index 000000000..652307577 --- /dev/null +++ b/spacy/lang/ne/lex_attrs.py @@ -0,0 +1,98 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..norm_exceptions import BASE_NORMS +from ...attrs import NORM, LIKE_NUM + + +# fmt: off +_stem_suffixes = [ + ["ा", "ि", "ी", "ु", "ू", "ृ", "े", "ै", "ो", "ौ"], + ["ँ", "ं", "्", "ः"], + ["लाई", "ले", "बाट", "को", "मा", "हरू"], + ["हरूलाई", "हरूले", "हरूबाट", "हरूको", "हरूमा"], + ["इलो", "िलो", "नु", "ाउनु", "ई", "इन", "इन्", "इनन्"], + ["एँ", "इँन्", "इस्", "इनस्", "यो", "एन", "यौं", "एनौं", "ए", "एनन्"], + ["छु", "छौँ", "छस्", "छौ", "छ", "छन्", "छेस्", "छे", "छ्यौ", "छिन्", "हुन्छ"], + ["दै", "दिन", "दिँन", "दैनस्", "दैन", "दैनौँ", "दैनौं", "दैनन्"], + ["हुन्न", "न्न", "न्न्स्", "न्नौं", "न्नौ", "न्न्न्", "िई"], + ["अ", "ओ", "ऊ", "अरी", "साथ", "वित्तिकै", "पूर्वक"], + ["याइ", "ाइ", "बार", "वार", "चाँहि"], + ["ने", "ेको", "ेकी", "ेका", "ेर", "दै", "तै", "िकन", "उ", "न", "नन्"] +] +# fmt: on + +# reference 1: https://en.wikipedia.org/wiki/Numbers_in_Nepali_language +# reference 2: https://www.imnepal.com/nepali-numbers/ +_num_words = [ + "शुन्य", + "एक", + "दुई", + "तीन", + "चार", + "पाँच", + "छ", + "सात", + "आठ", + "नौ", + "दश", + "एघार", + "बाह्र", + "तेह्र", + "चौध", + "पन्ध्र", + "सोह्र", + "सोह्र", + "सत्र", + "अठार", + "उन्नाइस", + "बीस", + "तीस", + "चालीस", + "पचास", + "साठी", + "सत्तरी", + "असी", + "नब्बे", + "सय", + "हजार", + "लाख", + "करोड", + "अर्ब", + "खर्ब", +] + + +def norm(string): + # normalise base exceptions, e.g. punctuation or currency symbols + if string in BASE_NORMS: + return BASE_NORMS[string] + # set stem word as norm, if available, adapted from: + # https://github.com/explosion/spaCy/blob/master/spacy/lang/hi/lex_attrs.py + # https://www.researchgate.net/publication/237261579_Structure_of_Nepali_Grammar + for suffix_group in reversed(_stem_suffixes): + length = len(suffix_group[0]) + if len(string) <= length: + break + for suffix in suffix_group: + if string.endswith(suffix): + return string[:-length] + return string + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(", ", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text.lower() in _num_words: + return True + return False + + +LEX_ATTRS = {NORM: norm, LIKE_NUM: like_num} diff --git a/spacy/lang/ne/stop_words.py b/spacy/lang/ne/stop_words.py new file mode 100644 index 000000000..f008697d0 --- /dev/null +++ b/spacy/lang/ne/stop_words.py @@ -0,0 +1,498 @@ +# coding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/sanjaalcorps/NepaliStopWords/blob/master/NepaliStopWords.txt + +STOP_WORDS = set( + """ +अक्सर +अगाडि +अगाडी +अघि +अझै +अठार +अथवा +अनि +अनुसार +अन्तर्गत +अन्य +अन्यत्र +अन्यथा +अब +अरु +अरुलाई +अरू +अर्को +अर्थात +अर्थात् +अलग +अलि +अवस्था +अहिले +आए +आएका +आएको +आज +आजको +आठ +आत्म +आदि +आदिलाई +आफनो +आफू +आफूलाई +आफै +आफैँ +आफ्नै +आफ्नो +आयो +उ +उक्त +उदाहरण +उनको +उनलाई +उनले +उनि +उनी +उनीहरुको +उन्नाइस +उप +उसको +उसलाई +उसले +उहालाई +ऊ +एउटा +एउटै +एक +एकदम +एघार +ओठ +औ +औं +कता +कति +कतै +कम +कमसेकम +कसरि +कसरी +कसै +कसैको +कसैलाई +कसैले +कसैसँग +कस्तो +कहाँबाट +कहिलेकाहीं +का +काम +कारण +कि +किन +किनभने +कुन +कुनै +कुन्नी +कुरा +कृपया +के +केहि +केही +को +कोहि +कोहिपनि +कोही +कोहीपनि +क्रमशः +गए +गएको +गएर +गयौ +गरि +गरी +गरे +गरेका +गरेको +गरेर +गरौं +गर्छ +गर्छन् +गर्छु +गर्दा +गर्दै +गर्न +गर्नु +गर्नुपर्छ +गर्ने +गैर +घर +चार +चाले +चाहनुहुन्छ +चाहन्छु +चाहिं +चाहिए +चाहिंले +चाहीं +चाहेको +चाहेर +चोटी +चौथो +चौध +छ +छन +छन् +छु +छू +छैन +छैनन् +छौ +छौं +जता +जताततै +जना +जनाको +जनालाई +जनाले +जब +जबकि +जबकी +जसको +जसबाट +जसमा +जसरी +जसलाई +जसले +जस्ता +जस्तै +जस्तो +जस्तोसुकै +जहाँ +जान +जाने +जाहिर +जुन +जुनै +जे +जो +जोपनि +जोपनी +झैं +ठाउँमा +ठीक +ठूलो +त +तता +तत्काल +तथा +तथापि +तथापी +तदनुसार +तपाइ +तपाई +तपाईको +तब +तर +तर्फ +तल +तसरी +तापनि +तापनी +तिन +तिनि +तिनिहरुलाई +तिनी +तिनीहरु +तिनीहरुको +तिनीहरू +तिनीहरूको +तिनै +तिमी +तिर +तिरको +ती +तीन +तुरन्त +तुरुन्त +तुरुन्तै +तेश्रो +तेस्कारण +तेस्रो +तेह्र +तैपनि +तैपनी +त्यत्तिकै +त्यत्तिकैमा +त्यस +त्यसकारण +त्यसको +त्यसले +त्यसैले +त्यसो +त्यस्तै +त्यस्तो +त्यहाँ +त्यहिँ +त्यही +त्यहीँ +त्यहीं +त्यो +त्सपछि +त्सैले +थप +थरि +थरी +थाहा +थिए +थिएँ +थिएन +थियो +दर्ता +दश +दिए +दिएको +दिन +दिनुभएको +दिनुहुन्छ +दुइ +दुइवटा +दुई +देखि +देखिन्छ +देखियो +देखे +देखेको +देखेर +दोश्री +दोश्रो +दोस्रो +द्वारा +धन्न +धेरै +धौ +न +नगर्नु +नगर्नू +नजिकै +नत्र +नत्रभने +नभई +नभएको +नभनेर +नयाँ +नि +निकै +निम्ति +निम्न +निम्नानुसार +निर्दिष्ट +नै +नौ +पक्का +पक्कै +पछाडि +पछाडी +पछि +पछिल्लो +पछी +पटक +पनि +पन्ध्र +पर्छ +पर्थ्यो +पर्दैन +पर्ने +पर्नेमा +पर्याप्त +पहिले +पहिलो +पहिल्यै +पाँच +पांच +पाचौँ +पाँचौं +पिच्छे +पूर्व +पो +प्रति +प्रतेक +प्रत्यक +प्राय +प्लस +फरक +फेरि +फेरी +बढी +बताए +बने +बरु +बाट +बारे +बाहिर +बाहेक +बाह्र +बिच +बिचमा +बिरुद्ध +बिशेष +बिस +बीच +बीचमा +बीस +भए +भएँ +भएका +भएकालाई +भएको +भएन +भएर +भन +भने +भनेको +भनेर +भन् +भन्छन् +भन्छु +भन्दा +भन्दै +भन्नुभयो +भन्ने +भन्या +भयेन +भयो +भर +भरि +भरी +भा +भित्र +भित्री +भीत्र +म +मध्य +मध्ये +मलाई +मा +मात्र +मात्रै +माथि +माथी +मुख्य +मुनि +मुन्तिर +मेरो +मैले +यति +यथोचित +यदि +यद्ध्यपि +यद्यपि +यस +यसका +यसको +यसपछि +यसबाहेक +यसमा +यसरी +यसले +यसो +यस्तै +यस्तो +यहाँ +यहाँसम्म +यही +या +यी +यो +र +रही +रहेका +रहेको +रहेछ +राखे +राख्छ +राम्रो +रुपमा +रूप +रे +लगभग +लगायत +लाई +लाख +लागि +लागेको +ले +वटा +वरीपरी +वा +वाट +वापत +वास्तवमा +शायद +सक्छ +सक्ने +सँग +संग +सँगको +सँगसँगै +सँगै +संगै +सङ्ग +सङ्गको +सट्टा +सत्र +सधै +सबै +सबैको +सबैलाई +समय +समेत +सम्भव +सम्म +सय +सरह +सहित +सहितै +सही +साँच्चै +सात +साथ +साथै +सायद +सारा +सुनेको +सुनेर +सुरु +सुरुको +सुरुमै +सो +सोचेको +सोचेर +सोही +सोह्र +स्थित +स्पष्ट +हजार +हरे +हरेक +हामी +हामीले +हाम्रा +हाम्रो +हुँदैन +हुन +हुनत +हुनु +हुने +हुनेछ +हुन् +हुन्छ +हुन्थ्यो +हैन +हो +होइन +होकि +होला +""".split() +) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 1f13da5d6..91b7e4d9d 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -170,6 +170,11 @@ def nb_tokenizer(): return get_lang_class("nb").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def ne_tokenizer(): + return get_lang_class("ne").Defaults.create_tokenizer() + + @pytest.fixture(scope="session") def nl_tokenizer(): return get_lang_class("nl").Defaults.create_tokenizer() diff --git a/spacy/tests/lang/ne/__init__.py b/spacy/tests/lang/ne/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/ne/test_text.py b/spacy/tests/lang/ne/test_text.py new file mode 100644 index 000000000..926a7de04 --- /dev/null +++ b/spacy/tests/lang/ne/test_text.py @@ -0,0 +1,19 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +def test_ne_tokenizer_handlers_long_text(ne_tokenizer): + text = """मैले पाएको सर्टिफिकेटलाई म त बोक्रो सम्झन्छु र अभ्यास तब सुरु भयो, जब मैले कलेज पार गरेँ र जीवनको पढाइ सुरु गरेँ ।""" + tokens = ne_tokenizer(text) + assert len(tokens) == 24 + + +@pytest.mark.parametrize( + "text,length", + [("समय जान कति पनि बेर लाग्दैन ।", 7), ("म ठूलो हुँदै थिएँ ।", 5)], +) +def test_ne_tokenizer_handles_cnts(ne_tokenizer, text, length): + tokens = ne_tokenizer(text) + assert len(tokens) == length \ No newline at end of file