From 00417794d3d4dfcbcf3a1d5624a6eff23a1799ad Mon Sep 17 00:00:00 2001 From: Tahar Zanouda Date: Tue, 15 May 2018 00:27:19 +0200 Subject: [PATCH] Add Arabic language (#2314) * added support for Arabic lang * added Arabic language support * updated conftest --- .github/contributors/tzano.md | 106 ++++++++++++ spacy/lang/ar/__init__.py | 31 ++++ spacy/lang/ar/examples.py | 20 +++ spacy/lang/ar/lex_attrs.py | 95 ++++++++++ spacy/lang/ar/punctuation.py | 15 ++ spacy/lang/ar/stop_words.py | 229 +++++++++++++++++++++++++ spacy/lang/ar/tokenizer_exceptions.py | 47 +++++ spacy/lang/char_classes.py | 7 +- spacy/tests/conftest.py | 17 +- spacy/tests/lang/ar/__init__.py | 0 spacy/tests/lang/ar/test_exceptions.py | 26 +++ spacy/tests/lang/ar/test_text.py | 13 ++ 12 files changed, 595 insertions(+), 11 deletions(-) create mode 100644 .github/contributors/tzano.md create mode 100644 spacy/lang/ar/__init__.py create mode 100644 spacy/lang/ar/examples.py create mode 100644 spacy/lang/ar/lex_attrs.py create mode 100644 spacy/lang/ar/punctuation.py create mode 100644 spacy/lang/ar/stop_words.py create mode 100644 spacy/lang/ar/tokenizer_exceptions.py create mode 100644 spacy/tests/lang/ar/__init__.py create mode 100644 spacy/tests/lang/ar/test_exceptions.py create mode 100644 spacy/tests/lang/ar/test_text.py diff --git a/.github/contributors/tzano.md b/.github/contributors/tzano.md new file mode 100644 index 000000000..2c20c59d4 --- /dev/null +++ b/.github/contributors/tzano.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Tahar Zanouda | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 09-05-2018 | +| GitHub username | tzano | +| Website (optional) | | diff --git a/spacy/lang/ar/__init__.py b/spacy/lang/ar/__init__.py new file mode 100644 index 000000000..50bf5b157 --- /dev/null +++ b/spacy/lang/ar/__init__.py @@ -0,0 +1,31 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS +from .punctuation import TOKENIZER_SUFFIXES + +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS +from ...language import Language +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups + + +class ArabicDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) + lex_attr_getters[LANG] = lambda text: 'ar' + lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = STOP_WORDS + suffixes = TOKENIZER_SUFFIXES + + +class Arabic(Language): + lang = 'ar' + Defaults = ArabicDefaults + + +__all__ = ['Arabic'] diff --git a/spacy/lang/ar/examples.py b/spacy/lang/ar/examples.py new file mode 100644 index 000000000..b78322d1a --- /dev/null +++ b/spacy/lang/ar/examples.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.ar.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + +sentences = [ + "نال الكاتب خالد توفيق جائزة الرواية العربية في معرض الشارقة الدولي للكتاب", + "أين تقع دمشق ؟" + "كيف حالك ؟", + "هل يمكن ان نلتقي على الساعة الثانية عشرة ظهرا ؟", + "ماهي أبرز التطورات السياسية، الأمنية والاجتماعية في العالم ؟", + "هل بالإمكان أن نلتقي غدا؟", + "هناك نحو 382 مليون شخص مصاب بداء السكَّري في العالم", + "كشفت دراسة حديثة أن الخيل تقرأ تعبيرات الوجه وتستطيع أن تتذكر مشاعر الناس وعواطفهم" +] diff --git a/spacy/lang/ar/lex_attrs.py b/spacy/lang/ar/lex_attrs.py new file mode 100644 index 000000000..d8c060077 --- /dev/null +++ b/spacy/lang/ar/lex_attrs.py @@ -0,0 +1,95 @@ +# coding: utf8 +from __future__ import unicode_literals +from ...attrs import LIKE_NUM + +_num_words = set(""" +صفر +واحد +إثنان +اثنان +ثلاثة +ثلاثه +أربعة +أربعه +خمسة +خمسه +ستة +سته +سبعة +سبعه +ثمانية +ثمانيه +تسعة +تسعه +ﻋﺸﺮﺓ +ﻋﺸﺮه +عشرون +عشرين +ثلاثون +ثلاثين +اربعون +اربعين +أربعون +أربعين +خمسون +خمسين +ستون +ستين +سبعون +سبعين +ثمانون +ثمانين +تسعون +تسعين +مائتين +مائتان +ثلاثمائة +خمسمائة +سبعمائة +الف +آلاف +ملايين +مليون +مليار +مليارات +""".split()) + +_ordinal_words = set(""" +اول +أول +حاد +واحد +ثان +ثاني +ثالث +رابع +خامس +سادس +سابع +ثامن +تاسع +عاشر +""".split()) + + +def like_num(text): + """ + check if text resembles a number + """ + text = text.replace(',', '').replace('.', '') + if text.isdigit(): + return True + if text.count('/') == 1: + num, denom = text.split('/') + if num.isdigit() and denom.isdigit(): + return True + if text in _num_words: + return True + if text in _ordinal_words: + return True + return False + + +LEX_ATTRS = { + LIKE_NUM: like_num +} diff --git a/spacy/lang/ar/punctuation.py b/spacy/lang/ar/punctuation.py new file mode 100644 index 000000000..9857d0d3e --- /dev/null +++ b/spacy/lang/ar/punctuation.py @@ -0,0 +1,15 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..punctuation import TOKENIZER_INFIXES +from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY +from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER + +_suffixes = (LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES + + [r'(?<=[0-9])\+', + # Arabic is written from Right-To-Left + r'(?<=[0-9])(?:{})'.format(CURRENCY), + r'(?<=[0-9])(?:{})'.format(UNITS), + r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER)]) + +TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/lang/ar/stop_words.py b/spacy/lang/ar/stop_words.py new file mode 100644 index 000000000..e55ba3f52 --- /dev/null +++ b/spacy/lang/ar/stop_words.py @@ -0,0 +1,229 @@ +# coding: utf8 +from __future__ import unicode_literals + +STOP_WORDS = set(""" +من +نحو +لعل +بما +بين +وبين +ايضا +وبينما +تحت +مثلا +لدي +عنه +مع +هي +وهذا +واذا +هذان +انه +بينما +أمسى +وسوف +ولم +لذلك +إلى +منه +منها +كما +ظل +هنا +به +كذلك +اما +هما +بعد +بينهم +التي +أبو +اذا +بدلا +لها +أمام +يلي +حين +ضد +الذي +قد +صار +إذا +مابرح +قبل +كل +وليست +الذين +لهذا +وثي +انهم +باللتي +مافتئ +ولا +بهذه +بحيث +كيف +وله +علي +بات +لاسيما +حتى +وقد +و +أما +فيها +بهذا +لذا +حيث +لقد +إن +فإن +اول +ليت +فاللتي +ولقد +لسوف +هذه +ولماذا +معه +الحالي +بإن +حول +في +عليه +مايزال +ولعل +أنه +أضحى +اي +ستكون +لن +أن +ضمن +وعلى +امسى +الي +ذات +ولايزال +ذلك +فقد +هم +أي +عند +ابن +أو +فهو +فانه +سوف +ما +آل +كلا +عنها +وكذلك +ليست +لم +وأن +ماذا +لو +وهل +اللتي +ولذا +يمكن +فيه +الا +عليها +وبينهم +يوم +وبما +لما +فكان +اضحى +اصبح +لهم +بها +او +الذى +الى +إلي +قال +والتي +لازال +أصبح +ولهذا +مثل +وكانت +لكنه +بذلك +هذا +لماذا +قالت +فقط +لكن +مما +وكل +وان +وأبو +ومن +كان +مازال +هل +بينهن +هو +وما +على +وهو +لأن +واللتي +والذي +دون +عن +وايضا +هناك +بلا +جدا +ثم +منذ +اللذين +لايزال +بعض +مساء +تكون +فلا +بيننا +لا +ولكن +إذ +وأثناء +ليس +ومع +فيهم +ولسوف +بل +تلك +أحد +وهي +وكان +ومنها +وفي +ماانفك +اليوم +وماذا +هؤلاء +وليس +له +أثناء +بد +اليه +كأن +اليها +بتلك +يكون +ولما +هن +والى +كانت +وقبل +ان +لدى +""".split()) diff --git a/spacy/lang/ar/tokenizer_exceptions.py b/spacy/lang/ar/tokenizer_exceptions.py new file mode 100644 index 000000000..e5b5a1767 --- /dev/null +++ b/spacy/lang/ar/tokenizer_exceptions.py @@ -0,0 +1,47 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA +import re + +_exc = {} + +# time +for exc_data in [ + {LEMMA: "قبل الميلاد", ORTH: "ق.م"}, + {LEMMA: "بعد الميلاد", ORTH: "ب. م"}, + {LEMMA: "ميلادي", ORTH: ".م"}, + {LEMMA: "هجري", ORTH: ".هـ"}, + {LEMMA: "توفي", ORTH: ".ت"}]: + _exc[exc_data[ORTH]] = [exc_data] + +# scientific abv. +for exc_data in [ + {LEMMA: "صلى الله عليه وسلم", ORTH: "صلعم"}, + {LEMMA: "الشارح", ORTH: "الشـ"}, + {LEMMA: "الظاهر", ORTH: "الظـ"}, + {LEMMA: "أيضًا", ORTH: "أيضـ"}, + {LEMMA: "إلى آخره", ORTH: "إلخ"}, + {LEMMA: "انتهى", ORTH: "اهـ"}, + {LEMMA: "حدّثنا", ORTH: "ثنا"}, + {LEMMA: "حدثني", ORTH: "ثنى"}, + {LEMMA: "أنبأنا", ORTH: "أنا"}, + {LEMMA: "أخبرنا", ORTH: "نا"}, + {LEMMA: "مصدر سابق", ORTH: "م. س"}, + {LEMMA: "مصدر نفسه", ORTH: "م. ن"}]: + _exc[exc_data[ORTH]] = [exc_data] + +# other abv. +for exc_data in [ + {LEMMA: "دكتور", ORTH: "د."}, + {LEMMA: "أستاذ دكتور", ORTH: "أ.د"}, + {LEMMA: "أستاذ", ORTH: "أ."}, + {LEMMA: "بروفيسور", ORTH: "ب."}]: + _exc[exc_data[ORTH]] = [exc_data] + +for exc_data in [ + {LEMMA: "تلفون", ORTH: "ت."}, + {LEMMA: "صندوق بريد", ORTH: "ص.ب"}]: + _exc[exc_data[ORTH]] = [exc_data] + +TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index e76ae89d3..c0ecf5b3f 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -3,13 +3,11 @@ from __future__ import unicode_literals import regex as re - re.DEFAULT_VERSION = re.VERSION1 merge_char_classes = lambda classes: '[{}]'.format('||'.join(classes)) split_chars = lambda char: list(char.strip().split(' ')) merge_chars = lambda char: char.strip().replace(' ', '|') - _bengali = r'[\p{L}&&\p{Bengali}]' _hebrew = r'[\p{L}&&\p{Hebrew}]' _latin_lower = r'[\p{Ll}&&\p{Latin}]' @@ -27,11 +25,11 @@ ALPHA = merge_char_classes(_upper + _lower + _uncased) ALPHA_LOWER = merge_char_classes(_lower + _uncased) ALPHA_UPPER = merge_char_classes(_upper + _uncased) - _units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft ' 'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb ' 'TB T G M K % км км² км³ м м² м³ дм дм² дм³ см см² см³ мм мм² мм³ нм ' - 'кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб') + 'кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб' + 'كم كم² كم³ م م² م³ سم سم² سم³ مم مم² مم³ كم غرام جرام جم كغ ملغ كوب اكواب') _currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼' # These expressions contain various unicode variations, including characters @@ -45,7 +43,6 @@ _hyphens = '- – — -- --- —— ~' # Details: https://www.compart.com/en/unicode/category/So _other_symbols = r'[\p{So}]' - UNITS = merge_chars(_units) CURRENCY = merge_chars(_currency) QUOTES = merge_chars(_quotes) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 74c464a9b..53f0b506c 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -15,7 +15,9 @@ from .. import util # here if it's using spaCy's tokenizer (not a different library) # TODO: re-implement generic tokenizer tests _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id', + 'it', 'nb', 'nl', 'pl', 'pt', 'ru', 'sv', 'tr', 'ar', 'xx'] 'it', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sv', 'tr', 'xx'] + _models = {'en': ['en_core_web_sm'], 'de': ['de_core_news_md'], 'fr': ['fr_core_news_sm'], @@ -50,8 +52,8 @@ def RU(request): #@pytest.fixture(params=_languages) #def tokenizer(request): - #lang = util.get_lang_class(request.param) - #return lang.Defaults.create_tokenizer() +#lang = util.get_lang_class(request.param) +#return lang.Defaults.create_tokenizer() @pytest.fixture @@ -152,6 +154,9 @@ def th_tokenizer(): def tr_tokenizer(): return util.get_lang_class('tr').Defaults.create_tokenizer() +@pytest.fixture +def ar_tokenizer(): + return util.get_lang_class('ar').Defaults.create_tokenizer() @pytest.fixture def ru_tokenizer(): @@ -166,7 +171,7 @@ def stringstore(): @pytest.fixture def en_entityrecognizer(): - return util.get_lang_class('en').Defaults.create_entity() + return util.get_lang_class('en').Defaults.create_entity() @pytest.fixture @@ -181,11 +186,11 @@ def text_file_b(): def pytest_addoption(parser): parser.addoption("--models", action="store_true", - help="include tests that require full models") + help="include tests that require full models") parser.addoption("--vectors", action="store_true", - help="include word vectors tests") + help="include word vectors tests") parser.addoption("--slow", action="store_true", - help="include slow tests") + help="include slow tests") for lang in _languages + ['all']: parser.addoption("--%s" % lang, action="store_true", help="Use %s models" % lang) diff --git a/spacy/tests/lang/ar/__init__.py b/spacy/tests/lang/ar/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/ar/test_exceptions.py b/spacy/tests/lang/ar/test_exceptions.py new file mode 100644 index 000000000..e8da7f621 --- /dev/null +++ b/spacy/tests/lang/ar/test_exceptions.py @@ -0,0 +1,26 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', + ["ق.م", "إلخ", "ص.ب", "ت."]) +def test_ar_tokenizer_handles_abbr(ar_tokenizer, text): + tokens = ar_tokenizer(text) + assert len(tokens) == 1 + + +def test_ar_tokenizer_handles_exc_in_text(ar_tokenizer): + text = u"تعود الكتابة الهيروغليفية إلى سنة 3200 ق.م" + tokens = ar_tokenizer(text) + assert len(tokens) == 7 + assert tokens[6].text == "ق.م" + assert tokens[6].lemma_ == "قبل الميلاد" + + +def test_ar_tokenizer_handles_exc_in_text(ar_tokenizer): + text = u"يبلغ طول مضيق طارق 14كم " + tokens = ar_tokenizer(text) + print([(tokens[i].text, tokens[i].suffix_) for i in range(len(tokens))]) + assert len(tokens) == 6 diff --git a/spacy/tests/lang/ar/test_text.py b/spacy/tests/lang/ar/test_text.py new file mode 100644 index 000000000..7c5e9f9c7 --- /dev/null +++ b/spacy/tests/lang/ar/test_text.py @@ -0,0 +1,13 @@ +# coding: utf8 +from __future__ import unicode_literals + + +def test_tokenizer_handles_long_text(ar_tokenizer): + text = """نجيب محفوظ مؤلف و كاتب روائي عربي، يعد من أهم الأدباء العرب خلال القرن العشرين. + ولد نجيب محفوظ في مدينة القاهرة، حيث ترعرع و تلقى تعليمه الجامعي في جامعتها، + فتمكن من نيل شهادة في الفلسفة. ألف محفوظ على مدار حياته الكثير من الأعمال الأدبية، و في مقدمتها ثلاثيته الشهيرة. + و قد نجح في الحصول على جائزة نوبل للآداب، ليكون بذلك العربي الوحيد الذي فاز بها.""" + + tokens = ar_tokenizer(text) + assert tokens[3].is_stop == True + assert len(tokens) == 77