From 24046fef17f211ec7e131c87f7371001f15fa625 Mon Sep 17 00:00:00 2001 From: Shumi <76557637+Shumie82@users.noreply.github.com> Date: Wed, 10 Feb 2021 20:12:33 +0200 Subject: [PATCH 1/9] South African Setswana language Please accept the additional of Setswana language --- spacy/lang/tn/__init__.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 spacy/lang/tn/__init__.py diff --git a/spacy/lang/tn/__init__.py b/spacy/lang/tn/__init__.py new file mode 100644 index 000000000..911214331 --- /dev/null +++ b/spacy/lang/tn/__init__.py @@ -0,0 +1,18 @@ +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS +from .punctuation import TOKENIZER_INFIXES +from ...language import Language + + +class SetswanaDefaults(Language.Defaults): + suffixes = TOKENIZER_SUFFIXES + stop_words = STOP_WORDS + lex_attr_getters = LEX_ATTRS + + +class Setswana(Language): + lang = "tn" + Defaults = SetswanaDefaults + + +__all__ = ["Setswana"] From f6be28cfb231111a970d60b19efda2996c917373 Mon Sep 17 00:00:00 2001 From: Shumi <76557637+Shumie82@users.noreply.github.com> Date: Wed, 10 Feb 2021 20:15:13 +0200 Subject: [PATCH 2/9] Added files to Setswana Language Add South African Setswana Language --- spacy/lang/tn/examples.py | 19 ++++++ spacy/lang/tn/lex_attrs.py | 110 +++++++++++++++++++++++++++++++++++ spacy/lang/tn/punctuation.py | 19 ++++++ spacy/lang/tn/stop_words.py | 24 ++++++++ spacy/lang/tn/tag_map.py | 22 +++++++ 5 files changed, 194 insertions(+) create mode 100644 spacy/lang/tn/examples.py create mode 100644 spacy/lang/tn/lex_attrs.py create mode 100644 spacy/lang/tn/punctuation.py create mode 100644 spacy/lang/tn/stop_words.py create mode 100644 spacy/lang/tn/tag_map.py diff --git a/spacy/lang/tn/examples.py b/spacy/lang/tn/examples.py new file mode 100644 index 000000000..9039a1624 --- /dev/null +++ b/spacy/lang/tn/examples.py @@ -0,0 +1,19 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. +>>> from spacy.lang.en.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple e nyaka go reka JSE ka tlhwatlhwa ta R1 billion", + "Johannesburg ke toropo e kgolo mo Afrika Borwa.", + "O ko kae?", + "ke mang presidente ya Afrika Borwa?", + "ke eng toropo kgolo ya Afrika Borwa?", + "Nelson Mandela o belegwe leng?", +] \ No newline at end of file diff --git a/spacy/lang/tn/lex_attrs.py b/spacy/lang/tn/lex_attrs.py new file mode 100644 index 000000000..daef45d72 --- /dev/null +++ b/spacy/lang/tn/lex_attrs.py @@ -0,0 +1,110 @@ +coding: utf8 + +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + +_num_words = [ + "lefela", + "nngwe", + "pedi", + "tharo", + "nne", + "tlhano", + "thataro", + "supa", + "robedi", + "robongwe", + "lesome", + "lesomenngwe", + "lesomepedi", + "sometharo", + "somenne", + "sometlhano", + "somethataro", + "somesupa", + "somerobedi", + "somerobongwe", + "someamabedi", + "someamararo", + "someamane", + "someamatlhano", + "someamarataro", + "someamasupa", + "someamarobedi", + "someamarobongwe", + "lekgolo", + "sekete", + "milione", + "bilione", + "terilione", + "kwatirilione", + "gajillione", + "bazillione", +] + + +_ordinal_words = [ + "ntlha", + "bobedi", + "boraro", + "bone", + "botlhano", + "borataro", + "bosupa", + "borobedi ", + "borobongwe", + "bolesome", + "bolesomengwe", + "bolesomepedi", + "bolesometharo", + "bolesomenne", + "bolesometlhano", + "bolesomethataro", + "bolesomesupa", + "bolesomerobedi", + "bolesomerobongwe", + "somamabedi", + "someamararo", + "someamane", + "someamatlhano", + "someamarataro", + "someamasupa", + "someamarobedi", + "someamarobongwe", + "lekgolo", + "sekete", + "milione", + "bilione", + "terilione", + "kwatirilione", + "gajillione", + "bazillione", +] + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + + text_lower = text.lower() + if text_lower in _num_words: + return True + + # CHeck ordinal number + if text_lower in _ordinal_words: + return True + if text_lower.endswith("th"): + if text_lower[:-2].isdigit(): + return True + + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/tn/punctuation.py b/spacy/lang/tn/punctuation.py new file mode 100644 index 000000000..241ad39af --- /dev/null +++ b/spacy/lang/tn/punctuation.py @@ -0,0 +1,19 @@ +from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS +from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\-\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), + r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + ] +) + + +TOKENIZER_INFIXES = _infixes \ No newline at end of file diff --git a/spacy/lang/tn/stop_words.py b/spacy/lang/tn/stop_words.py new file mode 100644 index 000000000..65681f6ee --- /dev/null +++ b/spacy/lang/tn/stop_words.py @@ -0,0 +1,24 @@ +coding: utf8 + +from __future__ import unicode_literals + + +# Stop words +STOP_WORDS = set(""" +ke gareng ga selekanyo tlhwatlhwa yo mongwe se +sengwe fa go le jalo gongwe ba na mo tikologong +jaaka kwa morago nna gonne ka sa pele nako teng +tlase fela ntle magareng tsona feta bobedi kgabaganya +moo gape kgatlhanong botlhe tsotlhe bokana e esi +setseng mororo dinako golo kgolo nnye wena gago +o ntse ntle tla goreng gangwe mang yotlhe gore +eo yona tseraganyo eng ne sentle re rona thata +godimo fitlha pedi masomamabedi lesomepedi mmogo +tharo tseo boraro tseno yone jaanong bobona bona +lesome tsaya tsamaiso nngwe masomethataro thataro +tsa mmatota tota sale thoko supa dira tshwanetse di mmalwa masisi +bonala e tshwanang bogolo tsenya tsweetswee karolo +sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa +tlhano lesometlhano botlalo lekgolo +""".split()) +print(STOP_WORDS) \ No newline at end of file diff --git a/spacy/lang/tn/tag_map.py b/spacy/lang/tn/tag_map.py new file mode 100644 index 000000000..1c7f0647f --- /dev/null +++ b/spacy/lang/tn/tag_map.py @@ -0,0 +1,22 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB +from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON + + +TAG_MAP = { + "INT": {POS: INTJ}, + "JUNC": {POS: CCONJ}, + "$": {POS: PUNCT}, + "PROPOSS": {POS: PRON}, + "PROQUANT": {POS: PRON}, + "PROEMP": {POS: PRON}, + "NUM": {POS: NUM}, + "N": {POS: NOUN}, + "AUX": {POS: VERB}, + "ADV": {POS: ADV}, + "ADJ": {POS: ADJ}, + "V": {POS: VERB}, + "VCOP": {POS: VERB}, +} From 7c8721b1bd3b12719a2db395e237d8b496a3414c Mon Sep 17 00:00:00 2001 From: Shumi <76557637+Shumie82@users.noreply.github.com> Date: Wed, 10 Feb 2021 20:21:22 +0200 Subject: [PATCH 3/9] Update tag_map.py Updated tag_map --- spacy/lang/tn/tag_map.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/tn/tag_map.py b/spacy/lang/tn/tag_map.py index 1c7f0647f..e26f4c4e1 100644 --- a/spacy/lang/tn/tag_map.py +++ b/spacy/lang/tn/tag_map.py @@ -1,4 +1,4 @@ -# coding: utf8 +coding: utf8 from __future__ import unicode_literals from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB From ed3397727e3cf3cc7b8ff9a89224fe894424392d Mon Sep 17 00:00:00 2001 From: Shumi <76557637+Shumie82@users.noreply.github.com> Date: Wed, 10 Feb 2021 20:41:18 +0200 Subject: [PATCH 4/9] Delete tag_map.py Tag map file is deleted. I will add it later because it was failing validations --- spacy/lang/tn/tag_map.py | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 spacy/lang/tn/tag_map.py diff --git a/spacy/lang/tn/tag_map.py b/spacy/lang/tn/tag_map.py deleted file mode 100644 index e26f4c4e1..000000000 --- a/spacy/lang/tn/tag_map.py +++ /dev/null @@ -1,22 +0,0 @@ -coding: utf8 -from __future__ import unicode_literals - -from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB -from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON - - -TAG_MAP = { - "INT": {POS: INTJ}, - "JUNC": {POS: CCONJ}, - "$": {POS: PUNCT}, - "PROPOSS": {POS: PRON}, - "PROQUANT": {POS: PRON}, - "PROEMP": {POS: PRON}, - "NUM": {POS: NUM}, - "N": {POS: NOUN}, - "AUX": {POS: VERB}, - "ADV": {POS: ADV}, - "ADJ": {POS: ADJ}, - "V": {POS: VERB}, - "VCOP": {POS: VERB}, -} From 39eeba6760c6011e3372ea2a359bfc7b056bfa1e Mon Sep 17 00:00:00 2001 From: Shumi <76557637+Shumie82@users.noreply.github.com> Date: Thu, 11 Feb 2021 21:20:46 +0200 Subject: [PATCH 5/9] Update __init__.py Added infixes = TOKENIZER_INFIXES --- spacy/lang/tn/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/lang/tn/__init__.py b/spacy/lang/tn/__init__.py index 911214331..648772528 100644 --- a/spacy/lang/tn/__init__.py +++ b/spacy/lang/tn/__init__.py @@ -6,6 +6,7 @@ from ...language import Language class SetswanaDefaults(Language.Defaults): suffixes = TOKENIZER_SUFFIXES + infixes = TOKENIZER_INFIXES stop_words = STOP_WORDS lex_attr_getters = LEX_ATTRS From 37ec67f868ec803423cd76af28f8116c326ebedd Mon Sep 17 00:00:00 2001 From: Shumi <76557637+Shumie82@users.noreply.github.com> Date: Thu, 11 Feb 2021 21:25:58 +0200 Subject: [PATCH 6/9] Update examples.py I have removed two lines: # coding: utf8 from __future__ import unicode_literals And updated: >>> from spacy.lang.tn.examples import sentences --- spacy/lang/tn/examples.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/spacy/lang/tn/examples.py b/spacy/lang/tn/examples.py index 9039a1624..7b33fae5a 100644 --- a/spacy/lang/tn/examples.py +++ b/spacy/lang/tn/examples.py @@ -1,10 +1,6 @@ -# coding: utf8 -from __future__ import unicode_literals - - """ Example sentences to test spaCy and its language models. ->>> from spacy.lang.en.examples import sentences +>>> from spacy.lang.tn.examples import sentences >>> docs = nlp.pipe(sentences) """ @@ -16,4 +12,4 @@ sentences = [ "ke mang presidente ya Afrika Borwa?", "ke eng toropo kgolo ya Afrika Borwa?", "Nelson Mandela o belegwe leng?", -] \ No newline at end of file +] From 0d57e84b7baa35aaadeba7346c63a98c07511869 Mon Sep 17 00:00:00 2001 From: Shumi <76557637+Shumie82@users.noreply.github.com> Date: Thu, 11 Feb 2021 21:28:23 +0200 Subject: [PATCH 7/9] Update lex_attrs.py I have removed line 1 to 4 --- spacy/lang/tn/lex_attrs.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/spacy/lang/tn/lex_attrs.py b/spacy/lang/tn/lex_attrs.py index daef45d72..33a16a09a 100644 --- a/spacy/lang/tn/lex_attrs.py +++ b/spacy/lang/tn/lex_attrs.py @@ -1,7 +1,3 @@ -coding: utf8 - -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = [ From 4e514f1ea8afcf341cc1d9b923eb7667b4b287c9 Mon Sep 17 00:00:00 2001 From: Shumi <76557637+Shumie82@users.noreply.github.com> Date: Thu, 11 Feb 2021 21:30:34 +0200 Subject: [PATCH 8/9] Update stop_words.py I have deleted line 1 to 5 and the statement print(STOP_WORDS) --- spacy/lang/tn/stop_words.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/spacy/lang/tn/stop_words.py b/spacy/lang/tn/stop_words.py index 65681f6ee..a627ef362 100644 --- a/spacy/lang/tn/stop_words.py +++ b/spacy/lang/tn/stop_words.py @@ -1,8 +1,3 @@ -coding: utf8 - -from __future__ import unicode_literals - - # Stop words STOP_WORDS = set(""" ke gareng ga selekanyo tlhwatlhwa yo mongwe se @@ -21,4 +16,3 @@ bonala e tshwanang bogolo tsenya tsweetswee karolo sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa tlhano lesometlhano botlalo lekgolo """.split()) -print(STOP_WORDS) \ No newline at end of file From 6c450decfc01e2d82f0b7c8f799654d79158fa4c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 13 Feb 2021 11:51:21 +1100 Subject: [PATCH 9/9] Fix punctuation settings and add to initialize tests --- spacy/lang/tn/__init__.py | 3 +-- spacy/tests/lang/test_initialize.py | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/lang/tn/__init__.py b/spacy/lang/tn/__init__.py index 648772528..99907c28a 100644 --- a/spacy/lang/tn/__init__.py +++ b/spacy/lang/tn/__init__.py @@ -5,8 +5,7 @@ from ...language import Language class SetswanaDefaults(Language.Defaults): - suffixes = TOKENIZER_SUFFIXES - infixes = TOKENIZER_INFIXES + infixes = TOKENIZER_INFIXES stop_words = STOP_WORDS lex_attr_getters = LEX_ATTRS diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py index de1871e64..46f1f2bd1 100644 --- a/spacy/tests/lang/test_initialize.py +++ b/spacy/tests/lang/test_initialize.py @@ -8,7 +8,8 @@ from spacy.util import get_lang_class LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is", "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk", - "sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur", 'yo'] + "sl", "sq", "sr", "sv", "ta", "te", "tl", "tn", "tr", "tt", "ur", + "yo"] # fmt: on