Added files to Setswana Language

Add South African Setswana Language
2021-02-10 20:15:13 +02:00 · 2021-02-10 20:15:13 +02:00 · f6be28cfb2
parent 24046fef17
commit f6be28cfb2
5 changed files with 194 additions and 0 deletions
--- a/spacy/lang/tn/examples.py
+++ b/spacy/lang/tn/examples.py
@ -0,0 +1,19 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+>>> from spacy.lang.en.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple e nyaka go reka JSE ka tlhwatlhwa ta R1 billion",
+    "Johannesburg ke toropo e kgolo mo Afrika Borwa.",
+    "O ko kae?",
+    "ke mang presidente ya Afrika Borwa?",
+    "ke eng toropo kgolo ya Afrika Borwa?",
+    "Nelson Mandela o belegwe leng?",
+]
--- a/spacy/lang/tn/lex_attrs.py
+++ b/spacy/lang/tn/lex_attrs.py
@ -0,0 +1,110 @@
+coding: utf8
+
+from __future__ import unicode_literals
+
+from ...attrs import LIKE_NUM
+
+_num_words = [
+    "lefela",
+    "nngwe",
+    "pedi",
+    "tharo",
+    "nne",
+    "tlhano",
+    "thataro",
+    "supa",
+    "robedi",
+    "robongwe",
+    "lesome",
+    "lesomenngwe",
+    "lesomepedi",
+    "sometharo",
+    "somenne",
+    "sometlhano",
+    "somethataro",
+    "somesupa",
+    "somerobedi",
+    "somerobongwe",
+    "someamabedi",
+    "someamararo",
+    "someamane",
+    "someamatlhano",
+    "someamarataro",
+    "someamasupa",
+    "someamarobedi",
+    "someamarobongwe",
+    "lekgolo",
+    "sekete",
+    "milione",
+    "bilione",
+    "terilione",
+    "kwatirilione",
+    "gajillione",
+    "bazillione",
+]
+
+
+_ordinal_words = [
+    "ntlha",
+    "bobedi",
+    "boraro",
+    "bone",
+    "botlhano",
+    "borataro",
+    "bosupa",
+    "borobedi ",
+    "borobongwe",
+    "bolesome",
+    "bolesomengwe",
+    "bolesomepedi",
+    "bolesometharo",
+    "bolesomenne",
+    "bolesometlhano",
+    "bolesomethataro",
+    "bolesomesupa",
+    "bolesomerobedi",
+    "bolesomerobongwe",
+    "somamabedi",
+    "someamararo",
+    "someamane",
+    "someamatlhano",
+    "someamarataro",
+    "someamasupa",
+    "someamarobedi",
+    "someamarobongwe",
+    "lekgolo",
+    "sekete",
+    "milione",
+    "bilione",
+    "terilione",
+    "kwatirilione",
+    "gajillione",
+    "bazillione",
+]
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+
+    text_lower = text.lower()
+    if text_lower in _num_words:
+        return True
+
+    # CHeck ordinal number
+    if text_lower in _ordinal_words:
+        return True
+    if text_lower.endswith("th"):
+        if text_lower[:-2].isdigit():
+            return True 
+
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/tn/punctuation.py
+++ b/spacy/lang/tn/punctuation.py
@ -0,0 +1,19 @@
+from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
+from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
+
+_infixes = (
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+        ),
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
+        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
+    ]
+)
+
+
+TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/tn/stop_words.py
+++ b/spacy/lang/tn/stop_words.py
@ -0,0 +1,24 @@
+coding: utf8
+
+from __future__ import unicode_literals
+
+
+# Stop words
+STOP_WORDS = set("""
+ke gareng ga selekanyo tlhwatlhwa yo mongwe se 
+sengwe fa go le jalo gongwe ba na mo tikologong
+jaaka kwa morago nna gonne ka sa pele nako teng 
+tlase fela ntle magareng tsona feta bobedi kgabaganya
+moo gape kgatlhanong botlhe tsotlhe bokana e esi
+setseng mororo dinako golo kgolo nnye wena gago 
+o ntse ntle tla goreng gangwe mang yotlhe gore 
+eo yona tseraganyo eng ne sentle re rona thata 
+godimo fitlha pedi masomamabedi lesomepedi mmogo 
+tharo tseo boraro tseno yone jaanong bobona bona 
+lesome tsaya tsamaiso nngwe masomethataro thataro 
+tsa mmatota tota sale thoko supa dira tshwanetse di mmalwa masisi
+bonala e tshwanang bogolo tsenya tsweetswee karolo 
+sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa 
+tlhano lesometlhano botlalo lekgolo           
+""".split())
+print(STOP_WORDS)
--- a/spacy/lang/tn/tag_map.py
+++ b/spacy/lang/tn/tag_map.py
@ -0,0 +1,22 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
+from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
+
+
+TAG_MAP = {    
+    "INT": {POS: INTJ}, 
+    "JUNC": {POS: CCONJ},   
+    "$": {POS: PUNCT},   
+    "PROPOSS": {POS: PRON},
+    "PROQUANT": {POS: PRON},
+    "PROEMP": {POS: PRON},
+    "NUM": {POS: NUM},
+    "N": {POS: NOUN},
+    "AUX": {POS: VERB},
+    "ADV": {POS: ADV},
+    "ADJ": {POS: ADJ},
+    "V": {POS: VERB},
+    "VCOP": {POS: VERB},
+}