Added files to Setswana Language

Add South African Setswana Language
2021-02-10 20:15:13 +02:00 · 2021-02-10 20:15:13 +02:00 · f6be28cfb2
parent 24046fef17
commit f6be28cfb2
5 changed files with 194 additions and 0 deletions
--- a/spacy/lang/tn/examples.py
+++ b/spacy/lang/tn/examples.py
@ -0,0 +1,19 @@
 # coding: utf8
 from __future__ import unicode_literals
 """
 Example sentences to test spaCy and its language models.
 >>> from spacy.lang.en.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 sentences = [
    "Apple e nyaka go reka JSE ka tlhwatlhwa ta R1 billion",
    "Johannesburg ke toropo e kgolo mo Afrika Borwa.",
    "O ko kae?",
    "ke mang presidente ya Afrika Borwa?",
    "ke eng toropo kgolo ya Afrika Borwa?",
    "Nelson Mandela o belegwe leng?",
 ]
--- a/spacy/lang/tn/lex_attrs.py
+++ b/spacy/lang/tn/lex_attrs.py
@ -0,0 +1,110 @@
 coding: utf8
 from __future__ import unicode_literals
 from ...attrs import LIKE_NUM
 _num_words = [
    "lefela",
    "nngwe",
    "pedi",
    "tharo",
    "nne",
    "tlhano",
    "thataro",
    "supa",
    "robedi",
    "robongwe",
    "lesome",
    "lesomenngwe",
    "lesomepedi",
    "sometharo",
    "somenne",
    "sometlhano",
    "somethataro",
    "somesupa",
    "somerobedi",
    "somerobongwe",
    "someamabedi",
    "someamararo",
    "someamane",
    "someamatlhano",
    "someamarataro",
    "someamasupa",
    "someamarobedi",
    "someamarobongwe",
    "lekgolo",
    "sekete",
    "milione",
    "bilione",
    "terilione",
    "kwatirilione",
    "gajillione",
    "bazillione",
 ]
 _ordinal_words = [
    "ntlha",
    "bobedi",
    "boraro",
    "bone",
    "botlhano",
    "borataro",
    "bosupa",
    "borobedi ",
    "borobongwe",
    "bolesome",
    "bolesomengwe",
    "bolesomepedi",
    "bolesometharo",
    "bolesomenne",
    "bolesometlhano",
    "bolesomethataro",
    "bolesomesupa",
    "bolesomerobedi",
    "bolesomerobongwe",
    "somamabedi",
    "someamararo",
    "someamane",
    "someamatlhano",
    "someamarataro",
    "someamasupa",
    "someamarobedi",
    "someamarobongwe",
    "lekgolo",
    "sekete",
    "milione",
    "bilione",
    "terilione",
    "kwatirilione",
    "gajillione",
    "bazillione",
 ]
 def like_num(text):
    if text.startswith(("+", "-", "±", "~")):
        text = text[1:]
    text = text.replace(",", "").replace(".", "")
    if text.isdigit():
        return True
    if text.count("/") == 1:
        num, denom = text.split("/")
        if num.isdigit() and denom.isdigit():
            return True
    text_lower = text.lower()
    if text_lower in _num_words:
        return True
    # CHeck ordinal number
    if text_lower in _ordinal_words:
        return True
    if text_lower.endswith("th"):
        if text_lower[:-2].isdigit():
            return True 
    return False
 LEX_ATTRS = {LIKE_NUM: like_num}
--- a/spacy/lang/tn/punctuation.py
+++ b/spacy/lang/tn/punctuation.py
@ -0,0 +1,19 @@
 from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
 from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
 _infixes = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [
        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
        ),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
    ]
 )
 TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/tn/stop_words.py
+++ b/spacy/lang/tn/stop_words.py
@ -0,0 +1,24 @@
 coding: utf8
 from __future__ import unicode_literals
 # Stop words
 STOP_WORDS = set("""
 ke gareng ga selekanyo tlhwatlhwa yo mongwe se 
 sengwe fa go le jalo gongwe ba na mo tikologong
 jaaka kwa morago nna gonne ka sa pele nako teng 
 tlase fela ntle magareng tsona feta bobedi kgabaganya
 moo gape kgatlhanong botlhe tsotlhe bokana e esi
 setseng mororo dinako golo kgolo nnye wena gago 
 o ntse ntle tla goreng gangwe mang yotlhe gore 
 eo yona tseraganyo eng ne sentle re rona thata 
 godimo fitlha pedi masomamabedi lesomepedi mmogo 
 tharo tseo boraro tseno yone jaanong bobona bona 
 lesome tsaya tsamaiso nngwe masomethataro thataro 
 tsa mmatota tota sale thoko supa dira tshwanetse di mmalwa masisi
 bonala e tshwanang bogolo tsenya tsweetswee karolo 
 sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa 
 tlhano lesometlhano botlalo lekgolo           
 """.split())
 print(STOP_WORDS)
--- a/spacy/lang/tn/tag_map.py
+++ b/spacy/lang/tn/tag_map.py
@ -0,0 +1,22 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
 from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
 TAG_MAP = {    
    "INT": {POS: INTJ}, 
    "JUNC": {POS: CCONJ},   
    "$": {POS: PUNCT},   
    "PROPOSS": {POS: PRON},
    "PROQUANT": {POS: PRON},
    "PROEMP": {POS: PRON},
    "NUM": {POS: NUM},
    "N": {POS: NOUN},
    "AUX": {POS: VERB},
    "ADV": {POS: ADV},
    "ADJ": {POS: ADJ},
    "V": {POS: VERB},
    "VCOP": {POS: VERB},
 }