Added files to Setswana Language

Add South African Setswana Language
This commit is contained in:
Shumi 2021-02-10 20:15:13 +02:00 committed by GitHub
parent 24046fef17
commit f6be28cfb2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 194 additions and 0 deletions

19
spacy/lang/tn/examples.py Normal file
View File

@ -0,0 +1,19 @@
# coding: utf8
from __future__ import unicode_literals
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.en.examples import sentences
>>> docs = nlp.pipe(sentences)
"""
sentences = [
"Apple e nyaka go reka JSE ka tlhwatlhwa ta R1 billion",
"Johannesburg ke toropo e kgolo mo Afrika Borwa.",
"O ko kae?",
"ke mang presidente ya Afrika Borwa?",
"ke eng toropo kgolo ya Afrika Borwa?",
"Nelson Mandela o belegwe leng?",
]

110
spacy/lang/tn/lex_attrs.py Normal file
View File

@ -0,0 +1,110 @@
coding: utf8
from __future__ import unicode_literals
from ...attrs import LIKE_NUM
_num_words = [
"lefela",
"nngwe",
"pedi",
"tharo",
"nne",
"tlhano",
"thataro",
"supa",
"robedi",
"robongwe",
"lesome",
"lesomenngwe",
"lesomepedi",
"sometharo",
"somenne",
"sometlhano",
"somethataro",
"somesupa",
"somerobedi",
"somerobongwe",
"someamabedi",
"someamararo",
"someamane",
"someamatlhano",
"someamarataro",
"someamasupa",
"someamarobedi",
"someamarobongwe",
"lekgolo",
"sekete",
"milione",
"bilione",
"terilione",
"kwatirilione",
"gajillione",
"bazillione",
]
_ordinal_words = [
"ntlha",
"bobedi",
"boraro",
"bone",
"botlhano",
"borataro",
"bosupa",
"borobedi ",
"borobongwe",
"bolesome",
"bolesomengwe",
"bolesomepedi",
"bolesometharo",
"bolesomenne",
"bolesometlhano",
"bolesomethataro",
"bolesomesupa",
"bolesomerobedi",
"bolesomerobongwe",
"somamabedi",
"someamararo",
"someamane",
"someamatlhano",
"someamarataro",
"someamasupa",
"someamarobedi",
"someamarobongwe",
"lekgolo",
"sekete",
"milione",
"bilione",
"terilione",
"kwatirilione",
"gajillione",
"bazillione",
]
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
text_lower = text.lower()
if text_lower in _num_words:
return True
# CHeck ordinal number
if text_lower in _ordinal_words:
return True
if text_lower.endswith("th"):
if text_lower[:-2].isdigit():
return True
return False
LEX_ATTRS = {LIKE_NUM: like_num}

View File

@ -0,0 +1,19 @@
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
]
)
TOKENIZER_INFIXES = _infixes

View File

@ -0,0 +1,24 @@
coding: utf8
from __future__ import unicode_literals
# Stop words
STOP_WORDS = set("""
ke gareng ga selekanyo tlhwatlhwa yo mongwe se
sengwe fa go le jalo gongwe ba na mo tikologong
jaaka kwa morago nna gonne ka sa pele nako teng
tlase fela ntle magareng tsona feta bobedi kgabaganya
moo gape kgatlhanong botlhe tsotlhe bokana e esi
setseng mororo dinako golo kgolo nnye wena gago
o ntse ntle tla goreng gangwe mang yotlhe gore
eo yona tseraganyo eng ne sentle re rona thata
godimo fitlha pedi masomamabedi lesomepedi mmogo
tharo tseo boraro tseno yone jaanong bobona bona
lesome tsaya tsamaiso nngwe masomethataro thataro
tsa mmatota tota sale thoko supa dira tshwanetse di mmalwa masisi
bonala e tshwanang bogolo tsenya tsweetswee karolo
sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa
tlhano lesometlhano botlalo lekgolo
""".split())
print(STOP_WORDS)

22
spacy/lang/tn/tag_map.py Normal file
View File

@ -0,0 +1,22 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
TAG_MAP = {
"INT": {POS: INTJ},
"JUNC": {POS: CCONJ},
"$": {POS: PUNCT},
"PROPOSS": {POS: PRON},
"PROQUANT": {POS: PRON},
"PROEMP": {POS: PRON},
"NUM": {POS: NUM},
"N": {POS: NOUN},
"AUX": {POS: VERB},
"ADV": {POS: ADV},
"ADJ": {POS: ADJ},
"V": {POS: VERB},
"VCOP": {POS: VERB},
}