mirror of https://github.com/explosion/spaCy.git
Added files to Setswana Language
Add South African Setswana Language
This commit is contained in:
parent
24046fef17
commit
f6be28cfb2
|
@ -0,0 +1,19 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
"""
|
||||
Example sentences to test spaCy and its language models.
|
||||
>>> from spacy.lang.en.examples import sentences
|
||||
>>> docs = nlp.pipe(sentences)
|
||||
"""
|
||||
|
||||
|
||||
sentences = [
|
||||
"Apple e nyaka go reka JSE ka tlhwatlhwa ta R1 billion",
|
||||
"Johannesburg ke toropo e kgolo mo Afrika Borwa.",
|
||||
"O ko kae?",
|
||||
"ke mang presidente ya Afrika Borwa?",
|
||||
"ke eng toropo kgolo ya Afrika Borwa?",
|
||||
"Nelson Mandela o belegwe leng?",
|
||||
]
|
|
@ -0,0 +1,110 @@
|
|||
coding: utf8
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
_num_words = [
|
||||
"lefela",
|
||||
"nngwe",
|
||||
"pedi",
|
||||
"tharo",
|
||||
"nne",
|
||||
"tlhano",
|
||||
"thataro",
|
||||
"supa",
|
||||
"robedi",
|
||||
"robongwe",
|
||||
"lesome",
|
||||
"lesomenngwe",
|
||||
"lesomepedi",
|
||||
"sometharo",
|
||||
"somenne",
|
||||
"sometlhano",
|
||||
"somethataro",
|
||||
"somesupa",
|
||||
"somerobedi",
|
||||
"somerobongwe",
|
||||
"someamabedi",
|
||||
"someamararo",
|
||||
"someamane",
|
||||
"someamatlhano",
|
||||
"someamarataro",
|
||||
"someamasupa",
|
||||
"someamarobedi",
|
||||
"someamarobongwe",
|
||||
"lekgolo",
|
||||
"sekete",
|
||||
"milione",
|
||||
"bilione",
|
||||
"terilione",
|
||||
"kwatirilione",
|
||||
"gajillione",
|
||||
"bazillione",
|
||||
]
|
||||
|
||||
|
||||
_ordinal_words = [
|
||||
"ntlha",
|
||||
"bobedi",
|
||||
"boraro",
|
||||
"bone",
|
||||
"botlhano",
|
||||
"borataro",
|
||||
"bosupa",
|
||||
"borobedi ",
|
||||
"borobongwe",
|
||||
"bolesome",
|
||||
"bolesomengwe",
|
||||
"bolesomepedi",
|
||||
"bolesometharo",
|
||||
"bolesomenne",
|
||||
"bolesometlhano",
|
||||
"bolesomethataro",
|
||||
"bolesomesupa",
|
||||
"bolesomerobedi",
|
||||
"bolesomerobongwe",
|
||||
"somamabedi",
|
||||
"someamararo",
|
||||
"someamane",
|
||||
"someamatlhano",
|
||||
"someamarataro",
|
||||
"someamasupa",
|
||||
"someamarobedi",
|
||||
"someamarobongwe",
|
||||
"lekgolo",
|
||||
"sekete",
|
||||
"milione",
|
||||
"bilione",
|
||||
"terilione",
|
||||
"kwatirilione",
|
||||
"gajillione",
|
||||
"bazillione",
|
||||
]
|
||||
|
||||
def like_num(text):
|
||||
if text.startswith(("+", "-", "±", "~")):
|
||||
text = text[1:]
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count("/") == 1:
|
||||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
|
||||
text_lower = text.lower()
|
||||
if text_lower in _num_words:
|
||||
return True
|
||||
|
||||
# CHeck ordinal number
|
||||
if text_lower in _ordinal_words:
|
||||
return True
|
||||
if text_lower.endswith("th"):
|
||||
if text_lower[:-2].isdigit():
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
|
@ -0,0 +1,19 @@
|
|||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
||||
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
|
||||
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
|
||||
al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
|
||||
),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
|
||||
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
TOKENIZER_INFIXES = _infixes
|
|
@ -0,0 +1,24 @@
|
|||
coding: utf8
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
# Stop words
|
||||
STOP_WORDS = set("""
|
||||
ke gareng ga selekanyo tlhwatlhwa yo mongwe se
|
||||
sengwe fa go le jalo gongwe ba na mo tikologong
|
||||
jaaka kwa morago nna gonne ka sa pele nako teng
|
||||
tlase fela ntle magareng tsona feta bobedi kgabaganya
|
||||
moo gape kgatlhanong botlhe tsotlhe bokana e esi
|
||||
setseng mororo dinako golo kgolo nnye wena gago
|
||||
o ntse ntle tla goreng gangwe mang yotlhe gore
|
||||
eo yona tseraganyo eng ne sentle re rona thata
|
||||
godimo fitlha pedi masomamabedi lesomepedi mmogo
|
||||
tharo tseo boraro tseno yone jaanong bobona bona
|
||||
lesome tsaya tsamaiso nngwe masomethataro thataro
|
||||
tsa mmatota tota sale thoko supa dira tshwanetse di mmalwa masisi
|
||||
bonala e tshwanang bogolo tsenya tsweetswee karolo
|
||||
sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa
|
||||
tlhano lesometlhano botlalo lekgolo
|
||||
""".split())
|
||||
print(STOP_WORDS)
|
|
@ -0,0 +1,22 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
|
||||
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
|
||||
|
||||
|
||||
TAG_MAP = {
|
||||
"INT": {POS: INTJ},
|
||||
"JUNC": {POS: CCONJ},
|
||||
"$": {POS: PUNCT},
|
||||
"PROPOSS": {POS: PRON},
|
||||
"PROQUANT": {POS: PRON},
|
||||
"PROEMP": {POS: PRON},
|
||||
"NUM": {POS: NUM},
|
||||
"N": {POS: NOUN},
|
||||
"AUX": {POS: VERB},
|
||||
"ADV": {POS: ADV},
|
||||
"ADJ": {POS: ADJ},
|
||||
"V": {POS: VERB},
|
||||
"VCOP": {POS: VERB},
|
||||
}
|
Loading…
Reference in New Issue