diff --git a/spacy/lang/tn/__init__.py b/spacy/lang/tn/__init__.py new file mode 100644 index 000000000..99907c28a --- /dev/null +++ b/spacy/lang/tn/__init__.py @@ -0,0 +1,18 @@ +from .stop_words import STOP_WORDS +from .lex_attrs import LEX_ATTRS +from .punctuation import TOKENIZER_INFIXES +from ...language import Language + + +class SetswanaDefaults(Language.Defaults): + infixes = TOKENIZER_INFIXES + stop_words = STOP_WORDS + lex_attr_getters = LEX_ATTRS + + +class Setswana(Language): + lang = "tn" + Defaults = SetswanaDefaults + + +__all__ = ["Setswana"] diff --git a/spacy/lang/tn/examples.py b/spacy/lang/tn/examples.py new file mode 100644 index 000000000..7b33fae5a --- /dev/null +++ b/spacy/lang/tn/examples.py @@ -0,0 +1,15 @@ +""" +Example sentences to test spaCy and its language models. +>>> from spacy.lang.tn.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple e nyaka go reka JSE ka tlhwatlhwa ta R1 billion", + "Johannesburg ke toropo e kgolo mo Afrika Borwa.", + "O ko kae?", + "ke mang presidente ya Afrika Borwa?", + "ke eng toropo kgolo ya Afrika Borwa?", + "Nelson Mandela o belegwe leng?", +] diff --git a/spacy/lang/tn/lex_attrs.py b/spacy/lang/tn/lex_attrs.py new file mode 100644 index 000000000..33a16a09a --- /dev/null +++ b/spacy/lang/tn/lex_attrs.py @@ -0,0 +1,106 @@ +from ...attrs import LIKE_NUM + +_num_words = [ + "lefela", + "nngwe", + "pedi", + "tharo", + "nne", + "tlhano", + "thataro", + "supa", + "robedi", + "robongwe", + "lesome", + "lesomenngwe", + "lesomepedi", + "sometharo", + "somenne", + "sometlhano", + "somethataro", + "somesupa", + "somerobedi", + "somerobongwe", + "someamabedi", + "someamararo", + "someamane", + "someamatlhano", + "someamarataro", + "someamasupa", + "someamarobedi", + "someamarobongwe", + "lekgolo", + "sekete", + "milione", + "bilione", + "terilione", + "kwatirilione", + "gajillione", + "bazillione", +] + + +_ordinal_words = [ + "ntlha", + "bobedi", + "boraro", + "bone", + "botlhano", + "borataro", + "bosupa", + "borobedi ", + "borobongwe", + "bolesome", + "bolesomengwe", + "bolesomepedi", + "bolesometharo", + "bolesomenne", + "bolesometlhano", + "bolesomethataro", + "bolesomesupa", + "bolesomerobedi", + "bolesomerobongwe", + "somamabedi", + "someamararo", + "someamane", + "someamatlhano", + "someamarataro", + "someamasupa", + "someamarobedi", + "someamarobongwe", + "lekgolo", + "sekete", + "milione", + "bilione", + "terilione", + "kwatirilione", + "gajillione", + "bazillione", +] + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + + text_lower = text.lower() + if text_lower in _num_words: + return True + + # CHeck ordinal number + if text_lower in _ordinal_words: + return True + if text_lower.endswith("th"): + if text_lower[:-2].isdigit(): + return True + + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/tn/punctuation.py b/spacy/lang/tn/punctuation.py new file mode 100644 index 000000000..241ad39af --- /dev/null +++ b/spacy/lang/tn/punctuation.py @@ -0,0 +1,19 @@ +from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS +from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\-\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), + r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + ] +) + + +TOKENIZER_INFIXES = _infixes \ No newline at end of file diff --git a/spacy/lang/tn/stop_words.py b/spacy/lang/tn/stop_words.py new file mode 100644 index 000000000..a627ef362 --- /dev/null +++ b/spacy/lang/tn/stop_words.py @@ -0,0 +1,18 @@ +# Stop words +STOP_WORDS = set(""" +ke gareng ga selekanyo tlhwatlhwa yo mongwe se +sengwe fa go le jalo gongwe ba na mo tikologong +jaaka kwa morago nna gonne ka sa pele nako teng +tlase fela ntle magareng tsona feta bobedi kgabaganya +moo gape kgatlhanong botlhe tsotlhe bokana e esi +setseng mororo dinako golo kgolo nnye wena gago +o ntse ntle tla goreng gangwe mang yotlhe gore +eo yona tseraganyo eng ne sentle re rona thata +godimo fitlha pedi masomamabedi lesomepedi mmogo +tharo tseo boraro tseno yone jaanong bobona bona +lesome tsaya tsamaiso nngwe masomethataro thataro +tsa mmatota tota sale thoko supa dira tshwanetse di mmalwa masisi +bonala e tshwanang bogolo tsenya tsweetswee karolo +sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa +tlhano lesometlhano botlalo lekgolo +""".split()) diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py index de1871e64..46f1f2bd1 100644 --- a/spacy/tests/lang/test_initialize.py +++ b/spacy/tests/lang/test_initialize.py @@ -8,7 +8,8 @@ from spacy.util import get_lang_class LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es", "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is", "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk", - "sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur", 'yo'] + "sl", "sq", "sr", "sv", "ta", "te", "tl", "tn", "tr", "tt", "ur", + "yo"] # fmt: on