diff --git a/spacy/lang/tn/examples.py b/spacy/lang/tn/examples.py new file mode 100644 index 000000000..9039a1624 --- /dev/null +++ b/spacy/lang/tn/examples.py @@ -0,0 +1,19 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. +>>> from spacy.lang.en.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "Apple e nyaka go reka JSE ka tlhwatlhwa ta R1 billion", + "Johannesburg ke toropo e kgolo mo Afrika Borwa.", + "O ko kae?", + "ke mang presidente ya Afrika Borwa?", + "ke eng toropo kgolo ya Afrika Borwa?", + "Nelson Mandela o belegwe leng?", +] \ No newline at end of file diff --git a/spacy/lang/tn/lex_attrs.py b/spacy/lang/tn/lex_attrs.py new file mode 100644 index 000000000..daef45d72 --- /dev/null +++ b/spacy/lang/tn/lex_attrs.py @@ -0,0 +1,110 @@ +coding: utf8 + +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + +_num_words = [ + "lefela", + "nngwe", + "pedi", + "tharo", + "nne", + "tlhano", + "thataro", + "supa", + "robedi", + "robongwe", + "lesome", + "lesomenngwe", + "lesomepedi", + "sometharo", + "somenne", + "sometlhano", + "somethataro", + "somesupa", + "somerobedi", + "somerobongwe", + "someamabedi", + "someamararo", + "someamane", + "someamatlhano", + "someamarataro", + "someamasupa", + "someamarobedi", + "someamarobongwe", + "lekgolo", + "sekete", + "milione", + "bilione", + "terilione", + "kwatirilione", + "gajillione", + "bazillione", +] + + +_ordinal_words = [ + "ntlha", + "bobedi", + "boraro", + "bone", + "botlhano", + "borataro", + "bosupa", + "borobedi ", + "borobongwe", + "bolesome", + "bolesomengwe", + "bolesomepedi", + "bolesometharo", + "bolesomenne", + "bolesometlhano", + "bolesomethataro", + "bolesomesupa", + "bolesomerobedi", + "bolesomerobongwe", + "somamabedi", + "someamararo", + "someamane", + "someamatlhano", + "someamarataro", + "someamasupa", + "someamarobedi", + "someamarobongwe", + "lekgolo", + "sekete", + "milione", + "bilione", + "terilione", + "kwatirilione", + "gajillione", + "bazillione", +] + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + + text_lower = text.lower() + if text_lower in _num_words: + return True + + # CHeck ordinal number + if text_lower in _ordinal_words: + return True + if text_lower.endswith("th"): + if text_lower[:-2].isdigit(): + return True + + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/tn/punctuation.py b/spacy/lang/tn/punctuation.py new file mode 100644 index 000000000..241ad39af --- /dev/null +++ b/spacy/lang/tn/punctuation.py @@ -0,0 +1,19 @@ +from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS +from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA + +_infixes = ( + LIST_ELLIPSES + + LIST_ICONS + + [ + r"(?<=[0-9])[+\-\*^](?=[0-9-])", + r"(?<=[{al}{q}])\.(?=[{au}{q}])".format( + al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES + ), + r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS), + r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), + ] +) + + +TOKENIZER_INFIXES = _infixes \ No newline at end of file diff --git a/spacy/lang/tn/stop_words.py b/spacy/lang/tn/stop_words.py new file mode 100644 index 000000000..65681f6ee --- /dev/null +++ b/spacy/lang/tn/stop_words.py @@ -0,0 +1,24 @@ +coding: utf8 + +from __future__ import unicode_literals + + +# Stop words +STOP_WORDS = set(""" +ke gareng ga selekanyo tlhwatlhwa yo mongwe se +sengwe fa go le jalo gongwe ba na mo tikologong +jaaka kwa morago nna gonne ka sa pele nako teng +tlase fela ntle magareng tsona feta bobedi kgabaganya +moo gape kgatlhanong botlhe tsotlhe bokana e esi +setseng mororo dinako golo kgolo nnye wena gago +o ntse ntle tla goreng gangwe mang yotlhe gore +eo yona tseraganyo eng ne sentle re rona thata +godimo fitlha pedi masomamabedi lesomepedi mmogo +tharo tseo boraro tseno yone jaanong bobona bona +lesome tsaya tsamaiso nngwe masomethataro thataro +tsa mmatota tota sale thoko supa dira tshwanetse di mmalwa masisi +bonala e tshwanang bogolo tsenya tsweetswee karolo +sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa +tlhano lesometlhano botlalo lekgolo +""".split()) +print(STOP_WORDS) \ No newline at end of file diff --git a/spacy/lang/tn/tag_map.py b/spacy/lang/tn/tag_map.py new file mode 100644 index 000000000..1c7f0647f --- /dev/null +++ b/spacy/lang/tn/tag_map.py @@ -0,0 +1,22 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB +from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON + + +TAG_MAP = { + "INT": {POS: INTJ}, + "JUNC": {POS: CCONJ}, + "$": {POS: PUNCT}, + "PROPOSS": {POS: PRON}, + "PROQUANT": {POS: PRON}, + "PROEMP": {POS: PRON}, + "NUM": {POS: NUM}, + "N": {POS: NOUN}, + "AUX": {POS: VERB}, + "ADV": {POS: ADV}, + "ADJ": {POS: ADJ}, + "V": {POS: VERB}, + "VCOP": {POS: VERB}, +}