From 24046fef17f211ec7e131c87f7371001f15fa625 Mon Sep 17 00:00:00 2001
From: Shumi <76557637+Shumie82@users.noreply.github.com>
Date: Wed, 10 Feb 2021 20:12:33 +0200
Subject: [PATCH 1/9] South African Setswana language

Please accept the additional of Setswana language
---
 spacy/lang/tn/__init__.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 spacy/lang/tn/__init__.py

diff --git a/spacy/lang/tn/__init__.py b/spacy/lang/tn/__init__.py
new file mode 100644
index 000000000..911214331
--- /dev/null
+++ b/spacy/lang/tn/__init__.py
@@ -0,0 +1,18 @@
+from .stop_words import STOP_WORDS
+from .lex_attrs import LEX_ATTRS
+from .punctuation import TOKENIZER_INFIXES
+from ...language import Language
+
+
+class SetswanaDefaults(Language.Defaults):
+    suffixes = TOKENIZER_SUFFIXES
+    stop_words = STOP_WORDS
+    lex_attr_getters = LEX_ATTRS
+
+
+class Setswana(Language):
+    lang = "tn"
+    Defaults = SetswanaDefaults
+
+
+__all__ = ["Setswana"]

From f6be28cfb231111a970d60b19efda2996c917373 Mon Sep 17 00:00:00 2001
From: Shumi <76557637+Shumie82@users.noreply.github.com>
Date: Wed, 10 Feb 2021 20:15:13 +0200
Subject: [PATCH 2/9] Added files to Setswana Language

Add South African Setswana Language
---
 spacy/lang/tn/examples.py    |  19 ++++++
 spacy/lang/tn/lex_attrs.py   | 110 +++++++++++++++++++++++++++++++++++
 spacy/lang/tn/punctuation.py |  19 ++++++
 spacy/lang/tn/stop_words.py  |  24 ++++++++
 spacy/lang/tn/tag_map.py     |  22 +++++++
 5 files changed, 194 insertions(+)
 create mode 100644 spacy/lang/tn/examples.py
 create mode 100644 spacy/lang/tn/lex_attrs.py
 create mode 100644 spacy/lang/tn/punctuation.py
 create mode 100644 spacy/lang/tn/stop_words.py
 create mode 100644 spacy/lang/tn/tag_map.py

diff --git a/spacy/lang/tn/examples.py b/spacy/lang/tn/examples.py
new file mode 100644
index 000000000..9039a1624
--- /dev/null
+++ b/spacy/lang/tn/examples.py
@@ -0,0 +1,19 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+>>> from spacy.lang.en.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple e nyaka go reka JSE ka tlhwatlhwa ta R1 billion",
+    "Johannesburg ke toropo e kgolo mo Afrika Borwa.",
+    "O ko kae?",
+    "ke mang presidente ya Afrika Borwa?",
+    "ke eng toropo kgolo ya Afrika Borwa?",
+    "Nelson Mandela o belegwe leng?",
+]
\ No newline at end of file
diff --git a/spacy/lang/tn/lex_attrs.py b/spacy/lang/tn/lex_attrs.py
new file mode 100644
index 000000000..daef45d72
--- /dev/null
+++ b/spacy/lang/tn/lex_attrs.py
@@ -0,0 +1,110 @@
+coding: utf8
+
+from __future__ import unicode_literals
+
+from ...attrs import LIKE_NUM
+
+_num_words = [
+    "lefela",
+    "nngwe",
+    "pedi",
+    "tharo",
+    "nne",
+    "tlhano",
+    "thataro",
+    "supa",
+    "robedi",
+    "robongwe",
+    "lesome",
+    "lesomenngwe",
+    "lesomepedi",
+    "sometharo",
+    "somenne",
+    "sometlhano",
+    "somethataro",
+    "somesupa",
+    "somerobedi",
+    "somerobongwe",
+    "someamabedi",
+    "someamararo",
+    "someamane",
+    "someamatlhano",
+    "someamarataro",
+    "someamasupa",
+    "someamarobedi",
+    "someamarobongwe",
+    "lekgolo",
+    "sekete",
+    "milione",
+    "bilione",
+    "terilione",
+    "kwatirilione",
+    "gajillione",
+    "bazillione",
+]
+
+
+_ordinal_words = [
+    "ntlha",
+    "bobedi",
+    "boraro",
+    "bone",
+    "botlhano",
+    "borataro",
+    "bosupa",
+    "borobedi ",
+    "borobongwe",
+    "bolesome",
+    "bolesomengwe",
+    "bolesomepedi",
+    "bolesometharo",
+    "bolesomenne",
+    "bolesometlhano",
+    "bolesomethataro",
+    "bolesomesupa",
+    "bolesomerobedi",
+    "bolesomerobongwe",
+    "somamabedi",
+    "someamararo",
+    "someamane",
+    "someamatlhano",
+    "someamarataro",
+    "someamasupa",
+    "someamarobedi",
+    "someamarobongwe",
+    "lekgolo",
+    "sekete",
+    "milione",
+    "bilione",
+    "terilione",
+    "kwatirilione",
+    "gajillione",
+    "bazillione",
+]
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+
+    text_lower = text.lower()
+    if text_lower in _num_words:
+        return True
+
+    # CHeck ordinal number
+    if text_lower in _ordinal_words:
+        return True
+    if text_lower.endswith("th"):
+        if text_lower[:-2].isdigit():
+            return True 
+
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
diff --git a/spacy/lang/tn/punctuation.py b/spacy/lang/tn/punctuation.py
new file mode 100644
index 000000000..241ad39af
--- /dev/null
+++ b/spacy/lang/tn/punctuation.py
@@ -0,0 +1,19 @@
+from ..char_classes import LIST_ELLIPSES, LIST_ICONS, HYPHENS
+from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
+
+_infixes = (
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+        ),
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}0-9])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
+        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
+    ]
+)
+
+
+TOKENIZER_INFIXES = _infixes
\ No newline at end of file
diff --git a/spacy/lang/tn/stop_words.py b/spacy/lang/tn/stop_words.py
new file mode 100644
index 000000000..65681f6ee
--- /dev/null
+++ b/spacy/lang/tn/stop_words.py
@@ -0,0 +1,24 @@
+coding: utf8
+
+from __future__ import unicode_literals
+
+
+# Stop words
+STOP_WORDS = set("""
+ke gareng ga selekanyo tlhwatlhwa yo mongwe se 
+sengwe fa go le jalo gongwe ba na mo tikologong
+jaaka kwa morago nna gonne ka sa pele nako teng 
+tlase fela ntle magareng tsona feta bobedi kgabaganya
+moo gape kgatlhanong botlhe tsotlhe bokana e esi
+setseng mororo dinako golo kgolo nnye wena gago 
+o ntse ntle tla goreng gangwe mang yotlhe gore 
+eo yona tseraganyo eng ne sentle re rona thata 
+godimo fitlha pedi masomamabedi lesomepedi mmogo 
+tharo tseo boraro tseno yone jaanong bobona bona 
+lesome tsaya tsamaiso nngwe masomethataro thataro 
+tsa mmatota tota sale thoko supa dira tshwanetse di mmalwa masisi
+bonala e tshwanang bogolo tsenya tsweetswee karolo 
+sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa 
+tlhano lesometlhano botlalo lekgolo           
+""".split())
+print(STOP_WORDS)
\ No newline at end of file
diff --git a/spacy/lang/tn/tag_map.py b/spacy/lang/tn/tag_map.py
new file mode 100644
index 000000000..1c7f0647f
--- /dev/null
+++ b/spacy/lang/tn/tag_map.py
@@ -0,0 +1,22 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
+from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
+
+
+TAG_MAP = {    
+    "INT": {POS: INTJ}, 
+    "JUNC": {POS: CCONJ},   
+    "$": {POS: PUNCT},   
+    "PROPOSS": {POS: PRON},
+    "PROQUANT": {POS: PRON},
+    "PROEMP": {POS: PRON},
+    "NUM": {POS: NUM},
+    "N": {POS: NOUN},
+    "AUX": {POS: VERB},
+    "ADV": {POS: ADV},
+    "ADJ": {POS: ADJ},
+    "V": {POS: VERB},
+    "VCOP": {POS: VERB},
+}

From 7c8721b1bd3b12719a2db395e237d8b496a3414c Mon Sep 17 00:00:00 2001
From: Shumi <76557637+Shumie82@users.noreply.github.com>
Date: Wed, 10 Feb 2021 20:21:22 +0200
Subject: [PATCH 3/9] Update tag_map.py

Updated tag_map
---
 spacy/lang/tn/tag_map.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/lang/tn/tag_map.py b/spacy/lang/tn/tag_map.py
index 1c7f0647f..e26f4c4e1 100644
--- a/spacy/lang/tn/tag_map.py
+++ b/spacy/lang/tn/tag_map.py
@@ -1,4 +1,4 @@
-# coding: utf8
+coding: utf8
 from __future__ import unicode_literals
 
 from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB

From ed3397727e3cf3cc7b8ff9a89224fe894424392d Mon Sep 17 00:00:00 2001
From: Shumi <76557637+Shumie82@users.noreply.github.com>
Date: Wed, 10 Feb 2021 20:41:18 +0200
Subject: [PATCH 4/9] Delete tag_map.py

Tag map file is deleted. I will add it later because it was failing validations
---
 spacy/lang/tn/tag_map.py | 22 ----------------------
 1 file changed, 22 deletions(-)
 delete mode 100644 spacy/lang/tn/tag_map.py

diff --git a/spacy/lang/tn/tag_map.py b/spacy/lang/tn/tag_map.py
deleted file mode 100644
index e26f4c4e1..000000000
--- a/spacy/lang/tn/tag_map.py
+++ /dev/null
@@ -1,22 +0,0 @@
-coding: utf8
-from __future__ import unicode_literals
-
-from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
-from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
-
-
-TAG_MAP = {    
-    "INT": {POS: INTJ}, 
-    "JUNC": {POS: CCONJ},   
-    "$": {POS: PUNCT},   
-    "PROPOSS": {POS: PRON},
-    "PROQUANT": {POS: PRON},
-    "PROEMP": {POS: PRON},
-    "NUM": {POS: NUM},
-    "N": {POS: NOUN},
-    "AUX": {POS: VERB},
-    "ADV": {POS: ADV},
-    "ADJ": {POS: ADJ},
-    "V": {POS: VERB},
-    "VCOP": {POS: VERB},
-}

From 39eeba6760c6011e3372ea2a359bfc7b056bfa1e Mon Sep 17 00:00:00 2001
From: Shumi <76557637+Shumie82@users.noreply.github.com>
Date: Thu, 11 Feb 2021 21:20:46 +0200
Subject: [PATCH 5/9] Update __init__.py

Added infixes = TOKENIZER_INFIXES
---
 spacy/lang/tn/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/lang/tn/__init__.py b/spacy/lang/tn/__init__.py
index 911214331..648772528 100644
--- a/spacy/lang/tn/__init__.py
+++ b/spacy/lang/tn/__init__.py
@@ -6,6 +6,7 @@ from ...language import Language
 
 class SetswanaDefaults(Language.Defaults):
     suffixes = TOKENIZER_SUFFIXES
+    infixes = TOKENIZER_INFIXES    
     stop_words = STOP_WORDS
     lex_attr_getters = LEX_ATTRS
 

From 37ec67f868ec803423cd76af28f8116c326ebedd Mon Sep 17 00:00:00 2001
From: Shumi <76557637+Shumie82@users.noreply.github.com>
Date: Thu, 11 Feb 2021 21:25:58 +0200
Subject: [PATCH 6/9] Update examples.py

I have removed two lines:
# coding: utf8
from __future__ import unicode_literals

And updated: >>> from spacy.lang.tn.examples import sentences
---
 spacy/lang/tn/examples.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/spacy/lang/tn/examples.py b/spacy/lang/tn/examples.py
index 9039a1624..7b33fae5a 100644
--- a/spacy/lang/tn/examples.py
+++ b/spacy/lang/tn/examples.py
@@ -1,10 +1,6 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-
 """
 Example sentences to test spaCy and its language models.
->>> from spacy.lang.en.examples import sentences
+>>> from spacy.lang.tn.examples import sentences
 >>> docs = nlp.pipe(sentences)
 """
 
@@ -16,4 +12,4 @@ sentences = [
     "ke mang presidente ya Afrika Borwa?",
     "ke eng toropo kgolo ya Afrika Borwa?",
     "Nelson Mandela o belegwe leng?",
-]
\ No newline at end of file
+]

From 0d57e84b7baa35aaadeba7346c63a98c07511869 Mon Sep 17 00:00:00 2001
From: Shumi <76557637+Shumie82@users.noreply.github.com>
Date: Thu, 11 Feb 2021 21:28:23 +0200
Subject: [PATCH 7/9] Update lex_attrs.py

I have removed line 1 to 4
---
 spacy/lang/tn/lex_attrs.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/spacy/lang/tn/lex_attrs.py b/spacy/lang/tn/lex_attrs.py
index daef45d72..33a16a09a 100644
--- a/spacy/lang/tn/lex_attrs.py
+++ b/spacy/lang/tn/lex_attrs.py
@@ -1,7 +1,3 @@
-coding: utf8
-
-from __future__ import unicode_literals
-
 from ...attrs import LIKE_NUM
 
 _num_words = [

From 4e514f1ea8afcf341cc1d9b923eb7667b4b287c9 Mon Sep 17 00:00:00 2001
From: Shumi <76557637+Shumie82@users.noreply.github.com>
Date: Thu, 11 Feb 2021 21:30:34 +0200
Subject: [PATCH 8/9] Update stop_words.py

I have deleted line 1 to 5 and the statement print(STOP_WORDS)
---
 spacy/lang/tn/stop_words.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/spacy/lang/tn/stop_words.py b/spacy/lang/tn/stop_words.py
index 65681f6ee..a627ef362 100644
--- a/spacy/lang/tn/stop_words.py
+++ b/spacy/lang/tn/stop_words.py
@@ -1,8 +1,3 @@
-coding: utf8
-
-from __future__ import unicode_literals
-
-
 # Stop words
 STOP_WORDS = set("""
 ke gareng ga selekanyo tlhwatlhwa yo mongwe se 
@@ -21,4 +16,3 @@ bonala e tshwanang bogolo tsenya tsweetswee karolo
 sepe tlhalosa dirwa robedi robongwe lesomenngwe gaisa 
 tlhano lesometlhano botlalo lekgolo           
 """.split())
-print(STOP_WORDS)
\ No newline at end of file

From 6c450decfc01e2d82f0b7c8f799654d79158fa4c Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 13 Feb 2021 11:51:21 +1100
Subject: [PATCH 9/9] Fix punctuation settings and add to initialize tests

---
 spacy/lang/tn/__init__.py           | 3 +--
 spacy/tests/lang/test_initialize.py | 3 ++-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/lang/tn/__init__.py b/spacy/lang/tn/__init__.py
index 648772528..99907c28a 100644
--- a/spacy/lang/tn/__init__.py
+++ b/spacy/lang/tn/__init__.py
@@ -5,8 +5,7 @@ from ...language import Language
 
 
 class SetswanaDefaults(Language.Defaults):
-    suffixes = TOKENIZER_SUFFIXES
-    infixes = TOKENIZER_INFIXES    
+    infixes = TOKENIZER_INFIXES
     stop_words = STOP_WORDS
     lex_attr_getters = LEX_ATTRS
 
diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py
index de1871e64..46f1f2bd1 100644
--- a/spacy/tests/lang/test_initialize.py
+++ b/spacy/tests/lang/test_initialize.py
@@ -8,7 +8,8 @@ from spacy.util import get_lang_class
 LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
              "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is",
              "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk",
-             "sl", "sq", "sr", "sv", "ta", "te", "tl", "tr", "tt", "ur", 'yo']
+             "sl", "sq", "sr", "sv", "ta", "te", "tl", "tn", "tr", "tt", "ur",
+             "yo"]
 # fmt: on