From bee79619278a5e055ac744798a6ea0001951c94b Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Sat, 14 Sep 2019 14:23:06 +0200
Subject: [PATCH 1/4] Add Kannada, Tamil, and Telugu unicode blocks (#4288)

Add Kannada, Tamil, and Telugu unicode blocks to uncased character
classes so that period is recognized as a suffix during tokenization.

(I'm sure a few symbols in the code blocks should not be ALPHA, but this
is mainly relevant for suffix detection and seems to be an improvement
in practice.)
---
 spacy/lang/char_classes.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py
index 9f6c3266e..131bdcd51 100644
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@@ -11,6 +11,12 @@ _hebrew = r"\u0591-\u05F4\uFB1D-\uFB4F"
 
 _hindi = r"\u0900-\u097F"
 
+_kannada = r"\u0C80-\u0CFF"
+
+_tamil = r"\u0B80-\u0BFF"
+
+_telugu = r"\u0C00-\u0C7F"
+
 # Latin standard
 _latin_u_standard = r"A-Z"
 _latin_l_standard = r"a-z"
@@ -195,7 +201,7 @@ _ukrainian = r"а-щюяіїєґА-ЩЮЯІЇЄҐ"
 _upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper
 _lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower
 
-_uncased = _bengali + _hebrew + _persian + _sinhala + _hindi
+_uncased = _bengali + _hebrew + _persian + _sinhala + _hindi + _kannada + _tamil + _telugu
 
 ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased)
 ALPHA_LOWER = group_chars(_lower + _uncased)

From 6942a6a69b5a50f6864427661bcd59403acfbd72 Mon Sep 17 00:00:00 2001
From: adrianeboyd <adrianeboyd@gmail.com>
Date: Sat, 14 Sep 2019 15:25:48 +0200
Subject: [PATCH 2/4] Extend default punct for sentencizer (#4290)

Most of these characters are for languages / writing systems that aren't
supported by spacy, but I don't think it causes problems to include
them. In the UD evals, Hindi and Urdu improve a lot as expected (from
0-10% to 70-80%) and Persian improves a little (90% to 96%). Tamil
improves in combination with #4288.

The punctuation list is converted to a set internally because of its
increased length.

Sentence final punctuation generated with:

```
unichars -gas '[\p{Sentence_Break=STerm}\p{Sentence_Break=ATerm}]' '\p{Terminal_Punctuation}'
```

See: https://stackoverflow.com/a/9508766/461847

Fixes #4269.
---
 spacy/pipeline/pipes.pyx                 | 24 ++++++++++++++++++------
 spacy/tests/pipeline/test_sentencizer.py |  4 ++--
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 412433565..190116a2e 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1371,7 +1371,16 @@ class Sentencizer(object):
     """
 
     name = "sentencizer"
-    default_punct_chars = [".", "!", "?"]
+    default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
+            '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
+            '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
+            '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶',
+            '꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒',
+            '﹖', '﹗', '！', '．', '？', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀',
+            '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼',
+            '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐',
+            '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂',
+            '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈']
 
     def __init__(self, punct_chars=None, **kwargs):
         """Initialize the sentencizer.
@@ -1382,7 +1391,10 @@ class Sentencizer(object):
 
         DOCS: https://spacy.io/api/sentencizer#init
         """
-        self.punct_chars = punct_chars or self.default_punct_chars
+        if punct_chars:
+            self.punct_chars = set(punct_chars)
+        else:
+            self.punct_chars = set(self.default_punct_chars)
 
     def __call__(self, doc):
         """Apply the sentencizer to a Doc and set Token.is_sent_start.
@@ -1414,7 +1426,7 @@ class Sentencizer(object):
 
         DOCS: https://spacy.io/api/sentencizer#to_bytes
         """
-        return srsly.msgpack_dumps({"punct_chars": self.punct_chars})
+        return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)})
 
     def from_bytes(self, bytes_data, **kwargs):
         """Load the sentencizer from a bytestring.
@@ -1425,7 +1437,7 @@ class Sentencizer(object):
         DOCS: https://spacy.io/api/sentencizer#from_bytes
         """
         cfg = srsly.msgpack_loads(bytes_data)
-        self.punct_chars = cfg.get("punct_chars", self.default_punct_chars)
+        self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
         return self
 
     def to_disk(self, path, exclude=tuple(), **kwargs):
@@ -1435,7 +1447,7 @@ class Sentencizer(object):
         """
         path = util.ensure_path(path)
         path = path.with_suffix(".json")
-        srsly.write_json(path, {"punct_chars": self.punct_chars})
+        srsly.write_json(path, {"punct_chars": list(self.punct_chars)})
 
 
     def from_disk(self, path, exclude=tuple(), **kwargs):
@@ -1446,7 +1458,7 @@ class Sentencizer(object):
         path = util.ensure_path(path)
         path = path.with_suffix(".json")
         cfg = srsly.read_json(path)
-        self.punct_chars = cfg.get("punct_chars", self.default_punct_chars)
+        self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars))
         return self
 
 
diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py
index c1b3eba45..1e03dc743 100644
--- a/spacy/tests/pipeline/test_sentencizer.py
+++ b/spacy/tests/pipeline/test_sentencizer.py
@@ -81,7 +81,7 @@ def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, n_s
 def test_sentencizer_serialize_bytes(en_vocab):
     punct_chars = [".", "~", "+"]
     sentencizer = Sentencizer(punct_chars=punct_chars)
-    assert sentencizer.punct_chars == punct_chars
+    assert sentencizer.punct_chars == set(punct_chars)
     bytes_data = sentencizer.to_bytes()
     new_sentencizer = Sentencizer().from_bytes(bytes_data)
-    assert new_sentencizer.punct_chars == punct_chars
+    assert new_sentencizer.punct_chars == set(punct_chars)

From 76d26a3d5e2cf604d4a2247fb0bb75f5ea110333 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 14 Sep 2019 16:32:24 +0200
Subject: [PATCH 3/4] Update site.json [ci skip]

---
 website/meta/site.json | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/website/meta/site.json b/website/meta/site.json
index 2b02ef953..edb60ab0c 100644
--- a/website/meta/site.json
+++ b/website/meta/site.json
@@ -10,10 +10,7 @@
     "modelsRepo": "explosion/spacy-models",
     "social": {
         "twitter": "spacy_io",
-        "github": "explosion",
-        "reddit": "spacynlp",
-        "codepen": "explosion",
-        "gitter": "explosion/spaCy"
+        "github": "explosion"
     },
     "theme": "#09a3d5",
     "analytics": "UA-58931649-1",
@@ -69,6 +66,7 @@
             "items": [
                 { "text": "Twitter", "url": "https://twitter.com/spacy_io" },
                 { "text": "GitHub", "url": "https://github.com/explosion/spaCy" },
+                { "text": "YouTube", "url": "https://youtube.com/c/ExplosionAI" },
                 { "text": "Blog", "url": "https://explosion.ai/blog" }
             ]
         }

From 04d36d2471bc48548abbae6c8913b2371b68d3bf Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 14 Sep 2019 16:41:19 +0200
Subject: [PATCH 4/4] Remove unused link [ci skip]

---
 website/docs/usage/v2.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/v2.md b/website/docs/usage/v2.md
index 9e54106c7..a412eeba4 100644
--- a/website/docs/usage/v2.md
+++ b/website/docs/usage/v2.md
@@ -107,7 +107,7 @@ process.
 
 <Infobox>
 
-**Usage:** [Models directory](/models) [Benchmarks](#benchmarks)
+**Usage:** [Models directory](/models)
 
 </Infobox>