diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 9f6c3266e..131bdcd51 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -11,6 +11,12 @@ _hebrew = r"\u0591-\u05F4\uFB1D-\uFB4F" _hindi = r"\u0900-\u097F" +_kannada = r"\u0C80-\u0CFF" + +_tamil = r"\u0B80-\u0BFF" + +_telugu = r"\u0C00-\u0C7F" + # Latin standard _latin_u_standard = r"A-Z" _latin_l_standard = r"a-z" @@ -195,7 +201,7 @@ _ukrainian = r"а-щюяіїєґА-ЩЮЯІЇЄҐ" _upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper _lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower -_uncased = _bengali + _hebrew + _persian + _sinhala + _hindi +_uncased = _bengali + _hebrew + _persian + _sinhala + _hindi + _kannada + _tamil + _telugu ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased) ALPHA_LOWER = group_chars(_lower + _uncased) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 2553ac83d..3e4b3582c 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1375,7 +1375,16 @@ class Sentencizer(object): """ name = "sentencizer" - default_punct_chars = [".", "!", "?"] + default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹', + '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄', + '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿', + '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶', + '꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒', + '﹖', '﹗', '!', '.', '?', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀', + '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼', + '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐', + '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂', + '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈'] def __init__(self, punct_chars=None, **kwargs): """Initialize the sentencizer. @@ -1386,7 +1395,10 @@ class Sentencizer(object): DOCS: https://spacy.io/api/sentencizer#init """ - self.punct_chars = punct_chars or self.default_punct_chars + if punct_chars: + self.punct_chars = set(punct_chars) + else: + self.punct_chars = set(self.default_punct_chars) def __call__(self, doc): """Apply the sentencizer to a Doc and set Token.is_sent_start. @@ -1418,7 +1430,7 @@ class Sentencizer(object): DOCS: https://spacy.io/api/sentencizer#to_bytes """ - return srsly.msgpack_dumps({"punct_chars": self.punct_chars}) + return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)}) def from_bytes(self, bytes_data, **kwargs): """Load the sentencizer from a bytestring. @@ -1429,7 +1441,7 @@ class Sentencizer(object): DOCS: https://spacy.io/api/sentencizer#from_bytes """ cfg = srsly.msgpack_loads(bytes_data) - self.punct_chars = cfg.get("punct_chars", self.default_punct_chars) + self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars)) return self def to_disk(self, path, exclude=tuple(), **kwargs): @@ -1439,7 +1451,7 @@ class Sentencizer(object): """ path = util.ensure_path(path) path = path.with_suffix(".json") - srsly.write_json(path, {"punct_chars": self.punct_chars}) + srsly.write_json(path, {"punct_chars": list(self.punct_chars)}) def from_disk(self, path, exclude=tuple(), **kwargs): @@ -1450,7 +1462,7 @@ class Sentencizer(object): path = util.ensure_path(path) path = path.with_suffix(".json") cfg = srsly.read_json(path) - self.punct_chars = cfg.get("punct_chars", self.default_punct_chars) + self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars)) return self diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py index c1b3eba45..1e03dc743 100644 --- a/spacy/tests/pipeline/test_sentencizer.py +++ b/spacy/tests/pipeline/test_sentencizer.py @@ -81,7 +81,7 @@ def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, n_s def test_sentencizer_serialize_bytes(en_vocab): punct_chars = [".", "~", "+"] sentencizer = Sentencizer(punct_chars=punct_chars) - assert sentencizer.punct_chars == punct_chars + assert sentencizer.punct_chars == set(punct_chars) bytes_data = sentencizer.to_bytes() new_sentencizer = Sentencizer().from_bytes(bytes_data) - assert new_sentencizer.punct_chars == punct_chars + assert new_sentencizer.punct_chars == set(punct_chars) diff --git a/website/docs/usage/v2.md b/website/docs/usage/v2.md index d7011fb2d..0ac8bfe75 100644 --- a/website/docs/usage/v2.md +++ b/website/docs/usage/v2.md @@ -107,7 +107,7 @@ process. -**Usage:** [Models directory](/models) [Benchmarks](#benchmarks) +**Usage:** [Models directory](/models) diff --git a/website/meta/site.json b/website/meta/site.json index 2b02ef953..edb60ab0c 100644 --- a/website/meta/site.json +++ b/website/meta/site.json @@ -10,10 +10,7 @@ "modelsRepo": "explosion/spacy-models", "social": { "twitter": "spacy_io", - "github": "explosion", - "reddit": "spacynlp", - "codepen": "explosion", - "gitter": "explosion/spaCy" + "github": "explosion" }, "theme": "#09a3d5", "analytics": "UA-58931649-1", @@ -69,6 +66,7 @@ "items": [ { "text": "Twitter", "url": "https://twitter.com/spacy_io" }, { "text": "GitHub", "url": "https://github.com/explosion/spaCy" }, + { "text": "YouTube", "url": "https://youtube.com/c/ExplosionAI" }, { "text": "Blog", "url": "https://explosion.ai/blog" } ] }