From 6942a6a69b5a50f6864427661bcd59403acfbd72 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Sat, 14 Sep 2019 15:25:48 +0200 Subject: [PATCH] Extend default punct for sentencizer (#4290) Most of these characters are for languages / writing systems that aren't supported by spacy, but I don't think it causes problems to include them. In the UD evals, Hindi and Urdu improve a lot as expected (from 0-10% to 70-80%) and Persian improves a little (90% to 96%). Tamil improves in combination with #4288. The punctuation list is converted to a set internally because of its increased length. Sentence final punctuation generated with: ``` unichars -gas '[\p{Sentence_Break=STerm}\p{Sentence_Break=ATerm}]' '\p{Terminal_Punctuation}' ``` See: https://stackoverflow.com/a/9508766/461847 Fixes #4269. --- spacy/pipeline/pipes.pyx | 24 ++++++++++++++++++------ spacy/tests/pipeline/test_sentencizer.py | 4 ++-- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 412433565..190116a2e 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1371,7 +1371,16 @@ class Sentencizer(object): """ name = "sentencizer" - default_punct_chars = [".", "!", "?"] + default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹', + '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄', + '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿', + '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶', + '꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒', + '﹖', '﹗', '!', '.', '?', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀', + '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼', + '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐', + '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂', + '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈'] def __init__(self, punct_chars=None, **kwargs): """Initialize the sentencizer. @@ -1382,7 +1391,10 @@ class Sentencizer(object): DOCS: https://spacy.io/api/sentencizer#init """ - self.punct_chars = punct_chars or self.default_punct_chars + if punct_chars: + self.punct_chars = set(punct_chars) + else: + self.punct_chars = set(self.default_punct_chars) def __call__(self, doc): """Apply the sentencizer to a Doc and set Token.is_sent_start. @@ -1414,7 +1426,7 @@ class Sentencizer(object): DOCS: https://spacy.io/api/sentencizer#to_bytes """ - return srsly.msgpack_dumps({"punct_chars": self.punct_chars}) + return srsly.msgpack_dumps({"punct_chars": list(self.punct_chars)}) def from_bytes(self, bytes_data, **kwargs): """Load the sentencizer from a bytestring. @@ -1425,7 +1437,7 @@ class Sentencizer(object): DOCS: https://spacy.io/api/sentencizer#from_bytes """ cfg = srsly.msgpack_loads(bytes_data) - self.punct_chars = cfg.get("punct_chars", self.default_punct_chars) + self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars)) return self def to_disk(self, path, exclude=tuple(), **kwargs): @@ -1435,7 +1447,7 @@ class Sentencizer(object): """ path = util.ensure_path(path) path = path.with_suffix(".json") - srsly.write_json(path, {"punct_chars": self.punct_chars}) + srsly.write_json(path, {"punct_chars": list(self.punct_chars)}) def from_disk(self, path, exclude=tuple(), **kwargs): @@ -1446,7 +1458,7 @@ class Sentencizer(object): path = util.ensure_path(path) path = path.with_suffix(".json") cfg = srsly.read_json(path) - self.punct_chars = cfg.get("punct_chars", self.default_punct_chars) + self.punct_chars = set(cfg.get("punct_chars", self.default_punct_chars)) return self diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py index c1b3eba45..1e03dc743 100644 --- a/spacy/tests/pipeline/test_sentencizer.py +++ b/spacy/tests/pipeline/test_sentencizer.py @@ -81,7 +81,7 @@ def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, n_s def test_sentencizer_serialize_bytes(en_vocab): punct_chars = [".", "~", "+"] sentencizer = Sentencizer(punct_chars=punct_chars) - assert sentencizer.punct_chars == punct_chars + assert sentencizer.punct_chars == set(punct_chars) bytes_data = sentencizer.to_bytes() new_sentencizer = Sentencizer().from_bytes(bytes_data) - assert new_sentencizer.punct_chars == punct_chars + assert new_sentencizer.punct_chars == set(punct_chars)