From 1d59fdbd39876eb2aac03f66c03529fc6c40b5bc Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 17 May 2021 10:16:20 +0200
Subject: [PATCH] Update Vietnamese tokenizer (#8099)

* Adapt tokenization methods from `pyvi` to preserve text encoding and
whitespace
* Add serialization support similar to Chinese and Japanese

Note: as for Chinese and Japanese, some settings are duplicated in
`config.cfg` and `tokenizer/cfg`.
---
 licenses/3rd_party_licenses.txt       |  31 ++++++-
 spacy/lang/vi/__init__.py             | 112 ++++++++++++++++++++++++--
 spacy/tests/conftest.py               |   6 ++
 spacy/tests/lang/vi/__init__.py       |   0
 spacy/tests/lang/vi/test_serialize.py |  33 ++++++++
 spacy/tests/lang/vi/test_tokenizer.py |  47 +++++++++++
 6 files changed, 220 insertions(+), 9 deletions(-)
 create mode 100644 spacy/tests/lang/vi/__init__.py
 create mode 100644 spacy/tests/lang/vi/test_serialize.py
 create mode 100644 spacy/tests/lang/vi/test_tokenizer.py

diff --git a/licenses/3rd_party_licenses.txt b/licenses/3rd_party_licenses.txt
index 3702ad131..7bc3d4547 100644
--- a/licenses/3rd_party_licenses.txt
+++ b/licenses/3rd_party_licenses.txt
@@ -43,8 +43,8 @@ scikit-learn
 
 * Files: scorer.py
 
-The following implementation of roc_auc_score() is adapted from
-scikit-learn, which is distributed under the following license:
+The implementation of roc_auc_score() is adapted from scikit-learn, which is
+distributed under the following license:
 
 New BSD License
 
@@ -77,3 +77,30 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 DAMAGE.
+
+
+pyvi
+----
+
+* Files: lang/vi/__init__.py
+
+The MIT License (MIT)
+Copyright (c) 2016 Viet-Trung Tran
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/spacy/lang/vi/__init__.py b/spacy/lang/vi/__init__.py
index 1328de495..b6d873a13 100644
--- a/spacy/lang/vi/__init__.py
+++ b/spacy/lang/vi/__init__.py
@@ -1,8 +1,15 @@
+from typing import Any, Dict, Union
+from pathlib import Path
+import re
+import srsly
+import string
+
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
 from ...tokens import Doc
 from ...util import DummyTokenizer, registry, load_config_from_str
+from ... import util
 
 
 DEFAULT_CONFIG = """
@@ -40,17 +47,108 @@ class VietnameseTokenizer(DummyTokenizer):
 
     def __call__(self, text: str) -> Doc:
         if self.use_pyvi:
-            words, spaces = self.ViTokenizer.spacy_tokenize(text)
+            words = self.pyvi_tokenize(text)
+            words, spaces = util.get_words_and_spaces(words, text)
             return Doc(self.vocab, words=words, spaces=spaces)
         else:
-            words = []
-            spaces = []
-            for token in self.tokenizer(text):
-                words.extend(list(token.text))
-                spaces.extend([False] * len(token.text))
-                spaces[-1] = bool(token.whitespace_)
+            words, spaces = util.get_words_and_spaces(text.split(), text)
             return Doc(self.vocab, words=words, spaces=spaces)
 
+    # The methods pyvi_sylabelize_with_ws and pyvi_tokenize are adapted from
+    # pyvi v0.1, MIT License, Copyright (c) 2016 Viet-Trung Tran.
+    # See licenses/3rd_party_licenses.txt
+    def pyvi_sylabelize_with_ws(self, text):
+        """Modified from pyvi to preserve whitespace and skip unicode
+        normalization."""
+        specials = [r"==>", r"->", r"\.\.\.", r">>"]
+        digit = r"\d+([\.,_]\d+)+"
+        email = r"([a-zA-Z0-9_.+-]+@([a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+)"
+        web = r"\w+://[^\s]+"
+        word = r"\w+"
+        non_word = r"[^\w\s]"
+        abbreviations = [
+            r"[A-ZĐ]+\.",
+            r"Tp\.",
+            r"Mr\.",
+            r"Mrs\.",
+            r"Ms\.",
+            r"Dr\.",
+            r"ThS\.",
+        ]
+
+        patterns = []
+        patterns.extend(abbreviations)
+        patterns.extend(specials)
+        patterns.extend([web, email])
+        patterns.extend([digit, non_word, word])
+
+        patterns = r"(\s+|" + "|".join(patterns) + ")"
+        tokens = re.findall(patterns, text, re.UNICODE)
+
+        return [token[0] for token in tokens]
+
+    def pyvi_tokenize(self, text):
+        """Modified from pyvi to preserve text and whitespace."""
+        if len(text) == 0:
+            return []
+        elif text.isspace():
+            return [text]
+        segs = self.pyvi_sylabelize_with_ws(text)
+        words = []
+        preceding_ws = []
+        for i, token in enumerate(segs):
+            if not token.isspace():
+                words.append(token)
+                preceding_ws.append(
+                    "" if (i == 0 or not segs[i - 1].isspace()) else segs[i - 1]
+                )
+        labels = self.ViTokenizer.ViTokenizer.model.predict(
+            [self.ViTokenizer.ViTokenizer.sent2features(words, False)]
+        )
+        token = words[0]
+        tokens = []
+        for i in range(1, len(labels[0])):
+            if (
+                labels[0][i] == "I_W"
+                and words[i] not in string.punctuation
+                and words[i - 1] not in string.punctuation
+                and not words[i][0].isdigit()
+                and not words[i - 1][0].isdigit()
+                and not (words[i][0].istitle() and not words[i - 1][0].istitle())
+            ):
+                token = token + preceding_ws[i] + words[i]
+            else:
+                tokens.append(token)
+                token = words[i]
+        tokens.append(token)
+        return tokens
+
+    def _get_config(self) -> Dict[str, Any]:
+        return {"use_pyvi": self.use_pyvi}
+
+    def _set_config(self, config: Dict[str, Any] = {}) -> None:
+        self.use_pyvi = config.get("use_pyvi", False)
+
+    def to_bytes(self, **kwargs) -> bytes:
+        serializers = {"cfg": lambda: srsly.json_dumps(self._get_config())}
+        return util.to_bytes(serializers, [])
+
+    def from_bytes(self, data: bytes, **kwargs) -> "VietnameseTokenizer":
+        deserializers = {"cfg": lambda b: self._set_config(srsly.json_loads(b))}
+        util.from_bytes(data, deserializers, [])
+        return self
+
+    def to_disk(self, path: Union[str, Path], **kwargs) -> None:
+        path = util.ensure_path(path)
+        serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())}
+        return util.to_disk(path, serializers, [])
+
+    def from_disk(self, path: Union[str, Path], **kwargs) -> "VietnameseTokenizer":
+        path = util.ensure_path(path)
+        serializers = {"cfg": lambda p: self._set_config(srsly.read_json(p))}
+        util.from_disk(path, serializers, [])
+        return self
+
 
 class VietnameseDefaults(Language.Defaults):
     config = load_config_from_str(DEFAULT_CONFIG)
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 04e254c50..404783197 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -286,6 +286,12 @@ def ur_tokenizer():
     return get_lang_class("ur")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def vi_tokenizer():
+    pytest.importorskip("pyvi")
+    return get_lang_class("vi")().tokenizer
+
+
 @pytest.fixture(scope="session")
 def yo_tokenizer():
     return get_lang_class("yo")().tokenizer
diff --git a/spacy/tests/lang/vi/__init__.py b/spacy/tests/lang/vi/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/vi/test_serialize.py b/spacy/tests/lang/vi/test_serialize.py
new file mode 100644
index 000000000..3ee5333fb
--- /dev/null
+++ b/spacy/tests/lang/vi/test_serialize.py
@@ -0,0 +1,33 @@
+from spacy.lang.vi import Vietnamese
+from ...util import make_tempdir
+
+
+def test_vi_tokenizer_serialize(vi_tokenizer):
+    tokenizer_bytes = vi_tokenizer.to_bytes()
+    nlp = Vietnamese()
+    nlp.tokenizer.from_bytes(tokenizer_bytes)
+    assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+    assert nlp.tokenizer.use_pyvi is True
+
+    with make_tempdir() as d:
+        file_path = d / "tokenizer"
+        vi_tokenizer.to_disk(file_path)
+        nlp = Vietnamese()
+        nlp.tokenizer.from_disk(file_path)
+        assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+        assert nlp.tokenizer.use_pyvi is True
+
+    # mode is (de)serialized correctly
+    nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}})
+    nlp_bytes = nlp.to_bytes()
+    nlp_r = Vietnamese()
+    nlp_r.from_bytes(nlp_bytes)
+    assert nlp_bytes == nlp_r.to_bytes()
+    assert nlp_r.tokenizer.use_pyvi == False
+
+    with make_tempdir() as d:
+        nlp.to_disk(d)
+        nlp_r = Vietnamese()
+        nlp_r.from_disk(d)
+        assert nlp_bytes == nlp_r.to_bytes()
+        assert nlp_r.tokenizer.use_pyvi == False
diff --git a/spacy/tests/lang/vi/test_tokenizer.py b/spacy/tests/lang/vi/test_tokenizer.py
new file mode 100644
index 000000000..3d0642d1e
--- /dev/null
+++ b/spacy/tests/lang/vi/test_tokenizer.py
@@ -0,0 +1,47 @@
+import pytest
+
+from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
+from spacy.lang.vi import Vietnamese
+
+
+# fmt: off
+TOKENIZER_TESTS = [
+    ("Đây là một văn  bản bằng tiếng Việt Sau đó, đây là một văn bản khác bằng ngôn ngữ này", ['Đây', 'là', 'một', 'văn  bản', 'bằng', 'tiếng', 'Việt', 'Sau', 'đó', ',', 'đây', 'là', 'một', 'văn bản', 'khác', 'bằng', 'ngôn ngữ', 'này']),
+]
+# fmt: on
+
+
+@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
+def test_vi_tokenizer(vi_tokenizer, text, expected_tokens):
+    tokens = [token.text for token in vi_tokenizer(text)]
+    assert tokens == expected_tokens
+
+
+def test_vi_tokenizer_extra_spaces(vi_tokenizer):
+    # note: three spaces after "I"
+    tokens = vi_tokenizer("I   like cheese.")
+    assert tokens[1].orth_ == "  "
+
+
+@pytest.mark.parametrize("text", NAUGHTY_STRINGS)
+def test_vi_tokenizer_naughty_strings(vi_tokenizer, text):
+    tokens = vi_tokenizer(text)
+    assert tokens.text_with_ws == text
+
+
+def test_vi_tokenizer_emptyish_texts(vi_tokenizer):
+    doc = vi_tokenizer("")
+    assert len(doc) == 0
+    doc = vi_tokenizer(" ")
+    assert len(doc) == 1
+    doc = vi_tokenizer("\n\n\n \t\t \n\n\n")
+    assert len(doc) == 1
+
+
+def test_vi_tokenizer_no_pyvi():
+    """Test for whitespace tokenization without pyvi"""
+    nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}})
+    text = "Đây là một văn  bản bằng tiếng Việt Sau đó, đây là một văn bản khác bằng ngôn ngữ này"
+    doc = nlp(text)
+    assert [t.text for t in doc if not t.is_space] == text.split()
+    assert doc[4].text == " "