Update Vietnamese tokenizer (#8099)

* Adapt tokenization methods from `pyvi` to preserve text encoding and
whitespace
* Add serialization support similar to Chinese and Japanese

Note: as for Chinese and Japanese, some settings are duplicated in
`config.cfg` and `tokenizer/cfg`.
This commit is contained in:
Adriane Boyd 2021-05-17 10:16:20 +02:00 committed by GitHub
parent 946a4284be
commit 1d59fdbd39
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 220 additions and 9 deletions

View File

@ -43,8 +43,8 @@ scikit-learn
* Files: scorer.py
The following implementation of roc_auc_score() is adapted from
scikit-learn, which is distributed under the following license:
The implementation of roc_auc_score() is adapted from scikit-learn, which is
distributed under the following license:
New BSD License
@ -77,3 +77,30 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
DAMAGE.
pyvi
----
* Files: lang/vi/__init__.py
The MIT License (MIT)
Copyright (c) 2016 Viet-Trung Tran
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -1,8 +1,15 @@
from typing import Any, Dict, Union
from pathlib import Path
import re
import srsly
import string
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from ...language import Language
from ...tokens import Doc
from ...util import DummyTokenizer, registry, load_config_from_str
from ... import util
DEFAULT_CONFIG = """
@ -40,17 +47,108 @@ class VietnameseTokenizer(DummyTokenizer):
def __call__(self, text: str) -> Doc:
if self.use_pyvi:
words, spaces = self.ViTokenizer.spacy_tokenize(text)
words = self.pyvi_tokenize(text)
words, spaces = util.get_words_and_spaces(words, text)
return Doc(self.vocab, words=words, spaces=spaces)
else:
words = []
spaces = []
for token in self.tokenizer(text):
words.extend(list(token.text))
spaces.extend([False] * len(token.text))
spaces[-1] = bool(token.whitespace_)
words, spaces = util.get_words_and_spaces(text.split(), text)
return Doc(self.vocab, words=words, spaces=spaces)
# The methods pyvi_sylabelize_with_ws and pyvi_tokenize are adapted from
# pyvi v0.1, MIT License, Copyright (c) 2016 Viet-Trung Tran.
# See licenses/3rd_party_licenses.txt
def pyvi_sylabelize_with_ws(self, text):
"""Modified from pyvi to preserve whitespace and skip unicode
normalization."""
specials = [r"==>", r"->", r"\.\.\.", r">>"]
digit = r"\d+([\.,_]\d+)+"
email = r"([a-zA-Z0-9_.+-]+@([a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+)"
web = r"\w+://[^\s]+"
word = r"\w+"
non_word = r"[^\w\s]"
abbreviations = [
r"[A-ZĐ]+\.",
r"Tp\.",
r"Mr\.",
r"Mrs\.",
r"Ms\.",
r"Dr\.",
r"ThS\.",
]
patterns = []
patterns.extend(abbreviations)
patterns.extend(specials)
patterns.extend([web, email])
patterns.extend([digit, non_word, word])
patterns = r"(\s+|" + "|".join(patterns) + ")"
tokens = re.findall(patterns, text, re.UNICODE)
return [token[0] for token in tokens]
def pyvi_tokenize(self, text):
"""Modified from pyvi to preserve text and whitespace."""
if len(text) == 0:
return []
elif text.isspace():
return [text]
segs = self.pyvi_sylabelize_with_ws(text)
words = []
preceding_ws = []
for i, token in enumerate(segs):
if not token.isspace():
words.append(token)
preceding_ws.append(
"" if (i == 0 or not segs[i - 1].isspace()) else segs[i - 1]
)
labels = self.ViTokenizer.ViTokenizer.model.predict(
[self.ViTokenizer.ViTokenizer.sent2features(words, False)]
)
token = words[0]
tokens = []
for i in range(1, len(labels[0])):
if (
labels[0][i] == "I_W"
and words[i] not in string.punctuation
and words[i - 1] not in string.punctuation
and not words[i][0].isdigit()
and not words[i - 1][0].isdigit()
and not (words[i][0].istitle() and not words[i - 1][0].istitle())
):
token = token + preceding_ws[i] + words[i]
else:
tokens.append(token)
token = words[i]
tokens.append(token)
return tokens
def _get_config(self) -> Dict[str, Any]:
return {"use_pyvi": self.use_pyvi}
def _set_config(self, config: Dict[str, Any] = {}) -> None:
self.use_pyvi = config.get("use_pyvi", False)
def to_bytes(self, **kwargs) -> bytes:
serializers = {"cfg": lambda: srsly.json_dumps(self._get_config())}
return util.to_bytes(serializers, [])
def from_bytes(self, data: bytes, **kwargs) -> "VietnameseTokenizer":
deserializers = {"cfg": lambda b: self._set_config(srsly.json_loads(b))}
util.from_bytes(data, deserializers, [])
return self
def to_disk(self, path: Union[str, Path], **kwargs) -> None:
path = util.ensure_path(path)
serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())}
return util.to_disk(path, serializers, [])
def from_disk(self, path: Union[str, Path], **kwargs) -> "VietnameseTokenizer":
path = util.ensure_path(path)
serializers = {"cfg": lambda p: self._set_config(srsly.read_json(p))}
util.from_disk(path, serializers, [])
return self
class VietnameseDefaults(Language.Defaults):
config = load_config_from_str(DEFAULT_CONFIG)

View File

@ -286,6 +286,12 @@ def ur_tokenizer():
return get_lang_class("ur")().tokenizer
@pytest.fixture(scope="session")
def vi_tokenizer():
pytest.importorskip("pyvi")
return get_lang_class("vi")().tokenizer
@pytest.fixture(scope="session")
def yo_tokenizer():
return get_lang_class("yo")().tokenizer

View File

View File

@ -0,0 +1,33 @@
from spacy.lang.vi import Vietnamese
from ...util import make_tempdir
def test_vi_tokenizer_serialize(vi_tokenizer):
tokenizer_bytes = vi_tokenizer.to_bytes()
nlp = Vietnamese()
nlp.tokenizer.from_bytes(tokenizer_bytes)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
assert nlp.tokenizer.use_pyvi is True
with make_tempdir() as d:
file_path = d / "tokenizer"
vi_tokenizer.to_disk(file_path)
nlp = Vietnamese()
nlp.tokenizer.from_disk(file_path)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
assert nlp.tokenizer.use_pyvi is True
# mode is (de)serialized correctly
nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}})
nlp_bytes = nlp.to_bytes()
nlp_r = Vietnamese()
nlp_r.from_bytes(nlp_bytes)
assert nlp_bytes == nlp_r.to_bytes()
assert nlp_r.tokenizer.use_pyvi == False
with make_tempdir() as d:
nlp.to_disk(d)
nlp_r = Vietnamese()
nlp_r.from_disk(d)
assert nlp_bytes == nlp_r.to_bytes()
assert nlp_r.tokenizer.use_pyvi == False

View File

@ -0,0 +1,47 @@
import pytest
from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
from spacy.lang.vi import Vietnamese
# fmt: off
TOKENIZER_TESTS = [
("Đây là một văn bản bằng tiếng Việt Sau đó, đây là một văn bản khác bằng ngôn ngữ này", ['Đây', '', 'một', 'văn bản', 'bằng', 'tiếng', 'Việt', 'Sau', 'đó', ',', 'đây', '', 'một', 'văn bản', 'khác', 'bằng', 'ngôn ngữ', 'này']),
]
# fmt: on
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
def test_vi_tokenizer(vi_tokenizer, text, expected_tokens):
tokens = [token.text for token in vi_tokenizer(text)]
assert tokens == expected_tokens
def test_vi_tokenizer_extra_spaces(vi_tokenizer):
# note: three spaces after "I"
tokens = vi_tokenizer("I like cheese.")
assert tokens[1].orth_ == " "
@pytest.mark.parametrize("text", NAUGHTY_STRINGS)
def test_vi_tokenizer_naughty_strings(vi_tokenizer, text):
tokens = vi_tokenizer(text)
assert tokens.text_with_ws == text
def test_vi_tokenizer_emptyish_texts(vi_tokenizer):
doc = vi_tokenizer("")
assert len(doc) == 0
doc = vi_tokenizer(" ")
assert len(doc) == 1
doc = vi_tokenizer("\n\n\n \t\t \n\n\n")
assert len(doc) == 1
def test_vi_tokenizer_no_pyvi():
"""Test for whitespace tokenization without pyvi"""
nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}})
text = "Đây là một văn bản bằng tiếng Việt Sau đó, đây là một văn bản khác bằng ngôn ngữ này"
doc = nlp(text)
assert [t.text for t in doc if not t.is_space] == text.split()
assert doc[4].text == " "