mirror of https://github.com/explosion/spaCy.git
Update Vietnamese tokenizer (#8099)
* Adapt tokenization methods from `pyvi` to preserve text encoding and whitespace * Add serialization support similar to Chinese and Japanese Note: as for Chinese and Japanese, some settings are duplicated in `config.cfg` and `tokenizer/cfg`.
This commit is contained in:
parent
946a4284be
commit
1d59fdbd39
|
@ -43,8 +43,8 @@ scikit-learn
|
|||
|
||||
* Files: scorer.py
|
||||
|
||||
The following implementation of roc_auc_score() is adapted from
|
||||
scikit-learn, which is distributed under the following license:
|
||||
The implementation of roc_auc_score() is adapted from scikit-learn, which is
|
||||
distributed under the following license:
|
||||
|
||||
New BSD License
|
||||
|
||||
|
@ -77,3 +77,30 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|||
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
|
||||
DAMAGE.
|
||||
|
||||
|
||||
pyvi
|
||||
----
|
||||
|
||||
* Files: lang/vi/__init__.py
|
||||
|
||||
The MIT License (MIT)
|
||||
Copyright (c) 2016 Viet-Trung Tran
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
this software and associated documentation files (the "Software"), to deal in
|
||||
the Software without restriction, including without limitation the rights to
|
||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
of the Software, and to permit persons to whom the Software is furnished to do
|
||||
so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
|
|
@ -1,8 +1,15 @@
|
|||
from typing import Any, Dict, Union
|
||||
from pathlib import Path
|
||||
import re
|
||||
import srsly
|
||||
import string
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from ...language import Language
|
||||
from ...tokens import Doc
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
from ... import util
|
||||
|
||||
|
||||
DEFAULT_CONFIG = """
|
||||
|
@ -40,17 +47,108 @@ class VietnameseTokenizer(DummyTokenizer):
|
|||
|
||||
def __call__(self, text: str) -> Doc:
|
||||
if self.use_pyvi:
|
||||
words, spaces = self.ViTokenizer.spacy_tokenize(text)
|
||||
words = self.pyvi_tokenize(text)
|
||||
words, spaces = util.get_words_and_spaces(words, text)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
else:
|
||||
words = []
|
||||
spaces = []
|
||||
for token in self.tokenizer(text):
|
||||
words.extend(list(token.text))
|
||||
spaces.extend([False] * len(token.text))
|
||||
spaces[-1] = bool(token.whitespace_)
|
||||
words, spaces = util.get_words_and_spaces(text.split(), text)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
# The methods pyvi_sylabelize_with_ws and pyvi_tokenize are adapted from
|
||||
# pyvi v0.1, MIT License, Copyright (c) 2016 Viet-Trung Tran.
|
||||
# See licenses/3rd_party_licenses.txt
|
||||
def pyvi_sylabelize_with_ws(self, text):
|
||||
"""Modified from pyvi to preserve whitespace and skip unicode
|
||||
normalization."""
|
||||
specials = [r"==>", r"->", r"\.\.\.", r">>"]
|
||||
digit = r"\d+([\.,_]\d+)+"
|
||||
email = r"([a-zA-Z0-9_.+-]+@([a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+)"
|
||||
web = r"\w+://[^\s]+"
|
||||
word = r"\w+"
|
||||
non_word = r"[^\w\s]"
|
||||
abbreviations = [
|
||||
r"[A-ZĐ]+\.",
|
||||
r"Tp\.",
|
||||
r"Mr\.",
|
||||
r"Mrs\.",
|
||||
r"Ms\.",
|
||||
r"Dr\.",
|
||||
r"ThS\.",
|
||||
]
|
||||
|
||||
patterns = []
|
||||
patterns.extend(abbreviations)
|
||||
patterns.extend(specials)
|
||||
patterns.extend([web, email])
|
||||
patterns.extend([digit, non_word, word])
|
||||
|
||||
patterns = r"(\s+|" + "|".join(patterns) + ")"
|
||||
tokens = re.findall(patterns, text, re.UNICODE)
|
||||
|
||||
return [token[0] for token in tokens]
|
||||
|
||||
def pyvi_tokenize(self, text):
|
||||
"""Modified from pyvi to preserve text and whitespace."""
|
||||
if len(text) == 0:
|
||||
return []
|
||||
elif text.isspace():
|
||||
return [text]
|
||||
segs = self.pyvi_sylabelize_with_ws(text)
|
||||
words = []
|
||||
preceding_ws = []
|
||||
for i, token in enumerate(segs):
|
||||
if not token.isspace():
|
||||
words.append(token)
|
||||
preceding_ws.append(
|
||||
"" if (i == 0 or not segs[i - 1].isspace()) else segs[i - 1]
|
||||
)
|
||||
labels = self.ViTokenizer.ViTokenizer.model.predict(
|
||||
[self.ViTokenizer.ViTokenizer.sent2features(words, False)]
|
||||
)
|
||||
token = words[0]
|
||||
tokens = []
|
||||
for i in range(1, len(labels[0])):
|
||||
if (
|
||||
labels[0][i] == "I_W"
|
||||
and words[i] not in string.punctuation
|
||||
and words[i - 1] not in string.punctuation
|
||||
and not words[i][0].isdigit()
|
||||
and not words[i - 1][0].isdigit()
|
||||
and not (words[i][0].istitle() and not words[i - 1][0].istitle())
|
||||
):
|
||||
token = token + preceding_ws[i] + words[i]
|
||||
else:
|
||||
tokens.append(token)
|
||||
token = words[i]
|
||||
tokens.append(token)
|
||||
return tokens
|
||||
|
||||
def _get_config(self) -> Dict[str, Any]:
|
||||
return {"use_pyvi": self.use_pyvi}
|
||||
|
||||
def _set_config(self, config: Dict[str, Any] = {}) -> None:
|
||||
self.use_pyvi = config.get("use_pyvi", False)
|
||||
|
||||
def to_bytes(self, **kwargs) -> bytes:
|
||||
serializers = {"cfg": lambda: srsly.json_dumps(self._get_config())}
|
||||
return util.to_bytes(serializers, [])
|
||||
|
||||
def from_bytes(self, data: bytes, **kwargs) -> "VietnameseTokenizer":
|
||||
deserializers = {"cfg": lambda b: self._set_config(srsly.json_loads(b))}
|
||||
util.from_bytes(data, deserializers, [])
|
||||
return self
|
||||
|
||||
def to_disk(self, path: Union[str, Path], **kwargs) -> None:
|
||||
path = util.ensure_path(path)
|
||||
serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())}
|
||||
return util.to_disk(path, serializers, [])
|
||||
|
||||
def from_disk(self, path: Union[str, Path], **kwargs) -> "VietnameseTokenizer":
|
||||
path = util.ensure_path(path)
|
||||
serializers = {"cfg": lambda p: self._set_config(srsly.read_json(p))}
|
||||
util.from_disk(path, serializers, [])
|
||||
return self
|
||||
|
||||
|
||||
class VietnameseDefaults(Language.Defaults):
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
|
|
|
@ -286,6 +286,12 @@ def ur_tokenizer():
|
|||
return get_lang_class("ur")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def vi_tokenizer():
|
||||
pytest.importorskip("pyvi")
|
||||
return get_lang_class("vi")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def yo_tokenizer():
|
||||
return get_lang_class("yo")().tokenizer
|
||||
|
|
|
@ -0,0 +1,33 @@
|
|||
from spacy.lang.vi import Vietnamese
|
||||
from ...util import make_tempdir
|
||||
|
||||
|
||||
def test_vi_tokenizer_serialize(vi_tokenizer):
|
||||
tokenizer_bytes = vi_tokenizer.to_bytes()
|
||||
nlp = Vietnamese()
|
||||
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
||||
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||
assert nlp.tokenizer.use_pyvi is True
|
||||
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "tokenizer"
|
||||
vi_tokenizer.to_disk(file_path)
|
||||
nlp = Vietnamese()
|
||||
nlp.tokenizer.from_disk(file_path)
|
||||
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||
assert nlp.tokenizer.use_pyvi is True
|
||||
|
||||
# mode is (de)serialized correctly
|
||||
nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}})
|
||||
nlp_bytes = nlp.to_bytes()
|
||||
nlp_r = Vietnamese()
|
||||
nlp_r.from_bytes(nlp_bytes)
|
||||
assert nlp_bytes == nlp_r.to_bytes()
|
||||
assert nlp_r.tokenizer.use_pyvi == False
|
||||
|
||||
with make_tempdir() as d:
|
||||
nlp.to_disk(d)
|
||||
nlp_r = Vietnamese()
|
||||
nlp_r.from_disk(d)
|
||||
assert nlp_bytes == nlp_r.to_bytes()
|
||||
assert nlp_r.tokenizer.use_pyvi == False
|
|
@ -0,0 +1,47 @@
|
|||
import pytest
|
||||
|
||||
from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
|
||||
from spacy.lang.vi import Vietnamese
|
||||
|
||||
|
||||
# fmt: off
|
||||
TOKENIZER_TESTS = [
|
||||
("Đây là một văn bản bằng tiếng Việt Sau đó, đây là một văn bản khác bằng ngôn ngữ này", ['Đây', 'là', 'một', 'văn bản', 'bằng', 'tiếng', 'Việt', 'Sau', 'đó', ',', 'đây', 'là', 'một', 'văn bản', 'khác', 'bằng', 'ngôn ngữ', 'này']),
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
|
||||
def test_vi_tokenizer(vi_tokenizer, text, expected_tokens):
|
||||
tokens = [token.text for token in vi_tokenizer(text)]
|
||||
assert tokens == expected_tokens
|
||||
|
||||
|
||||
def test_vi_tokenizer_extra_spaces(vi_tokenizer):
|
||||
# note: three spaces after "I"
|
||||
tokens = vi_tokenizer("I like cheese.")
|
||||
assert tokens[1].orth_ == " "
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text", NAUGHTY_STRINGS)
|
||||
def test_vi_tokenizer_naughty_strings(vi_tokenizer, text):
|
||||
tokens = vi_tokenizer(text)
|
||||
assert tokens.text_with_ws == text
|
||||
|
||||
|
||||
def test_vi_tokenizer_emptyish_texts(vi_tokenizer):
|
||||
doc = vi_tokenizer("")
|
||||
assert len(doc) == 0
|
||||
doc = vi_tokenizer(" ")
|
||||
assert len(doc) == 1
|
||||
doc = vi_tokenizer("\n\n\n \t\t \n\n\n")
|
||||
assert len(doc) == 1
|
||||
|
||||
|
||||
def test_vi_tokenizer_no_pyvi():
|
||||
"""Test for whitespace tokenization without pyvi"""
|
||||
nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}})
|
||||
text = "Đây là một văn bản bằng tiếng Việt Sau đó, đây là một văn bản khác bằng ngôn ngữ này"
|
||||
doc = nlp(text)
|
||||
assert [t.text for t in doc if not t.is_space] == text.split()
|
||||
assert doc[4].text == " "
|
Loading…
Reference in New Issue