spaCy/spacy/lang/vi/__init__.py

169 lines
5.4 KiB
Python

import re
import string
from pathlib import Path
from typing import Any, Dict, Union
import srsly
from ... import util
from ...language import BaseDefaults, Language
from ...tokens import Doc
from ...util import DummyTokenizer, load_config_from_str, registry
from ...vocab import Vocab
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
DEFAULT_CONFIG = """
[nlp]
[nlp.tokenizer]
@tokenizers = "spacy.vi.VietnameseTokenizer"
use_pyvi = true
"""
@registry.tokenizers("spacy.vi.VietnameseTokenizer")
def create_vietnamese_tokenizer(use_pyvi: bool = True):
def vietnamese_tokenizer_factory(nlp):
return VietnameseTokenizer(nlp.vocab, use_pyvi=use_pyvi)
return vietnamese_tokenizer_factory
class VietnameseTokenizer(DummyTokenizer):
def __init__(self, vocab: Vocab, use_pyvi: bool = False):
self.vocab = vocab
self.use_pyvi = use_pyvi
if self.use_pyvi:
try:
from pyvi import ViTokenizer
self.ViTokenizer = ViTokenizer
except ImportError:
msg = (
"Pyvi not installed. Either set use_pyvi = False, "
"or install it https://pypi.python.org/pypi/pyvi"
)
raise ImportError(msg) from None
def __reduce__(self):
return VietnameseTokenizer, (self.vocab, self.use_pyvi)
def __call__(self, text: str) -> Doc:
if self.use_pyvi:
words = self.pyvi_tokenize(text)
words, spaces = util.get_words_and_spaces(words, text)
return Doc(self.vocab, words=words, spaces=spaces)
else:
words, spaces = util.get_words_and_spaces(text.split(), text)
return Doc(self.vocab, words=words, spaces=spaces)
# The methods pyvi_sylabelize_with_ws and pyvi_tokenize are adapted from
# pyvi v0.1, MIT License, Copyright (c) 2016 Viet-Trung Tran.
# See licenses/3rd_party_licenses.txt
def pyvi_sylabelize_with_ws(self, text):
"""Modified from pyvi to preserve whitespace and skip unicode
normalization."""
specials = [r"==>", r"->", r"\.\.\.", r">>"]
digit = r"\d+([\.,_]\d+)+"
email = r"([a-zA-Z0-9_.+-]+@([a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+)"
web = r"\w+://[^\s]+"
word = r"\w+"
non_word = r"[^\w\s]"
abbreviations = [
r"[A-ZĐ]+\.",
r"Tp\.",
r"Mr\.",
r"Mrs\.",
r"Ms\.",
r"Dr\.",
r"ThS\.",
]
patterns = []
patterns.extend(abbreviations)
patterns.extend(specials)
patterns.extend([web, email])
patterns.extend([digit, non_word, word])
patterns = r"(\s+|" + "|".join(patterns) + ")"
tokens = re.findall(patterns, text, re.UNICODE)
return [token[0] for token in tokens]
def pyvi_tokenize(self, text):
"""Modified from pyvi to preserve text and whitespace."""
if len(text) == 0:
return []
elif text.isspace():
return [text]
segs = self.pyvi_sylabelize_with_ws(text)
words = []
preceding_ws = []
for i, token in enumerate(segs):
if not token.isspace():
words.append(token)
preceding_ws.append(
"" if (i == 0 or not segs[i - 1].isspace()) else segs[i - 1]
)
labels = self.ViTokenizer.ViTokenizer.model.predict(
[self.ViTokenizer.ViTokenizer.sent2features(words, False)]
)
token = words[0]
tokens = []
for i in range(1, len(labels[0])):
if (
labels[0][i] == "I_W"
and words[i] not in string.punctuation
and words[i - 1] not in string.punctuation
and not words[i][0].isdigit()
and not words[i - 1][0].isdigit()
and not (words[i][0].istitle() and not words[i - 1][0].istitle())
):
token = token + preceding_ws[i] + words[i]
else:
tokens.append(token)
token = words[i]
tokens.append(token)
return tokens
def _get_config(self) -> Dict[str, Any]:
return {"use_pyvi": self.use_pyvi}
def _set_config(self, config: Dict[str, Any] = {}) -> None:
self.use_pyvi = config.get("use_pyvi", False)
def to_bytes(self, **kwargs) -> bytes:
serializers = {"cfg": lambda: srsly.json_dumps(self._get_config())}
return util.to_bytes(serializers, [])
def from_bytes(self, data: bytes, **kwargs) -> "VietnameseTokenizer":
deserializers = {"cfg": lambda b: self._set_config(srsly.json_loads(b))}
util.from_bytes(data, deserializers, [])
return self
def to_disk(self, path: Union[str, Path], **kwargs) -> None:
path = util.ensure_path(path)
serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())}
util.to_disk(path, serializers, [])
def from_disk(self, path: Union[str, Path], **kwargs) -> "VietnameseTokenizer":
path = util.ensure_path(path)
serializers = {"cfg": lambda p: self._set_config(srsly.read_json(p))}
util.from_disk(path, serializers, [])
return self
class VietnameseDefaults(BaseDefaults):
config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
class Vietnamese(Language):
lang = "vi"
Defaults = VietnameseDefaults
__all__ = ["Vietnamese"]