mirror of https://github.com/explosion/spaCy.git
Rework Chinese language initialization and tokenization (#4619)
* Rework Chinese language initialization * Create a `ChineseTokenizer` class * Modify jieba post-processing to handle whitespace correctly * Modify non-jieba character tokenization to handle whitespace correctly * Add a `create_tokenizer()` method to `ChineseDefaults` * Load lexical attributes * Update Chinese tag_map for UD v2 * Add very basic Chinese tests * Test tokenization with and without jieba * Test `like_num` attribute * Fix try_jieba_import() * Fix zh code formatting
This commit is contained in:
parent
4d85f67eee
commit
0b9a5f4074
|
@ -4,19 +4,92 @@ from __future__ import unicode_literals
|
|||
from ...attrs import LANG
|
||||
from ...language import Language
|
||||
from ...tokens import Doc
|
||||
from ...util import DummyTokenizer
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tag_map import TAG_MAP
|
||||
|
||||
|
||||
def try_jieba_import(use_jieba):
|
||||
try:
|
||||
import jieba
|
||||
return jieba
|
||||
except ImportError:
|
||||
if use_jieba:
|
||||
msg = (
|
||||
"Jieba not installed. Either set Chinese.use_jieba = False, "
|
||||
"or install it https://github.com/fxsjy/jieba"
|
||||
)
|
||||
raise ImportError(msg)
|
||||
|
||||
|
||||
class ChineseTokenizer(DummyTokenizer):
|
||||
def __init__(self, cls, nlp=None):
|
||||
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||
self.use_jieba = cls.use_jieba
|
||||
self.jieba_seg = try_jieba_import(self.use_jieba)
|
||||
self.tokenizer = Language.Defaults().create_tokenizer(nlp)
|
||||
|
||||
def __call__(self, text):
|
||||
# use jieba
|
||||
if self.use_jieba:
|
||||
jieba_words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
|
||||
words = [jieba_words[0]]
|
||||
spaces = [False]
|
||||
for i in range(1, len(jieba_words)):
|
||||
word = jieba_words[i]
|
||||
if word.isspace():
|
||||
# second token in adjacent whitespace following a
|
||||
# non-space token
|
||||
if spaces[-1]:
|
||||
words.append(word)
|
||||
spaces.append(False)
|
||||
# first space token following non-space token
|
||||
elif word == " " and not words[-1].isspace():
|
||||
spaces[-1] = True
|
||||
# token is non-space whitespace or any whitespace following
|
||||
# a whitespace token
|
||||
else:
|
||||
# extend previous whitespace token with more whitespace
|
||||
if words[-1].isspace():
|
||||
words[-1] += word
|
||||
# otherwise it's a new whitespace token
|
||||
else:
|
||||
words.append(word)
|
||||
spaces.append(False)
|
||||
else:
|
||||
words.append(word)
|
||||
spaces.append(False)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
# split into individual characters
|
||||
words = []
|
||||
spaces = []
|
||||
for token in self.tokenizer(text):
|
||||
if token.text.isspace():
|
||||
words.append(token.text)
|
||||
spaces.append(False)
|
||||
else:
|
||||
words.extend(list(token.text))
|
||||
spaces.extend([False] * len(token.text))
|
||||
spaces[-1] = bool(token.whitespace_)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
|
||||
class ChineseDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: "zh"
|
||||
use_jieba = True
|
||||
tokenizer_exceptions = BASE_EXCEPTIONS
|
||||
stop_words = STOP_WORDS
|
||||
tag_map = TAG_MAP
|
||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||
use_jieba = True
|
||||
|
||||
@classmethod
|
||||
def create_tokenizer(cls, nlp=None):
|
||||
return ChineseTokenizer(cls, nlp)
|
||||
|
||||
|
||||
class Chinese(Language):
|
||||
|
@ -24,26 +97,7 @@ class Chinese(Language):
|
|||
Defaults = ChineseDefaults # override defaults
|
||||
|
||||
def make_doc(self, text):
|
||||
if self.Defaults.use_jieba:
|
||||
try:
|
||||
import jieba
|
||||
except ImportError:
|
||||
msg = (
|
||||
"Jieba not installed. Either set Chinese.use_jieba = False, "
|
||||
"or install it https://github.com/fxsjy/jieba"
|
||||
)
|
||||
raise ImportError(msg)
|
||||
words = list(jieba.cut(text, cut_all=False))
|
||||
words = [x for x in words if x]
|
||||
return Doc(self.vocab, words=words, spaces=[False] * len(words))
|
||||
else:
|
||||
words = []
|
||||
spaces = []
|
||||
for token in self.tokenizer(text):
|
||||
words.extend(list(token.text))
|
||||
spaces.extend([False] * len(token.text))
|
||||
spaces[-1] = bool(token.whitespace_)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
return self.tokenizer(text)
|
||||
|
||||
|
||||
__all__ = ["Chinese"]
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import POS, PUNCT, ADJ, CONJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
|
||||
from ...symbols import NOUN, PART, INTJ, PRON
|
||||
from ...symbols import POS, PUNCT, ADJ, SCONJ, CCONJ, NUM, DET, ADV, ADP, X
|
||||
from ...symbols import NOUN, PART, INTJ, PRON, VERB, SPACE
|
||||
|
||||
# The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank tag set.
|
||||
# We also map the tags to the simpler Google Universal POS tag set.
|
||||
# The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn
|
||||
# Treebank tag set. We also map the tags to the simpler Universal Dependencies
|
||||
# v2 tag set.
|
||||
|
||||
TAG_MAP = {
|
||||
"AS": {POS: PART},
|
||||
|
@ -38,10 +39,11 @@ TAG_MAP = {
|
|||
"OD": {POS: NUM},
|
||||
"DT": {POS: DET},
|
||||
"CC": {POS: CCONJ},
|
||||
"CS": {POS: CONJ},
|
||||
"CS": {POS: SCONJ},
|
||||
"AD": {POS: ADV},
|
||||
"JJ": {POS: ADJ},
|
||||
"P": {POS: ADP},
|
||||
"PN": {POS: PRON},
|
||||
"PU": {POS: PUNCT},
|
||||
"_SP": {POS: SPACE},
|
||||
}
|
||||
|
|
|
@ -218,3 +218,9 @@ def uk_tokenizer():
|
|||
@pytest.fixture(scope="session")
|
||||
def ur_tokenizer():
|
||||
return get_lang_class("ur").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def zh_tokenizer():
|
||||
pytest.importorskip("jieba")
|
||||
return get_lang_class("zh").Defaults.create_tokenizer()
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"text,match",
|
||||
[
|
||||
("10", True),
|
||||
("1", True),
|
||||
("999.0", True),
|
||||
("一", True),
|
||||
("二", True),
|
||||
("〇", True),
|
||||
("十一", True),
|
||||
("狗", False),
|
||||
(",", False),
|
||||
],
|
||||
)
|
||||
def test_lex_attrs_like_number(zh_tokenizer, text, match):
|
||||
tokens = zh_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
assert tokens[0].like_num == match
|
|
@ -0,0 +1,31 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
# fmt: off
|
||||
TOKENIZER_TESTS = [
|
||||
("作为语言而言,为世界使用人数最多的语言,目前世界有五分之一人口做为母语。",
|
||||
['作为', '语言', '而言', ',', '为', '世界', '使用', '人', '数最多',
|
||||
'的', '语言', ',', '目前', '世界', '有', '五分之一', '人口', '做',
|
||||
'为', '母语', '。']),
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
|
||||
def test_zh_tokenizer(zh_tokenizer, text, expected_tokens):
|
||||
zh_tokenizer.use_jieba = False
|
||||
tokens = [token.text for token in zh_tokenizer(text)]
|
||||
assert tokens == list(text)
|
||||
|
||||
zh_tokenizer.use_jieba = True
|
||||
tokens = [token.text for token in zh_tokenizer(text)]
|
||||
assert tokens == expected_tokens
|
||||
|
||||
|
||||
def test_extra_spaces(zh_tokenizer):
|
||||
# note: three spaces after "I"
|
||||
tokens = zh_tokenizer("I like cheese.")
|
||||
assert tokens[1].orth_ == " "
|
Loading…
Reference in New Issue