From 0b9a5f4074f2b6f5ad3322234a82ac921fe1f358 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Mon, 11 Nov 2019 14:23:21 +0100 Subject: [PATCH] Rework Chinese language initialization and tokenization (#4619) * Rework Chinese language initialization * Create a `ChineseTokenizer` class * Modify jieba post-processing to handle whitespace correctly * Modify non-jieba character tokenization to handle whitespace correctly * Add a `create_tokenizer()` method to `ChineseDefaults` * Load lexical attributes * Update Chinese tag_map for UD v2 * Add very basic Chinese tests * Test tokenization with and without jieba * Test `like_num` attribute * Fix try_jieba_import() * Fix zh code formatting --- spacy/lang/zh/__init__.py | 96 +++++++++++++++++++++------ spacy/lang/zh/tag_map.py | 12 ++-- spacy/tests/conftest.py | 6 ++ spacy/tests/lang/zh/__init__.py | 0 spacy/tests/lang/zh/test_text.py | 25 +++++++ spacy/tests/lang/zh/test_tokenizer.py | 31 +++++++++ 6 files changed, 144 insertions(+), 26 deletions(-) create mode 100644 spacy/tests/lang/zh/__init__.py create mode 100644 spacy/tests/lang/zh/test_text.py create mode 100644 spacy/tests/lang/zh/test_tokenizer.py diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 91daea099..5bd7b7335 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -4,19 +4,92 @@ from __future__ import unicode_literals from ...attrs import LANG from ...language import Language from ...tokens import Doc +from ...util import DummyTokenizer from ..tokenizer_exceptions import BASE_EXCEPTIONS +from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS from .tag_map import TAG_MAP +def try_jieba_import(use_jieba): + try: + import jieba + return jieba + except ImportError: + if use_jieba: + msg = ( + "Jieba not installed. Either set Chinese.use_jieba = False, " + "or install it https://github.com/fxsjy/jieba" + ) + raise ImportError(msg) + + +class ChineseTokenizer(DummyTokenizer): + def __init__(self, cls, nlp=None): + self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) + self.use_jieba = cls.use_jieba + self.jieba_seg = try_jieba_import(self.use_jieba) + self.tokenizer = Language.Defaults().create_tokenizer(nlp) + + def __call__(self, text): + # use jieba + if self.use_jieba: + jieba_words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x]) + words = [jieba_words[0]] + spaces = [False] + for i in range(1, len(jieba_words)): + word = jieba_words[i] + if word.isspace(): + # second token in adjacent whitespace following a + # non-space token + if spaces[-1]: + words.append(word) + spaces.append(False) + # first space token following non-space token + elif word == " " and not words[-1].isspace(): + spaces[-1] = True + # token is non-space whitespace or any whitespace following + # a whitespace token + else: + # extend previous whitespace token with more whitespace + if words[-1].isspace(): + words[-1] += word + # otherwise it's a new whitespace token + else: + words.append(word) + spaces.append(False) + else: + words.append(word) + spaces.append(False) + return Doc(self.vocab, words=words, spaces=spaces) + + # split into individual characters + words = [] + spaces = [] + for token in self.tokenizer(text): + if token.text.isspace(): + words.append(token.text) + spaces.append(False) + else: + words.extend(list(token.text)) + spaces.extend([False] * len(token.text)) + spaces[-1] = bool(token.whitespace_) + return Doc(self.vocab, words=words, spaces=spaces) + + class ChineseDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: "zh" - use_jieba = True tokenizer_exceptions = BASE_EXCEPTIONS stop_words = STOP_WORDS tag_map = TAG_MAP writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} + use_jieba = True + + @classmethod + def create_tokenizer(cls, nlp=None): + return ChineseTokenizer(cls, nlp) class Chinese(Language): @@ -24,26 +97,7 @@ class Chinese(Language): Defaults = ChineseDefaults # override defaults def make_doc(self, text): - if self.Defaults.use_jieba: - try: - import jieba - except ImportError: - msg = ( - "Jieba not installed. Either set Chinese.use_jieba = False, " - "or install it https://github.com/fxsjy/jieba" - ) - raise ImportError(msg) - words = list(jieba.cut(text, cut_all=False)) - words = [x for x in words if x] - return Doc(self.vocab, words=words, spaces=[False] * len(words)) - else: - words = [] - spaces = [] - for token in self.tokenizer(text): - words.extend(list(token.text)) - spaces.extend([False] * len(token.text)) - spaces[-1] = bool(token.whitespace_) - return Doc(self.vocab, words=words, spaces=spaces) + return self.tokenizer(text) __all__ = ["Chinese"] diff --git a/spacy/lang/zh/tag_map.py b/spacy/lang/zh/tag_map.py index 8d2f99d01..41e2d2158 100644 --- a/spacy/lang/zh/tag_map.py +++ b/spacy/lang/zh/tag_map.py @@ -1,11 +1,12 @@ # coding: utf8 from __future__ import unicode_literals -from ...symbols import POS, PUNCT, ADJ, CONJ, CCONJ, NUM, DET, ADV, ADP, X, VERB -from ...symbols import NOUN, PART, INTJ, PRON +from ...symbols import POS, PUNCT, ADJ, SCONJ, CCONJ, NUM, DET, ADV, ADP, X +from ...symbols import NOUN, PART, INTJ, PRON, VERB, SPACE -# The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank tag set. -# We also map the tags to the simpler Google Universal POS tag set. +# The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn +# Treebank tag set. We also map the tags to the simpler Universal Dependencies +# v2 tag set. TAG_MAP = { "AS": {POS: PART}, @@ -38,10 +39,11 @@ TAG_MAP = { "OD": {POS: NUM}, "DT": {POS: DET}, "CC": {POS: CCONJ}, - "CS": {POS: CONJ}, + "CS": {POS: SCONJ}, "AD": {POS: ADV}, "JJ": {POS: ADJ}, "P": {POS: ADP}, "PN": {POS: PRON}, "PU": {POS: PUNCT}, + "_SP": {POS: SPACE}, } diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index b0d373c42..d6b9ba11f 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -218,3 +218,9 @@ def uk_tokenizer(): @pytest.fixture(scope="session") def ur_tokenizer(): return get_lang_class("ur").Defaults.create_tokenizer() + + +@pytest.fixture(scope="session") +def zh_tokenizer(): + pytest.importorskip("jieba") + return get_lang_class("zh").Defaults.create_tokenizer() diff --git a/spacy/tests/lang/zh/__init__.py b/spacy/tests/lang/zh/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/zh/test_text.py b/spacy/tests/lang/zh/test_text.py new file mode 100644 index 000000000..235f597a5 --- /dev/null +++ b/spacy/tests/lang/zh/test_text.py @@ -0,0 +1,25 @@ +# coding: utf-8 +from __future__ import unicode_literals + + +import pytest + + +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("999.0", True), + ("一", True), + ("二", True), + ("〇", True), + ("十一", True), + ("狗", False), + (",", False), + ], +) +def test_lex_attrs_like_number(zh_tokenizer, text, match): + tokens = zh_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match diff --git a/spacy/tests/lang/zh/test_tokenizer.py b/spacy/tests/lang/zh/test_tokenizer.py new file mode 100644 index 000000000..36d94beb5 --- /dev/null +++ b/spacy/tests/lang/zh/test_tokenizer.py @@ -0,0 +1,31 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +# fmt: off +TOKENIZER_TESTS = [ + ("作为语言而言,为世界使用人数最多的语言,目前世界有五分之一人口做为母语。", + ['作为', '语言', '而言', ',', '为', '世界', '使用', '人', '数最多', + '的', '语言', ',', '目前', '世界', '有', '五分之一', '人口', '做', + '为', '母语', '。']), +] +# fmt: on + + +@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS) +def test_zh_tokenizer(zh_tokenizer, text, expected_tokens): + zh_tokenizer.use_jieba = False + tokens = [token.text for token in zh_tokenizer(text)] + assert tokens == list(text) + + zh_tokenizer.use_jieba = True + tokens = [token.text for token in zh_tokenizer(text)] + assert tokens == expected_tokens + + +def test_extra_spaces(zh_tokenizer): + # note: three spaces after "I" + tokens = zh_tokenizer("I like cheese.") + assert tokens[1].orth_ == " "