From 150a39ccca2426fcd10638c8515d7ec98cb79d8f Mon Sep 17 00:00:00 2001 From: Hiroshi Matsuda <40782025+hiroshi-matsuda-rit@users.noreply.github.com> Date: Mon, 22 Jun 2020 21:32:25 +0900 Subject: [PATCH] Japanese model: add user_dict entries and small refactor (#5573) * user_dict fields: adding inflections, reading_forms, sub_tokens deleting: unidic_tags improve code readability around the token alignment procedure * add test cases, replace fugashi with sudachipy in conftest * move bunsetu.py to spaCy Universe as a pipeline component BunsetuRecognizer * tag is space -> both surface and tag are spaces * consider len(text)==0 --- spacy/lang/ja/__init__.py | 203 +++++++++++++------------- spacy/lang/ja/bunsetu.py | 144 ------------------ spacy/tests/lang/ja/test_tokenizer.py | 53 ++++++- 3 files changed, 152 insertions(+), 248 deletions(-) delete mode 100644 spacy/lang/ja/bunsetu.py diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index a7ad0846e..fb8b9d7fe 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -20,12 +20,7 @@ from ... import util # Hold the attributes we need with convenient names -DetailedToken = namedtuple("DetailedToken", ["surface", "pos", "lemma"]) - -# Handling for multiple spaces in a row is somewhat awkward, this simplifies -# the flow by creating a dummy with the same interface. -DummyNode = namedtuple("DummyNode", ["surface", "pos", "lemma"]) -DummySpace = DummyNode(" ", " ", " ") +DetailedToken = namedtuple("DetailedToken", ["surface", "tag", "inf", "lemma", "reading", "sub_tokens"]) def try_sudachi_import(split_mode="A"): @@ -53,7 +48,7 @@ def try_sudachi_import(split_mode="A"): ) -def resolve_pos(orth, pos, next_pos): +def resolve_pos(orth, tag, next_tag): """If necessary, add a field to the POS tag for UD mapping. Under Universal Dependencies, sometimes the same Unidic POS tag can be mapped differently depending on the literal token or its context @@ -64,124 +59,77 @@ def resolve_pos(orth, pos, next_pos): # Some tokens have their UD tag decided based on the POS of the following # token. - # orth based rules - if pos[0] in TAG_ORTH_MAP: - orth_map = TAG_ORTH_MAP[pos[0]] + # apply orth based mapping + if tag in TAG_ORTH_MAP: + orth_map = TAG_ORTH_MAP[tag] if orth in orth_map: - return orth_map[orth], None + return orth_map[orth], None # current_pos, next_pos - # tag bi-gram mapping - if next_pos: - tag_bigram = pos[0], next_pos[0] + # apply tag bi-gram mapping + if next_tag: + tag_bigram = tag, next_tag if tag_bigram in TAG_BIGRAM_MAP: - bipos = TAG_BIGRAM_MAP[tag_bigram] - if bipos[0] is None: - return TAG_MAP[pos[0]][POS], bipos[1] + current_pos, next_pos = TAG_BIGRAM_MAP[tag_bigram] + if current_pos is None: # apply tag uni-gram mapping for current_pos + return TAG_MAP[tag][POS], next_pos # only next_pos is identified by tag bi-gram mapping else: - return bipos + return current_pos, next_pos - return TAG_MAP[pos[0]][POS], None + # apply tag uni-gram mapping + return TAG_MAP[tag][POS], None -# Use a mapping of paired punctuation to avoid splitting quoted sentences. -pairpunct = {'「':'」', '『': '』', '【': '】'} - - -def separate_sentences(doc): - """Given a doc, mark tokens that start sentences based on Unidic tags. - """ - - stack = [] # save paired punctuation - - for i, token in enumerate(doc[:-2]): - # Set all tokens after the first to false by default. This is necessary - # for the doc code to be aware we've done sentencization, see - # `is_sentenced`. - token.sent_start = (i == 0) - if token.tag_: - if token.tag_ == "補助記号-括弧開": - ts = str(token) - if ts in pairpunct: - stack.append(pairpunct[ts]) - elif stack and ts == stack[-1]: - stack.pop() - - if token.tag_ == "補助記号-句点": - next_token = doc[i+1] - if next_token.tag_ != token.tag_ and not stack: - next_token.sent_start = True - - -def get_dtokens(tokenizer, text): - tokens = tokenizer.tokenize(text) - words = [] - for ti, token in enumerate(tokens): - tag = '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*']) - inf = '-'.join([xx for xx in token.part_of_speech()[4:] if xx != '*']) - dtoken = DetailedToken( - token.surface(), - (tag, inf), - token.dictionary_form()) - if ti > 0 and words[-1].pos[0] == '空白' and tag == '空白': - # don't add multiple space tokens in a row - continue - words.append(dtoken) - - # remove empty tokens. These can be produced with characters like … that - # Sudachi normalizes internally. - words = [ww for ww in words if len(ww.surface) > 0] - return words - - -def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")): +def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"): + # Compare the content of tokens and text, first words = [x.surface for x in dtokens] if "".join("".join(words).split()) != "".join(text.split()): raise ValueError(Errors.E194.format(text=text, words=words)) - text_words = [] - text_lemmas = [] - text_tags = [] + + text_dtokens = [] text_spaces = [] text_pos = 0 # handle empty and whitespace-only texts if len(words) == 0: - return text_words, text_lemmas, text_tags, text_spaces + return text_dtokens, text_spaces elif len([word for word in words if not word.isspace()]) == 0: assert text.isspace() - text_words = [text] - text_lemmas = [text] - text_tags = [gap_tag] + text_dtokens = [DetailedToken(text, gap_tag, '', text, None, None)] text_spaces = [False] - return text_words, text_lemmas, text_tags, text_spaces - # normalize words to remove all whitespace tokens - norm_words, norm_dtokens = zip(*[(word, dtokens) for word, dtokens in zip(words, dtokens) if not word.isspace()]) - # align words with text - for word, dtoken in zip(norm_words, norm_dtokens): + return text_dtokens, text_spaces + + # align words and dtokens by referring text, and insert gap tokens for the space char spans + for word, dtoken in zip(words, dtokens): + # skip all space tokens + if word.isspace(): + continue try: word_start = text[text_pos:].index(word) except ValueError: raise ValueError(Errors.E194.format(text=text, words=words)) + + # space token if word_start > 0: w = text[text_pos:text_pos + word_start] - text_words.append(w) - text_lemmas.append(w) - text_tags.append(gap_tag) + text_dtokens.append(DetailedToken(w, gap_tag, '', w, None, None)) text_spaces.append(False) text_pos += word_start - text_words.append(word) - text_lemmas.append(dtoken.lemma) - text_tags.append(dtoken.pos) + + # content word + text_dtokens.append(dtoken) text_spaces.append(False) text_pos += len(word) + # poll a space char after the word if text_pos < len(text) and text[text_pos] == " ": text_spaces[-1] = True text_pos += 1 + + # trailing space token if text_pos < len(text): w = text[text_pos:] - text_words.append(w) - text_lemmas.append(w) - text_tags.append(gap_tag) + text_dtokens.append(DetailedToken(w, gap_tag, '', w, None, None)) text_spaces.append(False) - return text_words, text_lemmas, text_tags, text_spaces + + return text_dtokens, text_spaces class JapaneseTokenizer(DummyTokenizer): @@ -191,29 +139,78 @@ class JapaneseTokenizer(DummyTokenizer): self.tokenizer = try_sudachi_import(self.split_mode) def __call__(self, text): - dtokens = get_dtokens(self.tokenizer, text) + # convert sudachipy.morpheme.Morpheme to DetailedToken and merge continuous spaces + sudachipy_tokens = self.tokenizer.tokenize(text) + dtokens = self._get_dtokens(sudachipy_tokens) + dtokens, spaces = get_dtokens_and_spaces(dtokens, text) - words, lemmas, unidic_tags, spaces = get_words_lemmas_tags_spaces(dtokens, text) + # create Doc with tag bi-gram based part-of-speech identification rules + words, tags, inflections, lemmas, readings, sub_tokens_list = zip(*dtokens) if dtokens else [[]] * 6 + sub_tokens_list = list(sub_tokens_list) doc = Doc(self.vocab, words=words, spaces=spaces) - next_pos = None - for idx, (token, lemma, unidic_tag) in enumerate(zip(doc, lemmas, unidic_tags)): - token.tag_ = unidic_tag[0] - if next_pos: + next_pos = None # for bi-gram rules + for idx, (token, dtoken) in enumerate(zip(doc, dtokens)): + token.tag_ = dtoken.tag + if next_pos: # already identified in previous iteration token.pos = next_pos next_pos = None else: token.pos, next_pos = resolve_pos( token.orth_, - unidic_tag, - unidic_tags[idx + 1] if idx + 1 < len(unidic_tags) else None + dtoken.tag, + tags[idx + 1] if idx + 1 < len(tags) else None ) - # if there's no lemma info (it's an unk) just use the surface - token.lemma_ = lemma - doc.user_data["unidic_tags"] = unidic_tags + token.lemma_ = dtoken.lemma if dtoken.lemma else dtoken.surface + + doc.user_data["inflections"] = inflections + doc.user_data["reading_forms"] = readings + doc.user_data["sub_tokens"] = sub_tokens_list return doc + def _get_dtokens(self, sudachipy_tokens, need_sub_tokens=True): + sub_tokens_list = self._get_sub_tokens(sudachipy_tokens) if need_sub_tokens else None + dtokens = [ + DetailedToken( + token.surface(), # orth + '-'.join([xx for xx in token.part_of_speech()[:4] if xx != '*']), # tag + ','.join([xx for xx in token.part_of_speech()[4:] if xx != '*']), # inf + token.dictionary_form(), # lemma + token.reading_form(), # user_data['reading_forms'] + sub_tokens_list[idx] if sub_tokens_list else None, # user_data['sub_tokens'] + ) for idx, token in enumerate(sudachipy_tokens) if len(token.surface()) > 0 + # remove empty tokens which can be produced with characters like … that + ] + # Sudachi normalizes internally and outputs each space char as a token. + # This is the preparation for get_dtokens_and_spaces() to merge the continuous space tokens + return [ + t for idx, t in enumerate(dtokens) if + idx == 0 or + not t.surface.isspace() or t.tag != '空白' or + not dtokens[idx - 1].surface.isspace() or dtokens[idx - 1].tag != '空白' + ] + + def _get_sub_tokens(self, sudachipy_tokens): + if self.split_mode is None or self.split_mode == "A": # do nothing for default split mode + return None + + sub_tokens_list = [] # list of (list of list of DetailedToken | None) + for token in sudachipy_tokens: + sub_a = token.split(self.tokenizer.SplitMode.A) + if len(sub_a) == 1: # no sub tokens + sub_tokens_list.append(None) + elif self.split_mode == "B": + sub_tokens_list.append([self._get_dtokens(sub_a, False)]) + else: # "C" + sub_b = token.split(self.tokenizer.SplitMode.B) + if len(sub_a) == len(sub_b): + dtokens = self._get_dtokens(sub_a, False) + sub_tokens_list.append([dtokens, dtokens]) + else: + sub_tokens_list.append([self._get_dtokens(sub_a, False), self._get_dtokens(sub_b, False)]) + return sub_tokens_list + def _get_config(self): config = OrderedDict( ( diff --git a/spacy/lang/ja/bunsetu.py b/spacy/lang/ja/bunsetu.py deleted file mode 100644 index 7c3eee336..000000000 --- a/spacy/lang/ja/bunsetu.py +++ /dev/null @@ -1,144 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from .stop_words import STOP_WORDS - - -POS_PHRASE_MAP = { - "NOUN": "NP", - "NUM": "NP", - "PRON": "NP", - "PROPN": "NP", - - "VERB": "VP", - - "ADJ": "ADJP", - - "ADV": "ADVP", - - "CCONJ": "CCONJP", -} - - -# return value: [(bunsetu_tokens, phrase_type={'NP', 'VP', 'ADJP', 'ADVP'}, phrase_tokens)] -def yield_bunsetu(doc, debug=False): - bunsetu = [] - bunsetu_may_end = False - phrase_type = None - phrase = None - prev = None - prev_tag = None - prev_dep = None - prev_head = None - for t in doc: - pos = t.pos_ - pos_type = POS_PHRASE_MAP.get(pos, None) - tag = t.tag_ - dep = t.dep_ - head = t.head.i - if debug: - print(t.i, t.orth_, pos, pos_type, dep, head, bunsetu_may_end, phrase_type, phrase, bunsetu) - - # DET is always an individual bunsetu - if pos == "DET": - if bunsetu: - yield bunsetu, phrase_type, phrase - yield [t], None, None - bunsetu = [] - bunsetu_may_end = False - phrase_type = None - phrase = None - - # PRON or Open PUNCT always splits bunsetu - elif tag == "補助記号-括弧開": - if bunsetu: - yield bunsetu, phrase_type, phrase - bunsetu = [t] - bunsetu_may_end = True - phrase_type = None - phrase = None - - # bunsetu head not appeared - elif phrase_type is None: - if bunsetu and prev_tag == "補助記号-読点": - yield bunsetu, phrase_type, phrase - bunsetu = [] - bunsetu_may_end = False - phrase_type = None - phrase = None - bunsetu.append(t) - if pos_type: # begin phrase - phrase = [t] - phrase_type = pos_type - if pos_type in {"ADVP", "CCONJP"}: - bunsetu_may_end = True - - # entering new bunsetu - elif pos_type and ( - pos_type != phrase_type or # different phrase type arises - bunsetu_may_end # same phrase type but bunsetu already ended - ): - # exceptional case: NOUN to VERB - if phrase_type == "NP" and pos_type == "VP" and prev_dep == 'compound' and prev_head == t.i: - bunsetu.append(t) - phrase_type = "VP" - phrase.append(t) - # exceptional case: VERB to NOUN - elif phrase_type == "VP" and pos_type == "NP" and ( - prev_dep == 'compound' and prev_head == t.i or - dep == 'compound' and prev == head or - prev_dep == 'nmod' and prev_head == t.i - ): - bunsetu.append(t) - phrase_type = "NP" - phrase.append(t) - else: - yield bunsetu, phrase_type, phrase - bunsetu = [t] - bunsetu_may_end = False - phrase_type = pos_type - phrase = [t] - - # NOUN bunsetu - elif phrase_type == "NP": - bunsetu.append(t) - if not bunsetu_may_end and (( - (pos_type == "NP" or pos == "SYM") and (prev_head == t.i or prev_head == head) and prev_dep in {'compound', 'nummod'} - ) or ( - pos == "PART" and (prev == head or prev_head == head) and dep == 'mark' - )): - phrase.append(t) - else: - bunsetu_may_end = True - - # VERB bunsetu - elif phrase_type == "VP": - bunsetu.append(t) - if not bunsetu_may_end and pos == "VERB" and prev_head == t.i and prev_dep == 'compound': - phrase.append(t) - else: - bunsetu_may_end = True - - # ADJ bunsetu - elif phrase_type == "ADJP" and tag != '連体詞': - bunsetu.append(t) - if not bunsetu_may_end and (( - pos == "NOUN" and (prev_head == t.i or prev_head == head) and prev_dep in {'amod', 'compound'} - ) or ( - pos == "PART" and (prev == head or prev_head == head) and dep == 'mark' - )): - phrase.append(t) - else: - bunsetu_may_end = True - - # other bunsetu - else: - bunsetu.append(t) - - prev = t.i - prev_tag = t.tag_ - prev_dep = t.dep_ - prev_head = head - - if bunsetu: - yield bunsetu, phrase_type, phrase diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index 26be5cf59..651e906eb 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import pytest from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS -from spacy.lang.ja import Japanese +from spacy.lang.ja import Japanese, DetailedToken # fmt: off TOKENIZER_TESTS = [ @@ -96,6 +96,57 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c): assert len(nlp_c(text)) == len_c +@pytest.mark.parametrize("text,sub_tokens_list_a,sub_tokens_list_b,sub_tokens_list_c", + [ + ( + "選挙管理委員会", + [None, None, None, None], + [None, None, [ + [ + DetailedToken(surface='委員', tag='名詞-普通名詞-一般', inf='', lemma='委員', reading='イイン', sub_tokens=None), + DetailedToken(surface='会', tag='名詞-普通名詞-一般', inf='', lemma='会', reading='カイ', sub_tokens=None), + ] + ]], + [[ + [ + DetailedToken(surface='選挙', tag='名詞-普通名詞-サ変可能', inf='', lemma='選挙', reading='センキョ', sub_tokens=None), + DetailedToken(surface='管理', tag='名詞-普通名詞-サ変可能', inf='', lemma='管理', reading='カンリ', sub_tokens=None), + DetailedToken(surface='委員', tag='名詞-普通名詞-一般', inf='', lemma='委員', reading='イイン', sub_tokens=None), + DetailedToken(surface='会', tag='名詞-普通名詞-一般', inf='', lemma='会', reading='カイ', sub_tokens=None), + ], [ + DetailedToken(surface='選挙', tag='名詞-普通名詞-サ変可能', inf='', lemma='選挙', reading='センキョ', sub_tokens=None), + DetailedToken(surface='管理', tag='名詞-普通名詞-サ変可能', inf='', lemma='管理', reading='カンリ', sub_tokens=None), + DetailedToken(surface='委員会', tag='名詞-普通名詞-一般', inf='', lemma='委員会', reading='イインカイ', sub_tokens=None), + ] + ]] + ), + ] +) +def test_ja_tokenizer_sub_tokens(ja_tokenizer, text, sub_tokens_list_a, sub_tokens_list_b, sub_tokens_list_c): + nlp_a = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}}) + nlp_b = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}}) + nlp_c = Japanese(meta={"tokenizer": {"config": {"split_mode": "C"}}}) + + assert ja_tokenizer(text).user_data["sub_tokens"] == sub_tokens_list_a + assert nlp_a(text).user_data["sub_tokens"] == sub_tokens_list_a + assert nlp_b(text).user_data["sub_tokens"] == sub_tokens_list_b + assert nlp_c(text).user_data["sub_tokens"] == sub_tokens_list_c + + +@pytest.mark.parametrize("text,inflections,reading_forms", + [ + ( + "取ってつけた", + ("五段-ラ行,連用形-促音便", "", "下一段-カ行,連用形-一般", "助動詞-タ,終止形-一般"), + ("トッ", "テ", "ツケ", "タ"), + ), + ] +) +def test_ja_tokenizer_inflections_reading_forms(ja_tokenizer, text, inflections, reading_forms): + assert ja_tokenizer(text).user_data["inflections"] == inflections + assert ja_tokenizer(text).user_data["reading_forms"] == reading_forms + + def test_ja_tokenizer_emptyish_texts(ja_tokenizer): doc = ja_tokenizer("") assert len(doc) == 0