Rename ja morph features to Inflection and Reading (#9520)

* Rename ja morph features to Inflection and Reading
This commit is contained in:
Adriane Boyd 2021-10-27 13:13:03 +02:00 committed by GitHub
parent 2ea9b58006
commit 0c97ed2746
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 4 additions and 4 deletions

View File

@ -80,12 +80,12 @@ class JapaneseTokenizer(DummyTokenizer):
morph = {} morph = {}
if dtoken.inf: if dtoken.inf:
# it's normal for this to be empty for non-inflecting types # it's normal for this to be empty for non-inflecting types
morph["inflection"] = dtoken.inf morph["Inflection"] = dtoken.inf
token.norm_ = dtoken.norm token.norm_ = dtoken.norm
if dtoken.reading: if dtoken.reading:
# punctuation is its own reading, but we don't want values like # punctuation is its own reading, but we don't want values like
# "=" here # "=" here
morph["reading"] = re.sub("[=|]", "_", dtoken.reading) morph["Reading"] = re.sub("[=|]", "_", dtoken.reading)
token.morph = MorphAnalysis(self.vocab, morph) token.morph = MorphAnalysis(self.vocab, morph)
if self.need_subtokens: if self.need_subtokens:
doc.user_data["sub_tokens"] = sub_tokens_list doc.user_data["sub_tokens"] = sub_tokens_list

View File

@ -144,9 +144,9 @@ def test_ja_tokenizer_inflections_reading_forms(
ja_tokenizer, text, inflections, reading_forms ja_tokenizer, text, inflections, reading_forms
): ):
tokens = ja_tokenizer(text) tokens = ja_tokenizer(text)
test_inflections = [tt.morph.get("inflection") for tt in tokens] test_inflections = [tt.morph.get("Inflection") for tt in tokens]
assert test_inflections == list(inflections) assert test_inflections == list(inflections)
test_readings = [tt.morph.get("reading") for tt in tokens] test_readings = [tt.morph.get("Reading") for tt in tokens]
assert test_readings == list(reading_forms) assert test_readings == list(reading_forms)