Handle empty and whitespace-only docs for Japanese (#5564)

Handle empty and whitespace-only docs in the custom alignment method
used by the Japanese tokenizer.
This commit is contained in:
adrianeboyd 2020-06-08 21:09:23 +02:00 committed by GitHub
parent de00f967ce
commit f162815f45
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 19 additions and 0 deletions

View File

@ -139,6 +139,16 @@ def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")):
text_tags = [] text_tags = []
text_spaces = [] text_spaces = []
text_pos = 0 text_pos = 0
# handle empty and whitespace-only texts
if len(words) == 0:
return text_words, text_lemmas, text_tags, text_spaces
elif len([word for word in words if not word.isspace()]) == 0:
assert text.isspace()
text_words = [text]
text_lemmas = [text]
text_tags = [gap_tag]
text_spaces = [False]
return text_words, text_lemmas, text_tags, text_spaces
# normalize words to remove all whitespace tokens # normalize words to remove all whitespace tokens
norm_words, norm_dtokens = zip(*[(word, dtokens) for word, dtokens in zip(words, dtokens) if not word.isspace()]) norm_words, norm_dtokens = zip(*[(word, dtokens) for word, dtokens in zip(words, dtokens) if not word.isspace()])
# align words with text # align words with text

View File

@ -93,3 +93,12 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
assert len(nlp_a(text)) == len_a assert len(nlp_a(text)) == len_a
assert len(nlp_b(text)) == len_b assert len(nlp_b(text)) == len_b
assert len(nlp_c(text)) == len_c assert len(nlp_c(text)) == len_c
def test_ja_tokenizer_emptyish_texts(ja_tokenizer):
doc = ja_tokenizer("")
assert len(doc) == 0
doc = ja_tokenizer(" ")
assert len(doc) == 1
doc = ja_tokenizer("\n\n\n \t\t \n\n\n")
assert len(doc) == 1