mirror of https://github.com/explosion/spaCy.git
Handle empty and whitespace-only docs for Japanese (#5564)
Handle empty and whitespace-only docs in the custom alignment method used by the Japanese tokenizer.
This commit is contained in:
parent
de00f967ce
commit
f162815f45
|
@ -139,6 +139,16 @@ def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")):
|
||||||
text_tags = []
|
text_tags = []
|
||||||
text_spaces = []
|
text_spaces = []
|
||||||
text_pos = 0
|
text_pos = 0
|
||||||
|
# handle empty and whitespace-only texts
|
||||||
|
if len(words) == 0:
|
||||||
|
return text_words, text_lemmas, text_tags, text_spaces
|
||||||
|
elif len([word for word in words if not word.isspace()]) == 0:
|
||||||
|
assert text.isspace()
|
||||||
|
text_words = [text]
|
||||||
|
text_lemmas = [text]
|
||||||
|
text_tags = [gap_tag]
|
||||||
|
text_spaces = [False]
|
||||||
|
return text_words, text_lemmas, text_tags, text_spaces
|
||||||
# normalize words to remove all whitespace tokens
|
# normalize words to remove all whitespace tokens
|
||||||
norm_words, norm_dtokens = zip(*[(word, dtokens) for word, dtokens in zip(words, dtokens) if not word.isspace()])
|
norm_words, norm_dtokens = zip(*[(word, dtokens) for word, dtokens in zip(words, dtokens) if not word.isspace()])
|
||||||
# align words with text
|
# align words with text
|
||||||
|
|
|
@ -93,3 +93,12 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
|
||||||
assert len(nlp_a(text)) == len_a
|
assert len(nlp_a(text)) == len_a
|
||||||
assert len(nlp_b(text)) == len_b
|
assert len(nlp_b(text)) == len_b
|
||||||
assert len(nlp_c(text)) == len_c
|
assert len(nlp_c(text)) == len_c
|
||||||
|
|
||||||
|
|
||||||
|
def test_ja_tokenizer_emptyish_texts(ja_tokenizer):
|
||||||
|
doc = ja_tokenizer("")
|
||||||
|
assert len(doc) == 0
|
||||||
|
doc = ja_tokenizer(" ")
|
||||||
|
assert len(doc) == 1
|
||||||
|
doc = ja_tokenizer("\n\n\n \t\t \n\n\n")
|
||||||
|
assert len(doc) == 1
|
||||||
|
|
Loading…
Reference in New Issue