Handle empty and whitespace-only docs for Japanese (#5564)

Handle empty and whitespace-only docs in the custom alignment method used by the Japanese tokenizer.
2020-06-08 21:09:23 +02:00 · 2020-06-08 21:09:23 +02:00 · f162815f45
parent de00f967ce
commit f162815f45
2 changed files with 19 additions and 0 deletions
--- a/spacy/lang/ja/init.py
+++ b/spacy/lang/ja/init.py
@ -139,6 +139,16 @@ def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")):
    text_tags = []
    text_spaces = []
    text_pos = 0
+    # handle empty and whitespace-only texts
+    if len(words) == 0:
+        return text_words, text_lemmas, text_tags, text_spaces
+    elif len([word for word in words if not word.isspace()]) == 0:
+        assert text.isspace()
+        text_words = [text]
+        text_lemmas = [text]
+        text_tags = [gap_tag]
+        text_spaces = [False]
+        return text_words, text_lemmas, text_tags, text_spaces
    # normalize words to remove all whitespace tokens
    norm_words, norm_dtokens = zip(*[(word, dtokens) for word, dtokens in zip(words, dtokens) if not word.isspace()])
    # align words with text
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@ -93,3 +93,12 @@ def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
    assert len(nlp_a(text)) == len_a
    assert len(nlp_b(text)) == len_b
    assert len(nlp_c(text)) == len_c
+
+
+def test_ja_tokenizer_emptyish_texts(ja_tokenizer):
+    doc = ja_tokenizer("")
+    assert len(doc) == 0
+    doc = ja_tokenizer(" ")
+    assert len(doc) == 1
+    doc = ja_tokenizer("\n\n\n \t\t \n\n\n")
+    assert len(doc) == 1