From b7e6e1b9a75ea1301ea8253cd2c6a5d3740cef12 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Tue, 9 Jun 2020 12:00:59 +0200 Subject: [PATCH] Disable sentence segmentation in ja tokenizer (#5566) --- spacy/lang/ja/__init__.py | 1 - spacy/tests/lang/ja/test_tokenizer.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 39e0445c2..371cc0f98 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -209,7 +209,6 @@ class JapaneseTokenizer(DummyTokenizer): token.lemma_ = lemma doc.user_data["unidic_tags"] = unidic_tags - separate_sentences(doc) return doc def _get_config(self): diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index 30cba42b1..26be5cf59 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -58,6 +58,7 @@ def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos): assert pos == expected_pos +@pytest.mark.skip(reason="sentence segmentation in tokenizer is buggy") @pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS) def test_ja_tokenizer_pos(ja_tokenizer, text, expected_sents): sents = [str(sent) for sent in ja_tokenizer(text).sents]