From 30030176ee066e2de92238802d7af9d6120d689f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 21 Feb 2022 10:26:19 +0100 Subject: [PATCH] Update Korean defaults for Tokenizer (#10322) Update Korean defaults for `Tokenizer` for tokenization following UD Korean Kaist. --- spacy/lang/ko/__init__.py | 2 ++ spacy/lang/ko/punctuation.py | 12 ++++++++++++ spacy/tests/conftest.py | 13 +++++++++++++ spacy/tests/lang/ko/test_tokenizer.py | 20 ++++++++++++++++++++ 4 files changed, 47 insertions(+) create mode 100644 spacy/lang/ko/punctuation.py diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index a03f7821a..63bc06665 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -1,5 +1,6 @@ from typing import Iterator, Any, Dict +from .punctuation import TOKENIZER_INFIXES from .stop_words import STOP_WORDS from .tag_map import TAG_MAP from .lex_attrs import LEX_ATTRS @@ -85,6 +86,7 @@ class KoreanDefaults(BaseDefaults): lex_attr_getters = LEX_ATTRS stop_words = STOP_WORDS writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} + infixes = TOKENIZER_INFIXES class Korean(Language): diff --git a/spacy/lang/ko/punctuation.py b/spacy/lang/ko/punctuation.py new file mode 100644 index 000000000..7f7b40c5b --- /dev/null +++ b/spacy/lang/ko/punctuation.py @@ -0,0 +1,12 @@ +from ..char_classes import LIST_QUOTES +from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES + + +_infixes = ( + ["·", "ㆍ", "\(", "\)"] + + [r"(?<=[0-9])~(?=[0-9-])"] + + LIST_QUOTES + + BASE_TOKENIZER_INFIXES +) + +TOKENIZER_INFIXES = _infixes diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index ee90a9f38..f9266cb94 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -227,6 +227,19 @@ def ko_tokenizer(): return get_lang_class("ko")().tokenizer +@pytest.fixture(scope="session") +def ko_tokenizer_tokenizer(): + config = { + "nlp": { + "tokenizer": { + "@tokenizers": "spacy.Tokenizer.v1", + } + } + } + nlp = get_lang_class("ko").from_config(config) + return nlp.tokenizer + + @pytest.fixture(scope="session") def lb_tokenizer(): return get_lang_class("lb")().tokenizer diff --git a/spacy/tests/lang/ko/test_tokenizer.py b/spacy/tests/lang/ko/test_tokenizer.py index eac309857..e6b65dee9 100644 --- a/spacy/tests/lang/ko/test_tokenizer.py +++ b/spacy/tests/lang/ko/test_tokenizer.py @@ -47,3 +47,23 @@ def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos): def test_ko_empty_doc(ko_tokenizer): tokens = ko_tokenizer("") assert len(tokens) == 0 + + +# fmt: off +SPACY_TOKENIZER_TESTS = [ + ("있다.", "있다 ."), + ("'예'는", "' 예 ' 는"), + ("부 (富) 는", "부 ( 富 ) 는"), + ("부(富)는", "부 ( 富 ) 는"), + ("1982~1983.", "1982 ~ 1983 ."), + ("사과·배·복숭아·수박은 모두 과일이다.", "사과 · 배 · 복숭아 · 수박은 모두 과일이다 ."), + ("그렇구나~", "그렇구나~"), + ("『9시 반의 당구』,", "『 9시 반의 당구 』 ,"), +] +# fmt: on + + +@pytest.mark.parametrize("text,expected_tokens", SPACY_TOKENIZER_TESTS) +def test_ko_spacy_tokenizer(ko_tokenizer_tokenizer, text, expected_tokens): + tokens = [token.text for token in ko_tokenizer_tokenizer(text)] + assert tokens == expected_tokens.split()