mirror of https://github.com/explosion/spaCy.git
Update Korean defaults for Tokenizer (#10322)
Update Korean defaults for `Tokenizer` for tokenization following UD Korean Kaist.
This commit is contained in:
parent
f32ee2e533
commit
30030176ee
|
@ -1,5 +1,6 @@
|
|||
from typing import Iterator, Any, Dict
|
||||
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tag_map import TAG_MAP
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
|
@ -85,6 +86,7 @@ class KoreanDefaults(BaseDefaults):
|
|||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
||||
|
||||
class Korean(Language):
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
from ..char_classes import LIST_QUOTES
|
||||
from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
|
||||
|
||||
|
||||
_infixes = (
|
||||
["·", "ㆍ", "\(", "\)"]
|
||||
+ [r"(?<=[0-9])~(?=[0-9-])"]
|
||||
+ LIST_QUOTES
|
||||
+ BASE_TOKENIZER_INFIXES
|
||||
)
|
||||
|
||||
TOKENIZER_INFIXES = _infixes
|
|
@ -227,6 +227,19 @@ def ko_tokenizer():
|
|||
return get_lang_class("ko")().tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ko_tokenizer_tokenizer():
|
||||
config = {
|
||||
"nlp": {
|
||||
"tokenizer": {
|
||||
"@tokenizers": "spacy.Tokenizer.v1",
|
||||
}
|
||||
}
|
||||
}
|
||||
nlp = get_lang_class("ko").from_config(config)
|
||||
return nlp.tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def lb_tokenizer():
|
||||
return get_lang_class("lb")().tokenizer
|
||||
|
|
|
@ -47,3 +47,23 @@ def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos):
|
|||
def test_ko_empty_doc(ko_tokenizer):
|
||||
tokens = ko_tokenizer("")
|
||||
assert len(tokens) == 0
|
||||
|
||||
|
||||
# fmt: off
|
||||
SPACY_TOKENIZER_TESTS = [
|
||||
("있다.", "있다 ."),
|
||||
("'예'는", "' 예 ' 는"),
|
||||
("부 (富) 는", "부 ( 富 ) 는"),
|
||||
("부(富)는", "부 ( 富 ) 는"),
|
||||
("1982~1983.", "1982 ~ 1983 ."),
|
||||
("사과·배·복숭아·수박은 모두 과일이다.", "사과 · 배 · 복숭아 · 수박은 모두 과일이다 ."),
|
||||
("그렇구나~", "그렇구나~"),
|
||||
("『9시 반의 당구』,", "『 9시 반의 당구 』 ,"),
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tokens", SPACY_TOKENIZER_TESTS)
|
||||
def test_ko_spacy_tokenizer(ko_tokenizer_tokenizer, text, expected_tokens):
|
||||
tokens = [token.text for token in ko_tokenizer_tokenizer(text)]
|
||||
assert tokens == expected_tokens.split()
|
||||
|
|
Loading…
Reference in New Issue