diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index 63bc06665..0e02e4a2d 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -7,7 +7,7 @@ from .lex_attrs import LEX_ATTRS from ...language import Language, BaseDefaults from ...tokens import Doc from ...scorer import Scorer -from ...symbols import POS +from ...symbols import POS, X from ...training import validate_examples from ...util import DummyTokenizer, registry, load_config_from_str from ...vocab import Vocab @@ -57,7 +57,10 @@ class KoreanTokenizer(DummyTokenizer): for token, dtoken in zip(doc, dtokens): first_tag, sep, eomi_tags = dtoken["tag"].partition("+") token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미) - token.pos = TAG_MAP[token.tag_][POS] + if token.tag_ in TAG_MAP: + token.pos = TAG_MAP[token.tag_][POS] + else: + token.pos = X token.lemma_ = dtoken["lemma"] doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens] return doc diff --git a/spacy/tests/lang/ko/test_tokenizer.py b/spacy/tests/lang/ko/test_tokenizer.py index e6b65dee9..6e06e405e 100644 --- a/spacy/tests/lang/ko/test_tokenizer.py +++ b/spacy/tests/lang/ko/test_tokenizer.py @@ -49,6 +49,12 @@ def test_ko_empty_doc(ko_tokenizer): assert len(tokens) == 0 +@pytest.mark.issue(10535) +def test_ko_tokenizer_unknown_tag(ko_tokenizer): + tokens = ko_tokenizer("미닛 리피터") + assert tokens[1].pos_ == "X" + + # fmt: off SPACY_TOKENIZER_TESTS = [ ("있다.", "있다 ."),