mirror of https://github.com/explosion/spaCy.git
Handle unknown tags in KoreanTokenizer tag map (#10536)
This commit is contained in:
parent
c17980e535
commit
e908a67829
|
@ -7,7 +7,7 @@ from .lex_attrs import LEX_ATTRS
|
|||
from ...language import Language, BaseDefaults
|
||||
from ...tokens import Doc
|
||||
from ...scorer import Scorer
|
||||
from ...symbols import POS
|
||||
from ...symbols import POS, X
|
||||
from ...training import validate_examples
|
||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||
from ...vocab import Vocab
|
||||
|
@ -57,7 +57,10 @@ class KoreanTokenizer(DummyTokenizer):
|
|||
for token, dtoken in zip(doc, dtokens):
|
||||
first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
|
||||
token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미)
|
||||
token.pos = TAG_MAP[token.tag_][POS]
|
||||
if token.tag_ in TAG_MAP:
|
||||
token.pos = TAG_MAP[token.tag_][POS]
|
||||
else:
|
||||
token.pos = X
|
||||
token.lemma_ = dtoken["lemma"]
|
||||
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
|
||||
return doc
|
||||
|
|
|
@ -49,6 +49,12 @@ def test_ko_empty_doc(ko_tokenizer):
|
|||
assert len(tokens) == 0
|
||||
|
||||
|
||||
@pytest.mark.issue(10535)
|
||||
def test_ko_tokenizer_unknown_tag(ko_tokenizer):
|
||||
tokens = ko_tokenizer("미닛 리피터")
|
||||
assert tokens[1].pos_ == "X"
|
||||
|
||||
|
||||
# fmt: off
|
||||
SPACY_TOKENIZER_TESTS = [
|
||||
("있다.", "있다 ."),
|
||||
|
|
Loading…
Reference in New Issue