From 58f06e61800a477f67d13911068fd24892ccfa15 Mon Sep 17 00:00:00 2001 From: cedar101 Date: Wed, 10 Jul 2019 05:23:16 +0900 Subject: [PATCH] Korean support (#3901) * start lang/ko * add test codes * using natto-py * add test_ko_tokenizer_full_tags() * spaCy contributor agreement * external dependency for ko * collections.namedtuple for python version < 3.5 * case fix * tuple unpacking * add jongseong(final consonant) * apply mecab option * Remove Pipfile for now Co-authored-by: Ines Montani --- .github/contributors/cedar101.md | 106 +++++++++++++++++++ .gitignore | 2 + setup.py | 1 + spacy/lang/ko/__init__.py | 118 ++++++++++++++++++++++ spacy/lang/ko/examples.py | 15 +++ spacy/lang/ko/stop_words.py | 68 +++++++++++++ spacy/lang/ko/tag_map.py | 66 ++++++++++++ spacy/tests/conftest.py | 6 ++ spacy/tests/lang/ko/__init__.py | 0 spacy/tests/lang/ko/test_lemmatization.py | 13 +++ spacy/tests/lang/ko/test_tokenizer.py | 46 +++++++++ website/meta/languages.json | 11 ++ 12 files changed, 452 insertions(+) create mode 100644 .github/contributors/cedar101.md create mode 100644 spacy/lang/ko/__init__.py create mode 100644 spacy/lang/ko/examples.py create mode 100644 spacy/lang/ko/stop_words.py create mode 100644 spacy/lang/ko/tag_map.py create mode 100644 spacy/tests/lang/ko/__init__.py create mode 100644 spacy/tests/lang/ko/test_lemmatization.py create mode 100644 spacy/tests/lang/ko/test_tokenizer.py diff --git a/.github/contributors/cedar101.md b/.github/contributors/cedar101.md new file mode 100644 index 000000000..4d04ebacf --- /dev/null +++ b/.github/contributors/cedar101.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Kim, Baeg-il | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2019-07-03 | +| GitHub username | cedar101 | +| Website (optional) | | diff --git a/.gitignore b/.gitignore index ef586ac8d..35d431d48 100644 --- a/.gitignore +++ b/.gitignore @@ -56,6 +56,8 @@ parts/ sdist/ var/ *.egg-info/ +pip-wheel-metadata/ +Pipfile.lock .installed.cfg *.egg .eggs diff --git a/setup.py b/setup.py index 33623588c..544188f4a 100755 --- a/setup.py +++ b/setup.py @@ -246,6 +246,7 @@ def setup_package(): "cuda100": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy-cuda100>=5.0.0b4"], # Language tokenizers with external dependencies "ja": ["mecab-python3==0.7"], + "ko": ["natto-py==0.9.0"], }, python_requires=">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*", classifiers=[ diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py new file mode 100644 index 000000000..111d01720 --- /dev/null +++ b/spacy/lang/ko/__init__.py @@ -0,0 +1,118 @@ +# encoding: utf8 +from __future__ import unicode_literals, print_function + +import re +import sys + + +from .stop_words import STOP_WORDS +from .tag_map import TAG_MAP, POS +from ...attrs import LANG +from ...language import Language +from ...tokens import Doc +from ...compat import copy_reg +from ...util import DummyTokenizer +from ...compat import is_python3, is_python_pre_3_5 + +is_python_post_3_7 = is_python3 and sys.version_info[1] >= 7 + +# fmt: off +if is_python_pre_3_5: + from collections import namedtuple + Morpheme = namedtuple("Morpheme", "surface lemma tag") +elif is_python_post_3_7: + from dataclasses import dataclass + @dataclass(frozen=True) + class Morpheme: + surface: str + lemma: str + tag: str +else: + from typing import NamedTuple + class Morpheme(NamedTuple): + surface: str + lemma: str + tag: str + + +def try_mecab_import(): + try: + from natto import MeCab + return MeCab + except ImportError: + raise ImportError( + "Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), " + "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), " + "and [natto-py](https://github.com/buruzaemon/natto-py)" + ) +# fmt: on + + +def check_spaces(text, tokens): + token_pattern = re.compile(r"\s?".join(f"({t})" for t in tokens)) + m = token_pattern.match(text) + if m is not None: + for i in range(1, m.lastindex): + yield m.end(i) < m.start(i + 1) + yield False + + +class KoreanTokenizer(DummyTokenizer): + def __init__(self, cls, nlp=None): + self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) + self.Tokenizer = try_mecab_import() + + def __call__(self, text): + dtokens = list(self.detailed_tokens(text)) + surfaces = [dt.surface for dt in dtokens] + doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces))) + for token, dtoken in zip(doc, dtokens): + first_tag, sep, eomi_tags = dtoken.tag.partition("+") + token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미) + token.lemma_ = dtoken.lemma + doc.user_data["full_tags"] = [dt.tag for dt in dtokens] + return doc + + def detailed_tokens(self, text): + # 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3], + # 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], * + with self.Tokenizer("-F%f[0],%f[7]") as tokenizer: + for node in tokenizer.parse(text, as_nodes=True): + if node.is_eos(): + break + surface = node.surface + feature = node.feature + tag, _, expr = feature.partition(",") + lemma, _, remainder = expr.partition("/") + if lemma == "*": + lemma = surface + yield Morpheme(surface, lemma, tag) + + +class KoreanDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda _text: "ko" + stop_words = STOP_WORDS + tag_map = TAG_MAP + writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} + + @classmethod + def create_tokenizer(cls, nlp=None): + return KoreanTokenizer(cls, nlp) + + +class Korean(Language): + lang = "ko" + Defaults = KoreanDefaults + + def make_doc(self, text): + return self.tokenizer(text) + + +def pickle_korean(instance): + return Korean, tuple() + + +copy_reg.pickle(Korean, pickle_korean) + +__all__ = ["Korean"] diff --git a/spacy/lang/ko/examples.py b/spacy/lang/ko/examples.py new file mode 100644 index 000000000..10a6ea9bd --- /dev/null +++ b/spacy/lang/ko/examples.py @@ -0,0 +1,15 @@ +# coding: utf8 +from __future__ import unicode_literals +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.ko.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + +sentences = [ + "애플이 영국의 신생 기업을 10억 달러에 구매를 고려중이다.", + "자동 운전 자동차의 손해 배상 책임에 자동차 메이커에 일정한 부담을 요구하겠다.", + "자동 배달 로봇이 보도를 주행하는 것을 샌프란시스코시가 금지를 검토중이라고 합니다.", + "런던은 영국의 수도이자 가장 큰 도시입니다." +] diff --git a/spacy/lang/ko/stop_words.py b/spacy/lang/ko/stop_words.py new file mode 100644 index 000000000..53cf6f29a --- /dev/null +++ b/spacy/lang/ko/stop_words.py @@ -0,0 +1,68 @@ +# coding: utf8 +from __future__ import unicode_literals + +STOP_WORDS = set(""" +이 +있 +하 +것 +들 +그 +되 +수 +이 +보 +않 +없 +나 +주 +아니 +등 +같 +때 +년 +가 +한 +지 +오 +말 +일 +그렇 +위하 +때문 +그것 +두 +말하 +알 +그러나 +받 +못하 +일 +그런 +또 +더 +많 +그리고 +좋 +크 +시키 +그러 +하나 +살 +데 +안 +어떤 +번 +나 +다른 +어떻 +들 +이렇 +점 +싶 +말 +좀 +원 +잘 +놓 +""".split()) diff --git a/spacy/lang/ko/tag_map.py b/spacy/lang/ko/tag_map.py new file mode 100644 index 000000000..ed6b58170 --- /dev/null +++ b/spacy/lang/ko/tag_map.py @@ -0,0 +1,66 @@ +# encoding: utf8 +from __future__ import unicode_literals +from collections import defaultdict + +from ...symbols import (POS, PUNCT, INTJ, X, SYM, + ADJ, AUX, ADP, CONJ, NOUN, PRON, VERB, ADV, PROPN, + NUM, DET) + +# 은전한닢(mecab-ko-dic)의 품사 태그를 universal pos tag로 대응시킴 +# https://docs.google.com/spreadsheets/d/1-9blXKjtjeKZqsf4NzHeYJCrr49-nXeRF6D80udfcwY/edit#gid=589544265 +# https://universaldependencies.org/u/pos/ +TAG_MAP = { + # J.{1,2} 조사 + "JKS": {POS: ADP}, + "JKC": {POS: ADP}, + "JKG": {POS: ADP}, + "JKO": {POS: ADP}, + "JKB": {POS: ADP}, + "JKV": {POS: ADP}, + "JKQ": {POS: ADP}, + "JX": {POS: ADP}, # 보조사 + "JC": {POS: CONJ}, # 접속 조사 + "MAJ": {POS: CONJ}, # 접속 부사 + "MAG": {POS: ADV}, # 일반 부사 + "MM": {POS: DET}, # 관형사 + + "XPN": {POS: X}, # 접두사 + # XS. 접미사 + "XSN": {POS: X}, + "XSV": {POS: X}, + "XSA": {POS: X}, + "XR": {POS: X}, # 어근 + # E.{1,2} 어미 + "EP": {POS: X}, + "EF": {POS: X}, + "EC": {POS: X}, + "ETN": {POS: X}, + "ETM": {POS: X}, + + "IC": {POS: INTJ}, # 감탄사 + + "VV": {POS: VERB}, # 동사 + "VA": {POS: ADJ}, # 형용사 + "VX": {POS: AUX}, # 보조 용언 + "VCP": {POS: ADP}, # 긍정 지정사(이다) + "VCN": {POS: ADJ}, # 부정 지정사(아니다) + + "NNG": {POS: NOUN}, # 일반 명사(general noun) + "NNB": {POS: NOUN}, # 의존 명사 + "NNBC": {POS: NOUN}, # 의존 명사(단위: unit) + "NNP": {POS: PROPN}, # 고유 명사(proper noun) + "NP": {POS: PRON}, # 대명사 + "NR": {POS: NUM}, # 수사(numerals) + "SN": {POS: NUM}, # 숫자 + + # S.{1,2} 부호 + # 문장 부호 + "SF": {POS: PUNCT}, # period or other EOS marker + "SE": {POS: PUNCT}, + "SC": {POS: PUNCT}, # comma, etc. + "SSO": {POS: PUNCT}, # open bracket + "SSC": {POS: PUNCT}, # close bracket + "SY": {POS: SYM}, # 기타 기호 + "SL": {POS: X}, # 외국어 + "SH": {POS: X}, # 한자 +} diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 4bef85a1b..fdd86616d 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -124,6 +124,12 @@ def ja_tokenizer(): return get_lang_class("ja").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def ko_tokenizer(): + pytest.importorskip("natto") + return get_lang_class("ko").Defaults.create_tokenizer() + + @pytest.fixture(scope="session") def lt_tokenizer(): return get_lang_class("lt").Defaults.create_tokenizer() diff --git a/spacy/tests/lang/ko/__init__.py b/spacy/tests/lang/ko/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/ko/test_lemmatization.py b/spacy/tests/lang/ko/test_lemmatization.py new file mode 100644 index 000000000..67371d4ce --- /dev/null +++ b/spacy/tests/lang/ko/test_lemmatization.py @@ -0,0 +1,13 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize( + "word,lemma", + [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")], +) +def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma): + test_lemma = ko_tokenizer(word)[0].lemma_ + assert test_lemma == lemma diff --git a/spacy/tests/lang/ko/test_tokenizer.py b/spacy/tests/lang/ko/test_tokenizer.py new file mode 100644 index 000000000..bd1d94aec --- /dev/null +++ b/spacy/tests/lang/ko/test_tokenizer.py @@ -0,0 +1,46 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +# fmt: off +TOKENIZER_TESTS = [("서울 타워 근처에 살고 있습니다.", "서울 타워 근처 에 살 고 있 습니다 ."), + ("영등포구에 있는 맛집 좀 알려주세요.", "영등포구 에 있 는 맛집 좀 알려 주 세요 .")] + +TAG_TESTS = [("서울 타워 근처에 살고 있습니다.", + "NNP NNG NNG JKB VV EC VX EF SF"), + ("영등포구에 있는 맛집 좀 알려주세요.", + "NNP JKB VV ETM NNG MAG VV VX EP SF")] + +FULL_TAG_TESTS = [("영등포구에 있는 맛집 좀 알려주세요.", + "NNP JKB VV ETM NNG MAG VV+EC VX EP+EF SF")] + +POS_TESTS = [("서울 타워 근처에 살고 있습니다.", + "PROPN NOUN NOUN ADP VERB X AUX X PUNCT"), + ("영등포구에 있는 맛집 좀 알려주세요.", + "PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")] +# fmt: on + + +@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS) +def test_ko_tokenizer(ko_tokenizer, text, expected_tokens): + tokens = [token.text for token in ko_tokenizer(text)] + assert tokens == expected_tokens.split() + + +@pytest.mark.parametrize("text,expected_tags", TAG_TESTS) +def test_ko_tokenizer_tags(ko_tokenizer, text, expected_tags): + tags = [token.tag_ for token in ko_tokenizer(text)] + assert tags == expected_tags.split() + + +@pytest.mark.parametrize("text,expected_tags", FULL_TAG_TESTS) +def test_ko_tokenizer_full_tags(ko_tokenizer, text, expected_tags): + tags = ko_tokenizer(text).user_data["full_tags"] + assert tags == expected_tags.split() + + +@pytest.mark.parametrize("text,expected_pos", POS_TESTS) +def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos): + pos = [token.pos_ for token in ko_tokenizer(text)] + assert pos == expected_pos.split() diff --git a/website/meta/languages.json b/website/meta/languages.json index cfa468d7f..1169a3397 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -153,6 +153,17 @@ "example": "これは文章です。", "has_examples": true }, + { + "code": "ko", + "name": "Korean", + "dependencies": [ + { "name": "mecab-ko", "url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md" }, + { "name": "mecab-ko-dic", "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" }, + { "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py"} + ], + "example": "이것은 문장입니다.", + "has_examples": true + }, { "code": "vi", "name": "Vietnamese",