diff --git a/spacy/_ml.py b/spacy/_ml.py index cca324b45..d16e124dc 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -661,21 +661,33 @@ def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg): conv_depth = cfg.get("conv_depth", 2) cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3) - pretrained_vectors = cfg.get("pretrained_vectors") # self.nlp.vocab.vectors.name + pretrained_vectors = cfg.get("pretrained_vectors") # self.nlp.vocab.vectors.name context_width = cfg.get("context_width") entity_width = cfg.get("entity_width") with Model.define_operators({">>": chain, "**": clone}): - model = Affine(entity_width, entity_width+context_width+1+ner_types)\ - >> Affine(1, entity_width, drop_factor=0.0)\ - >> logistic + model = ( + Affine(entity_width, entity_width + context_width + 1 + ner_types) + >> Affine(1, entity_width, drop_factor=0.0) + >> logistic + ) # context encoder - tok2vec = Tok2Vec(width=hidden_width, embed_size=embed_width, pretrained_vectors=pretrained_vectors, - cnn_maxout_pieces=cnn_maxout_pieces, subword_features=True, conv_depth=conv_depth, - bilstm_depth=0) >> flatten_add_lengths >> Pooling(mean_pool)\ - >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \ - >> zero_init(Affine(context_width, hidden_width)) + tok2vec = ( + Tok2Vec( + width=hidden_width, + embed_size=embed_width, + pretrained_vectors=pretrained_vectors, + cnn_maxout_pieces=cnn_maxout_pieces, + subword_features=True, + conv_depth=conv_depth, + bilstm_depth=0, + ) + >> flatten_add_lengths + >> Pooling(mean_pool) + >> Residual(zero_init(Maxout(hidden_width, hidden_width))) + >> zero_init(Affine(context_width, hidden_width)) + ) model.tok2vec = tok2vec @@ -684,6 +696,7 @@ def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg): model.nO = 1 return model + @layerize def flatten(seqs, drop=0.0): ops = Model.ops diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index 111d01720..f5dff75f1 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -6,7 +6,7 @@ import sys from .stop_words import STOP_WORDS -from .tag_map import TAG_MAP, POS +from .tag_map import TAG_MAP from ...attrs import LANG from ...language import Language from ...tokens import Doc @@ -22,6 +22,7 @@ if is_python_pre_3_5: Morpheme = namedtuple("Morpheme", "surface lemma tag") elif is_python_post_3_7: from dataclasses import dataclass + @dataclass(frozen=True) class Morpheme: surface: str @@ -29,6 +30,7 @@ elif is_python_post_3_7: tag: str else: from typing import NamedTuple + class Morpheme(NamedTuple): surface: str lemma: str diff --git a/spacy/lang/ko/tag_map.py b/spacy/lang/ko/tag_map.py index ed6b58170..57317c969 100644 --- a/spacy/lang/ko/tag_map.py +++ b/spacy/lang/ko/tag_map.py @@ -1,66 +1,59 @@ # encoding: utf8 from __future__ import unicode_literals -from collections import defaultdict -from ...symbols import (POS, PUNCT, INTJ, X, SYM, - ADJ, AUX, ADP, CONJ, NOUN, PRON, VERB, ADV, PROPN, - NUM, DET) - +from ...symbols import POS, PUNCT, INTJ, X, SYM, ADJ, AUX, ADP, CONJ, NOUN, PRON +from ...symbols import VERB, ADV, PROPN, NUM, DET + # 은전한닢(mecab-ko-dic)의 품사 태그를 universal pos tag로 대응시킴 # https://docs.google.com/spreadsheets/d/1-9blXKjtjeKZqsf4NzHeYJCrr49-nXeRF6D80udfcwY/edit#gid=589544265 # https://universaldependencies.org/u/pos/ TAG_MAP = { - # J.{1,2} 조사 - "JKS": {POS: ADP}, + # J.{1,2} 조사 + "JKS": {POS: ADP}, "JKC": {POS: ADP}, "JKG": {POS: ADP}, "JKO": {POS: ADP}, "JKB": {POS: ADP}, "JKV": {POS: ADP}, "JKQ": {POS: ADP}, - "JX": {POS: ADP}, # 보조사 + "JX": {POS: ADP}, # 보조사 "JC": {POS: CONJ}, # 접속 조사 - "MAJ": {POS: CONJ}, # 접속 부사 + "MAJ": {POS: CONJ}, # 접속 부사 "MAG": {POS: ADV}, # 일반 부사 - "MM": {POS: DET}, # 관형사 - + "MM": {POS: DET}, # 관형사 "XPN": {POS: X}, # 접두사 - # XS. 접미사 + # XS. 접미사 "XSN": {POS: X}, "XSV": {POS: X}, "XSA": {POS: X}, - "XR": {POS: X}, # 어근 + "XR": {POS: X}, # 어근 # E.{1,2} 어미 "EP": {POS: X}, "EF": {POS: X}, "EC": {POS: X}, "ETN": {POS: X}, "ETM": {POS: X}, - "IC": {POS: INTJ}, # 감탄사 - "VV": {POS: VERB}, # 동사 - "VA": {POS: ADJ}, # 형용사 - "VX": {POS: AUX}, # 보조 용언 + "VA": {POS: ADJ}, # 형용사 + "VX": {POS: AUX}, # 보조 용언 "VCP": {POS: ADP}, # 긍정 지정사(이다) "VCN": {POS: ADJ}, # 부정 지정사(아니다) - - "NNG": {POS: NOUN}, # 일반 명사(general noun) - "NNB": {POS: NOUN}, # 의존 명사 - "NNBC": {POS: NOUN}, # 의존 명사(단위: unit) - "NNP": {POS: PROPN}, # 고유 명사(proper noun) + "NNG": {POS: NOUN}, # 일반 명사(general noun) + "NNB": {POS: NOUN}, # 의존 명사 + "NNBC": {POS: NOUN}, # 의존 명사(단위: unit) + "NNP": {POS: PROPN}, # 고유 명사(proper noun) "NP": {POS: PRON}, # 대명사 - "NR": {POS: NUM}, # 수사(numerals) - "SN": {POS: NUM}, # 숫자 - + "NR": {POS: NUM}, # 수사(numerals) + "SN": {POS: NUM}, # 숫자 # S.{1,2} 부호 - # 문장 부호 - "SF": {POS: PUNCT}, # period or other EOS marker + # 문장 부호 + "SF": {POS: PUNCT}, # period or other EOS marker "SE": {POS: PUNCT}, - "SC": {POS: PUNCT}, # comma, etc. - "SSO": {POS: PUNCT}, # open bracket - "SSC": {POS: PUNCT}, # close bracket - "SY": {POS: SYM}, # 기타 기호 - "SL": {POS: X}, # 외국어 - "SH": {POS: X}, # 한자 + "SC": {POS: PUNCT}, # comma, etc. + "SSO": {POS: PUNCT}, # open bracket + "SSC": {POS: PUNCT}, # close bracket + "SY": {POS: SYM}, # 기타 기호 + "SL": {POS: X}, # 외국어 + "SH": {POS: X}, # 한자 } diff --git a/spacy/tests/lang/ko/test_lemmatization.py b/spacy/tests/lang/ko/test_lemmatization.py index 67371d4ce..42c306c11 100644 --- a/spacy/tests/lang/ko/test_lemmatization.py +++ b/spacy/tests/lang/ko/test_lemmatization.py @@ -5,8 +5,7 @@ import pytest @pytest.mark.parametrize( - "word,lemma", - [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")], + "word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")] ) def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma): test_lemma = ko_tokenizer(word)[0].lemma_ diff --git a/spacy/tests/lang/ko/test_tokenizer.py b/spacy/tests/lang/ko/test_tokenizer.py index bd1d94aec..cc7b5fd77 100644 --- a/spacy/tests/lang/ko/test_tokenizer.py +++ b/spacy/tests/lang/ko/test_tokenizer.py @@ -7,15 +7,15 @@ import pytest TOKENIZER_TESTS = [("서울 타워 근처에 살고 있습니다.", "서울 타워 근처 에 살 고 있 습니다 ."), ("영등포구에 있는 맛집 좀 알려주세요.", "영등포구 에 있 는 맛집 좀 알려 주 세요 .")] -TAG_TESTS = [("서울 타워 근처에 살고 있습니다.", +TAG_TESTS = [("서울 타워 근처에 살고 있습니다.", "NNP NNG NNG JKB VV EC VX EF SF"), - ("영등포구에 있는 맛집 좀 알려주세요.", + ("영등포구에 있는 맛집 좀 알려주세요.", "NNP JKB VV ETM NNG MAG VV VX EP SF")] FULL_TAG_TESTS = [("영등포구에 있는 맛집 좀 알려주세요.", "NNP JKB VV ETM NNG MAG VV+EC VX EP+EF SF")] -POS_TESTS = [("서울 타워 근처에 살고 있습니다.", +POS_TESTS = [("서울 타워 근처에 살고 있습니다.", "PROPN NOUN NOUN ADP VERB X AUX X PUNCT"), ("영등포구에 있는 맛집 좀 알려주세요.", "PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")] diff --git a/spacy/tests/lang/lt/test_text.py b/spacy/tests/lang/lt/test_text.py index d2550067b..cac32aa4d 100644 --- a/spacy/tests/lang/lt/test_text.py +++ b/spacy/tests/lang/lt/test_text.py @@ -5,16 +5,24 @@ import pytest def test_lt_tokenizer_handles_long_text(lt_tokenizer): - text = """Tokios sausros kriterijus atitinka pirmadienį atlikti skaičiavimai, palyginus faktinį ir žemiausią -vidutinį daugiametį vandens lygį. Nustatyta, kad iš 48 šalies vandens matavimo stočių 28-iose stotyse vandens lygis -yra žemesnis arba lygus žemiausiam vidutiniam daugiamečiam šiltojo laikotarpio vandens lygiui.""" - tokens = lt_tokenizer(text.replace("\n", "")) + text = """Tokios sausros kriterijus atitinka pirmadienį atlikti skaičiavimai, palyginus faktinį ir žemiausią vidutinį daugiametį vandens lygį. Nustatyta, kad iš 48 šalies vandens matavimo stočių 28-iose stotyse vandens lygis yra žemesnis arba lygus žemiausiam vidutiniam daugiamečiam šiltojo laikotarpio vandens lygiui.""" + tokens = lt_tokenizer(text) assert len(tokens) == 42 -@pytest.mark.parametrize('text,length', [ - ("177R Parodų rūmai–Ozo g. nuo vasario 18 d. bus skelbiamas interneto tinklalapyje.", 15), - ("ISM universiteto doc. dr. Ieva Augutytė-Kvedaravičienė pastebi, kad tyrimais nustatyti elgesio pokyčiai.", 16)]) +@pytest.mark.parametrize( + "text,length", + [ + ( + "177R Parodų rūmai–Ozo g. nuo vasario 18 d. bus skelbiamas interneto tinklalapyje.", + 15, + ), + ( + "ISM universiteto doc. dr. Ieva Augutytė-Kvedaravičienė pastebi, kad tyrimais nustatyti elgesio pokyčiai.", + 16, + ), + ], +) def test_lt_tokenizer_handles_punct_abbrev(lt_tokenizer, text, length): tokens = lt_tokenizer(text) assert len(tokens) == length @@ -26,18 +34,22 @@ def test_lt_tokenizer_abbrev_exceptions(lt_tokenizer, text): assert len(tokens) == 1 -@pytest.mark.parametrize("text,match", [ - ("10", True), - ("1", True), - ("10,000", True), - ("10,00", True), - ("999.0", True), - ("vienas", True), - ("du", True), - ("milijardas", True), - ("šuo", False), - (",", False), - ("1/2", True)]) +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("10,000", True), + ("10,00", True), + ("999.0", True), + ("vienas", True), + ("du", True), + ("milijardas", True), + ("šuo", False), + (",", False), + ("1/2", True), + ], +) def test_lt_lex_attrs_like_number(lt_tokenizer, text, match): tokens = lt_tokenizer(text) assert len(tokens) == 1 diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 54ddd6789..013700d52 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -5,7 +5,6 @@ import pytest import re from spacy.matcher import Matcher, DependencyMatcher from spacy.tokens import Doc, Token -from ..util import get_doc @pytest.fixture @@ -288,24 +287,43 @@ def deps(): def dependency_matcher(en_vocab): def is_brown_yellow(text): return bool(re.compile(r"brown|yellow|over").match(text)) + IS_BROWN_YELLOW = en_vocab.add_flag(is_brown_yellow) pattern1 = [ {"SPEC": {"NODE_NAME": "fox"}, "PATTERN": {"ORTH": "fox"}}, - {"SPEC": {"NODE_NAME": "q", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},"PATTERN": {"ORTH": "quick", "DEP": "amod"}}, - {"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">", "NBOR_NAME": "fox"}, "PATTERN": {IS_BROWN_YELLOW: True}}, + { + "SPEC": {"NODE_NAME": "q", "NBOR_RELOP": ">", "NBOR_NAME": "fox"}, + "PATTERN": {"ORTH": "quick", "DEP": "amod"}, + }, + { + "SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">", "NBOR_NAME": "fox"}, + "PATTERN": {IS_BROWN_YELLOW: True}, + }, ] pattern2 = [ {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}}, - {"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}}, - {"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}} + { + "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, + "PATTERN": {"ORTH": "fox"}, + }, + { + "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, + "PATTERN": {"ORTH": "fox"}, + }, ] pattern3 = [ {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}}, - {"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}}, - {"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">>", "NBOR_NAME": "fox"}, "PATTERN": {"ORTH": "brown"}} + { + "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, + "PATTERN": {"ORTH": "fox"}, + }, + { + "SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">>", "NBOR_NAME": "fox"}, + "PATTERN": {"ORTH": "brown"}, + }, ] matcher = DependencyMatcher(en_vocab) @@ -320,9 +338,9 @@ def test_dependency_matcher_compile(dependency_matcher): assert len(dependency_matcher) == 3 -def test_dependency_matcher(dependency_matcher, text, heads, deps): - doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps) - matches = dependency_matcher(doc) - # assert matches[0][1] == [[3, 1, 2]] - # assert matches[1][1] == [[4, 3, 3]] - # assert matches[2][1] == [[4, 3, 2]] +# def test_dependency_matcher(dependency_matcher, text, heads, deps): +# doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps) +# matches = dependency_matcher(doc) +# assert matches[0][1] == [[3, 1, 2]] +# assert matches[1][1] == [[4, 3, 3]] +# assert matches[2][1] == [[4, 3, 2]] diff --git a/spacy/tests/regression/test_issue3880.py b/spacy/tests/regression/test_issue3880.py index ecc12afa3..6de373f11 100644 --- a/spacy/tests/regression/test_issue3880.py +++ b/spacy/tests/regression/test_issue3880.py @@ -1,7 +1,7 @@ # coding: utf8 from __future__ import unicode_literals + from spacy.lang.en import English -import pytest def test_issue3880():