diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index e18d59a4c..5570b1cef 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -38,24 +38,20 @@ def resolve_pos(token): in the sentence. This function adds information to the POS tag to resolve ambiguous mappings. """ - # TODO: This is a first take. The rules here are crude approximations. # For many of these, full dependencies are needed to properly resolve # PoS mappings. - if token.pos == "連体詞,*,*,*": if re.match(r"[こそあど此其彼]の", token.surface): return token.pos + ",DET" if re.match(r"[こそあど此其彼]", token.surface): return token.pos + ",PRON" return token.pos + ",ADJ" - return token.pos def detailed_tokens(tokenizer, text): """Format Mecab output into a nice data structure, based on Janome.""" - node = tokenizer.parseToNode(text) node = node.next # first node is beginning of sentence and empty, skip it words = [] @@ -64,12 +60,10 @@ def detailed_tokens(tokenizer, text): base = surface # a default value. Updated if available later. parts = node.feature.split(",") pos = ",".join(parts[0:4]) - if len(parts) > 7: # this information is only available for words in the tokenizer # dictionary base = parts[7] - words.append(ShortUnitWord(surface, base, pos)) node = node.next return words @@ -78,29 +72,24 @@ def detailed_tokens(tokenizer, text): class JapaneseTokenizer(DummyTokenizer): def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - self.tokenizer = try_mecab_import().Tagger() self.tokenizer.parseToNode("") # see #2901 def __call__(self, text): dtokens = detailed_tokens(self.tokenizer, text) - words = [x.surface for x in dtokens] spaces = [False] * len(words) doc = Doc(self.vocab, words=words, spaces=spaces) - for token, dtoken in zip(doc, dtokens): token._.mecab_tag = dtoken.pos token.tag_ = resolve_pos(dtoken) token.lemma_ = dtoken.lemma - return doc class JapaneseDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda _text: "ja" - tag_map = TAG_MAP @classmethod