Fix whitespace

This commit is contained in:
Ines Montani 2019-03-06 14:20:34 +01:00
parent 48a206a95f
commit 85deb96278
1 changed files with 0 additions and 11 deletions

View File

@ -38,24 +38,20 @@ def resolve_pos(token):
in the sentence. This function adds information to the POS tag to in the sentence. This function adds information to the POS tag to
resolve ambiguous mappings. resolve ambiguous mappings.
""" """
# TODO: This is a first take. The rules here are crude approximations. # TODO: This is a first take. The rules here are crude approximations.
# For many of these, full dependencies are needed to properly resolve # For many of these, full dependencies are needed to properly resolve
# PoS mappings. # PoS mappings.
if token.pos == "連体詞,*,*,*": if token.pos == "連体詞,*,*,*":
if re.match(r"[こそあど此其彼]の", token.surface): if re.match(r"[こそあど此其彼]の", token.surface):
return token.pos + ",DET" return token.pos + ",DET"
if re.match(r"[こそあど此其彼]", token.surface): if re.match(r"[こそあど此其彼]", token.surface):
return token.pos + ",PRON" return token.pos + ",PRON"
return token.pos + ",ADJ" return token.pos + ",ADJ"
return token.pos return token.pos
def detailed_tokens(tokenizer, text): def detailed_tokens(tokenizer, text):
"""Format Mecab output into a nice data structure, based on Janome.""" """Format Mecab output into a nice data structure, based on Janome."""
node = tokenizer.parseToNode(text) node = tokenizer.parseToNode(text)
node = node.next # first node is beginning of sentence and empty, skip it node = node.next # first node is beginning of sentence and empty, skip it
words = [] words = []
@ -64,12 +60,10 @@ def detailed_tokens(tokenizer, text):
base = surface # a default value. Updated if available later. base = surface # a default value. Updated if available later.
parts = node.feature.split(",") parts = node.feature.split(",")
pos = ",".join(parts[0:4]) pos = ",".join(parts[0:4])
if len(parts) > 7: if len(parts) > 7:
# this information is only available for words in the tokenizer # this information is only available for words in the tokenizer
# dictionary # dictionary
base = parts[7] base = parts[7]
words.append(ShortUnitWord(surface, base, pos)) words.append(ShortUnitWord(surface, base, pos))
node = node.next node = node.next
return words return words
@ -78,29 +72,24 @@ def detailed_tokens(tokenizer, text):
class JapaneseTokenizer(DummyTokenizer): class JapaneseTokenizer(DummyTokenizer):
def __init__(self, cls, nlp=None): def __init__(self, cls, nlp=None):
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
self.tokenizer = try_mecab_import().Tagger() self.tokenizer = try_mecab_import().Tagger()
self.tokenizer.parseToNode("") # see #2901 self.tokenizer.parseToNode("") # see #2901
def __call__(self, text): def __call__(self, text):
dtokens = detailed_tokens(self.tokenizer, text) dtokens = detailed_tokens(self.tokenizer, text)
words = [x.surface for x in dtokens] words = [x.surface for x in dtokens]
spaces = [False] * len(words) spaces = [False] * len(words)
doc = Doc(self.vocab, words=words, spaces=spaces) doc = Doc(self.vocab, words=words, spaces=spaces)
for token, dtoken in zip(doc, dtokens): for token, dtoken in zip(doc, dtokens):
token._.mecab_tag = dtoken.pos token._.mecab_tag = dtoken.pos
token.tag_ = resolve_pos(dtoken) token.tag_ = resolve_pos(dtoken)
token.lemma_ = dtoken.lemma token.lemma_ = dtoken.lemma
return doc return doc
class JapaneseDefaults(Language.Defaults): class JapaneseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda _text: "ja" lex_attr_getters[LANG] = lambda _text: "ja"
tag_map = TAG_MAP tag_map = TAG_MAP
@classmethod @classmethod