mirror of https://github.com/explosion/spaCy.git
Before this patch, half-width spaces between words were simply lost in Japanese text. This wasn't immediately noticeable because much Japanese text never uses spaces at all.
This commit is contained in:
parent
3c3658ef9f
commit
29a9e636eb
|
@ -37,6 +37,11 @@ def resolve_pos(token):
|
||||||
in the sentence. This function adds information to the POS tag to
|
in the sentence. This function adds information to the POS tag to
|
||||||
resolve ambiguous mappings.
|
resolve ambiguous mappings.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# this is only used for consecutive ascii spaces
|
||||||
|
if token.pos == '空白':
|
||||||
|
return '空白'
|
||||||
|
|
||||||
# TODO: This is a first take. The rules here are crude approximations.
|
# TODO: This is a first take. The rules here are crude approximations.
|
||||||
# For many of these, full dependencies are needed to properly resolve
|
# For many of these, full dependencies are needed to properly resolve
|
||||||
# PoS mappings.
|
# PoS mappings.
|
||||||
|
@ -54,6 +59,7 @@ def detailed_tokens(tokenizer, text):
|
||||||
node = tokenizer.parseToNode(text)
|
node = tokenizer.parseToNode(text)
|
||||||
node = node.next # first node is beginning of sentence and empty, skip it
|
node = node.next # first node is beginning of sentence and empty, skip it
|
||||||
words = []
|
words = []
|
||||||
|
spaces = []
|
||||||
while node.posid != 0:
|
while node.posid != 0:
|
||||||
surface = node.surface
|
surface = node.surface
|
||||||
base = surface # a default value. Updated if available later.
|
base = surface # a default value. Updated if available later.
|
||||||
|
@ -64,8 +70,20 @@ def detailed_tokens(tokenizer, text):
|
||||||
# dictionary
|
# dictionary
|
||||||
base = parts[7]
|
base = parts[7]
|
||||||
words.append(ShortUnitWord(surface, base, pos))
|
words.append(ShortUnitWord(surface, base, pos))
|
||||||
|
|
||||||
|
# The way MeCab stores spaces is that the rlength of the next token is
|
||||||
|
# the length of that token plus any preceding whitespace, **in bytes**.
|
||||||
|
# also note that this is only for half-width / ascii spaces. Full width
|
||||||
|
# spaces just become tokens.
|
||||||
|
scount = node.next.rlength - node.next.length
|
||||||
|
spaces.append(bool(scount))
|
||||||
|
while scount > 1:
|
||||||
|
words.append(ShortUnitWord(' ', ' ', '空白'))
|
||||||
|
spaces.append(False)
|
||||||
|
scount -= 1
|
||||||
|
|
||||||
node = node.next
|
node = node.next
|
||||||
return words
|
return words, spaces
|
||||||
|
|
||||||
|
|
||||||
class JapaneseTokenizer(DummyTokenizer):
|
class JapaneseTokenizer(DummyTokenizer):
|
||||||
|
@ -75,9 +93,8 @@ class JapaneseTokenizer(DummyTokenizer):
|
||||||
self.tokenizer.parseToNode("") # see #2901
|
self.tokenizer.parseToNode("") # see #2901
|
||||||
|
|
||||||
def __call__(self, text):
|
def __call__(self, text):
|
||||||
dtokens = detailed_tokens(self.tokenizer, text)
|
dtokens, spaces = detailed_tokens(self.tokenizer, text)
|
||||||
words = [x.surface for x in dtokens]
|
words = [x.surface for x in dtokens]
|
||||||
spaces = [False] * len(words)
|
|
||||||
doc = Doc(self.vocab, words=words, spaces=spaces)
|
doc = Doc(self.vocab, words=words, spaces=spaces)
|
||||||
mecab_tags = []
|
mecab_tags = []
|
||||||
for token, dtoken in zip(doc, dtokens):
|
for token, dtoken in zip(doc, dtokens):
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, SCONJ, NOUN
|
from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, SCONJ, NOUN
|
||||||
from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET
|
from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE
|
||||||
|
|
||||||
|
|
||||||
TAG_MAP = {
|
TAG_MAP = {
|
||||||
|
@ -21,6 +21,8 @@ TAG_MAP = {
|
||||||
"感動詞,一般,*,*": {POS: INTJ},
|
"感動詞,一般,*,*": {POS: INTJ},
|
||||||
# this is specifically for unicode full-width space
|
# this is specifically for unicode full-width space
|
||||||
"空白,*,*,*": {POS: X},
|
"空白,*,*,*": {POS: X},
|
||||||
|
# This is used when sequential half-width spaces are present
|
||||||
|
"空白": {POS: SPACE},
|
||||||
"形状詞,一般,*,*": {POS: ADJ},
|
"形状詞,一般,*,*": {POS: ADJ},
|
||||||
"形状詞,タリ,*,*": {POS: ADJ},
|
"形状詞,タリ,*,*": {POS: ADJ},
|
||||||
"形状詞,助動詞語幹,*,*": {POS: ADJ},
|
"形状詞,助動詞語幹,*,*": {POS: ADJ},
|
||||||
|
|
|
@ -47,3 +47,9 @@ def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags):
|
||||||
def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
|
def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
|
||||||
pos = [token.pos_ for token in ja_tokenizer(text)]
|
pos = [token.pos_ for token in ja_tokenizer(text)]
|
||||||
assert pos == expected_pos
|
assert pos == expected_pos
|
||||||
|
|
||||||
|
def test_extra_spaces(ja_tokenizer):
|
||||||
|
# note: three spaces after "I"
|
||||||
|
tokens = ja_tokenizer("I like cheese.")
|
||||||
|
assert tokens[1].orth_ == ' '
|
||||||
|
assert tokens[2].orth_ == ' '
|
||||||
|
|
Loading…
Reference in New Issue