2016-04-24 16:44:24 +00:00
|
|
|
from ..language import Language
|
2016-05-05 09:39:12 +00:00
|
|
|
from ..tokenizer import Tokenizer
|
|
|
|
from ..tagger import Tagger
|
|
|
|
|
|
|
|
|
|
|
|
class CharacterTokenizer(Tokenizer):
|
|
|
|
def __call__(self, text):
|
|
|
|
return self.tokens_from_list(list(text))
|
2016-04-24 16:44:24 +00:00
|
|
|
|
|
|
|
|
|
|
|
class Chinese(Language):
|
|
|
|
lang = u'zh'
|
2016-05-05 09:39:12 +00:00
|
|
|
|
|
|
|
def __call__(self, text):
|
|
|
|
doc = self.tokenizer.tokens_from_list(list(text))
|
|
|
|
self.tagger(doc)
|
|
|
|
self.merge_characters(doc)
|
|
|
|
return doc
|
|
|
|
|
|
|
|
def merge_characters(self, doc):
|
|
|
|
start = 0
|
|
|
|
chunks = []
|
|
|
|
for token in doc:
|
|
|
|
if token.tag_ != 'CHAR':
|
|
|
|
chunk = doc[start : token.i + 1]
|
|
|
|
chunks.append(chunk)
|
|
|
|
start = token.i + 1
|
|
|
|
text = doc.text
|
|
|
|
for chunk in chunks:
|
|
|
|
chunk.merge(chunk[-1].tag_, chunk.text, u'')
|