Add draft Jieba tokenizer for Chinese

This commit is contained in:
Matthew Honnibal 2016-11-02 19:57:38 +01:00
parent f7fee6c24b
commit 5363224395
1 changed files with 6 additions and 24 deletions

View File

@ -1,30 +1,12 @@
import jieba
from ..language import Language from ..language import Language
from ..tokenizer import Tokenizer from ..tokens import Doc
from ..tagger import Tagger
class CharacterTokenizer(Tokenizer):
def __call__(self, text):
return self.tokens_from_list(list(text))
class Chinese(Language): class Chinese(Language):
lang = u'zh' lang = u'zh'
def __call__(self, text): def make_doc(self, text):
doc = self.tokenizer.tokens_from_list(list(text)) words = list(jieba.cut(text, cut_all=True))
self.tagger(doc) return Doc(self.vocab, words=words, spaces=[False]*len(words))
self.merge_characters(doc)
return doc
def merge_characters(self, doc):
start = 0
chunks = []
for token in doc:
if token.tag_ != 'CHAR':
chunk = doc[start : token.i + 1]
chunks.append(chunk)
start = token.i + 1
text = doc.text
for chunk in chunks:
chunk.merge(chunk[-1].tag_, chunk.text, u'')