From 5363224395b26528465417ff550d6a2163cbe8e6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 2 Nov 2016 19:57:38 +0100 Subject: [PATCH] Add draft Jieba tokenizer for Chinese --- spacy/zh/__init__.py | 30 ++++++------------------------ 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/spacy/zh/__init__.py b/spacy/zh/__init__.py index 243d8525c..3f4e36207 100644 --- a/spacy/zh/__init__.py +++ b/spacy/zh/__init__.py @@ -1,30 +1,12 @@ +import jieba + from ..language import Language -from ..tokenizer import Tokenizer -from ..tagger import Tagger - - -class CharacterTokenizer(Tokenizer): - def __call__(self, text): - return self.tokens_from_list(list(text)) +from ..tokens import Doc class Chinese(Language): lang = u'zh' - def __call__(self, text): - doc = self.tokenizer.tokens_from_list(list(text)) - self.tagger(doc) - self.merge_characters(doc) - return doc - - def merge_characters(self, doc): - start = 0 - chunks = [] - for token in doc: - if token.tag_ != 'CHAR': - chunk = doc[start : token.i + 1] - chunks.append(chunk) - start = token.i + 1 - text = doc.text - for chunk in chunks: - chunk.merge(chunk[-1].tag_, chunk.text, u'') + def make_doc(self, text): + words = list(jieba.cut(text, cut_all=True)) + return Doc(self.vocab, words=words, spaces=[False]*len(words))