mirror of https://github.com/explosion/spaCy.git
Add draft Jieba tokenizer for Chinese
This commit is contained in:
parent
f7fee6c24b
commit
5363224395
|
@ -1,30 +1,12 @@
|
||||||
|
import jieba
|
||||||
|
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..tokenizer import Tokenizer
|
from ..tokens import Doc
|
||||||
from ..tagger import Tagger
|
|
||||||
|
|
||||||
|
|
||||||
class CharacterTokenizer(Tokenizer):
|
|
||||||
def __call__(self, text):
|
|
||||||
return self.tokens_from_list(list(text))
|
|
||||||
|
|
||||||
|
|
||||||
class Chinese(Language):
|
class Chinese(Language):
|
||||||
lang = u'zh'
|
lang = u'zh'
|
||||||
|
|
||||||
def __call__(self, text):
|
def make_doc(self, text):
|
||||||
doc = self.tokenizer.tokens_from_list(list(text))
|
words = list(jieba.cut(text, cut_all=True))
|
||||||
self.tagger(doc)
|
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||||
self.merge_characters(doc)
|
|
||||||
return doc
|
|
||||||
|
|
||||||
def merge_characters(self, doc):
|
|
||||||
start = 0
|
|
||||||
chunks = []
|
|
||||||
for token in doc:
|
|
||||||
if token.tag_ != 'CHAR':
|
|
||||||
chunk = doc[start : token.i + 1]
|
|
||||||
chunks.append(chunk)
|
|
||||||
start = token.i + 1
|
|
||||||
text = doc.text
|
|
||||||
for chunk in chunks:
|
|
||||||
chunk.merge(chunk[-1].tag_, chunk.text, u'')
|
|
||||||
|
|
Loading…
Reference in New Issue