From 9b406181cdc7061e9c5545fdc4d3dfa599650b42 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 25 Feb 2018 15:12:38 +0100 Subject: [PATCH] Add Chinese.Defaults.use_jieba setting, for UD --- examples/training/conllu.py | 4 ++++ spacy/lang/zh/__init__.py | 28 ++++++++++++++++++++-------- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/examples/training/conllu.py b/examples/training/conllu.py index f7c9b5fef..605905361 100644 --- a/examples/training/conllu.py +++ b/examples/training/conllu.py @@ -26,6 +26,10 @@ import cytoolz import conll17_ud_eval +import spacy.lang.zh + +spacy.lang.zh.Chinese.Defaults.use_jieba = False + random.seed(0) numpy.random.seed(0) diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index a2a2dcacd..bdf739fd7 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -9,6 +9,7 @@ from ...tokens import Doc class ChineseDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'zh' # for pickling + use_jieba = True class Chinese(Language): @@ -16,14 +17,25 @@ class Chinese(Language): Defaults = ChineseDefaults # override defaults def make_doc(self, text): - try: - import jieba - except ImportError: - raise ImportError("The Chinese tokenizer requires the Jieba library: " - "https://github.com/fxsjy/jieba") - words = list(jieba.cut(text, cut_all=False)) - words = [x for x in words if x] - return Doc(self.vocab, words=words, spaces=[False]*len(words)) + if self.Defaults.use_jieba: + try: + import jieba + except ImportError: + msg = ("Jieba not installed. Either set Chinese.use_jieba = False, " + "or install it https://github.com/fxsjy/jieba") + raise ImportError(msg) + words = list(jieba.cut(text, cut_all=False)) + words = [x for x in words if x] + return Doc(self.vocab, words=words, spaces=[False]*len(words)) + else: + words = [] + spaces = [] + doc = self.tokenizer(text) + for token in self.tokenizer(text): + words.extend(list(token.text)) + spaces.extend([False]*len(token.text)) + spaces[-1] = bool(token.whitespace_) + return Doc(self.vocab, words=words, spaces=spaces) __all__ = ['Chinese']