From 5faae803c6e6e26c7997b3476e9cec3825837c7d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 26 Feb 2018 09:39:46 +0100 Subject: [PATCH] Add option to not use Janome for Japanese tokenization --- examples/training/conllu.py | 1 + spacy/lang/ja/__init__.py | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/examples/training/conllu.py b/examples/training/conllu.py index 605905361..ac4305d7b 100644 --- a/examples/training/conllu.py +++ b/examples/training/conllu.py @@ -29,6 +29,7 @@ import conll17_ud_eval import spacy.lang.zh spacy.lang.zh.Chinese.Defaults.use_jieba = False +spacy.lang.ja.Chinese.Defaults.use_janome = False random.seed(0) numpy.random.seed(0) diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 3b67c5489..8231b0be3 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -35,14 +35,32 @@ class JapaneseTokenizer(object): def from_disk(self, path, **exclude): return self +class JapaneseCharacterSegmenter(object): + def __init__(self, vocab): + self.vocab = vocab + + def __call__(self, text): + words = [] + spaces = [] + doc = self.tokenizer(text) + for token in self.tokenizer(text): + words.extend(list(token.text)) + spaces.extend([False]*len(token.text)) + spaces[-1] = bool(token.whitespace_) + return Doc(self.vocab, words=words, spaces=spaces) + class JapaneseDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'ja' + use_janome = True @classmethod def create_tokenizer(cls, nlp=None): - return JapaneseTokenizer(cls, nlp) + if cls.use_janome: + return JapaneseTokenizer(cls, nlp) + else: + return JapaneseCharacterSegmenter(cls, nlp.vocab) class Japanese(Language):