mirror of https://github.com/explosion/spaCy.git
Add option to not use Janome for Japanese tokenization
This commit is contained in:
parent
9b406181cd
commit
5faae803c6
|
@ -29,6 +29,7 @@ import conll17_ud_eval
|
|||
import spacy.lang.zh
|
||||
|
||||
spacy.lang.zh.Chinese.Defaults.use_jieba = False
|
||||
spacy.lang.ja.Chinese.Defaults.use_janome = False
|
||||
|
||||
random.seed(0)
|
||||
numpy.random.seed(0)
|
||||
|
|
|
@ -35,14 +35,32 @@ class JapaneseTokenizer(object):
|
|||
def from_disk(self, path, **exclude):
|
||||
return self
|
||||
|
||||
class JapaneseCharacterSegmenter(object):
|
||||
def __init__(self, vocab):
|
||||
self.vocab = vocab
|
||||
|
||||
def __call__(self, text):
|
||||
words = []
|
||||
spaces = []
|
||||
doc = self.tokenizer(text)
|
||||
for token in self.tokenizer(text):
|
||||
words.extend(list(token.text))
|
||||
spaces.extend([False]*len(token.text))
|
||||
spaces[-1] = bool(token.whitespace_)
|
||||
return Doc(self.vocab, words=words, spaces=spaces)
|
||||
|
||||
|
||||
class JapaneseDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'ja'
|
||||
use_janome = True
|
||||
|
||||
@classmethod
|
||||
def create_tokenizer(cls, nlp=None):
|
||||
return JapaneseTokenizer(cls, nlp)
|
||||
if cls.use_janome:
|
||||
return JapaneseTokenizer(cls, nlp)
|
||||
else:
|
||||
return JapaneseCharacterSegmenter(cls, nlp.vocab)
|
||||
|
||||
|
||||
class Japanese(Language):
|
||||
|
|
Loading…
Reference in New Issue