updated Russian tokenizer

moved the trying to import pymorph into __init__
This commit is contained in:
yuukos 2017-10-13 13:57:29 +07:00 committed by Vadim Mazaev
parent 3aad66cf00
commit 7401152289
1 changed files with 17 additions and 9 deletions

View File

@ -8,6 +8,9 @@ from .language_data import *
class RussianTokenizer(object): class RussianTokenizer(object):
_morph = None
def __init__(self, spacy_tokenizer, cls, nlp=None):
try: try:
from pymorphy2 import MorphAnalyzer from pymorphy2 import MorphAnalyzer
except ImportError: except ImportError:
@ -16,9 +19,8 @@ class RussianTokenizer(object):
"try to fix it with " "try to fix it with "
"pip install pymorphy2==0.8") "pip install pymorphy2==0.8")
_morph = MorphAnalyzer() RussianTokenizer._morph = RussianTokenizer._create_morph(MorphAnalyzer)
def __init__(self, spacy_tokenizer, cls, nlp=None):
self.vocab = nlp.vocab if nlp else cls.create_vocab(nlp) self.vocab = nlp.vocab if nlp else cls.create_vocab(nlp)
self._spacy_tokenizer = spacy_tokenizer self._spacy_tokenizer = spacy_tokenizer
@ -36,6 +38,12 @@ class RussianTokenizer(object):
def _normalize(cls, word): def _normalize(cls, word):
return cls._morph.parse(word)[0].normal_form return cls._morph.parse(word)[0].normal_form
@classmethod
def _create_morph(cls, morph_analyzer_class):
if not cls._morph:
cls._morph = morph_analyzer_class()
return cls._morph
class RussianDefaults(Language.Defaults): class RussianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)