spaCy/spacy/ru/__init__.py

57 lines
1.6 KiB
Python

# encoding: utf8
from __future__ import unicode_literals, print_function
from ..language import Language
from ..attrs import LANG
from ..tokens import Doc
from .language_data import *
class RussianTokenizer(object):
try:
from pymorphy2 import MorphAnalyzer
except ImportError:
raise ImportError(
"The Russian tokenizer requires the pymorphy2 library: "
"try to fix it with "
"pip install pymorphy2==0.8")
_morph = MorphAnalyzer()
def __init__(self, spacy_tokenizer, cls, nlp=None):
self.vocab = nlp.vocab if nlp else cls.create_vocab(nlp)
self._spacy_tokenizer = spacy_tokenizer
def __call__(self, text):
words = [self._normalize(RussianTokenizer._get_word(token))
for token in self._spacy_tokenizer(text)]
return Doc(self.vocab, words, [False] * len(words))
@staticmethod
def _get_word(token):
return token.lemma_ if len(token.lemma_) > 0 else token.text
@classmethod
def _normalize(cls, word):
return cls._morph.parse(word)[0].normal_form
class RussianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'ru'
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
stop_words = STOP_WORDS
@classmethod
def create_tokenizer(cls, nlp=None):
tokenizer = super(RussianDefaults, cls).create_tokenizer(nlp)
return RussianTokenizer(tokenizer, cls, nlp)
class Russian(Language):
lang = 'ru'
Defaults = RussianDefaults