mirror of https://github.com/explosion/spaCy.git
57 lines
1.6 KiB
Python
57 lines
1.6 KiB
Python
|
# encoding: utf8
|
||
|
from __future__ import unicode_literals, print_function
|
||
|
|
||
|
from ..language import Language
|
||
|
from ..attrs import LANG
|
||
|
from ..tokens import Doc
|
||
|
from .language_data import *
|
||
|
|
||
|
|
||
|
class RussianTokenizer(object):
|
||
|
try:
|
||
|
from pymorphy2 import MorphAnalyzer
|
||
|
except ImportError:
|
||
|
raise ImportError(
|
||
|
"The Russian tokenizer requires the pymorphy2 library: "
|
||
|
"try to fix it with "
|
||
|
"pip install pymorphy2==0.8")
|
||
|
|
||
|
_morph = MorphAnalyzer()
|
||
|
|
||
|
def __init__(self, spacy_tokenizer, cls, nlp=None):
|
||
|
self.vocab = nlp.vocab if nlp else cls.create_vocab(nlp)
|
||
|
self._spacy_tokenizer = spacy_tokenizer
|
||
|
|
||
|
def __call__(self, text):
|
||
|
words = [self._normalize(RussianTokenizer._get_word(token))
|
||
|
for token in self._spacy_tokenizer(text)]
|
||
|
|
||
|
return Doc(self.vocab, words, [False] * len(words))
|
||
|
|
||
|
@staticmethod
|
||
|
def _get_word(token):
|
||
|
return token.lemma_ if len(token.lemma_) > 0 else token.text
|
||
|
|
||
|
@classmethod
|
||
|
def _normalize(cls, word):
|
||
|
return cls._morph.parse(word)[0].normal_form
|
||
|
|
||
|
|
||
|
class RussianDefaults(Language.Defaults):
|
||
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||
|
lex_attr_getters[LANG] = lambda text: 'ru'
|
||
|
|
||
|
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||
|
stop_words = STOP_WORDS
|
||
|
|
||
|
@classmethod
|
||
|
def create_tokenizer(cls, nlp=None):
|
||
|
tokenizer = super(RussianDefaults, cls).create_tokenizer(nlp)
|
||
|
return RussianTokenizer(tokenizer, cls, nlp)
|
||
|
|
||
|
|
||
|
class Russian(Language):
|
||
|
lang = 'ru'
|
||
|
|
||
|
Defaults = RussianDefaults
|