mirror of https://github.com/explosion/spaCy.git
Updated Russian Language, added lemmatizer, norm exceptions and lex
attrs
This commit is contained in:
parent
a0739a06d4
commit
52ee1f9bf9
|
@ -1,64 +1,36 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from ..language import Language
|
||||
from ..attrs import LANG
|
||||
from ..tokens import Doc
|
||||
from .language_data import *
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .lemmatizer import RussianLemmatizer
|
||||
|
||||
|
||||
class RussianTokenizer(object):
|
||||
_morph = None
|
||||
|
||||
def __init__(self, spacy_tokenizer, cls, nlp=None):
|
||||
try:
|
||||
from pymorphy2 import MorphAnalyzer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"The Russian tokenizer requires the pymorphy2 library: "
|
||||
"try to fix it with "
|
||||
"pip install pymorphy2==0.8")
|
||||
|
||||
RussianTokenizer._morph = RussianTokenizer._create_morph(MorphAnalyzer)
|
||||
|
||||
self.vocab = nlp.vocab if nlp else cls.create_vocab(nlp)
|
||||
self._spacy_tokenizer = spacy_tokenizer
|
||||
|
||||
def __call__(self, text):
|
||||
words = [self._normalize(RussianTokenizer._get_word(token))
|
||||
for token in self._spacy_tokenizer(text)]
|
||||
|
||||
return Doc(self.vocab, words, [False] * len(words))
|
||||
|
||||
@staticmethod
|
||||
def _get_word(token):
|
||||
return token.lemma_ if len(token.lemma_) > 0 else token.text
|
||||
|
||||
@classmethod
|
||||
def _normalize(cls, word):
|
||||
return cls._morph.parse(word)[0].normal_form
|
||||
|
||||
@classmethod
|
||||
def _create_morph(cls, morph_analyzer_class):
|
||||
if not cls._morph:
|
||||
cls._morph = morph_analyzer_class()
|
||||
return cls._morph
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
from ...util import update_exc, add_lookups
|
||||
from ...language import Language
|
||||
from ...attrs import LANG, LIKE_NUM, NORM
|
||||
|
||||
|
||||
class RussianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: 'ru'
|
||||
|
||||
tokenizer_exceptions = TOKENIZER_EXCEPTIONS
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
|
||||
BASE_NORMS, NORM_EXCEPTIONS)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
|
||||
@classmethod
|
||||
def create_tokenizer(cls, nlp=None):
|
||||
tokenizer = super(RussianDefaults, cls).create_tokenizer(nlp)
|
||||
return RussianTokenizer(tokenizer, cls, nlp)
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return RussianLemmatizer()
|
||||
|
||||
|
||||
class Russian(Language):
|
||||
lang = 'ru'
|
||||
|
||||
Defaults = RussianDefaults
|
||||
|
||||
|
||||
__all__ = ['Russian']
|
||||
|
|
|
@ -1,18 +0,0 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .. import language_data as base
|
||||
from ..language_data import update_exc, strings_to_exc
|
||||
|
||||
from .stop_words import STOP_WORDS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
|
||||
|
||||
STOP_WORDS = set(STOP_WORDS)
|
||||
TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS)
|
||||
|
||||
|
||||
update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS))
|
||||
|
||||
|
||||
__all__ = ["STOP_WORDS", "TOKENIZER_EXCEPTIONS"]
|
|
@ -0,0 +1,232 @@
|
|||
# coding: utf8
|
||||
from ...symbols import (
|
||||
ADJ, DET, NOUN, NUM, PRON, PROPN, VERB
|
||||
)
|
||||
from ...lemmatizer import Lemmatizer
|
||||
|
||||
|
||||
class RussianLemmatizer(Lemmatizer):
|
||||
_morph = None
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
try:
|
||||
from pymorphy2 import MorphAnalyzer
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
'The Russian lemmatizer requires the pymorphy2 library: '
|
||||
'try to fix it with "pip install pymorphy2"')
|
||||
|
||||
if RussianLemmatizer._morph is None:
|
||||
RussianLemmatizer._morph = MorphAnalyzer()
|
||||
|
||||
def __call__(self, string, univ_pos, morphology=None):
|
||||
univ_pos = self.normalize_univ_pos(univ_pos)
|
||||
if univ_pos not in ('ADJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PROPN', 'VERB'):
|
||||
# Skip unchangeable pos
|
||||
return [string.lower()]
|
||||
|
||||
analyses = self._morph.parse(string)
|
||||
filtered_analyses = []
|
||||
for analysis in analyses:
|
||||
if not analysis.is_known:
|
||||
# Skip suggested parse variant for unknown word for pymorphy
|
||||
continue
|
||||
analysis_pos, _ = oc2ud(str(analysis.tag))
|
||||
if analysis_pos == univ_pos \
|
||||
or (analysis_pos in ('NOUN', 'PROPN') and univ_pos in ('NOUN', 'PROPN')):
|
||||
filtered_analyses.append(analysis)
|
||||
|
||||
if not len(filtered_analyses):
|
||||
return [string.lower()]
|
||||
if morphology is None:
|
||||
return list(set([analysis.normal_form for analysis in filtered_analyses]))
|
||||
|
||||
if univ_pos in ('ADJ', 'DET', 'NOUN', 'PROPN'):
|
||||
features_to_compare = ['Case', 'Number', 'Gender']
|
||||
elif univ_pos == 'NUM':
|
||||
features_to_compare = ['Case', 'Gender']
|
||||
elif univ_pos == 'PRON':
|
||||
features_to_compare = ['Case', 'Number', 'Gender', 'Person']
|
||||
else: # VERB
|
||||
features_to_compare = ['Aspect', 'Gender', 'Mood', 'Number', 'Tense', 'VerbForm', 'Voice']
|
||||
|
||||
analyses, filtered_analyses = filtered_analyses, []
|
||||
for analysis in analyses:
|
||||
_, analysis_morph = oc2ud(str(analysis.tag))
|
||||
for feature in features_to_compare:
|
||||
if feature in morphology and morphology[feature] != analysis_morph[feature]:
|
||||
break
|
||||
else:
|
||||
filtered_analyses.append(analysis)
|
||||
|
||||
if not len(filtered_analyses):
|
||||
return [string.lower()]
|
||||
return list(set([analysis.normal_form for analysis in filtered_analyses]))
|
||||
|
||||
@staticmethod
|
||||
def normalize_univ_pos(univ_pos):
|
||||
if isinstance(univ_pos, str):
|
||||
return univ_pos.upper()
|
||||
|
||||
symbols_to_str = {
|
||||
ADJ: 'ADJ',
|
||||
DET: 'DET',
|
||||
NOUN: 'NOUN',
|
||||
NUM: 'NUM',
|
||||
PRON: 'PRON',
|
||||
PROPN: 'PROPN',
|
||||
VERB: 'VERB'
|
||||
}
|
||||
if univ_pos in symbols_to_str:
|
||||
return symbols_to_str[univ_pos]
|
||||
return None
|
||||
|
||||
def is_base_form(self, univ_pos, morphology=None):
|
||||
# TODO
|
||||
raise NotImplementedError
|
||||
|
||||
# ('ADJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PROPN', 'VERB'):
|
||||
def det(self, string, morphology=None):
|
||||
return self(string, 'det', morphology)
|
||||
|
||||
def num(self, string, morphology=None):
|
||||
return self(string, 'num', morphology)
|
||||
|
||||
def pron(self, string, morphology=None):
|
||||
return self(string, 'pron', morphology)
|
||||
|
||||
def lookup(self, string):
|
||||
analyses = self._morph.parse(string)
|
||||
if len(analyses) == 1:
|
||||
return analyses[0].normal_form
|
||||
return string
|
||||
|
||||
|
||||
def oc2ud(oc_tag):
|
||||
gram_map = {
|
||||
'_POS': {
|
||||
'ADJF': 'ADJ',
|
||||
'ADJS': 'ADJ',
|
||||
'ADVB': 'ADV',
|
||||
'Apro': 'DET',
|
||||
'COMP': 'ADJ', # Can also be an ADV - unchangeable
|
||||
'CONJ': 'CCONJ', # Can also be a SCONJ - both unchangeable ones
|
||||
'GRND': 'VERB',
|
||||
'INFN': 'VERB',
|
||||
'INTJ': 'INTJ',
|
||||
'NOUN': 'NOUN',
|
||||
'NPRO': 'PRON',
|
||||
'NUMR': 'NUM',
|
||||
'NUMB': 'NUM',
|
||||
'PNCT': 'PUNCT',
|
||||
'PRCL': 'PART',
|
||||
'PREP': 'ADP',
|
||||
'PRTF': 'VERB',
|
||||
'PRTS': 'VERB',
|
||||
'VERB': 'VERB',
|
||||
},
|
||||
'Animacy': {
|
||||
'anim': 'Anim',
|
||||
'inan': 'Inan',
|
||||
},
|
||||
'Aspect': {
|
||||
'impf': 'Imp',
|
||||
'perf': 'Perf',
|
||||
},
|
||||
'Case': {
|
||||
'ablt': 'Ins',
|
||||
'accs': 'Acc',
|
||||
'datv': 'Dat',
|
||||
'gen1': 'Gen',
|
||||
'gen2': 'Gen',
|
||||
'gent': 'Gen',
|
||||
'loc2': 'Loc',
|
||||
'loct': 'Loc',
|
||||
'nomn': 'Nom',
|
||||
'voct': 'Voc',
|
||||
},
|
||||
'Degree': {
|
||||
'COMP': 'Cmp',
|
||||
'Supr': 'Sup',
|
||||
},
|
||||
'Gender': {
|
||||
'femn': 'Fem',
|
||||
'masc': 'Masc',
|
||||
'neut': 'Neut',
|
||||
},
|
||||
'Mood': {
|
||||
'impr': 'Imp',
|
||||
'indc': 'Ind',
|
||||
},
|
||||
'Number': {
|
||||
'plur': 'Plur',
|
||||
'sing': 'Sing',
|
||||
},
|
||||
'NumForm': {
|
||||
'NUMB': 'Digit',
|
||||
},
|
||||
'Person': {
|
||||
'1per': '1',
|
||||
'2per': '2',
|
||||
'3per': '3',
|
||||
'excl': '2',
|
||||
'incl': '1',
|
||||
},
|
||||
'Tense': {
|
||||
'futr': 'Fut',
|
||||
'past': 'Past',
|
||||
'pres': 'Pres',
|
||||
},
|
||||
'Variant': {
|
||||
'ADJS': 'Brev',
|
||||
'PRTS': 'Brev',
|
||||
},
|
||||
'VerbForm': {
|
||||
'GRND': 'Conv',
|
||||
'INFN': 'Inf',
|
||||
'PRTF': 'Part',
|
||||
'PRTS': 'Part',
|
||||
'VERB': 'Fin',
|
||||
},
|
||||
'Voice': {
|
||||
'actv': 'Act',
|
||||
'pssv': 'Pass',
|
||||
},
|
||||
'Abbr': {
|
||||
'Abbr': 'Yes'
|
||||
}
|
||||
}
|
||||
|
||||
pos = 'X'
|
||||
morphology = dict()
|
||||
unmatched = set()
|
||||
|
||||
grams = oc_tag.replace(' ', ',').split(',')
|
||||
for gram in grams:
|
||||
match = False
|
||||
for categ, gmap in sorted(gram_map.items()):
|
||||
if gram in gmap:
|
||||
match = True
|
||||
if categ == '_POS':
|
||||
pos = gmap[gram]
|
||||
else:
|
||||
morphology[categ] = gmap[gram]
|
||||
if not match:
|
||||
unmatched.add(gram)
|
||||
|
||||
while len(unmatched) > 0:
|
||||
gram = unmatched.pop()
|
||||
if gram in ('Name', 'Patr', 'Surn', 'Geox', 'Orgn'):
|
||||
pos = 'PROPN'
|
||||
elif gram == 'Auxt':
|
||||
pos = 'AUX'
|
||||
elif gram == 'Pltm':
|
||||
morphology['Number'] = 'Ptan'
|
||||
|
||||
return pos, morphology
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
l = RussianLemmatizer()
|
||||
print(l.noun('гвоздики', {'Gender': 'Fem'}))
|
|
@ -0,0 +1,35 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
_num_words = [
|
||||
'ноль', 'один', 'два', 'три', 'четыре', 'пять', 'шесть', 'семь', 'восемь', 'девять',
|
||||
|
||||
'десять', 'одиннадцать', 'двенадцать', 'тринадцать', 'четырнадцать',
|
||||
'пятнадцать', 'шестнадцать', 'семнадцать', 'восемнадцать', 'девятнадцать',
|
||||
|
||||
'двадцать', 'тридцать', 'сорок', 'пятдесят', 'шестдесят', 'семдесят', 'восемдесят', 'девяносто',
|
||||
|
||||
'сто', 'двести', 'триста', 'четыреста', 'пятсот', 'шестсот', 'семсот', 'восемсот', 'девятсот',
|
||||
|
||||
'тысяча', 'миллион', 'миллиад', 'триллион', 'квадриллион', 'квинтиллион']
|
||||
|
||||
|
||||
def like_num(text):
|
||||
text = text.replace(',', '').replace('.', '')
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count('/') == 1:
|
||||
num, denom = text.split('/')
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
if text in _num_words:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {
|
||||
LIKE_NUM: like_num
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
_exc = {
|
||||
# Slang
|
||||
'прив': 'привет',
|
||||
|
||||
# Weekdays abbreviations
|
||||
"пн.": "понедельник",
|
||||
"вт.": "вторник",
|
||||
"ср.": "среда",
|
||||
"чт.": "четверг",
|
||||
"пт.": "пятница",
|
||||
"сб.": "суббота",
|
||||
"вс.": "воскресенье",
|
||||
|
||||
# Months abbreviations
|
||||
"янв.": "январь",
|
||||
"фев.": "февраль",
|
||||
"мар.": "март",
|
||||
"апр.": "апрель",
|
||||
|
||||
}
|
||||
|
||||
|
||||
NORM_EXCEPTIONS = {}
|
||||
|
||||
for string, norm in _exc.items():
|
||||
NORM_EXCEPTIONS[string] = norm
|
||||
NORM_EXCEPTIONS[string.title()] = norm
|
||||
if string.endswith('.'):
|
||||
NORM_EXCEPTIONS[string[:-1]] = norm
|
||||
NORM_EXCEPTIONS[string.title()[:-1]] = norm
|
|
@ -1,29 +1,9 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..symbols import *
|
||||
from ...symbols import ORTH, LEMMA
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = {
|
||||
"Пн.": [
|
||||
{ORTH: "Пн.", LEMMA: "Понедельник"}
|
||||
],
|
||||
"Вт.": [
|
||||
{ORTH: "Вт.", LEMMA: "Вторник"}
|
||||
],
|
||||
"Ср.": [
|
||||
{ORTH: "Ср.", LEMMA: "Среда"}
|
||||
],
|
||||
"Чт.": [
|
||||
{ORTH: "Чт.", LEMMA: "Четверг"}
|
||||
],
|
||||
"Пт.": [
|
||||
{ORTH: "Пт.", LEMMA: "Пятница"}
|
||||
],
|
||||
"Сб.": [
|
||||
{ORTH: "Сб.", LEMMA: "Суббота"}
|
||||
],
|
||||
"Вс.": [
|
||||
{ORTH: "Вс.", LEMMA: "Воскресенье"}
|
||||
],
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue