From 52ee1f9bf93e75df649b2f25f3d106f8602483ea Mon Sep 17 00:00:00 2001 From: Vadim Mazaev Date: Tue, 21 Nov 2017 11:44:46 +0300 Subject: [PATCH] Updated Russian Language, added lemmatizer, norm exceptions and lex attrs --- spacy/lang/ru/__init__.py | 66 +++----- spacy/lang/ru/language_data.py | 18 -- spacy/lang/ru/lemmatizer.py | 232 ++++++++++++++++++++++++++ spacy/lang/ru/lex_attrs.py | 35 ++++ spacy/lang/ru/norm_exceptions.py | 34 ++++ spacy/lang/ru/tokenizer_exceptions.py | 26 +-- 6 files changed, 323 insertions(+), 88 deletions(-) delete mode 100644 spacy/lang/ru/language_data.py create mode 100644 spacy/lang/ru/lemmatizer.py create mode 100644 spacy/lang/ru/lex_attrs.py create mode 100644 spacy/lang/ru/norm_exceptions.py diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index 12b480a8a..29b6c7082 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -1,64 +1,36 @@ # encoding: utf8 from __future__ import unicode_literals, print_function -from ..language import Language -from ..attrs import LANG -from ..tokens import Doc -from .language_data import * +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .norm_exceptions import NORM_EXCEPTIONS +from .lex_attrs import LEX_ATTRS +from .lemmatizer import RussianLemmatizer - -class RussianTokenizer(object): - _morph = None - - def __init__(self, spacy_tokenizer, cls, nlp=None): - try: - from pymorphy2 import MorphAnalyzer - except ImportError: - raise ImportError( - "The Russian tokenizer requires the pymorphy2 library: " - "try to fix it with " - "pip install pymorphy2==0.8") - - RussianTokenizer._morph = RussianTokenizer._create_morph(MorphAnalyzer) - - self.vocab = nlp.vocab if nlp else cls.create_vocab(nlp) - self._spacy_tokenizer = spacy_tokenizer - - def __call__(self, text): - words = [self._normalize(RussianTokenizer._get_word(token)) - for token in self._spacy_tokenizer(text)] - - return Doc(self.vocab, words, [False] * len(words)) - - @staticmethod - def _get_word(token): - return token.lemma_ if len(token.lemma_) > 0 else token.text - - @classmethod - def _normalize(cls, word): - return cls._morph.parse(word)[0].normal_form - - @classmethod - def _create_morph(cls, morph_analyzer_class): - if not cls._morph: - cls._morph = morph_analyzer_class() - return cls._morph +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS +from ...util import update_exc, add_lookups +from ...language import Language +from ...attrs import LANG, LIKE_NUM, NORM class RussianDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters.update(LEX_ATTRS) lex_attr_getters[LANG] = lambda text: 'ru' - - tokenizer_exceptions = TOKENIZER_EXCEPTIONS + lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], + BASE_NORMS, NORM_EXCEPTIONS) + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS @classmethod - def create_tokenizer(cls, nlp=None): - tokenizer = super(RussianDefaults, cls).create_tokenizer(nlp) - return RussianTokenizer(tokenizer, cls, nlp) + def create_lemmatizer(cls, nlp=None): + return RussianLemmatizer() class Russian(Language): lang = 'ru' - Defaults = RussianDefaults + + +__all__ = ['Russian'] diff --git a/spacy/lang/ru/language_data.py b/spacy/lang/ru/language_data.py deleted file mode 100644 index 75ca41b65..000000000 --- a/spacy/lang/ru/language_data.py +++ /dev/null @@ -1,18 +0,0 @@ -# encoding: utf8 -from __future__ import unicode_literals - -from .. import language_data as base -from ..language_data import update_exc, strings_to_exc - -from .stop_words import STOP_WORDS -from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS - - -STOP_WORDS = set(STOP_WORDS) -TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) - - -update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) - - -__all__ = ["STOP_WORDS", "TOKENIZER_EXCEPTIONS"] \ No newline at end of file diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py new file mode 100644 index 000000000..538c4aaef --- /dev/null +++ b/spacy/lang/ru/lemmatizer.py @@ -0,0 +1,232 @@ +# coding: utf8 +from ...symbols import ( + ADJ, DET, NOUN, NUM, PRON, PROPN, VERB +) +from ...lemmatizer import Lemmatizer + + +class RussianLemmatizer(Lemmatizer): + _morph = None + + def __init__(self): + super().__init__() + try: + from pymorphy2 import MorphAnalyzer + except ImportError: + raise ImportError( + 'The Russian lemmatizer requires the pymorphy2 library: ' + 'try to fix it with "pip install pymorphy2"') + + if RussianLemmatizer._morph is None: + RussianLemmatizer._morph = MorphAnalyzer() + + def __call__(self, string, univ_pos, morphology=None): + univ_pos = self.normalize_univ_pos(univ_pos) + if univ_pos not in ('ADJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PROPN', 'VERB'): + # Skip unchangeable pos + return [string.lower()] + + analyses = self._morph.parse(string) + filtered_analyses = [] + for analysis in analyses: + if not analysis.is_known: + # Skip suggested parse variant for unknown word for pymorphy + continue + analysis_pos, _ = oc2ud(str(analysis.tag)) + if analysis_pos == univ_pos \ + or (analysis_pos in ('NOUN', 'PROPN') and univ_pos in ('NOUN', 'PROPN')): + filtered_analyses.append(analysis) + + if not len(filtered_analyses): + return [string.lower()] + if morphology is None: + return list(set([analysis.normal_form for analysis in filtered_analyses])) + + if univ_pos in ('ADJ', 'DET', 'NOUN', 'PROPN'): + features_to_compare = ['Case', 'Number', 'Gender'] + elif univ_pos == 'NUM': + features_to_compare = ['Case', 'Gender'] + elif univ_pos == 'PRON': + features_to_compare = ['Case', 'Number', 'Gender', 'Person'] + else: # VERB + features_to_compare = ['Aspect', 'Gender', 'Mood', 'Number', 'Tense', 'VerbForm', 'Voice'] + + analyses, filtered_analyses = filtered_analyses, [] + for analysis in analyses: + _, analysis_morph = oc2ud(str(analysis.tag)) + for feature in features_to_compare: + if feature in morphology and morphology[feature] != analysis_morph[feature]: + break + else: + filtered_analyses.append(analysis) + + if not len(filtered_analyses): + return [string.lower()] + return list(set([analysis.normal_form for analysis in filtered_analyses])) + + @staticmethod + def normalize_univ_pos(univ_pos): + if isinstance(univ_pos, str): + return univ_pos.upper() + + symbols_to_str = { + ADJ: 'ADJ', + DET: 'DET', + NOUN: 'NOUN', + NUM: 'NUM', + PRON: 'PRON', + PROPN: 'PROPN', + VERB: 'VERB' + } + if univ_pos in symbols_to_str: + return symbols_to_str[univ_pos] + return None + + def is_base_form(self, univ_pos, morphology=None): + # TODO + raise NotImplementedError + + # ('ADJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PROPN', 'VERB'): + def det(self, string, morphology=None): + return self(string, 'det', morphology) + + def num(self, string, morphology=None): + return self(string, 'num', morphology) + + def pron(self, string, morphology=None): + return self(string, 'pron', morphology) + + def lookup(self, string): + analyses = self._morph.parse(string) + if len(analyses) == 1: + return analyses[0].normal_form + return string + + +def oc2ud(oc_tag): + gram_map = { + '_POS': { + 'ADJF': 'ADJ', + 'ADJS': 'ADJ', + 'ADVB': 'ADV', + 'Apro': 'DET', + 'COMP': 'ADJ', # Can also be an ADV - unchangeable + 'CONJ': 'CCONJ', # Can also be a SCONJ - both unchangeable ones + 'GRND': 'VERB', + 'INFN': 'VERB', + 'INTJ': 'INTJ', + 'NOUN': 'NOUN', + 'NPRO': 'PRON', + 'NUMR': 'NUM', + 'NUMB': 'NUM', + 'PNCT': 'PUNCT', + 'PRCL': 'PART', + 'PREP': 'ADP', + 'PRTF': 'VERB', + 'PRTS': 'VERB', + 'VERB': 'VERB', + }, + 'Animacy': { + 'anim': 'Anim', + 'inan': 'Inan', + }, + 'Aspect': { + 'impf': 'Imp', + 'perf': 'Perf', + }, + 'Case': { + 'ablt': 'Ins', + 'accs': 'Acc', + 'datv': 'Dat', + 'gen1': 'Gen', + 'gen2': 'Gen', + 'gent': 'Gen', + 'loc2': 'Loc', + 'loct': 'Loc', + 'nomn': 'Nom', + 'voct': 'Voc', + }, + 'Degree': { + 'COMP': 'Cmp', + 'Supr': 'Sup', + }, + 'Gender': { + 'femn': 'Fem', + 'masc': 'Masc', + 'neut': 'Neut', + }, + 'Mood': { + 'impr': 'Imp', + 'indc': 'Ind', + }, + 'Number': { + 'plur': 'Plur', + 'sing': 'Sing', + }, + 'NumForm': { + 'NUMB': 'Digit', + }, + 'Person': { + '1per': '1', + '2per': '2', + '3per': '3', + 'excl': '2', + 'incl': '1', + }, + 'Tense': { + 'futr': 'Fut', + 'past': 'Past', + 'pres': 'Pres', + }, + 'Variant': { + 'ADJS': 'Brev', + 'PRTS': 'Brev', + }, + 'VerbForm': { + 'GRND': 'Conv', + 'INFN': 'Inf', + 'PRTF': 'Part', + 'PRTS': 'Part', + 'VERB': 'Fin', + }, + 'Voice': { + 'actv': 'Act', + 'pssv': 'Pass', + }, + 'Abbr': { + 'Abbr': 'Yes' + } + } + + pos = 'X' + morphology = dict() + unmatched = set() + + grams = oc_tag.replace(' ', ',').split(',') + for gram in grams: + match = False + for categ, gmap in sorted(gram_map.items()): + if gram in gmap: + match = True + if categ == '_POS': + pos = gmap[gram] + else: + morphology[categ] = gmap[gram] + if not match: + unmatched.add(gram) + + while len(unmatched) > 0: + gram = unmatched.pop() + if gram in ('Name', 'Patr', 'Surn', 'Geox', 'Orgn'): + pos = 'PROPN' + elif gram == 'Auxt': + pos = 'AUX' + elif gram == 'Pltm': + morphology['Number'] = 'Ptan' + + return pos, morphology + + +if __name__ == '__main__': + l = RussianLemmatizer() + print(l.noun('гвоздики', {'Gender': 'Fem'})) diff --git a/spacy/lang/ru/lex_attrs.py b/spacy/lang/ru/lex_attrs.py new file mode 100644 index 000000000..f3a889cad --- /dev/null +++ b/spacy/lang/ru/lex_attrs.py @@ -0,0 +1,35 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...attrs import LIKE_NUM + + +_num_words = [ + 'ноль', 'один', 'два', 'три', 'четыре', 'пять', 'шесть', 'семь', 'восемь', 'девять', + + 'десять', 'одиннадцать', 'двенадцать', 'тринадцать', 'четырнадцать', + 'пятнадцать', 'шестнадцать', 'семнадцать', 'восемнадцать', 'девятнадцать', + + 'двадцать', 'тридцать', 'сорок', 'пятдесят', 'шестдесят', 'семдесят', 'восемдесят', 'девяносто', + + 'сто', 'двести', 'триста', 'четыреста', 'пятсот', 'шестсот', 'семсот', 'восемсот', 'девятсот', + + 'тысяча', 'миллион', 'миллиад', 'триллион', 'квадриллион', 'квинтиллион'] + + +def like_num(text): + text = text.replace(',', '').replace('.', '') + if text.isdigit(): + return True + if text.count('/') == 1: + num, denom = text.split('/') + if num.isdigit() and denom.isdigit(): + return True + if text in _num_words: + return True + return False + + +LEX_ATTRS = { + LIKE_NUM: like_num +} diff --git a/spacy/lang/ru/norm_exceptions.py b/spacy/lang/ru/norm_exceptions.py new file mode 100644 index 000000000..6c3a5007c --- /dev/null +++ b/spacy/lang/ru/norm_exceptions.py @@ -0,0 +1,34 @@ +# coding: utf8 +from __future__ import unicode_literals + + +_exc = { + # Slang + 'прив': 'привет', + + # Weekdays abbreviations + "пн.": "понедельник", + "вт.": "вторник", + "ср.": "среда", + "чт.": "четверг", + "пт.": "пятница", + "сб.": "суббота", + "вс.": "воскресенье", + + # Months abbreviations + "янв.": "январь", + "фев.": "февраль", + "мар.": "март", + "апр.": "апрель", + +} + + +NORM_EXCEPTIONS = {} + +for string, norm in _exc.items(): + NORM_EXCEPTIONS[string] = norm + NORM_EXCEPTIONS[string.title()] = norm + if string.endswith('.'): + NORM_EXCEPTIONS[string[:-1]] = norm + NORM_EXCEPTIONS[string.title()[:-1]] = norm diff --git a/spacy/lang/ru/tokenizer_exceptions.py b/spacy/lang/ru/tokenizer_exceptions.py index 8df57a402..707dd388c 100644 --- a/spacy/lang/ru/tokenizer_exceptions.py +++ b/spacy/lang/ru/tokenizer_exceptions.py @@ -1,29 +1,9 @@ # encoding: utf8 from __future__ import unicode_literals -from ..symbols import * +from ...symbols import ORTH, LEMMA TOKENIZER_EXCEPTIONS = { - "Пн.": [ - {ORTH: "Пн.", LEMMA: "Понедельник"} - ], - "Вт.": [ - {ORTH: "Вт.", LEMMA: "Вторник"} - ], - "Ср.": [ - {ORTH: "Ср.", LEMMA: "Среда"} - ], - "Чт.": [ - {ORTH: "Чт.", LEMMA: "Четверг"} - ], - "Пт.": [ - {ORTH: "Пт.", LEMMA: "Пятница"} - ], - "Сб.": [ - {ORTH: "Сб.", LEMMA: "Суббота"} - ], - "Вс.": [ - {ORTH: "Вс.", LEMMA: "Воскресенье"} - ], -} \ No newline at end of file + +}