spaCy/spacy/lang/el/lemmatizer/__init__.py

# coding: utf8
from __future__ import unicode_literals

from ....symbols import NOUN, VERB, ADJ, PUNCT


class GreekLemmatizer(object):
    """
    Greek language lemmatizer applies the default rule based lemmatization
    procedure with some modifications for better Greek language support.

    The first modification is that it checks if the word for lemmatization is
    already a lemma and if yes, it just returns it.
    The second modification is about removing the base forms function which is
    not applicable for Greek language.
    """

    @classmethod
    def load(cls, path, index=None, exc=None, rules=None, lookup=None):
        return cls(index, exc, rules, lookup)

    def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
        self.index = index
        self.exc = exceptions
        self.rules = rules
        self.lookup_table = lookup if lookup is not None else {}

    def __call__(self, string, univ_pos, morphology=None):
        if not self.rules:
            return [self.lookup_table.get(string, string)]
        if univ_pos in (NOUN, "NOUN", "noun"):
            univ_pos = "noun"
        elif univ_pos in (VERB, "VERB", "verb"):
            univ_pos = "verb"
        elif univ_pos in (ADJ, "ADJ", "adj"):
            univ_pos = "adj"
        elif univ_pos in (PUNCT, "PUNCT", "punct"):
            univ_pos = "punct"
        else:
            return list(set([string.lower()]))
        lemmas = lemmatize(
            string,
            self.index.get(univ_pos, {}),
            self.exc.get(univ_pos, {}),
            self.rules.get(univ_pos, []),
        )
        return lemmas

    def lookup(self, string, orth=None):
        key = orth if orth is not None else string
        if key in self.lookup_table:
            return self.lookup_table[key]
        return string


def lemmatize(string, index, exceptions, rules):
    string = string.lower()
    forms = []
    if string in index:
        forms.append(string)
        return forms
    forms.extend(exceptions.get(string, []))
    oov_forms = []
    if not forms:
        for old, new in rules:
            if string.endswith(old):
                form = string[: len(string) - len(old)] + new
                if not form:
                    pass
                elif form in index or not form.isalpha():
                    forms.append(form)
                else:
                    oov_forms.append(form)
    if not forms:
        forms.extend(oov_forms)
    if not forms:
        forms.append(string)
    return list(set(forms))