2020-11-25 09:33:49 +00:00
|
|
|
from typing import List
|
2020-10-15 13:55:01 +00:00
|
|
|
from collections import OrderedDict
|
|
|
|
|
2020-11-25 09:33:49 +00:00
|
|
|
from ...pipeline import Lemmatizer
|
|
|
|
from ...tokens import Token
|
2020-10-15 13:55:01 +00:00
|
|
|
|
|
|
|
|
|
|
|
class MacedonianLemmatizer(Lemmatizer):
|
2020-11-25 09:33:49 +00:00
|
|
|
def rule_lemmatize(self, token: Token) -> List[str]:
|
|
|
|
string = token.text
|
|
|
|
univ_pos = token.pos_.lower()
|
2020-10-15 13:55:01 +00:00
|
|
|
|
|
|
|
if univ_pos in ("", "eol", "space"):
|
|
|
|
return [string.lower()]
|
|
|
|
|
2021-01-05 02:41:53 +00:00
|
|
|
if string[-3:] == "јќи":
|
2020-10-15 13:55:01 +00:00
|
|
|
string = string[:-3]
|
|
|
|
univ_pos = "verb"
|
|
|
|
|
|
|
|
index_table = self.lookups.get_table("lemma_index", {})
|
|
|
|
exc_table = self.lookups.get_table("lemma_exc", {})
|
|
|
|
rules_table = self.lookups.get_table("lemma_rules", {})
|
2021-01-05 02:41:53 +00:00
|
|
|
if not any(
|
|
|
|
(
|
|
|
|
index_table.get(univ_pos),
|
|
|
|
exc_table.get(univ_pos),
|
|
|
|
rules_table.get(univ_pos),
|
|
|
|
)
|
|
|
|
):
|
2020-10-15 13:55:01 +00:00
|
|
|
if univ_pos == "propn":
|
|
|
|
return [string]
|
|
|
|
else:
|
|
|
|
return [string.lower()]
|
|
|
|
|
2020-11-25 09:33:49 +00:00
|
|
|
index = index_table.get(univ_pos, {})
|
|
|
|
exceptions = exc_table.get(univ_pos, {})
|
|
|
|
rules = rules_table.get(univ_pos, [])
|
|
|
|
|
2020-10-15 13:55:01 +00:00
|
|
|
orig = string
|
|
|
|
string = string.lower()
|
|
|
|
forms = []
|
|
|
|
|
|
|
|
for old, new in rules:
|
|
|
|
if string.endswith(old):
|
|
|
|
form = string[: len(string) - len(old)] + new
|
|
|
|
if not form:
|
|
|
|
continue
|
|
|
|
if form in index or not form.isalpha():
|
|
|
|
forms.append(form)
|
|
|
|
|
|
|
|
forms = list(OrderedDict.fromkeys(forms))
|
|
|
|
for form in exceptions.get(string, []):
|
|
|
|
if form not in forms:
|
|
|
|
forms.insert(0, form)
|
|
|
|
if not forms:
|
|
|
|
forms.append(orig)
|
|
|
|
|
|
|
|
return forms
|