From 1eed101be9adc8f94036761099d512f26439a2c5 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 26 May 2020 09:56:12 +0200 Subject: [PATCH] Fix Polish lemmatizer for deserialized models Restructure Polish lemmatizer not to depend on lookups data in `__init__` since the lemmatizer is initialized before the lookups data is loaded from a saved model. The lookups tables are accessed first in `__call__` instead once the data is available. --- spacy/lang/pl/lemmatizer.py | 87 +++++++++++++------------------------ 1 file changed, 31 insertions(+), 56 deletions(-) diff --git a/spacy/lang/pl/lemmatizer.py b/spacy/lang/pl/lemmatizer.py index d0d843b2a..8b8d7fe27 100644 --- a/spacy/lang/pl/lemmatizer.py +++ b/spacy/lang/pl/lemmatizer.py @@ -6,98 +6,73 @@ from ...parts_of_speech import NAMES class PolishLemmatizer(Lemmatizer): - # This lemmatizer implements lookup lemmatization based on - # the Morfeusz dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS - # It utilizes some prefix based improvements for - # verb and adjectives lemmatization, as well as case-sensitive - # lemmatization for nouns - def __init__(self, lookups, *args, **kwargs): - # this lemmatizer is lookup based, so it does not require an index, exceptionlist, or rules - super(PolishLemmatizer, self).__init__(lookups) - self.lemma_lookups = {} - for tag in [ - "ADJ", - "ADP", - "ADV", - "AUX", - "NOUN", - "NUM", - "PART", - "PRON", - "VERB", - "X", - ]: - self.lemma_lookups[tag] = self.lookups.get_table( - "lemma_lookup_" + tag.lower(), {} - ) - self.lemma_lookups["DET"] = self.lemma_lookups["X"] - self.lemma_lookups["PROPN"] = self.lemma_lookups["NOUN"] - + # This lemmatizer implements lookup lemmatization based on the Morfeusz + # dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS. + # It utilizes some prefix based improvements for verb and adjectives + # lemmatization, as well as case-sensitive lemmatization for nouns. def __call__(self, string, univ_pos, morphology=None): if isinstance(univ_pos, int): univ_pos = NAMES.get(univ_pos, "X") univ_pos = univ_pos.upper() + lookup_pos = univ_pos.lower() + if univ_pos == "PROPN": + lookup_pos = "noun" + lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {}) + if univ_pos == "NOUN": - return self.lemmatize_noun(string, morphology) + return self.lemmatize_noun(string, morphology, lookup_table) if univ_pos != "PROPN": string = string.lower() if univ_pos == "ADJ": - return self.lemmatize_adj(string, morphology) + return self.lemmatize_adj(string, morphology, lookup_table) elif univ_pos == "VERB": - return self.lemmatize_verb(string, morphology) + return self.lemmatize_verb(string, morphology, lookup_table) - lemma_dict = self.lemma_lookups.get(univ_pos, {}) - return [lemma_dict.get(string, string.lower())] + return [lookup_table.get(string, string.lower())] - def lemmatize_adj(self, string, morphology): + def lemmatize_adj(self, string, morphology, lookup_table): # this method utilizes different procedures for adjectives # with 'nie' and 'naj' prefixes - lemma_dict = self.lemma_lookups["ADJ"] - if string[:3] == "nie": search_string = string[3:] if search_string[:3] == "naj": naj_search_string = search_string[3:] - if naj_search_string in lemma_dict: - return [lemma_dict[naj_search_string]] - if search_string in lemma_dict: - return [lemma_dict[search_string]] + if naj_search_string in lookup_table: + return [lookup_table[naj_search_string]] + if search_string in lookup_table: + return [lookup_table[search_string]] if string[:3] == "naj": naj_search_string = string[3:] - if naj_search_string in lemma_dict: - return [lemma_dict[naj_search_string]] + if naj_search_string in lookup_table: + return [lookup_table[naj_search_string]] - return [lemma_dict.get(string, string)] + return [lookup_table.get(string, string)] - def lemmatize_verb(self, string, morphology): + def lemmatize_verb(self, string, morphology, lookup_table): # this method utilizes a different procedure for verbs # with 'nie' prefix - lemma_dict = self.lemma_lookups["VERB"] - if string[:3] == "nie": search_string = string[3:] - if search_string in lemma_dict: - return [lemma_dict[search_string]] + if search_string in lookup_table: + return [lookup_table[search_string]] - return [lemma_dict.get(string, string)] + return [lookup_table.get(string, string)] - def lemmatize_noun(self, string, morphology): + def lemmatize_noun(self, string, morphology, lookup_table): # this method is case-sensitive, in order to work # for incorrectly tagged proper names - lemma_dict = self.lemma_lookups["NOUN"] - if string != string.lower(): - if string.lower() in lemma_dict: - return [lemma_dict[string.lower()]] - elif string in lemma_dict: - return [lemma_dict[string]] + if string.lower() in lookup_table: + return [lookup_table[string.lower()]] + elif string in lookup_table: + return [lookup_table[string]] return [string.lower()] - return [lemma_dict.get(string, string)] + return [lookup_table.get(string, string)] def lookup(self, string, orth=None): return string.lower()