From 4eb3405df770c79f0823404a8934a7bf0b4bd5d9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 5 Jul 2018 13:49:29 +0200 Subject: [PATCH] Fix lemmatizer ordering, re Issue #1387 --- spacy/lemmatizer.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index ee1a35ef1..1738b5e5e 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -93,7 +93,6 @@ def lemmatize(string, index, exceptions, rules): orig = string string = string.lower() forms = [] - forms.extend(exceptions.get(string, [])) oov_forms = [] for old, new in rules: if string.endswith(old): @@ -104,8 +103,18 @@ def lemmatize(string, index, exceptions, rules): forms.append(form) else: oov_forms.append(form) + # Remove duplicates, and sort forms generated by rules alphabetically. + forms = list(set(forms)) + forms.sort() + # Put exceptions at the front of the list, so they get priority. + # This is a dodgy heuristic -- but it's the best we can do until we get + # frequencies on this. We can at least prune out problematic exceptions, + # if they shadow more frequent analyses. + for form in exceptions.get(string, []): + if form not in forms: + forms.insert(0, form) if not forms: forms.extend(oov_forms) if not forms: forms.append(orig) - return list(set(forms)) + return forms