diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 1aea308f9..f9e35f44a 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -1,5 +1,6 @@ # coding: utf8 from __future__ import unicode_literals +from collections import OrderedDict from .symbols import POS, NOUN, VERB, ADJ, PUNCT, PROPN from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos @@ -118,8 +119,8 @@ def lemmatize(string, index, exceptions, rules): forms.append(form) else: oov_forms.append(form) - # Remove duplicates, and sort forms generated by rules alphabetically. - forms = list(set(forms)) + # Remove duplicates but preserve the ordering of applied "rules" + forms = list(OrderedDict.fromkeys(forms)) # Put exceptions at the front of the list, so they get priority. # This is a dodgy heuristic -- but it's the best we can do until we get # frequencies on this. We can at least prune out problematic exceptions,