mirror of https://github.com/explosion/spaCy.git
Fix lemmatizer ordering, re Issue #1387
This commit is contained in:
parent
a0db2e0077
commit
4eb3405df7
|
@ -93,7 +93,6 @@ def lemmatize(string, index, exceptions, rules):
|
||||||
orig = string
|
orig = string
|
||||||
string = string.lower()
|
string = string.lower()
|
||||||
forms = []
|
forms = []
|
||||||
forms.extend(exceptions.get(string, []))
|
|
||||||
oov_forms = []
|
oov_forms = []
|
||||||
for old, new in rules:
|
for old, new in rules:
|
||||||
if string.endswith(old):
|
if string.endswith(old):
|
||||||
|
@ -104,8 +103,18 @@ def lemmatize(string, index, exceptions, rules):
|
||||||
forms.append(form)
|
forms.append(form)
|
||||||
else:
|
else:
|
||||||
oov_forms.append(form)
|
oov_forms.append(form)
|
||||||
|
# Remove duplicates, and sort forms generated by rules alphabetically.
|
||||||
|
forms = list(set(forms))
|
||||||
|
forms.sort()
|
||||||
|
# Put exceptions at the front of the list, so they get priority.
|
||||||
|
# This is a dodgy heuristic -- but it's the best we can do until we get
|
||||||
|
# frequencies on this. We can at least prune out problematic exceptions,
|
||||||
|
# if they shadow more frequent analyses.
|
||||||
|
for form in exceptions.get(string, []):
|
||||||
|
if form not in forms:
|
||||||
|
forms.insert(0, form)
|
||||||
if not forms:
|
if not forms:
|
||||||
forms.extend(oov_forms)
|
forms.extend(oov_forms)
|
||||||
if not forms:
|
if not forms:
|
||||||
forms.append(orig)
|
forms.append(orig)
|
||||||
return list(set(forms))
|
return forms
|
||||||
|
|
Loading…
Reference in New Issue