mirror of https://github.com/explosion/spaCy.git
Merge pull request #866 from juanmirocks/master
Fix lemmatization of OOV words
This commit is contained in:
commit
fea9fe08af
|
@ -70,11 +70,14 @@ def lemmatize(string, index, exceptions, rules):
|
||||||
#if string in index:
|
#if string in index:
|
||||||
# forms.append(string)
|
# forms.append(string)
|
||||||
forms.extend(exceptions.get(string, []))
|
forms.extend(exceptions.get(string, []))
|
||||||
|
oov_forms = []
|
||||||
for old, new in rules:
|
for old, new in rules:
|
||||||
if string.endswith(old):
|
if string.endswith(old):
|
||||||
form = string[:len(string) - len(old)] + new
|
form = string[:len(string) - len(old)] + new
|
||||||
if form in index or not form.isalpha():
|
if form in index or not form.isalpha():
|
||||||
forms.append(form)
|
forms.append(form)
|
||||||
|
else:
|
||||||
|
oov_forms.append(form)
|
||||||
if not forms:
|
if not forms:
|
||||||
forms.append(string)
|
forms.extend(oov_forms)
|
||||||
return set(forms)
|
return set(forms)
|
||||||
|
|
|
@ -0,0 +1,10 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
# Note: "chromosomes" worked previous the bug fix
|
||||||
|
@pytest.mark.parametrize('word,lemmas', [("chromosomes", ["chromosome"]), ("endosomes", ["endosome"]), ("colocalizes", ["colocalize", "colocaliz"])])
|
||||||
|
def test_issue781(path, lemmatizer, word, lemmas):
|
||||||
|
assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == set(lemmas)
|
Loading…
Reference in New Issue