diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 323dddd3a..e7960dcd2 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -70,11 +70,14 @@ def lemmatize(string, index, exceptions, rules): #if string in index: # forms.append(string) forms.extend(exceptions.get(string, [])) + oov_forms = [] for old, new in rules: if string.endswith(old): form = string[:len(string) - len(old)] + new if form in index or not form.isalpha(): forms.append(form) + else: + oov_forms.append(form) if not forms: - forms.append(string) + forms.extend(oov_forms) return set(forms) diff --git a/spacy/tests/regression/test_issue781.py b/spacy/tests/regression/test_issue781.py new file mode 100644 index 000000000..8a98b48b8 --- /dev/null +++ b/spacy/tests/regression/test_issue781.py @@ -0,0 +1,10 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +# Note: "chromosomes" worked previous the bug fix +@pytest.mark.parametrize('word,lemmas', [("chromosomes", ["chromosome"]), ("endosomes", ["endosome"]), ("colocalizes", ["colocalize", "colocaliz"])]) +def test_issue781(path, lemmatizer, word, lemmas): + assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == set(lemmas)