Merge pull request #866 from juanmirocks/master

Fix lemmatization of OOV words
2017-03-16 23:37:36 +01:00 · 2017-03-16 23:37:36 +01:00 · fea9fe08af
parent 28bb546939 25c29f072d
commit fea9fe08af
2 changed files with 14 additions and 1 deletions
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -70,11 +70,14 @@ def lemmatize(string, index, exceptions, rules):
    #if string in index:
    #    forms.append(string)
    forms.extend(exceptions.get(string, []))
    oov_forms = []
    for old, new in rules:
        if string.endswith(old):
            form = string[:len(string) - len(old)] + new
            if form in index or not form.isalpha():
                forms.append(form)
            else:
                oov_forms.append(form)
    if not forms:
-        forms.append(string)
+        forms.extend(oov_forms)
    return set(forms)
--- a/spacy/tests/regression/test_issue781.py
+++ b/spacy/tests/regression/test_issue781.py
@ -0,0 +1,10 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
 # Note: "chromosomes" worked previous the bug fix
@pytest.mark.parametrize('word,lemmas', [("chromosomes", ["chromosome"]), ("endosomes", ["endosome"]), ("colocalizes", ["colocalize", "colocaliz"])])
 def test_issue781(path, lemmatizer, word, lemmas):
    assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == set(lemmas)