From a471114eb2c9dffff27de12772e69b58e58f4dd1 Mon Sep 17 00:00:00 2001 From: Juan Miguel Cejuela Date: Wed, 1 Mar 2017 21:30:51 +0100 Subject: [PATCH 1/3] #781 add regression test, failing previous bug fix --- spacy/tests/regression/test_issue781.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 spacy/tests/regression/test_issue781.py diff --git a/spacy/tests/regression/test_issue781.py b/spacy/tests/regression/test_issue781.py new file mode 100644 index 000000000..56a1a9ef0 --- /dev/null +++ b/spacy/tests/regression/test_issue781.py @@ -0,0 +1,10 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +# Note: "chromosomes" worked previous the bug fix +@pytest.mark.parametrize('word,lemma', [("chromosomes", "chromosome"), ("endosomes", "endosome"), ("colocalizes", "colocalize")]) +def test_issue781(path, lemmatizer, word, lemma): + assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == set([lemma]) From a8cfde46d33076672aa4fdbfa0f448e0bc1c593d Mon Sep 17 00:00:00 2001 From: Juan Miguel Cejuela Date: Wed, 1 Mar 2017 21:43:08 +0100 Subject: [PATCH 2/3] =?UTF-8?q?#781=20Fix=20test=20=E2=80=94=20colocalizes?= =?UTF-8?q?=20is=20lemmatized=20to=20colocaliz=20and=20colicalize?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- spacy/tests/regression/test_issue781.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/tests/regression/test_issue781.py b/spacy/tests/regression/test_issue781.py index 56a1a9ef0..8a98b48b8 100644 --- a/spacy/tests/regression/test_issue781.py +++ b/spacy/tests/regression/test_issue781.py @@ -5,6 +5,6 @@ import pytest # Note: "chromosomes" worked previous the bug fix -@pytest.mark.parametrize('word,lemma', [("chromosomes", "chromosome"), ("endosomes", "endosome"), ("colocalizes", "colocalize")]) -def test_issue781(path, lemmatizer, word, lemma): - assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == set([lemma]) +@pytest.mark.parametrize('word,lemmas', [("chromosomes", ["chromosome"]), ("endosomes", ["endosome"]), ("colocalizes", ["colocalize", "colocaliz"])]) +def test_issue781(path, lemmatizer, word, lemmas): + assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == set(lemmas) From 25c29f072d32784290f51ac8a21490bc58405bb2 Mon Sep 17 00:00:00 2001 From: Juan Miguel Cejuela Date: Wed, 1 Mar 2017 21:44:17 +0100 Subject: [PATCH 3/3] apply patch --- spacy/lemmatizer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 960467a0b..630334bf7 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -86,13 +86,16 @@ def lemmatize(string, index, exceptions, rules): #if string in index: # forms.append(string) forms.extend(exceptions.get(string, [])) + oov_forms = [] for old, new in rules: if string.endswith(old): form = string[:len(string) - len(old)] + new if form in index or not form.isalpha(): forms.append(form) + else: + oov_forms.append(form) if not forms: - forms.append(string) + forms.extend(oov_forms) return set(forms)