Merge pull request #866 from juanmirocks/master

Fix lemmatization of OOV words
This commit is contained in:
Matthew Honnibal 2017-03-16 23:37:36 +01:00 committed by GitHub
commit fea9fe08af
2 changed files with 14 additions and 1 deletions

View File

@ -70,11 +70,14 @@ def lemmatize(string, index, exceptions, rules):
#if string in index: #if string in index:
# forms.append(string) # forms.append(string)
forms.extend(exceptions.get(string, [])) forms.extend(exceptions.get(string, []))
oov_forms = []
for old, new in rules: for old, new in rules:
if string.endswith(old): if string.endswith(old):
form = string[:len(string) - len(old)] + new form = string[:len(string) - len(old)] + new
if form in index or not form.isalpha(): if form in index or not form.isalpha():
forms.append(form) forms.append(form)
else:
oov_forms.append(form)
if not forms: if not forms:
forms.append(string) forms.extend(oov_forms)
return set(forms) return set(forms)

View File

@ -0,0 +1,10 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
# Note: "chromosomes" worked previous the bug fix
@pytest.mark.parametrize('word,lemmas', [("chromosomes", ["chromosome"]), ("endosomes", ["endosome"]), ("colocalizes", ["colocalize", "colocaliz"])])
def test_issue781(path, lemmatizer, word, lemmas):
assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == set(lemmas)