mirror of https://github.com/explosion/spaCy.git
Merge pull request #866 from juanmirocks/master
Fix lemmatization of OOV words
This commit is contained in:
commit
fea9fe08af
|
@ -70,11 +70,14 @@ def lemmatize(string, index, exceptions, rules):
|
|||
#if string in index:
|
||||
# forms.append(string)
|
||||
forms.extend(exceptions.get(string, []))
|
||||
oov_forms = []
|
||||
for old, new in rules:
|
||||
if string.endswith(old):
|
||||
form = string[:len(string) - len(old)] + new
|
||||
if form in index or not form.isalpha():
|
||||
forms.append(form)
|
||||
else:
|
||||
oov_forms.append(form)
|
||||
if not forms:
|
||||
forms.append(string)
|
||||
forms.extend(oov_forms)
|
||||
return set(forms)
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
# Note: "chromosomes" worked previous the bug fix
|
||||
@pytest.mark.parametrize('word,lemmas', [("chromosomes", ["chromosome"]), ("endosomes", ["endosome"]), ("colocalizes", ["colocalize", "colocaliz"])])
|
||||
def test_issue781(path, lemmatizer, word, lemmas):
|
||||
assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == set(lemmas)
|
Loading…
Reference in New Issue