mirror of https://github.com/explosion/spaCy.git
Prevent lemmatization of base nouns
Update lemmatizer's base-form check, for change in morphology class. Closes #903.
This commit is contained in:
parent
850d35dcb3
commit
4f400fa486
|
@ -44,18 +44,16 @@ class Lemmatizer(object):
|
|||
def is_base_form(self, univ_pos, morphology=None):
|
||||
'''Check whether we're dealing with an uninflected paradigm, so we can
|
||||
avoid lemmatization entirely.'''
|
||||
print("Is base form?", univ_pos, morphology)
|
||||
morphology = {} if morphology is None else morphology
|
||||
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
|
||||
true_morph_key = morphology.get('morph', 0)
|
||||
if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others:
|
||||
if univ_pos == 'noun' and morphology.get('Number') == 'sing' and not others:
|
||||
return True
|
||||
elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others:
|
||||
elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf' and not others:
|
||||
return True
|
||||
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
|
||||
return True
|
||||
elif true_morph_key in \
|
||||
(VerbForm_inf, VerbForm_none, Number_sing, Degree_pos):
|
||||
elif true_morph_key in (VerbForm_inf, VerbForm_none, Number_sing, Degree_pos):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from ...tokens import Doc
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,tag,lemma',
|
||||
[("anus", "NN", "anus"),
|
||||
("princess", "NN", "princess")])
|
||||
def test_issue912(en_vocab, text, tag, lemma):
|
||||
'''Test base-forms of adjectives are preserved.'''
|
||||
doc = Doc(en_vocab, words=[text])
|
||||
doc[0].tag_ = tag
|
||||
assert doc[0].lemma_ == lemma
|
||||
|
Loading…
Reference in New Issue