Block lemmatization of base-form adjectives

Fixes check that an adjective is a base form (as opposed to a
comparative or superlative), so that it's not lemmatized.
e.g. inner -!> inn. Closes #912.
This commit is contained in:
Matthew Honnibal 2017-03-25 21:29:57 +01:00
parent 97814f8da6
commit 4454c1b23f
2 changed files with 21 additions and 1 deletions

View File

@ -7,6 +7,8 @@ import ujson as json
from .en.lemmatizer import INDEX, EXC, RULES from .en.lemmatizer import INDEX, EXC, RULES
from .symbols import POS, NOUN, VERB, ADJ, PUNCT from .symbols import POS, NOUN, VERB, ADJ, PUNCT
from .symbols import VerbForm_inf, VerbForm_none from .symbols import VerbForm_inf, VerbForm_none
from .symbols import Number_sing
from .symbols import Degree_pos
class Lemmatizer(object): class Lemmatizer(object):
@ -42,6 +44,7 @@ class Lemmatizer(object):
def is_base_form(self, univ_pos, morphology=None): def is_base_form(self, univ_pos, morphology=None):
'''Check whether we're dealing with an uninflected paradigm, so we can '''Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.''' avoid lemmatization entirely.'''
print("Is base form?", univ_pos, morphology)
morphology = {} if morphology is None else morphology morphology = {} if morphology is None else morphology
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')] others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
true_morph_key = morphology.get('morph', 0) true_morph_key = morphology.get('morph', 0)
@ -49,7 +52,10 @@ class Lemmatizer(object):
return True return True
elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others: elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others:
return True return True
elif true_morph_key in (VerbForm_inf, VerbForm_none): elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
return True
elif true_morph_key in \
(VerbForm_inf, VerbForm_none, Number_sing, Degree_pos):
return True return True
else: else:
return False return False

View File

@ -0,0 +1,14 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from ...tokens import Doc
@pytest.mark.parametrize('text,tag,lemma', [("inner", "JJ", "inner")])
def test_issue912(en_vocab, text, tag, lemma):
'''Test base-forms of adjectives are preserved.'''
doc = Doc(en_vocab, words=[text])
doc[0].tag_ = tag
assert doc[0].lemma_ == lemma