mirror of https://github.com/explosion/spaCy.git
Block lemmatization of base-form adjectives
Fixes check that an adjective is a base form (as opposed to a comparative or superlative), so that it's not lemmatized. e.g. inner -!> inn. Closes #912.
This commit is contained in:
parent
97814f8da6
commit
4454c1b23f
|
@ -7,6 +7,8 @@ import ujson as json
|
||||||
from .en.lemmatizer import INDEX, EXC, RULES
|
from .en.lemmatizer import INDEX, EXC, RULES
|
||||||
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
|
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
|
||||||
from .symbols import VerbForm_inf, VerbForm_none
|
from .symbols import VerbForm_inf, VerbForm_none
|
||||||
|
from .symbols import Number_sing
|
||||||
|
from .symbols import Degree_pos
|
||||||
|
|
||||||
|
|
||||||
class Lemmatizer(object):
|
class Lemmatizer(object):
|
||||||
|
@ -42,6 +44,7 @@ class Lemmatizer(object):
|
||||||
def is_base_form(self, univ_pos, morphology=None):
|
def is_base_form(self, univ_pos, morphology=None):
|
||||||
'''Check whether we're dealing with an uninflected paradigm, so we can
|
'''Check whether we're dealing with an uninflected paradigm, so we can
|
||||||
avoid lemmatization entirely.'''
|
avoid lemmatization entirely.'''
|
||||||
|
print("Is base form?", univ_pos, morphology)
|
||||||
morphology = {} if morphology is None else morphology
|
morphology = {} if morphology is None else morphology
|
||||||
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
|
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
|
||||||
true_morph_key = morphology.get('morph', 0)
|
true_morph_key = morphology.get('morph', 0)
|
||||||
|
@ -49,7 +52,10 @@ class Lemmatizer(object):
|
||||||
return True
|
return True
|
||||||
elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others:
|
elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others:
|
||||||
return True
|
return True
|
||||||
elif true_morph_key in (VerbForm_inf, VerbForm_none):
|
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
|
||||||
|
return True
|
||||||
|
elif true_morph_key in \
|
||||||
|
(VerbForm_inf, VerbForm_none, Number_sing, Degree_pos):
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from ...tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,tag,lemma', [("inner", "JJ", "inner")])
|
||||||
|
def test_issue912(en_vocab, text, tag, lemma):
|
||||||
|
'''Test base-forms of adjectives are preserved.'''
|
||||||
|
doc = Doc(en_vocab, words=[text])
|
||||||
|
doc[0].tag_ = tag
|
||||||
|
assert doc[0].lemma_ == lemma
|
||||||
|
|
Loading…
Reference in New Issue