Update base-form check in lemmatizer, for UD 2.0 morphology

This commit is contained in:
Matthew Honnibal 2017-03-16 17:59:31 -05:00
parent 1e10383e1b
commit c4351e1165
1 changed files with 6 additions and 0 deletions

View File

@ -6,6 +6,7 @@ import ujson as json
from .en.lemmatizer import INDEX, EXC, RULES
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
from .symbols import VerbForm_inf, VerbForm_none
class Lemmatizer(object):
@ -43,10 +44,13 @@ class Lemmatizer(object):
avoid lemmatization entirely.'''
morphology = {} if morphology is None else morphology
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
true_morph_key = morphology.get('morph', 0)
if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others:
return True
elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others:
return True
elif true_morph_key in (VerbForm_inf, VerbForm_none):
return True
else:
return False
@ -80,4 +84,6 @@ def lemmatize(string, index, exceptions, rules):
oov_forms.append(form)
if not forms:
forms.extend(oov_forms)
if not forms:
forms.append(string)
return set(forms)