Update base-form check in lemmatizer, for UD 2.0 morphology

This commit is contained in:
Matthew Honnibal 2017-03-16 17:59:31 -05:00
parent 1e10383e1b
commit c4351e1165
1 changed files with 6 additions and 0 deletions

View File

@ -6,6 +6,7 @@ import ujson as json
from .en.lemmatizer import INDEX, EXC, RULES from .en.lemmatizer import INDEX, EXC, RULES
from .symbols import POS, NOUN, VERB, ADJ, PUNCT from .symbols import POS, NOUN, VERB, ADJ, PUNCT
from .symbols import VerbForm_inf, VerbForm_none
class Lemmatizer(object): class Lemmatizer(object):
@ -43,10 +44,13 @@ class Lemmatizer(object):
avoid lemmatization entirely.''' avoid lemmatization entirely.'''
morphology = {} if morphology is None else morphology morphology = {} if morphology is None else morphology
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')] others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
true_morph_key = morphology.get('morph', 0)
if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others: if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others:
return True return True
elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others: elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others:
return True return True
elif true_morph_key in (VerbForm_inf, VerbForm_none):
return True
else: else:
return False return False
@ -80,4 +84,6 @@ def lemmatize(string, index, exceptions, rules):
oov_forms.append(form) oov_forms.append(form)
if not forms: if not forms:
forms.extend(oov_forms) forms.extend(oov_forms)
if not forms:
forms.append(string)
return set(forms) return set(forms)