Fix lemmatizer rules

This commit is contained in:
Matthew Honnibal 2017-09-06 19:13:24 +02:00
parent dd9cab0faf
commit 5c3ff06924
1 changed files with 6 additions and 2 deletions

View File

@ -25,6 +25,7 @@ class Lemmatizer(object):
elif univ_pos == PUNCT: elif univ_pos == PUNCT:
univ_pos = 'punct' univ_pos = 'punct'
# See Issue #435 for example of where this logic is requied. # See Issue #435 for example of where this logic is requied.
print("Check base form", string)
if self.is_base_form(univ_pos, morphology): if self.is_base_form(univ_pos, morphology):
return set([string.lower()]) return set([string.lower()])
lemmas = lemmatize(string, self.index.get(univ_pos, {}), lemmas = lemmatize(string, self.index.get(univ_pos, {}),
@ -38,7 +39,8 @@ class Lemmatizer(object):
avoid lemmatization entirely. avoid lemmatization entirely.
""" """
morphology = {} if morphology is None else morphology morphology = {} if morphology is None else morphology
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')] others = [key for key in morphology
if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')]
true_morph_key = morphology.get('morph', 0) true_morph_key = morphology.get('morph', 0)
if univ_pos == 'noun' and morphology.get('Number') == 'sing': if univ_pos == 'noun' and morphology.get('Number') == 'sing':
return True return True
@ -47,7 +49,9 @@ class Lemmatizer(object):
# This maps 'VBP' to base form -- probably just need 'IS_BASE' # This maps 'VBP' to base form -- probably just need 'IS_BASE'
# morphology # morphology
elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \ elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \
morphology.get('Tense') == 'pres'): morphology.get('Tense') == 'pres' and \
morphology.get('Number') is None and \
not others):
return True return True
elif univ_pos == 'adj' and morphology.get('Degree') == 'pos': elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
return True return True