mirror of https://github.com/explosion/spaCy.git
Update base-form check in lemmatizer, for UD 2.0 morphology
This commit is contained in:
parent
1e10383e1b
commit
c4351e1165
|
@ -6,6 +6,7 @@ import ujson as json
|
||||||
|
|
||||||
from .en.lemmatizer import INDEX, EXC, RULES
|
from .en.lemmatizer import INDEX, EXC, RULES
|
||||||
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
|
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
|
||||||
|
from .symbols import VerbForm_inf, VerbForm_none
|
||||||
|
|
||||||
|
|
||||||
class Lemmatizer(object):
|
class Lemmatizer(object):
|
||||||
|
@ -43,10 +44,13 @@ class Lemmatizer(object):
|
||||||
avoid lemmatization entirely.'''
|
avoid lemmatization entirely.'''
|
||||||
morphology = {} if morphology is None else morphology
|
morphology = {} if morphology is None else morphology
|
||||||
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
|
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
|
||||||
|
true_morph_key = morphology.get('morph', 0)
|
||||||
if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others:
|
if univ_pos == 'noun' and morphology.get('number') == 'sing' and not others:
|
||||||
return True
|
return True
|
||||||
elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others:
|
elif univ_pos == 'verb' and morphology.get('verbform') == 'inf' and not others:
|
||||||
return True
|
return True
|
||||||
|
elif true_morph_key in (VerbForm_inf, VerbForm_none):
|
||||||
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -80,4 +84,6 @@ def lemmatize(string, index, exceptions, rules):
|
||||||
oov_forms.append(form)
|
oov_forms.append(form)
|
||||||
if not forms:
|
if not forms:
|
||||||
forms.extend(oov_forms)
|
forms.extend(oov_forms)
|
||||||
|
if not forms:
|
||||||
|
forms.append(string)
|
||||||
return set(forms)
|
return set(forms)
|
||||||
|
|
Loading…
Reference in New Issue