* Generalize lemmatizer

This commit is contained in:
Matthew Honnibal 2015-08-25 15:46:19 +02:00
parent 8083a07c3e
commit 82217c6ec6
1 changed files with 5 additions and 36 deletions

View File

@ -3,39 +3,6 @@ from os import path
import codecs import codecs
NOUN_RULES = (
('s', ''),
('ses', 's'),
('ves', 'f'),
('xes', 'x'),
('zes', 'z'),
('ches', 'ch'),
('shes', 'sh'),
('men', 'man'),
('ies', 'y')
)
VERB_RULES = (
("s", ""),
("ies", "y"),
("es", "e"),
("es", ""),
("ed", "e"),
("ed", ""),
("ing", "e"),
("ing", "")
)
ADJ_RULES = (
("er", ""),
("est", ""),
("er", "e"),
("est", "e")
)
class Lemmatizer(object): class Lemmatizer(object):
def __init__(self, wn_dict_dir, noun_id, verb_id, adj_id): def __init__(self, wn_dict_dir, noun_id, verb_id, adj_id):
self.noun_id = noun_id self.noun_id = noun_id
@ -48,6 +15,8 @@ class Lemmatizer(object):
self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos)) self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos))
def __call__(self, string, pos): def __call__(self, string, pos):
return lemmatize(string, self.index[pos], self.exc[pos], self.rules[pos])
if pos == self.noun_id: if pos == self.noun_id:
return self.noun(string) return self.noun(string)
elif pos == self.verb_id: elif pos == self.verb_id:
@ -58,13 +27,13 @@ class Lemmatizer(object):
raise Exception("Cannot lemmatize with unknown pos: %s" % pos) raise Exception("Cannot lemmatize with unknown pos: %s" % pos)
def noun(self, string): def noun(self, string):
return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES) return self(string, 'noun')
def verb(self, string): def verb(self, string):
return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES) return self(string, 'verb')
def adj(self, string): def adj(self, string):
return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES) return self(string, 'adj')
def lemmatize(string, index, exceptions, rules): def lemmatize(string, index, exceptions, rules):