From 7b68f911cf882d5f2694eb7ea26eddf37b9c9070 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 8 Dec 2014 01:39:13 +1100 Subject: [PATCH] * Add WordNet lemmatizer --- spacy/lemmatizer.py | 87 +++++++++++++++++++++++++ tests/{test_ner.py => depr_test_ner.py} | 0 tests/test_lemmatizer.py | 34 ++++++++++ 3 files changed, 121 insertions(+) create mode 100644 spacy/lemmatizer.py rename tests/{test_ner.py => depr_test_ner.py} (100%) create mode 100644 tests/test_lemmatizer.py diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py new file mode 100644 index 000000000..a42a5daee --- /dev/null +++ b/spacy/lemmatizer.py @@ -0,0 +1,87 @@ +from os import path + + +NOUN_RULES = ( + ('s', ''), + ('ses', 's'), + ('ves', 'f'), + ('xes', 'x'), + ('zes', 'z'), + ('ches', 'ch'), + ('shes', 'sh'), + ('men', 'man'), + ('ies', 'y') +) + + +VERB_RULES = ( + ("s", ""), + ("ies", "y"), + ("es", "e"), + ("es", ""), + ("ed", "e"), + ("ed", ""), + ("ing", "e"), + ("ing", "") +) + + +ADJ_RULES = ( + ("er", ""), + ("est", ""), + ("er", "e"), + ("est", "e") +) + + +class Lemmatizer(object): + def __init__(self, wn_dict_dir): + self.index = {} + self.exc = {} + for pos in ['adj', 'adv', 'noun', 'verb']: + self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos)) + self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos)) + + def noun(self, string): + return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES) + + def verb(self, string): + return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES) + + def adj(self, string): + return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES) + + +def lemmatize(string, index, exceptions, rules): + forms = [] + if string in index: + forms.append(string) + forms.extend(exceptions.get(string, [])) + for old, new in rules: + if string.endswith(old): + form = string[:len(string) - len(old)] + new + if form in index: + forms.append(form) + return set(forms) + + +def read_index(loc): + index = set() + for line in open(loc): + if line.startswith(' '): + continue + pieces = line.split() + word = pieces[0] + if word.count('_') == 0: + index.add(word) + return index + + +def read_exc(loc): + exceptions = {} + for line in open(loc): + if line.startswith(' '): + continue + pieces = line.split() + exceptions[pieces[0]] = tuple(pieces[1:]) + return exceptions diff --git a/tests/test_ner.py b/tests/depr_test_ner.py similarity index 100% rename from tests/test_ner.py rename to tests/depr_test_ner.py diff --git a/tests/test_lemmatizer.py b/tests/test_lemmatizer.py new file mode 100644 index 000000000..2047e4d2c --- /dev/null +++ b/tests/test_lemmatizer.py @@ -0,0 +1,34 @@ +from spacy.lemmatizer import Lemmatizer, read_index, read_exc +from spacy.util import DATA_DIR +from os import path + +import pytest + + +def test_read_index(): + wn = path.join(DATA_DIR, 'wordnet') + index = read_index(path.join(wn, 'index.noun')) + assert 'man' in index + assert 'plantes' not in index + assert 'plant' in index + + +def test_read_exc(): + wn = path.join(DATA_DIR, 'wordnet') + exc = read_exc(path.join(wn, 'verb.exc')) + assert exc['was'] == ('be',) + + +@pytest.fixture +def lemmatizer(): + return Lemmatizer(path.join(DATA_DIR, 'wordnet')) + + +def test_noun_lemmas(lemmatizer): + do = lemmatizer.noun + + assert do('aardwolves') == set(['aardwolf']) + assert do('aardwolf') == set(['aardwolf']) + assert do('planets') == set(['planet']) + assert do('ring') == set(['ring']) + assert do('axes') == set(['axis', 'axe', 'ax'])