* Add WordNet lemmatizer

2014-12-08 01:39:13 +11:00 · 2014-12-08 01:39:13 +11:00 · 7b68f911cf
parent c20dd79748
commit 7b68f911cf
3 changed files with 121 additions and 0 deletions
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -0,0 +1,87 @@
+from os import path
+
+
+NOUN_RULES = (
+    ('s', ''),
+    ('ses', 's'),
+    ('ves', 'f'),
+    ('xes', 'x'),
+    ('zes', 'z'),
+    ('ches', 'ch'),
+    ('shes', 'sh'),
+    ('men', 'man'),
+    ('ies', 'y')
+)
+
+
+VERB_RULES = (
+    ("s", ""),
+    ("ies", "y"),
+    ("es", "e"),
+    ("es", ""),
+    ("ed", "e"),
+    ("ed", ""),
+    ("ing", "e"),
+    ("ing", "")
+)
+
+
+ADJ_RULES = (
+    ("er", ""),
+    ("est", ""),
+    ("er", "e"),
+    ("est", "e")
+)
+
+
+class Lemmatizer(object):
+    def __init__(self, wn_dict_dir):
+        self.index = {}
+        self.exc = {}
+        for pos in ['adj', 'adv', 'noun', 'verb']:
+            self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos))
+            self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos))
+
+    def noun(self, string):
+        return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES)
+
+    def verb(self, string):
+        return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES)
+
+    def adj(self, string):
+        return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES)
+
+
+def lemmatize(string, index, exceptions, rules):
+    forms = []
+    if string in index:
+        forms.append(string)
+    forms.extend(exceptions.get(string, []))
+    for old, new in rules:
+        if string.endswith(old):
+            form = string[:len(string) - len(old)] + new
+            if form in index:
+                forms.append(form)
+    return set(forms)
+
+
+def read_index(loc):
+    index = set()
+    for line in open(loc):
+        if line.startswith(' '):
+            continue
+        pieces = line.split()
+        word = pieces[0]
+        if word.count('_') == 0:
+            index.add(word)
+    return index
+
+
+def read_exc(loc):
+    exceptions = {}
+    for line in open(loc):
+        if line.startswith(' '):
+            continue
+        pieces = line.split()
+        exceptions[pieces[0]] = tuple(pieces[1:])
+    return exceptions
--- a/tests/depr_test_ner.py
+++ b/tests/depr_test_ner.py
--- a/tests/test_lemmatizer.py
+++ b/tests/test_lemmatizer.py
@ -0,0 +1,34 @@
+from spacy.lemmatizer import Lemmatizer, read_index, read_exc
+from spacy.util import DATA_DIR
+from os import path
+
+import pytest
+
+
+def test_read_index():
+    wn = path.join(DATA_DIR, 'wordnet')
+    index = read_index(path.join(wn, 'index.noun'))
+    assert 'man' in index
+    assert 'plantes' not in index
+    assert 'plant' in index
+
+
+def test_read_exc():
+    wn = path.join(DATA_DIR, 'wordnet')
+    exc = read_exc(path.join(wn, 'verb.exc'))
+    assert exc['was'] == ('be',)
+
+
+@pytest.fixture
+def lemmatizer():
+    return Lemmatizer(path.join(DATA_DIR, 'wordnet'))
+
+
+def test_noun_lemmas(lemmatizer):
+    do = lemmatizer.noun
+
+    assert do('aardwolves') == set(['aardwolf'])
+    assert do('aardwolf') == set(['aardwolf'])
+    assert do('planets') == set(['planet'])
+    assert do('ring') == set(['ring'])
+    assert do('axes') == set(['axis', 'axe', 'ax'])