From 7b68f911cf882d5f2694eb7ea26eddf37b9c9070 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 8 Dec 2014 01:39:13 +1100
Subject: [PATCH] * Add WordNet lemmatizer

---
 spacy/lemmatizer.py                     | 87 +++++++++++++++++++++++++
 tests/{test_ner.py => depr_test_ner.py} |  0
 tests/test_lemmatizer.py                | 34 ++++++++++
 3 files changed, 121 insertions(+)
 create mode 100644 spacy/lemmatizer.py
 rename tests/{test_ner.py => depr_test_ner.py} (100%)
 create mode 100644 tests/test_lemmatizer.py

diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
new file mode 100644
index 000000000..a42a5daee
--- /dev/null
+++ b/spacy/lemmatizer.py
@@ -0,0 +1,87 @@
+from os import path
+
+
+NOUN_RULES = (
+    ('s', ''),
+    ('ses', 's'),
+    ('ves', 'f'),
+    ('xes', 'x'),
+    ('zes', 'z'),
+    ('ches', 'ch'),
+    ('shes', 'sh'),
+    ('men', 'man'),
+    ('ies', 'y')
+)
+
+
+VERB_RULES = (
+    ("s", ""),
+    ("ies", "y"),
+    ("es", "e"),
+    ("es", ""),
+    ("ed", "e"),
+    ("ed", ""),
+    ("ing", "e"),
+    ("ing", "")
+)
+
+
+ADJ_RULES = (
+    ("er", ""),
+    ("est", ""),
+    ("er", "e"),
+    ("est", "e")
+)
+
+
+class Lemmatizer(object):
+    def __init__(self, wn_dict_dir):
+        self.index = {}
+        self.exc = {}
+        for pos in ['adj', 'adv', 'noun', 'verb']:
+            self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos))
+            self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos))
+
+    def noun(self, string):
+        return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES)
+
+    def verb(self, string):
+        return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES)
+
+    def adj(self, string):
+        return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES)
+
+
+def lemmatize(string, index, exceptions, rules):
+    forms = []
+    if string in index:
+        forms.append(string)
+    forms.extend(exceptions.get(string, []))
+    for old, new in rules:
+        if string.endswith(old):
+            form = string[:len(string) - len(old)] + new
+            if form in index:
+                forms.append(form)
+    return set(forms)
+
+
+def read_index(loc):
+    index = set()
+    for line in open(loc):
+        if line.startswith(' '):
+            continue
+        pieces = line.split()
+        word = pieces[0]
+        if word.count('_') == 0:
+            index.add(word)
+    return index
+
+
+def read_exc(loc):
+    exceptions = {}
+    for line in open(loc):
+        if line.startswith(' '):
+            continue
+        pieces = line.split()
+        exceptions[pieces[0]] = tuple(pieces[1:])
+    return exceptions
diff --git a/tests/test_ner.py b/tests/depr_test_ner.py
similarity index 100%
rename from tests/test_ner.py
rename to tests/depr_test_ner.py
diff --git a/tests/test_lemmatizer.py b/tests/test_lemmatizer.py
new file mode 100644
index 000000000..2047e4d2c
--- /dev/null
+++ b/tests/test_lemmatizer.py
@@ -0,0 +1,34 @@
+from spacy.lemmatizer import Lemmatizer, read_index, read_exc
+from spacy.util import DATA_DIR
+from os import path
+
+import pytest
+
+
+def test_read_index():
+    wn = path.join(DATA_DIR, 'wordnet')
+    index = read_index(path.join(wn, 'index.noun'))
+    assert 'man' in index
+    assert 'plantes' not in index
+    assert 'plant' in index
+
+
+def test_read_exc():
+    wn = path.join(DATA_DIR, 'wordnet')
+    exc = read_exc(path.join(wn, 'verb.exc'))
+    assert exc['was'] == ('be',)
+
+
+@pytest.fixture
+def lemmatizer():
+    return Lemmatizer(path.join(DATA_DIR, 'wordnet'))
+
+
+def test_noun_lemmas(lemmatizer):
+    do = lemmatizer.noun
+
+    assert do('aardwolves') == set(['aardwolf'])
+    assert do('aardwolf') == set(['aardwolf'])
+    assert do('planets') == set(['planet'])
+    assert do('ring') == set(['ring'])
+    assert do('axes') == set(['axis', 'axe', 'ax'])