Use new Lemmatizer data and remove file import

Since there's currently only an English lemmatizer, the global Lemmatizer imports from spacy.en. This is unideal and still needs to be fixed.
2017-03-12 13:48:27 +01:00 · 2017-03-12 13:48:27 +01:00 · 1da29a7146
parent 0957737ee8
commit 1da29a7146
2 changed files with 4 additions and 63 deletions
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -4,32 +4,16 @@ import pathlib

 import ujson as json

+from .en.lemmatizer import INDEX, EXC, RULES
 from .symbols import POS, NOUN, VERB, ADJ, PUNCT


 class Lemmatizer(object):
    @classmethod
    def load(cls, path, rules=None):
-        index = {}
-        exc = {}
-        for pos in ['adj', 'noun', 'verb']:
-            pos_index_path = path / 'wordnet' / 'index.{pos}'.format(pos=pos)
-            if pos_index_path.exists():
-                with pos_index_path.open() as file_:
-                    index[pos] = read_index(file_)
-            else:
-                index[pos] = set()
-            pos_exc_path = path / 'wordnet' / '{pos}.exc'.format(pos=pos)
-            if pos_exc_path.exists():
-                with pos_exc_path.open() as file_:
-                    exc[pos] = read_exc(file_)
-            else:
-                exc[pos] = {}
-        if rules is None and (path / 'vocab' / 'lemma_rules.json').exists():
-            with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_:
-                rules = json.load(file_)
-        elif rules is None:
-            rules = {}
+        index = dict(INDEX)
+        exc = dict(EXC)
+        rules = dict(RULES)
        return cls(index, exc, rules)

    def __init__(self, index, exceptions, rules):
@ -94,25 +78,3 @@ def lemmatize(string, index, exceptions, rules):
    if not forms:
        forms.append(string)
    return set(forms)
-
-
-def read_index(fileobj):
-    index = set()
-    for line in fileobj:
-        if line.startswith(' '):
-            continue
-        pieces = line.split()
-        word = pieces[0]
-        if word.count('_') == 0:
-            index.add(word)
-    return index
-
-
-def read_exc(fileobj):
-    exceptions = {}
-    for line in fileobj:
-        if line.startswith(' '):
-            continue
-        pieces = line.split()
-        exceptions[pieces[0]] = tuple(pieces[1:])
-    return exceptions
--- a/spacy/tests/tagger/test_lemmatizer.py
+++ b/spacy/tests/tagger/test_lemmatizer.py
@ -1,8 +1,6 @@
 # coding: utf-8
 from __future__ import unicode_literals

-from ...lemmatizer import read_index, read_exc
-
 import pytest


@ -41,25 +39,6 @@ def test_tagger_lemmatizer_punct(lemmatizer):
    assert lemmatizer.punct('“') == set(['"'])


-@pytest.mark.models
-def test_tagger_lemmatizer_read_index(path):
-    if path is not None:
-        with (path / 'wordnet' / 'index.noun').open() as file_:
-            index = read_index(file_)
-        assert 'man' in index
-        assert 'plantes' not in index
-        assert 'plant' in index
-
-
-@pytest.mark.models
-@pytest.mark.parametrize('text,lemma', [("was", "be")])
-def test_tagger_lemmatizer_read_exc(path, text, lemma):
-    if path is not None:
-        with (path / 'wordnet' / 'verb.exc').open() as file_:
-            exc = read_exc(file_)
-        assert exc[text] == (lemma,)
-
-
@pytest.mark.models
 def test_tagger_lemmatizer_lemma_assignment(EN):
    text = "Bananas in pyjamas are geese."