Use new Lemmatizer data and remove file import

Since there's currently only an English lemmatizer, the global Lemmatizer imports from spacy.en. This is unideal and still needs to be fixed.
2017-03-12 13:48:27 +01:00 · 2017-03-12 13:48:27 +01:00 · 1da29a7146
parent 0957737ee8
commit 1da29a7146
2 changed files with 4 additions and 63 deletions
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -4,32 +4,16 @@ import pathlib
 import ujson as json
 from .en.lemmatizer import INDEX, EXC, RULES
 from .symbols import POS, NOUN, VERB, ADJ, PUNCT
 class Lemmatizer(object):
    @classmethod
    def load(cls, path, rules=None):
-        index = {}
+        index = dict(INDEX)
-        exc = {}
+        exc = dict(EXC)
-        for pos in ['adj', 'noun', 'verb']:
+        rules = dict(RULES)
            pos_index_path = path / 'wordnet' / 'index.{pos}'.format(pos=pos)
            if pos_index_path.exists():
                with pos_index_path.open() as file_:
                    index[pos] = read_index(file_)
            else:
                index[pos] = set()
            pos_exc_path = path / 'wordnet' / '{pos}.exc'.format(pos=pos)
            if pos_exc_path.exists():
                with pos_exc_path.open() as file_:
                    exc[pos] = read_exc(file_)
            else:
                exc[pos] = {}
        if rules is None and (path / 'vocab' / 'lemma_rules.json').exists():
            with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_:
                rules = json.load(file_)
        elif rules is None:
            rules = {}
        return cls(index, exc, rules)
    def __init__(self, index, exceptions, rules):
@ -94,25 +78,3 @@ def lemmatize(string, index, exceptions, rules):
    if not forms:
        forms.append(string)
    return set(forms)
 def read_index(fileobj):
    index = set()
    for line in fileobj:
        if line.startswith(' '):
            continue
        pieces = line.split()
        word = pieces[0]
        if word.count('_') == 0:
            index.add(word)
    return index
 def read_exc(fileobj):
    exceptions = {}
    for line in fileobj:
        if line.startswith(' '):
            continue
        pieces = line.split()
        exceptions[pieces[0]] = tuple(pieces[1:])
    return exceptions
--- a/spacy/tests/tagger/test_lemmatizer.py
+++ b/spacy/tests/tagger/test_lemmatizer.py
@ -1,8 +1,6 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from ...lemmatizer import read_index, read_exc
 import pytest
@ -41,25 +39,6 @@ def test_tagger_lemmatizer_punct(lemmatizer):
    assert lemmatizer.punct('“') == set(['"'])
@pytest.mark.models
 def test_tagger_lemmatizer_read_index(path):
    if path is not None:
        with (path / 'wordnet' / 'index.noun').open() as file_:
            index = read_index(file_)
        assert 'man' in index
        assert 'plantes' not in index
        assert 'plant' in index
@pytest.mark.models
@pytest.mark.parametrize('text,lemma', [("was", "be")])
 def test_tagger_lemmatizer_read_exc(path, text, lemma):
    if path is not None:
        with (path / 'wordnet' / 'verb.exc').open() as file_:
            exc = read_exc(file_)
        assert exc[text] == (lemma,)
@pytest.mark.models
 def test_tagger_lemmatizer_lemma_assignment(EN):
    text = "Bananas in pyjamas are geese."