Use new Lemmatizer data and remove file import

Since there's currently only an English lemmatizer, the global
Lemmatizer imports from spacy.en. This is unideal and still needs to be
fixed.
This commit is contained in:
ines 2017-03-12 13:48:27 +01:00
parent 0957737ee8
commit 1da29a7146
2 changed files with 4 additions and 63 deletions

View File

@ -4,32 +4,16 @@ import pathlib
import ujson as json
from .en.lemmatizer import INDEX, EXC, RULES
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
class Lemmatizer(object):
@classmethod
def load(cls, path, rules=None):
index = {}
exc = {}
for pos in ['adj', 'noun', 'verb']:
pos_index_path = path / 'wordnet' / 'index.{pos}'.format(pos=pos)
if pos_index_path.exists():
with pos_index_path.open() as file_:
index[pos] = read_index(file_)
else:
index[pos] = set()
pos_exc_path = path / 'wordnet' / '{pos}.exc'.format(pos=pos)
if pos_exc_path.exists():
with pos_exc_path.open() as file_:
exc[pos] = read_exc(file_)
else:
exc[pos] = {}
if rules is None and (path / 'vocab' / 'lemma_rules.json').exists():
with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_:
rules = json.load(file_)
elif rules is None:
rules = {}
index = dict(INDEX)
exc = dict(EXC)
rules = dict(RULES)
return cls(index, exc, rules)
def __init__(self, index, exceptions, rules):
@ -94,25 +78,3 @@ def lemmatize(string, index, exceptions, rules):
if not forms:
forms.append(string)
return set(forms)
def read_index(fileobj):
index = set()
for line in fileobj:
if line.startswith(' '):
continue
pieces = line.split()
word = pieces[0]
if word.count('_') == 0:
index.add(word)
return index
def read_exc(fileobj):
exceptions = {}
for line in fileobj:
if line.startswith(' '):
continue
pieces = line.split()
exceptions[pieces[0]] = tuple(pieces[1:])
return exceptions

View File

@ -1,8 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
from ...lemmatizer import read_index, read_exc
import pytest
@ -41,25 +39,6 @@ def test_tagger_lemmatizer_punct(lemmatizer):
assert lemmatizer.punct('') == set(['"'])
@pytest.mark.models
def test_tagger_lemmatizer_read_index(path):
if path is not None:
with (path / 'wordnet' / 'index.noun').open() as file_:
index = read_index(file_)
assert 'man' in index
assert 'plantes' not in index
assert 'plant' in index
@pytest.mark.models
@pytest.mark.parametrize('text,lemma', [("was", "be")])
def test_tagger_lemmatizer_read_exc(path, text, lemma):
if path is not None:
with (path / 'wordnet' / 'verb.exc').open() as file_:
exc = read_exc(file_)
assert exc[text] == (lemma,)
@pytest.mark.models
def test_tagger_lemmatizer_lemma_assignment(EN):
text = "Bananas in pyjamas are geese."