From 1da29a7146f2a8b8f60e1dcf5599dddb67612b95 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 12 Mar 2017 13:48:27 +0100 Subject: [PATCH] Use new Lemmatizer data and remove file import Since there's currently only an English lemmatizer, the global Lemmatizer imports from spacy.en. This is unideal and still needs to be fixed. --- spacy/lemmatizer.py | 46 +++------------------------ spacy/tests/tagger/test_lemmatizer.py | 21 ------------ 2 files changed, 4 insertions(+), 63 deletions(-) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 960467a0b..323dddd3a 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -4,32 +4,16 @@ import pathlib import ujson as json +from .en.lemmatizer import INDEX, EXC, RULES from .symbols import POS, NOUN, VERB, ADJ, PUNCT class Lemmatizer(object): @classmethod def load(cls, path, rules=None): - index = {} - exc = {} - for pos in ['adj', 'noun', 'verb']: - pos_index_path = path / 'wordnet' / 'index.{pos}'.format(pos=pos) - if pos_index_path.exists(): - with pos_index_path.open() as file_: - index[pos] = read_index(file_) - else: - index[pos] = set() - pos_exc_path = path / 'wordnet' / '{pos}.exc'.format(pos=pos) - if pos_exc_path.exists(): - with pos_exc_path.open() as file_: - exc[pos] = read_exc(file_) - else: - exc[pos] = {} - if rules is None and (path / 'vocab' / 'lemma_rules.json').exists(): - with (path / 'vocab' / 'lemma_rules.json').open('r', encoding='utf8') as file_: - rules = json.load(file_) - elif rules is None: - rules = {} + index = dict(INDEX) + exc = dict(EXC) + rules = dict(RULES) return cls(index, exc, rules) def __init__(self, index, exceptions, rules): @@ -94,25 +78,3 @@ def lemmatize(string, index, exceptions, rules): if not forms: forms.append(string) return set(forms) - - -def read_index(fileobj): - index = set() - for line in fileobj: - if line.startswith(' '): - continue - pieces = line.split() - word = pieces[0] - if word.count('_') == 0: - index.add(word) - return index - - -def read_exc(fileobj): - exceptions = {} - for line in fileobj: - if line.startswith(' '): - continue - pieces = line.split() - exceptions[pieces[0]] = tuple(pieces[1:]) - return exceptions diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py index 37ebc3518..3e2933fcd 100644 --- a/spacy/tests/tagger/test_lemmatizer.py +++ b/spacy/tests/tagger/test_lemmatizer.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -from ...lemmatizer import read_index, read_exc - import pytest @@ -41,25 +39,6 @@ def test_tagger_lemmatizer_punct(lemmatizer): assert lemmatizer.punct('“') == set(['"']) -@pytest.mark.models -def test_tagger_lemmatizer_read_index(path): - if path is not None: - with (path / 'wordnet' / 'index.noun').open() as file_: - index = read_index(file_) - assert 'man' in index - assert 'plantes' not in index - assert 'plant' in index - - -@pytest.mark.models -@pytest.mark.parametrize('text,lemma', [("was", "be")]) -def test_tagger_lemmatizer_read_exc(path, text, lemma): - if path is not None: - with (path / 'wordnet' / 'verb.exc').open() as file_: - exc = read_exc(file_) - assert exc[text] == (lemma,) - - @pytest.mark.models def test_tagger_lemmatizer_lemma_assignment(EN): text = "Bananas in pyjamas are geese."