Modernise lemmatizer tests

This commit is contained in:
Ines Montani 2017-01-12 23:41:17 +01:00
parent 33d9cf87f9
commit affcf1b19d
1 changed files with 38 additions and 78 deletions

View File

@ -1,34 +1,43 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import os from ...lemmatizer import read_index, read_exc
import io
import pickle
import pathlib
from spacy.lemmatizer import Lemmatizer, read_index, read_exc
from spacy import util
import pytest import pytest
@pytest.fixture @pytest.mark.parametrize('text,lemmas', [("aardwolves", ["aardwolf"]),
def path(): ("aardwolf", ["aardwolf"]),
if 'SPACY_DATA' in os.environ: ("planets", ["planet"]),
return pathlib.Path(os.environ['SPACY_DATA']) ("ring", ["ring"]),
else: ("axes", ["axis", "axe", "ax"])])
return util.match_best_version('en', None, util.get_data_path()) def test_tagger_lemmatizer_noun_lemmas(lemmatizer, text, lemmas):
if lemmatizer is None:
@pytest.fixture
def lemmatizer(path):
if path is not None:
return Lemmatizer.load(path)
else:
return None return None
assert lemmatizer.noun(text) == set(lemmas)
def test_read_index(path): def test_tagger_lemmatizer_base_forms(lemmatizer):
if lemmatizer is None:
return None
assert lemmatizer.noun('dive', {'number': 'sing'}) == set(['dive'])
assert lemmatizer.noun('dive', {'number': 'plur'}) == set(['diva'])
def test_tagger_lemmatizer_base_form_verb(lemmatizer):
if lemmatizer is None:
return None
assert lemmatizer.verb('saw', {'verbform': 'past'}) == set(['see'])
def test_tagger_lemmatizer_punct(lemmatizer):
if lemmatizer is None:
return None
assert lemmatizer.punct('') == set(['"'])
assert lemmatizer.punct('') == set(['"'])
def test_tagger_lemmatizer_read_index(path):
if path is not None: if path is not None:
with (path / 'wordnet' / 'index.noun').open() as file_: with (path / 'wordnet' / 'index.noun').open() as file_:
index = read_index(file_) index = read_index(file_)
@ -37,67 +46,18 @@ def test_read_index(path):
assert 'plant' in index assert 'plant' in index
def test_read_exc(path): @pytest.mark.parametrize('text,lemma', [("was", "be")])
def test_tagger_lemmatizer_read_exc(path, text, lemma):
if path is not None: if path is not None:
with (path / 'wordnet' / 'verb.exc').open() as file_: with (path / 'wordnet' / 'verb.exc').open() as file_:
exc = read_exc(file_) exc = read_exc(file_)
assert exc['was'] == ('be',) assert exc[text] == (lemma,)
def test_noun_lemmas(lemmatizer):
if lemmatizer is None:
return None
do = lemmatizer.noun
assert do('aardwolves') == set(['aardwolf'])
assert do('aardwolf') == set(['aardwolf'])
assert do('planets') == set(['planet'])
assert do('ring') == set(['ring'])
assert do('axes') == set(['axis', 'axe', 'ax'])
def test_base_form_dive(lemmatizer):
if lemmatizer is None:
return None
do = lemmatizer.noun
assert do('dive', {'number': 'sing'}) == set(['dive'])
assert do('dive', {'number': 'plur'}) == set(['diva'])
def test_base_form_saw(lemmatizer):
if lemmatizer is None:
return None
do = lemmatizer.verb
assert do('saw', {'verbform': 'past'}) == set(['see'])
def test_smart_quotes(lemmatizer):
if lemmatizer is None:
return None
do = lemmatizer.punct
assert do('') == set(['"'])
assert do('') == set(['"'])
def test_pickle_lemmatizer(lemmatizer):
if lemmatizer is None:
return None
file_ = io.BytesIO()
pickle.dump(lemmatizer, file_)
file_.seek(0)
loaded = pickle.load(file_)
@pytest.mark.models @pytest.mark.models
def test_tagger_lemma_assignment(EN): def test_tagger_lemmatizer_lemma_assignment(EN):
tokens = u'Bananas in pyjamas are geese .'.split(' ') text = "Bananas in pyjamas are geese."
doc = EN.tokenizer.tokens_from_list(tokens) doc = EN.tokenizer(text)
assert all( t.lemma_ == u'' for t in doc ) assert all(t.lemma_ == '' for t in doc)
EN.tagger(doc) EN.tagger(doc)
assert all( t.lemma_ != u'' for t in doc ) assert all(t.lemma_ != '' for t in doc)