From 1786331cd82674a5d4ec14cce74d135278dae84d Mon Sep 17 00:00:00 2001 From: Wolfgang Seeker Date: Tue, 3 May 2016 12:51:47 +0200 Subject: [PATCH] add model sanity test --- spacy/tests/conftest.py | 14 ++--- spacy/tests/integration/__init__.py | 0 spacy/tests/integration/test_model_sanity.py | 62 ++++++++++++++++++++ 3 files changed, 68 insertions(+), 8 deletions(-) create mode 100644 spacy/tests/integration/__init__.py create mode 100644 spacy/tests/integration/test_model_sanity.py diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 83a39a03a..cf7fd223a 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -1,17 +1,15 @@ -from spacy.en import English - import pytest import os +import spacy @pytest.fixture(scope="session") def EN(): - if os.environ.get('SPACY_DATA'): - data_dir = os.environ.get('SPACY_DATA') - else: - data_dir = None - print("Load EN from %s" % data_dir) - return English(data_dir=data_dir) + return spacy.load("en") + +@pytest.fixture(score="session") +def DE(): + return spacy.load("de") def pytest_addoption(parser): diff --git a/spacy/tests/integration/__init__.py b/spacy/tests/integration/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/integration/test_model_sanity.py b/spacy/tests/integration/test_model_sanity.py new file mode 100644 index 000000000..0cddb85dd --- /dev/null +++ b/spacy/tests/integration/test_model_sanity.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- + +import pytest +import numpy + +@pytest.mark.models +class TestModelSanity: + """ + This is to make sure the model works as expected. The tests make sure that values are properly set. + Tests are not meant to evaluate the content of the output, only make sure the output is formally okay. + """ + + @pytest.fixture(scope='class', params=['en','de']) + def example(self, request, EN, DE): + if request.param == 'en': + return EN(u'There was a stranger standing at the big street talking to herself.') + elif request.param == 'de': + return DE(u'An der großen Straße stand eine merkwürdige Gestalt und führte Selbstgespräche.') + + def test_tokenization(self, example): + # tokenization should split the document into tokens + assert len(example) > 1 + + def test_tagging(self, example): + # if tagging was done properly, pos tags shouldn't be empty + assert example.is_tagged + assert all( t.pos != 0 for t in example ) + assert all( t.tag != 0 for t in example ) + + def test_parsing(self, example): + # if parsing was done properly + # - dependency labels shouldn't be empty + # - the head of some tokens should not be root + assert example.is_parsed + assert all( t.dep != 0 for t in example ) + assert any( t.dep != i for i,t in enumerate(example) ) + + def test_ner(self, example): + # if ner was done properly, ent_iob shouldn't be empty + assert all( t.ent_iob != 0 for t in example ) + + def test_vectors(self, example): + # if vectors are available, they should differ on different words + # this isn't a perfect test since this could in principle fail in a sane model as well, + # but that's very unlikely and a good indicator if something is wrong + vector0 = example[0].vector + vector1 = example[1].vector + vector2 = example[2].vector + assert not numpy.array_equal(vector0,vector1) + assert not numpy.array_equal(vector0,vector2) + assert not numpy.array_equal(vector1,vector2) + + def test_probs(self, example): + # if frequencies/probabilities are okay, they should differ for different words + # this isn't a perfect test since this could in principle fail in a sane model as well, + # but that's very unlikely and a good indicator if something is wrong + prob0 = example[0].prob + prob1 = example[1].prob + prob2 = example[2].prob + assert not prob0 == prob1 + assert not prob0 == prob2 + assert not prob1 == prob2