mirror of https://github.com/explosion/spaCy.git
add model sanity test
This commit is contained in:
parent
1f1532142f
commit
1786331cd8
|
@ -1,17 +1,15 @@
|
|||
from spacy.en import English
|
||||
|
||||
import pytest
|
||||
import os
|
||||
|
||||
import spacy
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def EN():
|
||||
if os.environ.get('SPACY_DATA'):
|
||||
data_dir = os.environ.get('SPACY_DATA')
|
||||
else:
|
||||
data_dir = None
|
||||
print("Load EN from %s" % data_dir)
|
||||
return English(data_dir=data_dir)
|
||||
return spacy.load("en")
|
||||
|
||||
@pytest.fixture(score="session")
|
||||
def DE():
|
||||
return spacy.load("de")
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import pytest
|
||||
import numpy
|
||||
|
||||
@pytest.mark.models
|
||||
class TestModelSanity:
|
||||
"""
|
||||
This is to make sure the model works as expected. The tests make sure that values are properly set.
|
||||
Tests are not meant to evaluate the content of the output, only make sure the output is formally okay.
|
||||
"""
|
||||
|
||||
@pytest.fixture(scope='class', params=['en','de'])
|
||||
def example(self, request, EN, DE):
|
||||
if request.param == 'en':
|
||||
return EN(u'There was a stranger standing at the big street talking to herself.')
|
||||
elif request.param == 'de':
|
||||
return DE(u'An der großen Straße stand eine merkwürdige Gestalt und führte Selbstgespräche.')
|
||||
|
||||
def test_tokenization(self, example):
|
||||
# tokenization should split the document into tokens
|
||||
assert len(example) > 1
|
||||
|
||||
def test_tagging(self, example):
|
||||
# if tagging was done properly, pos tags shouldn't be empty
|
||||
assert example.is_tagged
|
||||
assert all( t.pos != 0 for t in example )
|
||||
assert all( t.tag != 0 for t in example )
|
||||
|
||||
def test_parsing(self, example):
|
||||
# if parsing was done properly
|
||||
# - dependency labels shouldn't be empty
|
||||
# - the head of some tokens should not be root
|
||||
assert example.is_parsed
|
||||
assert all( t.dep != 0 for t in example )
|
||||
assert any( t.dep != i for i,t in enumerate(example) )
|
||||
|
||||
def test_ner(self, example):
|
||||
# if ner was done properly, ent_iob shouldn't be empty
|
||||
assert all( t.ent_iob != 0 for t in example )
|
||||
|
||||
def test_vectors(self, example):
|
||||
# if vectors are available, they should differ on different words
|
||||
# this isn't a perfect test since this could in principle fail in a sane model as well,
|
||||
# but that's very unlikely and a good indicator if something is wrong
|
||||
vector0 = example[0].vector
|
||||
vector1 = example[1].vector
|
||||
vector2 = example[2].vector
|
||||
assert not numpy.array_equal(vector0,vector1)
|
||||
assert not numpy.array_equal(vector0,vector2)
|
||||
assert not numpy.array_equal(vector1,vector2)
|
||||
|
||||
def test_probs(self, example):
|
||||
# if frequencies/probabilities are okay, they should differ for different words
|
||||
# this isn't a perfect test since this could in principle fail in a sane model as well,
|
||||
# but that's very unlikely and a good indicator if something is wrong
|
||||
prob0 = example[0].prob
|
||||
prob1 = example[1].prob
|
||||
prob2 = example[2].prob
|
||||
assert not prob0 == prob1
|
||||
assert not prob0 == prob2
|
||||
assert not prob1 == prob2
|
Loading…
Reference in New Issue