From 908809d488fb7c9ba25fde8d8077a328a12376f4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 24 Oct 2017 17:05:15 +0200 Subject: [PATCH] Update tests --- spacy/tests/doc/test_doc_api.py | 18 +++++------- spacy/tests/doc/test_token_api.py | 35 +++++++++++------------- spacy/tests/regression/test_issue1305.py | 11 +++++--- spacy/tests/regression/test_issue781.py | 2 +- 4 files changed, 31 insertions(+), 35 deletions(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 5e052f771..46c615973 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals from ..util import get_doc +from ...tokens import Doc +from ...vocab import Vocab import pytest import numpy @@ -204,17 +206,11 @@ def test_doc_api_right_edge(en_tokenizer): assert doc[6].right_edge.text == ',' -@pytest.mark.xfail -@pytest.mark.parametrize('text,vectors', [ - ("apple orange pear", ["apple -1 -1 -1", "orange -1 -1 0", "pear -1 0 -1"]) -]) -def test_doc_api_has_vector(en_tokenizer, text_file, text, vectors): - text_file.write('\n'.join(vectors)) - text_file.seek(0) - vector_length = en_tokenizer.vocab.load_vectors(text_file) - assert vector_length == 3 - - doc = en_tokenizer(text) +def test_doc_api_has_vector(): + vocab = Vocab() + vocab.clear_vectors(2) + vocab.vectors.add('kitten', numpy.asarray([0., 2.], dtype='f')) + doc = Doc(vocab, words=['kitten']) assert doc.has_vector def test_lowest_common_ancestor(en_tokenizer): diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index 00caa1445..0ab723f7a 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals from ...attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STOP from ..util import get_doc +from ...vocab import Vocab +from ...tokens import Doc import pytest import numpy @@ -68,26 +70,21 @@ def test_doc_token_api_is_properties(en_vocab): assert doc[5].like_email -@pytest.mark.xfail -@pytest.mark.parametrize('text,vectors', [ - ("apples oranges ldskbjls", ["apples -1 -1 -1", "oranges -1 -1 0"]) -]) -def test_doc_token_api_vectors(en_tokenizer, text_file, text, vectors): - text_file.write('\n'.join(vectors)) - text_file.seek(0) - vector_length = en_tokenizer.vocab.load_vectors(text_file) - assert vector_length == 3 +def test_doc_token_api_vectors(): + vocab = Vocab() + vocab.clear_vectors(2) + vocab.vectors.add('apples', numpy.asarray([0., 2.], dtype='f')) + vocab.vectors.add('oranges', numpy.asarray([0., 1.], dtype='f')) + doc = Doc(vocab, words=['apples', 'oranges', 'oov']) + assert doc.has_vector - tokens = en_tokenizer(text) - assert tokens[0].has_vector - assert tokens[1].has_vector - assert not tokens[2].has_vector - assert tokens[0].similarity(tokens[1]) > tokens[0].similarity(tokens[2]) - assert tokens[0].similarity(tokens[1]) == tokens[1].similarity(tokens[0]) - assert sum(tokens[0].vector) != sum(tokens[1].vector) - assert numpy.isclose( - tokens[0].vector_norm, - numpy.sqrt(numpy.dot(tokens[0].vector, tokens[0].vector))) + assert doc[0].has_vector + assert doc[1].has_vector + assert not doc[2].has_vector + apples_norm = (0*0 + 2*2) ** 0.5 + oranges_norm = (0*0 + 1*1) ** 0.5 + cosine = ((0*0) + (2*1)) / (apples_norm * oranges_norm) + assert doc[0].similarity(doc[1]) == cosine def test_doc_token_api_ancestors(en_tokenizer): diff --git a/spacy/tests/regression/test_issue1305.py b/spacy/tests/regression/test_issue1305.py index e123ce0ba..d1d5eb93d 100644 --- a/spacy/tests/regression/test_issue1305.py +++ b/spacy/tests/regression/test_issue1305.py @@ -1,8 +1,11 @@ import pytest +import spacy -@pytest.mark.models('en') -def test_issue1305(EN): +#@pytest.mark.models('en') +def test_issue1305(): '''Test lemmatization of English VBZ''' - assert EN.vocab.morphology.lemmatizer('works', 'verb') == set(['work']) - doc = EN(u'This app works well') + nlp = spacy.load('en_core_web_sm') + assert nlp.vocab.morphology.lemmatizer('works', 'verb') == ['work'] + doc = nlp(u'This app works well') + print([(w.text, w.tag_) for w in doc]) assert doc[2].lemma_ == 'work' diff --git a/spacy/tests/regression/test_issue781.py b/spacy/tests/regression/test_issue781.py index e3f391a37..2c77e68cd 100644 --- a/spacy/tests/regression/test_issue781.py +++ b/spacy/tests/regression/test_issue781.py @@ -9,4 +9,4 @@ import pytest @pytest.mark.parametrize('word,lemmas', [("chromosomes", ["chromosome"]), ("endosomes", ["endosome"]), ("colocalizes", ["colocalize", "colocaliz"])]) def test_issue781(EN, word, lemmas): lemmatizer = EN.Defaults.create_lemmatizer() - assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == set(lemmas) + assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == lemmas