diff --git a/spacy/tests/regression/test_issue118.py b/spacy/tests/regression/test_issue118.py index 7996b1f2f..ffdade1d0 100644 --- a/spacy/tests/regression/test_issue118.py +++ b/spacy/tests/regression/test_issue118.py @@ -1,7 +1,4 @@ # coding: utf-8 -"""Test a bug that arose from having overlapping matches""" - - from __future__ import unicode_literals from ...matcher import Matcher @@ -25,6 +22,7 @@ def doc(en_tokenizer): @pytest.mark.parametrize('pattern', [pattern1, pattern2]) def test_issue118(doc, pattern): + """Test a bug that arose from having overlapping matches""" ORG = doc.vocab.strings['ORG'] matcher = Matcher(doc.vocab, {'BostonCeltics': ('ORG', {}, pattern)}) @@ -41,6 +39,7 @@ def test_issue118(doc, pattern): @pytest.mark.parametrize('pattern', [pattern3, pattern4]) def test_issue118_prefix_reorder(doc, pattern): + """Test a bug that arose from having overlapping matches""" ORG = doc.vocab.strings['ORG'] matcher = Matcher(doc.vocab, {'BostonCeltics': ('ORG', {}, pattern)}) diff --git a/spacy/tests/regression/test_issue242.py b/spacy/tests/regression/test_issue242.py index 752620209..a4acf04b3 100644 --- a/spacy/tests/regression/test_issue242.py +++ b/spacy/tests/regression/test_issue242.py @@ -9,7 +9,6 @@ import pytest def test_issue242(en_tokenizer): """Test overlapping multi-word phrases.""" - text = "There are different food safety standards in different countries." patterns = [[{LOWER: 'food'}, {LOWER: 'safety'}], [{LOWER: 'safety'}, {LOWER: 'standards'}]] diff --git a/spacy/tests/regression/test_issue309.py b/spacy/tests/regression/test_issue309.py index 4d69482a0..84756c6b1 100644 --- a/spacy/tests/regression/test_issue309.py +++ b/spacy/tests/regression/test_issue309.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals from ..util import get_doc -def test_sbd_empty_string(en_tokenizer): +def test_issue309(en_tokenizer): """Test Issue #309: SBD fails on empty string""" tokens = en_tokenizer(" ") doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[0], deps=['ROOT']) diff --git a/spacy/tests/regression/test_issue351.py b/spacy/tests/regression/test_issue351.py index 06f24715c..95dbec35a 100644 --- a/spacy/tests/regression/test_issue351.py +++ b/spacy/tests/regression/test_issue351.py @@ -1,16 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals -from ...en import English - import pytest -@pytest.fixture -def en_tokenizer(): - return English.Defaults.create_tokenizer() - - def test_issue351(en_tokenizer): doc = en_tokenizer(" This is a cat.") assert doc[0].idx == 0 diff --git a/spacy/tests/regression/test_issue360.py b/spacy/tests/regression/test_issue360.py index d0b55032a..a2c007f16 100644 --- a/spacy/tests/regression/test_issue360.py +++ b/spacy/tests/regression/test_issue360.py @@ -1,16 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals -from ...en import English - import pytest -@pytest.fixture -def en_tokenizer(): - return English.Defaults.create_tokenizer() - - -def test_big_ellipsis(en_tokenizer): +def test_issue360(en_tokenizer): + """Test tokenization of big ellipsis""" tokens = en_tokenizer('$45...............Asking') assert len(tokens) > 2 diff --git a/spacy/tests/regression/test_issue429.py b/spacy/tests/regression/test_issue429.py index 1412a54f2..5b76f05e6 100644 --- a/spacy/tests/regression/test_issue429.py +++ b/spacy/tests/regression/test_issue429.py @@ -1,31 +1,25 @@ # coding: utf-8 from __future__ import unicode_literals -import spacy -from spacy.attrs import ORTH +from ...attrs import ORTH +from ...matcher import Matcher import pytest @pytest.mark.models -def test_issue429(): - - nlp = spacy.load('en', parser=False) - - +def test_issue429(EN): def merge_phrases(matcher, doc, i, matches): if i != len(matches) - 1: return None spans = [(ent_id, label, doc[start:end]) for ent_id, label, start, end in matches] for ent_id, label, span in spans: - span.merge('NNP' if label else span.root.tag_, span.text, nlp.vocab.strings[label]) + span.merge('NNP' if label else span.root.tag_, span.text, EN.vocab.strings[label]) - doc = nlp('a') - nlp.matcher.add('key', label='TEST', attrs={}, specs=[[{ORTH: 'a'}]], on_match=merge_phrases) - doc = nlp.tokenizer('a b c') - nlp.tagger(doc) - nlp.matcher(doc) - - for word in doc: - print(word.text, word.ent_iob_, word.ent_type_) - nlp.entity(doc) + doc = EN('a') + matcher = Matcher(EN.vocab) + matcher.add('key', label='TEST', attrs={}, specs=[[{ORTH: 'a'}]], on_match=merge_phrases) + doc = EN.tokenizer('a b c') + EN.tagger(doc) + matcher(doc) + EN.entity(doc) diff --git a/spacy/tests/regression/test_issue514.py b/spacy/tests/regression/test_issue514.py new file mode 100644 index 000000000..a21b7333e --- /dev/null +++ b/spacy/tests/regression/test_issue514.py @@ -0,0 +1,21 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ..util import get_doc + +import pytest + + +@pytest.mark.models +def test_issue514(EN): + """Test serializing after adding entity""" + text = ["This", "is", "a", "sentence", "about", "pasta", "."] + vocab = EN.entity.vocab + doc = get_doc(vocab, text) + EN.entity.add_label("Food") + EN.entity(doc) + label_id = vocab.strings[u'Food'] + doc.ents = [(label_id, 5,6)] + assert [(ent.label_, ent.text) for ent in doc.ents] == [("Food", "pasta")] + doc2 = get_doc(EN.entity.vocab).from_bytes(doc.to_bytes()) + assert [(ent.label_, ent.text) for ent in doc2.ents] == [("Food", "pasta")] diff --git a/spacy/tests/regression/test_issue54.py b/spacy/tests/regression/test_issue54.py index c743715d8..9085457f6 100644 --- a/spacy/tests/regression/test_issue54.py +++ b/spacy/tests/regression/test_issue54.py @@ -6,5 +6,5 @@ import pytest @pytest.mark.models def test_issue54(EN): - text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).' + text = "Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1)." tokens = EN(text) diff --git a/spacy/tests/regression/test_issue587.py b/spacy/tests/regression/test_issue587.py index 8815b346a..1a9620236 100644 --- a/spacy/tests/regression/test_issue587.py +++ b/spacy/tests/regression/test_issue587.py @@ -1,21 +1,20 @@ # coding: utf-8 from __future__ import unicode_literals -import spacy -import spacy.matcher -from spacy.attrs import IS_PUNCT, ORTH +from ...matcher import Matcher +from ...attrs import IS_PUNCT, ORTH import pytest @pytest.mark.models -def test_matcher_segfault(): - nlp = spacy.load('en', parser=False, entity=False) - matcher = spacy.matcher.Matcher(nlp.vocab) +def test_issue587(EN): + """Test that Matcher doesn't segfault on particular input""" + matcher = Matcher(EN.vocab) content = '''a b; c''' matcher.add(entity_key='1', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}]]) - matcher(nlp(content)) + matcher(EN(content)) matcher.add(entity_key='2', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'c'}]]) - matcher(nlp(content)) + matcher(EN(content)) matcher.add(entity_key='3', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'd'}]]) - matcher(nlp(content)) + matcher(EN(content)) diff --git a/spacy/tests/regression/test_issue588.py b/spacy/tests/regression/test_issue588.py index 0b05ac74e..1002da226 100644 --- a/spacy/tests/regression/test_issue588.py +++ b/spacy/tests/regression/test_issue588.py @@ -1,14 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals -from ...vocab import Vocab -from ...tokens import Doc from ...matcher import Matcher import pytest -def test_issue588(): - matcher = Matcher(Vocab()) +def test_issue588(en_vocab): + matcher = Matcher(en_vocab) with pytest.raises(ValueError): matcher.add(entity_key='1', label='TEST', attrs={}, specs=[[]]) diff --git a/spacy/tests/regression/test_issue589.py b/spacy/tests/regression/test_issue589.py index bcbfb0a6a..27363739d 100644 --- a/spacy/tests/regression/test_issue589.py +++ b/spacy/tests/regression/test_issue589.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from ...vocab import Vocab -from ...tokens import Doc +from ..util import get_doc import pytest @@ -10,4 +10,4 @@ import pytest def test_issue589(): vocab = Vocab() vocab.strings.set_frozen(True) - doc = Doc(vocab, words=['whata']) + doc = get_doc(vocab, ['whata']) diff --git a/spacy/tests/regression/test_issue590.py b/spacy/tests/regression/test_issue590.py index fedc9eaf4..443239cf1 100644 --- a/spacy/tests/regression/test_issue590.py +++ b/spacy/tests/regression/test_issue590.py @@ -1,37 +1,22 @@ # coding: utf-8 from __future__ import unicode_literals -from ...attrs import * +from ...attrs import ORTH, IS_ALPHA, LIKE_NUM from ...matcher import Matcher -from ...tokens import Doc -from ...en import English +from ..util import get_doc -def test_overlapping_matches(): - vocab = English.Defaults.create_vocab() - doc = Doc(vocab, words=['n', '=', '1', ';', 'a', ':', '5', '%']) - - matcher = Matcher(vocab) - matcher.add_entity( - "ab", - acceptor=None, - on_match=None - ) - matcher.add_pattern( - 'ab', - [ - {IS_ALPHA: True}, - {ORTH: ':'}, - {LIKE_NUM: True}, - {ORTH: '%'} - ], label='a') - matcher.add_pattern( - 'ab', - [ - {IS_ALPHA: True}, - {ORTH: '='}, - {LIKE_NUM: True}, - ], label='b') +def test_issue590(en_vocab): + """Test overlapping matches""" + doc = get_doc(en_vocab, ['n', '=', '1', ';', 'a', ':', '5', '%']) + matcher = Matcher(en_vocab) + matcher.add_entity("ab", acceptor=None, on_match=None) + matcher.add_pattern('ab', [{IS_ALPHA: True}, {ORTH: ':'}, + {LIKE_NUM: True}, {ORTH: '%'}], + label='a') + matcher.add_pattern('ab', [{IS_ALPHA: True}, {ORTH: '='}, + {LIKE_NUM: True}], + label='b') matches = matcher(doc) assert len(matches) == 2 diff --git a/spacy/tests/regression/test_issue595.py b/spacy/tests/regression/test_issue595.py index e61ff5273..6c73a621a 100644 --- a/spacy/tests/regression/test_issue595.py +++ b/spacy/tests/regression/test_issue595.py @@ -2,43 +2,23 @@ from __future__ import unicode_literals from ...symbols import POS, VERB, VerbForm_inf -from ...tokens import Doc from ...vocab import Vocab from ...lemmatizer import Lemmatizer +from ..util import get_doc import pytest -@pytest.fixture -def index(): - return {'verb': {}} +def test_issue595(): + """Test lemmatization of base forms""" + words = ["Do", "n't", "feed", "the", "dog"] + tag_map = {'VB': {POS: VERB, 'morph': VerbForm_inf}} + rules = {"verb": [["ed", "e"]]} -@pytest.fixture -def exceptions(): - return {'verb': {}} + lemmatizer = Lemmatizer({'verb': {}}, {'verb': {}}, rules) + vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) + doc = get_doc(vocab, words) -@pytest.fixture -def rules(): - return {"verb": [["ed", "e"]]} - -@pytest.fixture -def lemmatizer(index, exceptions, rules): - return Lemmatizer(index, exceptions, rules) - - -@pytest.fixture -def tag_map(): - return {'VB': {POS: VERB, 'morph': VerbForm_inf}} - - -@pytest.fixture -def vocab(lemmatizer, tag_map): - return Vocab(lemmatizer=lemmatizer, tag_map=tag_map) - - -def test_not_lemmatize_base_forms(vocab): - doc = Doc(vocab, words=["Do", "n't", "feed", "the", "dog"]) - feed = doc[2] - feed.tag_ = 'VB' - assert feed.text == 'feed' - assert feed.lemma_ == 'feed' + doc[2].tag_ = 'VB' + assert doc[2].text == 'feed' + assert doc[2].lemma_ == 'feed' diff --git a/spacy/tests/regression/test_issue599.py b/spacy/tests/regression/test_issue599.py index 9f8721676..9e187b3d4 100644 --- a/spacy/tests/regression/test_issue599.py +++ b/spacy/tests/regression/test_issue599.py @@ -1,15 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals -from ...tokens import Doc -from ...vocab import Vocab +from ..util import get_doc -def test_issue599(): - doc = Doc(Vocab()) +def test_issue599(en_vocab): + doc = get_doc(en_vocab) doc.is_tagged = True doc.is_parsed = True - bytes_ = doc.to_bytes() - doc2 = Doc(doc.vocab) - doc2.from_bytes(bytes_) + doc2 = get_doc(doc.vocab) + doc2.from_bytes(doc.to_bytes()) assert doc2.is_parsed diff --git a/spacy/tests/regression/test_issue600.py b/spacy/tests/regression/test_issue600.py index 5fc1bc68c..45511fd48 100644 --- a/spacy/tests/regression/test_issue600.py +++ b/spacy/tests/regression/test_issue600.py @@ -1,11 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -from ...tokens import Doc from ...vocab import Vocab -from ...attrs import POS +from ..util import get_doc def test_issue600(): - doc = Doc(Vocab(tag_map={'NN': {'pos': 'NOUN'}}), words=['hello']) + vocab = Vocab(tag_map={'NN': {'pos': 'NOUN'}}) + doc = get_doc(vocab, ["hello"]) doc[0].tag_ = 'NN' diff --git a/spacy/tests/regression/test_issue605.py b/spacy/tests/regression/test_issue605.py index 16bcea472..14b619ebf 100644 --- a/spacy/tests/regression/test_issue605.py +++ b/spacy/tests/regression/test_issue605.py @@ -1,27 +1,21 @@ # coding: utf-8 from __future__ import unicode_literals -from ...attrs import LOWER, ORTH -from ...tokens import Doc -from ...vocab import Vocab +from ...attrs import ORTH from ...matcher import Matcher +from ..util import get_doc -def return_false(doc, ent_id, label, start, end): - return False +def test_issue605(en_vocab): + def return_false(doc, ent_id, label, start, end): + return False - -def test_matcher_accept(): - doc = Doc(Vocab(), words=['The', 'golf', 'club', 'is', 'broken']) - - golf_pattern = [ - { ORTH: "golf"}, - { ORTH: "club"} - ] + words = ["The", "golf", "club", "is", "broken"] + pattern = [{ORTH: "golf"}, {ORTH: "club"}] + label = "Sport_Equipment" + doc = get_doc(en_vocab, words) matcher = Matcher(doc.vocab) - - matcher.add_entity('Sport_Equipment', acceptor=return_false) - matcher.add_pattern("Sport_Equipment", golf_pattern) + matcher.add_entity(label, acceptor=return_false) + matcher.add_pattern(label, pattern) match = matcher(doc) - assert match == [] diff --git a/spacy/tests/regression/test_issue615.py b/spacy/tests/regression/test_issue615.py index 26594aaa8..393b34b34 100644 --- a/spacy/tests/regression/test_issue615.py +++ b/spacy/tests/regression/test_issue615.py @@ -19,7 +19,7 @@ def test_issue615(en_tokenizer): span.merge('NNP' if label else span.root.tag_, span.text, doc.vocab.strings[label]) text = "The golf club is broken" - pattern = [{ ORTH: "golf"}, { ORTH: "club"}] + pattern = [{ORTH: "golf"}, {ORTH: "club"}] label = "Sport_Equipment" doc = en_tokenizer(text) diff --git a/spacy/tests/regression/test_issue617.py b/spacy/tests/regression/test_issue617.py index 0f4d63b97..f17342565 100644 --- a/spacy/tests/regression/test_issue617.py +++ b/spacy/tests/regression/test_issue617.py @@ -4,7 +4,8 @@ from __future__ import unicode_literals from ...vocab import Vocab -def test_load_vocab_with_string(): +def test_issue617(): + """Test loading Vocab with string""" try: vocab = Vocab.load('/tmp/vocab') except IOError: