diff --git a/spacy/tests/spans/test_merge.py b/spacy/tests/spans/test_merge.py index b7f0a961e..86712f771 100644 --- a/spacy/tests/spans/test_merge.py +++ b/spacy/tests/spans/test_merge.py @@ -1,89 +1,117 @@ +# coding: utf-8 from __future__ import unicode_literals -from spacy.attrs import HEAD + +from ..util import get_doc + import pytest -import numpy -def test_merge_tokens(EN): - tokens = EN(u'Los Angeles start.') - tokens.from_array([HEAD], numpy.asarray([[1, 1, 0, -1]], dtype='int32').T) - assert len(tokens) == 4 - assert tokens[0].head.orth_ == 'Angeles' - assert tokens[1].head.orth_ == 'start' - tokens.merge(0, len('Los Angeles'), 'NNP', 'Los Angeles', 'GPE') - assert len(tokens) == 3 - assert tokens[0].orth_ == 'Los Angeles' - assert tokens[0].head.orth_ == 'start' +def test_spans_merge_tokens(en_tokenizer): + text = "Los Angeles start." + heads = [1, 1, 0, -1] + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) + assert len(doc) == 4 + assert doc[0].head.text == 'Angeles' + assert doc[1].head.text == 'start' + doc.merge(0, len('Los Angeles'), 'NNP', 'Los Angeles', 'GPE') + assert len(doc) == 3 + assert doc[0].text == 'Los Angeles' + assert doc[0].head.text == 'start' -@pytest.mark.models -def test_merge_heads(EN): - tokens = EN(u'I found a pilates class near work.') - assert len(tokens) == 8 - tokens.merge(tokens[3].idx, tokens[4].idx + len(tokens[4]), tokens[4].tag_, - 'pilates class', 'O') - assert len(tokens) == 7 - assert tokens[0].head.i == 1 - assert tokens[1].head.i == 1 - assert tokens[2].head.i == 3 - assert tokens[3].head.i == 1 - assert tokens[4].head.i in [1, 3] - assert tokens[5].head.i == 4 +def test_spans_merge_heads(en_tokenizer): + text = "I found a pilates class near work." + heads = [1, 0, 2, 1, -3, -1, -1, -6] + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) + + assert len(doc) == 8 + doc.merge(doc[3].idx, doc[4].idx + len(doc[4]), doc[4].tag_, 'pilates class', 'O') + assert len(doc) == 7 + assert doc[0].head.i == 1 + assert doc[1].head.i == 1 + assert doc[2].head.i == 3 + assert doc[3].head.i == 1 + assert doc[4].head.i in [1, 3] + assert doc[5].head.i == 4 +def test_span_np_merges(en_tokenizer): + text = "displaCy is a parse tool built with Javascript" + heads = [1, 0, 2, 1, -3, -1, -1, -1] + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) -@pytest.mark.models -def test_np_merges(EN): - text = u'displaCy is a parse tool built with Javascript' - tokens = EN(text) - assert tokens[4].head.i == 1 - tokens.merge(tokens[2].idx, tokens[4].idx + len(tokens[4]), u'NP', u'tool', u'O') - assert tokens[2].head.i == 1 - tokens = EN('displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript.') - - ents = [(e[0].idx, e[-1].idx + len(e[-1]), e.label_, e.lemma_) - for e in tokens.ents] + assert doc[4].head.i == 1 + doc.merge(doc[2].idx, doc[4].idx + len(doc[4]), 'NP', 'tool', 'O') + assert doc[2].head.i == 1 + + text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript." + heads = [1, 0, 8, 3, -1, -2, 4, 3, 1, 1, -9, -1, -1, -1, -1, -2, -15] + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) + + ents = [(e[0].idx, e[-1].idx + len(e[-1]), e.label_, e.lemma_) for e in doc.ents] for start, end, label, lemma in ents: - merged = tokens.merge(start, end, label, lemma, label) - assert merged != None, (start, end, label, lemma) + merged = doc.merge(start, end, label, lemma, label) + assert merged != None, (start, end, label, lemma) - tokens = EN(u'One test with entities like New York City so the ents list is not void') + text = "One test with entities like New York City so the ents list is not void" + heads = [1, 11, -1, -1, -1, 1, 1, -3, 4, 2, 1, 1, 0, -1, -2] + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) - for span in tokens.ents: - merged = span.merge() - assert merged != None, (span.start, span.end, span.label_, span.lemma_) + for span in doc.ents: + merged = doc.merge() + assert merged != None, (span.start, span.end, span.label_, span.lemma_) -@pytest.mark.models -def test_entity_merge(EN): - tokens = EN(u'Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n') - assert(len(tokens) == 17) - for ent in tokens.ents: + +def test_spans_entity_merge(en_tokenizer): + text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n" + heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2, -13, -1] + tags = ['NNP', 'NNP', 'VBZ', 'DT', 'VB', 'RP', 'NN', 'WP', 'VBZ', 'IN', 'NNP', 'CC', 'VBZ', 'NNP', 'NNP', '.', 'SP'] + ents = [('Stewart Lee', 'PERSON', 0, 2), ('England', 'GPE', 10, 11), ('Joe Pasquale', 'PERSON', 13, 15)] + + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, tags=tags, ents=ents) + assert len(doc) == 17 + for ent in doc.ents: label, lemma, type_ = (ent.root.tag_, ent.root.lemma_, max(w.ent_type_ for w in ent)) ent.merge(label, lemma, type_) # check looping is ok - assert(len(tokens) == 15) + assert len(doc) == 15 -@pytest.mark.models -def test_sentence_update_after_merge(EN): - tokens = EN(u'Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale.') - sent1, sent2 = list(tokens.sents) +def test_spans_sentence_update_after_merge(en_tokenizer): + text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale." + heads = [1, 1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2, -7] + deps = ['compound', 'nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr', + 'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj', + 'compound', 'dobj', 'punct'] + + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps) + sent1, sent2 = list(doc.sents) init_len = len(sent1) init_len2 = len(sent2) - merge_me = tokens[0:2] - merge_me.merge(u'none', u'none', u'none') - merge_me2 = tokens[-2:] - merge_me2.merge(u'none', u'none', u'none') - assert(len(sent1) == init_len - 1) - assert(len(sent2) == init_len2 - 1) + doc[0:2].merge('none', 'none', 'none') + doc[-2:].merge('none', 'none', 'none') + assert len(sent1) == init_len - 1 + assert len(sent2) == init_len2 - 1 -@pytest.mark.models -def test_subtree_size_check(EN): - tokens = EN(u'Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale') - sent1 = list(tokens.sents)[0] +def test_spans_subtree_size_check(en_tokenizer): + text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale" + heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2] + deps = ['compound', 'nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr', + 'nsubj', 'relcl', 'prep', 'pobj', 'cc', 'conj', 'compound', + 'dobj'] + + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, deps=deps) + sent1 = list(doc.sents)[0] init_len = len(list(sent1.root.subtree)) - merge_me = tokens[0:2] - merge_me.merge(u'none', u'none', u'none') - assert(len(list(sent1.root.subtree)) == init_len - 1) + doc[0:2].merge('none', 'none', 'none') + assert len(list(sent1.root.subtree)) == init_len - 1