From b11cbb06c6da2ff6646b6e9f0efcabe6dea0121f Mon Sep 17 00:00:00 2001 From: Wolfgang Seeker Date: Mon, 2 May 2016 14:36:35 +0200 Subject: [PATCH] remove old tests for sentence boundary detection --- spacy/tests/parser/test_parse.py | 71 --------------- spacy/tests/parser/test_sbd.py | 151 +++++++++++-------------------- 2 files changed, 53 insertions(+), 169 deletions(-) diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 2590ad13d..c966c1610 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -25,74 +25,3 @@ def apply_transition_sequence(model, doc, sequence): with model.parser.step_through(doc) as stepwise: for transition in sequence: stepwise.transition(transition) - - -@pytest.mark.models -def test_arc_eager_finalize_state(EN): - # right branching - example = EN.tokenizer.tokens_from_list(u"a b c d e".split(' ')) - apply_transition_sequence(EN, example, ['R-nsubj','D','R-nsubj','R-nsubj','D','R-ROOT']) - - assert example[0].n_lefts == 0 - assert example[0].n_rights == 2 - assert example[0].left_edge.i == 0 - assert example[0].right_edge.i == 3 - assert example[0].head.i == 0 - - assert example[1].n_lefts == 0 - assert example[1].n_rights == 0 - assert example[1].left_edge.i == 1 - assert example[1].right_edge.i == 1 - assert example[1].head.i == 0 - - assert example[2].n_lefts == 0 - assert example[2].n_rights == 1 - assert example[2].left_edge.i == 2 - assert example[2].right_edge.i == 3 - assert example[2].head.i == 0 - - assert example[3].n_lefts == 0 - assert example[3].n_rights == 0 - assert example[3].left_edge.i == 3 - assert example[3].right_edge.i == 3 - assert example[3].head.i == 2 - - assert example[4].n_lefts == 0 - assert example[4].n_rights == 0 - assert example[4].left_edge.i == 4 - assert example[4].right_edge.i == 4 - assert example[4].head.i == 4 - - # left branching - example = EN.tokenizer.tokens_from_list(u"a b c d e".split(' ')) - apply_transition_sequence(EN, example, ['S','L-nsubj','L-ROOT','S','L-nsubj','L-nsubj']) - - assert example[0].n_lefts == 0 - assert example[0].n_rights == 0 - assert example[0].left_edge.i == 0 - assert example[0].right_edge.i == 0 - assert example[0].head.i == 0 - - assert example[1].n_lefts == 0 - assert example[1].n_rights == 0 - assert example[1].left_edge.i == 1 - assert example[1].right_edge.i == 1 - assert example[1].head.i == 2 - - assert example[2].n_lefts == 1 - assert example[2].n_rights == 0 - assert example[2].left_edge.i == 1 - assert example[2].right_edge.i == 2 - assert example[2].head.i == 4 - - assert example[3].n_lefts == 0 - assert example[3].n_rights == 0 - assert example[3].left_edge.i == 3 - assert example[3].right_edge.i == 3 - assert example[3].head.i == 4 - - assert example[4].n_lefts == 2 - assert example[4].n_rights == 0 - assert example[4].left_edge.i == 1 - assert example[4].right_edge.i == 4 - assert example[4].head.i == 4 diff --git a/spacy/tests/parser/test_sbd.py b/spacy/tests/parser/test_sbd.py index d72cef32d..8c3fa9f72 100644 --- a/spacy/tests/parser/test_sbd.py +++ b/spacy/tests/parser/test_sbd.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import pytest from spacy.tokens import Doc +from spacy.syntax.nonproj import PseudoProjectivity @pytest.mark.models @@ -41,88 +42,42 @@ def test_single_question(EN): @pytest.mark.models -def test_sentence_breaks_no_space(EN): +def test_sentence_breaks(EN): doc = EN.tokenizer.tokens_from_list(u'This is a sentence . This is another one .'.split(' ')) EN.tagger(doc) with EN.parser.step_through(doc) as stepwise: - # stack empty, automatic Shift (This) assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('L-nsubj') # attach This - # stack empty, automatic Shift (is) + stepwise.transition('L-nsubj') assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('S') # shift a + stepwise.transition('S') assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('L-det') # attach a - assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('R-attr') # attach sentence - stepwise.transition('D') # remove sentence - assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('R-punct') # attach . + stepwise.transition('L-det') assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('B-ROOT') # set sentence start on This - # automatic reduction of the stack, automatic Shift to start second sentence + stepwise.transition('R-attr') assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('L-nsubj') # attach This - # stack empty, automatic Shift (is) + stepwise.transition('D') assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('S') # shift another + stepwise.transition('R-punct') assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('L-attr') # attach another - assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('R-attr') # attach one + stepwise.transition('B-ROOT') assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('D') # remove one - assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('R-punct') # attach . - # buffer empty, automatic cleanup + stepwise.transition('L-nsubj') + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('S') + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('L-attr') + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('R-attr') + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('D') + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('R-punct') assert len(list(doc.sents)) == 2 for tok in doc: assert tok.dep != 0 or tok.is_space assert [ tok.head.i for tok in doc ] == [1,1,3,1,1,6,6,8,6,6] -@pytest.mark.models -def test_sentence_breaks_with_space(EN): - doc = EN.tokenizer.tokens_from_list(u'\t This is \n a sentence \n \n . \n \t \n This is another \t one .'.split(' ')) - EN.tagger(doc) - with EN.parser.step_through(doc) as stepwise: - # stack empty, automatic Shift (This) - assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('L-nsubj') # attach This - # stack empty, automatic Shift (is) - assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('S') # shift a - assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('L-det') # attach a - assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('R-attr') # attach sentence - stepwise.transition('D') # remove sentence - assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('R-punct') # attach . - assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('B-ROOT') # set sentence start on This - # automatic reduction of the stack, automatic Shift to start second sentence - assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('L-nsubj') # attach This - # stack empty, automatic Shift (is) - assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('S') # shift another - assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('L-attr') # attach another - assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('R-attr') # attach one - assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('D') # remove one - assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('R-punct') # attach . - # buffer empty, automatic cleanup - assert len(list(doc.sents)) == 2 - for tok in doc: - assert tok.dep != 0 or tok.is_space - assert [ tok.head.i for tok in doc ] == [1,2,2,2,5,2,5,5,2,8,8,8,13,13,16,14,13,13] - - - def apply_transition_sequence(model, doc, sequence): with model.parser.step_through(doc) as stepwise: for transition in sequence: @@ -130,46 +85,46 @@ def apply_transition_sequence(model, doc, sequence): @pytest.mark.models -def test_sbd_for_root_label_dependents(EN): +def test_sbd_serialization_projective(EN): """ - make sure that the parser properly introduces a sentence boundary without - the break transition by checking for dependents with the root label - """ - example = EN.tokenizer.tokens_from_list(u"I saw a firefly It glowed".split(' ')) - EN.tagger(example) - apply_transition_sequence(EN, example, ['L-nsubj','S','L-det','R-dobj','D','S','L-nsubj','R-ROOT']) - - assert example[1].head.i == 1 - assert example[5].head.i == 5 - - sents = list(example.sents) - assert len(sents) == 2 - assert sents[1][0].orth_ == u'It' - - - -@pytest.mark.models -def test_sbd_serialization(EN): - """ - test that before and after serialization, the sentence boundaries are the same even - if the parser predicted two roots for the sentence that were made into two sentences - after parsing by arc_eager.finalize() - - This is actually an interaction between the sentence boundary prediction and doc.from_array - The process is the following: if the parser doesn't predict a sentence boundary but attaches - a word with the ROOT label, the second root node is made root of its own sentence after parsing. - During serialization, sentence boundary information is lost and reintroduced when the code - is deserialized by introducing sentence starts at every left-edge of every root node. - - BUG that is tested here: So far, the parser wasn't introducing a sentence start when - it introduced the second root node. + test that before and after serialization, the sentence boundaries are the same. """ example = EN.tokenizer.tokens_from_list(u"I bought a couch from IKEA. It was n't very comfortable .".split(' ')) EN.tagger(example) - apply_transition_sequence(EN, example, ['L-nsubj','S','L-det','R-dobj','D','R-prep','R-pobj','D','D','S','L-nsubj','R-ROOT','R-neg','D','S','L-advmod','R-acomp','D','R-punct']) + apply_transition_sequence(EN, example, ['L-nsubj','S','L-det','R-dobj','D','R-prep','R-pobj','B-ROOT','L-nsubj','R-neg','D','S','L-advmod','R-acomp','D','R-punct']) example_serialized = Doc(EN.vocab).from_bytes(example.to_bytes()) assert example.to_bytes() == example_serialized.to_bytes() assert [s.text for s in example.sents] == [s.text for s in example_serialized.sents] + + +# TODO: +# @pytest.mark.models +# def test_sbd_serialization_nonprojective(DE): +# """ +# test that before and after serialization, the sentence boundaries are the same in a non-projective sentence. +# """ +# example = EN.tokenizer.tokens_from_list(u"Den Mann hat Peter nicht gesehen . Er war zu langsam .".split(' ')) +# EN.tagger(example) +# apply_transition_sequence(EN, example, ['L-nk','L-oa||oc','R-sb','D','S','L-ng','B-ROOT','L-nsubj','R-neg','D','S','L-advmod','R-acomp','D','R-punct']) +# print [(t.dep_,t.head.i) for t in example] + +# example_serialized = Doc(EN.vocab).from_bytes(example.to_bytes()) + +# assert example.to_bytes() == example_serialized.to_bytes() +# assert [s.text for s in example.sents] == [s.text for s in example_serialized.sents] + + + + + + + + + + + + +