diff --git a/spacy/tests/parser/test_sbd.py b/spacy/tests/parser/test_sbd.py index 1f66368e4..fea6b4162 100644 --- a/spacy/tests/parser/test_sbd.py +++ b/spacy/tests/parser/test_sbd.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import pytest from spacy.tokens import Doc +from spacy.syntax.nonproj import PseudoProjectivity @pytest.mark.models @@ -41,88 +42,42 @@ def test_single_question(EN): @pytest.mark.models -def test_sentence_breaks_no_space(EN): +def test_sentence_breaks(EN): doc = EN.tokenizer.tokens_from_list(u'This is a sentence . This is another one .'.split(' ')) EN.tagger(doc) with EN.parser.step_through(doc) as stepwise: - # stack empty, automatic Shift (This) assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('L-nsubj') # attach This - # stack empty, automatic Shift (is) + stepwise.transition('L-nsubj') assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('S') # shift a + stepwise.transition('S') assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('L-det') # attach a - assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('R-attr') # attach sentence - stepwise.transition('D') # remove sentence - assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('R-punct') # attach . + stepwise.transition('L-det') assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('B-ROOT') # set sentence start on This - # automatic reduction of the stack, automatic Shift to start second sentence + stepwise.transition('R-attr') assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('L-nsubj') # attach This - # stack empty, automatic Shift (is) + stepwise.transition('D') assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('S') # shift another + stepwise.transition('R-punct') assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('L-attr') # attach another - assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('R-attr') # attach one + stepwise.transition('B-ROOT') assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('D') # remove one - assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('R-punct') # attach . - # buffer empty, automatic cleanup + stepwise.transition('L-nsubj') + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('S') + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('L-attr') + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('R-attr') + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('D') + assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') + stepwise.transition('R-punct') assert len(list(doc.sents)) == 2 for tok in doc: assert tok.dep != 0 or tok.is_space assert [ tok.head.i for tok in doc ] == [1,1,3,1,1,6,6,8,6,6] -@pytest.mark.models -def test_sentence_breaks_with_space(EN): - doc = EN.tokenizer.tokens_from_list(u'\t This is \n a sentence \n \n . \n \t \n This is another \t one .'.split(' ')) - EN.tagger(doc) - with EN.parser.step_through(doc) as stepwise: - # stack empty, automatic Shift (This) - assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('L-nsubj') # attach This - # stack empty, automatic Shift (is) - assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('S') # shift a - assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('L-det') # attach a - assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('R-attr') # attach sentence - stepwise.transition('D') # remove sentence - assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('R-punct') # attach . - assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('B-ROOT') # set sentence start on This - # automatic reduction of the stack, automatic Shift to start second sentence - assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('L-nsubj') # attach This - # stack empty, automatic Shift (is) - assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('S') # shift another - assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('L-attr') # attach another - assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('R-attr') # attach one - assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('D') # remove one - assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT') - stepwise.transition('R-punct') # attach . - # buffer empty, automatic cleanup - assert len(list(doc.sents)) == 2 - for tok in doc: - assert tok.dep != 0 or tok.is_space - assert [ tok.head.i for tok in doc ] == [1,2,2,2,5,2,5,5,2,8,8,8,13,13,16,14,13,13] - - - def apply_transition_sequence(model, doc, sequence): with model.parser.step_through(doc) as stepwise: for transition in sequence: @@ -130,46 +85,46 @@ def apply_transition_sequence(model, doc, sequence): @pytest.mark.models -def test_sbd_for_root_label_dependents(EN): +def test_sbd_serialization_projective(EN): """ - make sure that the parser properly introduces a sentence boundary without - the break transition by checking for dependents with the root label - """ - example = EN.tokenizer.tokens_from_list(u"I saw a firefly It glowed".split(' ')) - EN.tagger(example) - apply_transition_sequence(EN, example, ['L-nsubj','S','L-det','R-dobj','D','S','L-nsubj','R-ROOT']) - - assert example[1].head.i == 1 - assert example[5].head.i == 5 - - sents = list(example.sents) - assert len(sents) == 2 - assert sents[1][0].orth_ == u'It' - - - -@pytest.mark.models -def test_sbd_serialization(EN): - """ - test that before and after serialization, the sentence boundaries are the same even - if the parser predicted two roots for the sentence that were made into two sentences - after parsing by arc_eager.finalize() - - This is actually an interaction between the sentence boundary prediction and doc.from_array - The process is the following: if the parser doesn't predict a sentence boundary but attaches - a word with the ROOT label, the second root node is made root of its own sentence after parsing. - During serialization, sentence boundary information is lost and reintroduced when the code - is deserialized by introducing sentence starts at every left-edge of every root node. - - BUG that is tested here: So far, the parser wasn't introducing a sentence start when - it introduced the second root node. + test that before and after serialization, the sentence boundaries are the same. """ example = EN.tokenizer.tokens_from_list(u"I bought a couch from IKEA. It was n't very comfortable .".split(' ')) EN.tagger(example) - apply_transition_sequence(EN, example, ['L-nsubj','S','L-det','R-dobj','D','R-prep','R-pobj','D','D','S','L-nsubj','R-ROOT','R-neg','D','S','L-advmod','R-acomp','D','R-punct']) + apply_transition_sequence(EN, example, ['L-nsubj','S','L-det','R-dobj','D','R-prep','R-pobj','B-ROOT','L-nsubj','R-neg','D','S','L-advmod','R-acomp','D','R-punct']) example_serialized = Doc(EN.vocab).from_bytes(example.to_bytes()) assert example.to_bytes() == example_serialized.to_bytes() assert [s.text for s in example.sents] == [s.text for s in example_serialized.sents] + + +# TODO: +# @pytest.mark.models +# def test_sbd_serialization_nonprojective(DE): +# """ +# test that before and after serialization, the sentence boundaries are the same in a non-projective sentence. +# """ +# example = EN.tokenizer.tokens_from_list(u"Den Mann hat Peter nicht gesehen . Er war zu langsam .".split(' ')) +# EN.tagger(example) +# apply_transition_sequence(EN, example, ['L-nk','L-oa||oc','R-sb','D','S','L-ng','B-ROOT','L-nsubj','R-neg','D','S','L-advmod','R-acomp','D','R-punct']) +# print [(t.dep_,t.head.i) for t in example] + +# example_serialized = Doc(EN.vocab).from_bytes(example.to_bytes()) + +# assert example.to_bytes() == example_serialized.to_bytes() +# assert [s.text for s in example.sents] == [s.text for s in example_serialized.sents] + + + + + + + + + + + + +