diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 1b76f724c..5cc04c96c 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -447,6 +447,7 @@ cdef class ArcEager(TransitionSystem): # note that this can create non-projective trees if there are arcs # between nodes on both sides of the new root node st._sent[i].head = 0 + st._sent[st._sent[i].l_edge].sent_start = True cdef int set_valid(self, int* output, const StateC* st) nogil: cdef bint[N_MOVES] is_valid diff --git a/spacy/tests/parser/test_sbd.py b/spacy/tests/parser/test_sbd.py index 771e2401f..247c14a34 100644 --- a/spacy/tests/parser/test_sbd.py +++ b/spacy/tests/parser/test_sbd.py @@ -1,7 +1,7 @@ from __future__ import unicode_literals import pytest - +from spacy.tokens import Doc @pytest.mark.models @@ -42,7 +42,7 @@ def test_single_question(EN): @pytest.mark.models def test_sentence_breaks_no_space(EN): - doc = EN.tokenizer.tokens_from_list('This is a sentence . This is another one .'.split(' ')) + doc = EN.tokenizer.tokens_from_list(u'This is a sentence . This is another one .'.split(' ')) EN.tagger(doc) with EN.parser.step_through(doc) as stepwise: # stack empty, automatic Shift (This) @@ -83,7 +83,7 @@ def test_sentence_breaks_no_space(EN): @pytest.mark.models def test_sentence_breaks_with_space(EN): - doc = EN.tokenizer.tokens_from_list('\t This is \n a sentence \n \n . \n \t \n This is another \t one .'.split(' ')) + doc = EN.tokenizer.tokens_from_list(u'\t This is \n a sentence \n \n . \n \t \n This is another \t one .'.split(' ')) EN.tagger(doc) with EN.parser.step_through(doc) as stepwise: # stack empty, automatic Shift (This) @@ -120,3 +120,71 @@ def test_sentence_breaks_with_space(EN): for tok in doc: assert tok.dep != 0 or tok.is_space assert [ tok.head.i for tok in doc ] == [1,2,2,2,5,2,5,5,2,8,8,8,13,13,16,14,13,13] + + + +@pytest.fixture +@pytest.mark.models +def example(EN): + def apply_transition_sequence(model, doc, sequence): + with model.parser.step_through(doc) as stepwise: + for transition in sequence: + stepwise.transition(transition) + doc = EN.tokenizer.tokens_from_list(u"I bought a couch from IKEA. It was n't very comfortable .".split(' ')) + EN.tagger(doc) + apply_transition_sequence(EN, doc, ['L-nsubj','S','L-det','R-dobj','D','R-prep','R-pobj','D','D','S','L-nsubj','R-ROOT','R-neg','D','S','L-advmod','R-acomp','D','R-punct']) + return doc + + +def test_sbd_for_root_label_dependents(example): + """ + make sure that the parser properly introduces a sentence boundary without + the break transition by checking for dependents with the root label + """ + + assert example[1].head.i == 1 + assert example[7].head.i == 7 + + sents = list(example.sents) + assert len(sents) == 2 + assert sents[1][0].orth_ == u'It' + + + +@pytest.mark.models +def test_sbd_serialization(EN, example): + """ + test that before and after serialization, the sentence boundaries are the same even + if the parser predicted two roots for the sentence that were made into two sentences + after parsing by arc_eager.finalize() + + This is actually an interaction between the sentence boundary prediction and doc.from_array + The process is the following: if the parser doesn't predict a sentence boundary but attaches + a word with the ROOT label, the second root node is made root of its own sentence after parsing. + During serialization, sentence boundary information is lost and reintroduced when the code + is deserialized by introducing sentence starts at every left-edge of every root node. + + BUG that is tested here: So far, the parser wasn't introducing a sentence start when + it introduced the second root node. + """ + + example_serialized = Doc(EN.vocab).from_bytes(example.to_bytes()) + + assert example.to_bytes() == example_serialized.to_bytes() + assert [s.text for s in example.sents] == [s.text for s in example_serialized.sents] + + + + + + + + + + + + + + + +