Fix re-parsing of previously parsed text

If a Doc object had been previously parsed, it was possible for
invalid parses to be added. There were two problems:

1) The parse was only being partially erased
2) The RightArc action was able to create a 1-cycle.

This patch fixes both errors, and avoids resetting the parse if one is
present. In theory this might allow a better parse to be predicted by
running the parser twice.

Closes #1253.
This commit is contained in:
Matthew Honnibal 2017-10-20 16:24:48 +02:00
parent 61bc203f3f
commit f111b228e0
2 changed files with 30 additions and 4 deletions

View File

@ -212,7 +212,8 @@ cdef class LeftArc:
cdef class RightArc:
@staticmethod
cdef bint is_valid(const StateC* st, attr_t label) nogil:
return st.B_(0).sent_start != 1
# If there's (perhaps partial) parse pre-set, don't allow cycle.
return st.B_(0).sent_start != 1 and st.H(st.S(0)) != st.B(0)
@staticmethod
cdef int transition(StateC* st, attr_t label) nogil:
@ -446,14 +447,19 @@ cdef class ArcEager(TransitionSystem):
cdef int initialize_state(self, StateC* st) nogil:
for i in range(st.length):
st._sent[i].l_edge = i
st._sent[i].r_edge = i
if st._sent[i].dep == 0:
st._sent[i].l_edge = i
st._sent[i].r_edge = i
st._sent[i].head = 0
st._sent[i].dep = 0
st._sent[i].l_kids = 0
st._sent[i].r_kids = 0
st.fast_forward()
cdef int finalize_state(self, StateC* st) nogil:
cdef int i
for i in range(st.length):
if st._sent[i].head == 0 and st._sent[i].dep == 0:
if st._sent[i].head == 0:
st._sent[i].dep = self.root_label
def finalize_doc(self, doc):

View File

@ -0,0 +1,20 @@
from __future__ import unicode_literals
import pytest
import spacy
def ss(tt):
for i in range(len(tt)-1):
for j in range(i+1, len(tt)):
tt[i:j].root
@pytest.mark.models('en')
def test_access_parse_for_merged():
nlp = spacy.load('en_core_web_sm')
t_t = nlp.tokenizer("Highly rated - I'll definitely")
nlp.tagger(t_t)
nlp.parser(t_t)
nlp.parser(t_t)
ss(t_t)