mirror of https://github.com/explosion/spaCy.git
Fix re-parsing of previously parsed text
If a Doc object had been previously parsed, it was possible for invalid parses to be added. There were two problems: 1) The parse was only being partially erased 2) The RightArc action was able to create a 1-cycle. This patch fixes both errors, and avoids resetting the parse if one is present. In theory this might allow a better parse to be predicted by running the parser twice. Closes #1253.
This commit is contained in:
parent
61bc203f3f
commit
f111b228e0
|
@ -212,7 +212,8 @@ cdef class LeftArc:
|
|||
cdef class RightArc:
|
||||
@staticmethod
|
||||
cdef bint is_valid(const StateC* st, attr_t label) nogil:
|
||||
return st.B_(0).sent_start != 1
|
||||
# If there's (perhaps partial) parse pre-set, don't allow cycle.
|
||||
return st.B_(0).sent_start != 1 and st.H(st.S(0)) != st.B(0)
|
||||
|
||||
@staticmethod
|
||||
cdef int transition(StateC* st, attr_t label) nogil:
|
||||
|
@ -446,14 +447,19 @@ cdef class ArcEager(TransitionSystem):
|
|||
|
||||
cdef int initialize_state(self, StateC* st) nogil:
|
||||
for i in range(st.length):
|
||||
if st._sent[i].dep == 0:
|
||||
st._sent[i].l_edge = i
|
||||
st._sent[i].r_edge = i
|
||||
st._sent[i].head = 0
|
||||
st._sent[i].dep = 0
|
||||
st._sent[i].l_kids = 0
|
||||
st._sent[i].r_kids = 0
|
||||
st.fast_forward()
|
||||
|
||||
cdef int finalize_state(self, StateC* st) nogil:
|
||||
cdef int i
|
||||
for i in range(st.length):
|
||||
if st._sent[i].head == 0 and st._sent[i].dep == 0:
|
||||
if st._sent[i].head == 0:
|
||||
st._sent[i].dep = self.root_label
|
||||
|
||||
def finalize_doc(self, doc):
|
||||
|
|
|
@ -0,0 +1,20 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
import spacy
|
||||
|
||||
|
||||
def ss(tt):
|
||||
for i in range(len(tt)-1):
|
||||
for j in range(i+1, len(tt)):
|
||||
tt[i:j].root
|
||||
|
||||
|
||||
@pytest.mark.models('en')
|
||||
def test_access_parse_for_merged():
|
||||
nlp = spacy.load('en_core_web_sm')
|
||||
t_t = nlp.tokenizer("Highly rated - I'll definitely")
|
||||
nlp.tagger(t_t)
|
||||
nlp.parser(t_t)
|
||||
nlp.parser(t_t)
|
||||
ss(t_t)
|
||||
|
Loading…
Reference in New Issue