Fix re-parsing of previously parsed text

If a Doc object had been previously parsed, it was possible for invalid parses to be added. There were two problems: 1) The parse was only being partially erased 2) The RightArc action was able to create a 1-cycle. This patch fixes both errors, and avoids resetting the parse if one is present. In theory this might allow a better parse to be predicted by running the parser twice. Closes #1253.
2017-10-20 16:24:48 +02:00 · 2017-10-20 16:24:48 +02:00 · f111b228e0
parent 61bc203f3f
commit f111b228e0
2 changed files with 30 additions and 4 deletions
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -212,7 +212,8 @@ cdef class LeftArc:
 cdef class RightArc:
    @staticmethod
    cdef bint is_valid(const StateC* st, attr_t label) nogil:
-        return st.B_(0).sent_start != 1
+        # If there's (perhaps partial) parse pre-set, don't allow cycle.
+        return st.B_(0).sent_start != 1 and st.H(st.S(0)) != st.B(0)

    @staticmethod
    cdef int transition(StateC* st, attr_t label) nogil:
@ -446,14 +447,19 @@ cdef class ArcEager(TransitionSystem):

    cdef int initialize_state(self, StateC* st) nogil:
        for i in range(st.length):
-            st._sent[i].l_edge = i
-            st._sent[i].r_edge = i
+            if st._sent[i].dep == 0:
+                st._sent[i].l_edge = i
+                st._sent[i].r_edge = i
+                st._sent[i].head = 0
+                st._sent[i].dep = 0
+                st._sent[i].l_kids = 0
+                st._sent[i].r_kids = 0
        st.fast_forward()

    cdef int finalize_state(self, StateC* st) nogil:
        cdef int i
        for i in range(st.length):
-            if st._sent[i].head == 0 and st._sent[i].dep == 0:
+            if st._sent[i].head == 0:
                st._sent[i].dep = self.root_label

    def finalize_doc(self, doc):
--- a/spacy/tests/regression/test_issue1253.py
+++ b/spacy/tests/regression/test_issue1253.py
@ -0,0 +1,20 @@
+from __future__ import unicode_literals
+import pytest
+import spacy
+
+
+def ss(tt):
+    for i in range(len(tt)-1):
+        for j in range(i+1, len(tt)):
+            tt[i:j].root
+
+
+@pytest.mark.models('en')
+def test_access_parse_for_merged():
+    nlp = spacy.load('en_core_web_sm')
+    t_t = nlp.tokenizer("Highly rated - I'll definitely")
+    nlp.tagger(t_t)
+    nlp.parser(t_t)
+    nlp.parser(t_t)
+    ss(t_t)
+