* Fix sentence-final whitespace issue

2015-11-07 17:34:46 +11:00 · 2015-11-07 17:34:46 +11:00 · 1cfa20fb17
parent 7663970d5f
commit 1cfa20fb17
1 changed files with 4 additions and 13 deletions
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -238,6 +238,9 @@ cdef class Break:
        elif (st.S(0) + 1) != st.B(0):
            # Must break at the token boundary
            return False
        # Don't allow spaces to be the first word of a sentence
        elif Lexeme.c_check_flag(st.B_(0).lex, IS_SPACE):
            return False
        else:
            return True
@ -382,19 +385,7 @@ cdef class ArcEager(TransitionSystem):
    cdef int finalize_state(self, StateClass st) nogil:
        cdef int i
        for i in range(st.length):
-            # Always attach spaces to the previous word
+            if st._sent[i].head == 0 and st._sent[i].dep == 0:
            if Lexeme.c_check_flag(st._sent[i].lex, IS_SPACE):
                if st._sent[i].sent_start and st._sent[i].head == -1:
                    st._sent[i].sent_start = False
                    # If we had this space token as the start of a sentence,
                    # move that sentence start forward one
                    if (i + 1) < st.length and not st._sent[i+1].sent_start:
                        st._sent[i+1].sent_start = True
                    if i >= 1:
                        st.add_arc(i-1, i, st._sent[i].dep)
                    else:
                        st.add_arc(i+1, i, st._sent[i].dep)
            elif st._sent[i].head == 0 and st._sent[i].dep == 0:
                st._sent[i].dep = self.root_label
            # If we're not using the Break transition, we segment via root-labelled
            # arcs between the root words.