From 9dd2f25c7438c81f7122f9de28f4d35e1e6b0911 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Sat, 10 Oct 2015 15:53:30 +1100
Subject: [PATCH] * Fix Issue #131: Force whitespace characters to attach
 syntactically to previous token, and ensure they cannot serve as stand-alone
 'sentence' units.

---
 spacy/syntax/arc_eager.pyx | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 07595d4ab..561308928 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -380,10 +380,17 @@ cdef class ArcEager(TransitionSystem):
         st.fast_forward()
 
     cdef int finalize_state(self, StateClass st) nogil:
+        cdef int i
         for i in range(st.length):
             # Always attach spaces to the previous word
             if Lexeme.c_check_flag(st._sent[i].lex, IS_SPACE):
                 st._sent[i].head = -1 if (i >= 1) else 1
+                if st._sent[i].sent_start and st._sent[i].head == -1:
+                    st._sent[i].sent_start = False
+                    # If we had this space token as the start of a sentence,
+                    # move that sentence start forward one
+                    if (i + 1) < st.length and not st._sent[i+1].sent_start:
+                        st._sent[i+1].sent_start = True
             elif st._sent[i].head == 0 and st._sent[i].dep == 0:
                 st._sent[i].dep = self.root_label
             # If we're not using the Break transition, we segment via root-labelled