From 7195b6742d50a950a567bf3671bddec47b92b3d3 Mon Sep 17 00:00:00 2001
From: Wolfgang Seeker <seeker@spacy.io>
Date: Mon, 28 Mar 2016 10:40:52 +0200
Subject: [PATCH 1/3] add restrictions to L-arc and R-arc to prevent space
 heads

---
 spacy/syntax/arc_eager.pyx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index b92b66230..2257317bc 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -9,7 +9,7 @@ from .transition_system cimport do_func_t, get_cost_func_t
 from .transition_system cimport move_cost_func_t, label_cost_func_t
 from ..gold cimport GoldParse
 from ..gold cimport GoldParseC
-from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
+from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE
 from ..lexeme cimport Lexeme
 
 from libc.stdint cimport uint32_t
@@ -166,7 +166,7 @@ cdef class Reduce:
 cdef class LeftArc:
     @staticmethod
     cdef bint is_valid(const StateC* st, int label) nogil:
-        return not st.B_(0).sent_start
+        return not st.B_(0).sent_start and not Lexeme.c_check_flag(st.B_(0).lex,IS_SPACE)
 
     @staticmethod
     cdef int transition(StateC* st, int label) nogil:
@@ -199,7 +199,7 @@ cdef class LeftArc:
 cdef class RightArc:
     @staticmethod
     cdef bint is_valid(const StateC* st, int label) nogil:
-        return not st.B_(0).sent_start
+        return not st.B_(0).sent_start and not Lexeme.c_check_flag(st.S_(0).lex,IS_SPACE)
 
     @staticmethod
     cdef int transition(StateC* st, int label) nogil:

From d99a9cbce97dc0b771e2afdc1bbb445574cabbf1 Mon Sep 17 00:00:00 2001
From: Wolfgang Seeker <seeker@spacy.io>
Date: Wed, 13 Apr 2016 15:28:28 +0200
Subject: [PATCH 2/3] different handling of space tokens

space tokens are now always attached to the previous non-space token
there are two exceptions:
leading space tokens are attached to the first following non-space token
in input that consists exclusively of space tokens, the last space token
is the head of all others.
---
 spacy/syntax/_state.pxd                     | 105 ++++++++++++++++----
 spacy/syntax/arc_eager.pyx                  |  24 +++--
 spacy/syntax/parser.pyx                     |  12 ++-
 spacy/syntax/stateclass.pyx                 |   4 +-
 spacy/tests/parser/test_parse.py            |  13 +++
 spacy/tests/parser/test_sbd.py              |  83 ++++++++++++++++
 spacy/tests/parser/test_space_attachment.py |  66 ++++++++++++
 7 files changed, 276 insertions(+), 31 deletions(-)

diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd
index 401be9bf6..76cae3eb4 100644
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@@ -8,6 +8,10 @@ from ..symbols cimport punct
 from ..attrs cimport IS_SPACE
 
 
+cdef inline bint is_space_token(const TokenC* token) nogil:
+    return Lexeme.c_check_flag(token.lex, IS_SPACE)
+
+
 cdef cppclass StateC:
     int* _stack
     int* _buffer
@@ -292,23 +296,88 @@ cdef cppclass StateC:
         this._break = src._break
 
     void fast_forward() nogil:
-        while this.buffer_length() == 0 \
-        or this.stack_depth() == 0 \
-        or Lexeme.c_check_flag(this.S_(0).lex, IS_SPACE):
-            if this.buffer_length() == 1 and this.stack_depth() == 0:
-                this.push()
-                this.pop()
-            elif this.buffer_length() == 0 and this.stack_depth() == 1:
-                this.pop()
-            elif this.buffer_length() == 0 and this.stack_depth() >= 2:
-                if this.has_head(this.S(0)):
+        # while this.buffer_length() == 0 \
+        # or this.stack_depth() == 0 \
+        # or Lexeme.c_check_flag(this.S_(0).lex, IS_SPACE):
+        #     if this.buffer_length() == 1 and this.stack_depth() == 0:
+        #         this.push()
+        #         this.pop()
+        #     elif this.buffer_length() == 0 and this.stack_depth() == 1:
+        #         this.pop()
+        #     elif this.buffer_length() == 0 and this.stack_depth() >= 2:
+        #         if this.has_head(this.S(0)):
+        #             this.pop()
+        #         else:
+        #             this.unshift()
+        #     elif (this.length - this._b_i) >= 1 and this.stack_depth() == 0:
+        #         this.push()
+        #     elif Lexeme.c_check_flag(this.S_(0).lex, IS_SPACE):
+        #         this.add_arc(this.B(0), this.S(0), 0)
+        #         this.pop()
+        #     else:
+        #         break
+
+        # space token attachement policy:
+        # - attach space tokens always to the last preceding real token
+        # - except if it's the beginning of a sentence, then attach to the first following
+        # - boundary case: a document containing multiple space tokens but nothing else,
+        #   then make the last space token the head of all others
+
+        while is_space_token(this.B_(0)) \
+        or this.buffer_length() == 0 \
+        or this.stack_depth() == 0:
+            if this.buffer_length() == 0:
+                # remove the last sentence's root from the stack
+                if this.stack_depth() == 1:
                     this.pop()
-                else:
-                    this.unshift()
-            elif (this.length - this._b_i) >= 1 and this.stack_depth() == 0:
-                this.push()
-            elif Lexeme.c_check_flag(this.S_(0).lex, IS_SPACE):
-                this.add_arc(this.B(0), this.S(0), 0)
-                this.pop()
-            else:
+                # parser got stuck: reduce stack or unshift
+                elif this.stack_depth() > 1:
+                    if this.has_head(this.S(0)):
+                        this.pop()
+                    else:
+                        this.unshift()
+                # stack is empty but there is another sentence on the buffer
+                elif (this.length - this._b_i) >= 1:
+                    this.push()
+                else: # stack empty and nothing else coming
+                    break
+
+            elif is_space_token(this.B_(0)):
+                # the normal case: we're somewhere inside a sentence
+                if this.stack_depth() > 0:
+                    # assert not is_space_token(this.S_(0))
+                    # attach all coming space tokens to their last preceding
+                    # real token (which should be on the top of the stack)
+                    while is_space_token(this.B_(0)):
+                        this.add_arc(this.S(0),this.B(0),0)
+                        this.push()
+                        this.pop()
+                # the rare case: we're at the beginning of a document:
+                # space tokens are attached to the first real token on the buffer
+                elif this.stack_depth() == 0:
+                    # store all space tokens on the stack until a real token shows up
+                    # or the last token on the buffer is reached
+                    while is_space_token(this.B_(0)) and this.buffer_length() > 1:
+                        this.push()
+                    # empty the stack by attaching all space tokens to the
+                    # first token on the buffer
+                    # boundary case: if all tokens are space tokens, the last one
+                    # becomes the head of all others
+                    while this.stack_depth() > 0:
+                        this.add_arc(this.B(0),this.S(0),0)
+                        this.pop()
+                    # move the first token onto the stack
+                    this.push()
+
+            elif this.stack_depth() == 0:
+                # for one token sentences (?)
+                if this.buffer_length() == 1:
+                    this.push()
+                    this.pop()
+                # with an empty stack and a non-empty buffer
+                # only shift is valid anyway
+                elif (this.length - this._b_i) >= 1:
+                    this.push()
+
+            else: # can this even happen?
                 break
diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 2257317bc..2305c309f 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -17,7 +17,7 @@ from libc.string cimport memcpy
 
 from cymem.cymem cimport Pool
 from .stateclass cimport StateClass
-from ._state cimport StateC
+from ._state cimport StateC, is_space_token
 
 
 DEF NON_MONOTONIC = True
@@ -166,7 +166,7 @@ cdef class Reduce:
 cdef class LeftArc:
     @staticmethod
     cdef bint is_valid(const StateC* st, int label) nogil:
-        return not st.B_(0).sent_start and not Lexeme.c_check_flag(st.B_(0).lex,IS_SPACE)
+        return not st.B_(0).sent_start
 
     @staticmethod
     cdef int transition(StateC* st, int label) nogil:
@@ -199,7 +199,7 @@ cdef class LeftArc:
 cdef class RightArc:
     @staticmethod
     cdef bint is_valid(const StateC* st, int label) nogil:
-        return not st.B_(0).sent_start and not Lexeme.c_check_flag(st.S_(0).lex,IS_SPACE)
+        return not st.B_(0).sent_start
 
     @staticmethod
     cdef int transition(StateC* st, int label) nogil:
@@ -233,11 +233,23 @@ cdef class Break:
             return False
         elif st.at_break():
             return False
-        elif st.B(0) == 0:
-            return False
+        # unnecessary, since the first item in the buffer is always put onto the stack
+        # automatically by fast_forward() in initialize_state()
+        # elif st.B(0) == 0:
+        #     return False
         elif st.stack_depth() < 1:
             return False
-        elif (st.S(0) + 1) != st.B(0):
+        # It is okay to predict a sentence boundary if the top item on the stack
+        # and the first item on the buffer are adjacent tokens. If this is not the
+        # case, it is still okay if there are only space tokens in between.
+        # This is checked by testing whether the head of a space token immediately
+        # preceding the first item in the buffer is the top item on the stack.
+        # Intervening space tokens must be attached to the previous non-space token.
+        # Therefore, if the head of a space token that immediately precedes the first
+        # item on the buffer is the top item on the stack, a sentence boundary can be
+        # predicted.
+        elif (st.S(0) + 1) != st.B(0) \
+        and not (is_space_token(st.safe_get(st.B(0)-1)) and st.H(st.B(0)-1) == st.S(0)):
             # Must break at the token boundary
             return False
         else:
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index ba1f4f1b8..20cce7bb6 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -188,9 +188,11 @@ cdef class Parser:
 
             action = self.moves.c[guess]
             if not eg.is_valid[guess]:
-                with gil:
-                    move_name = self.moves.move_name(action.move, action.label)
-                    return 1
+                # with gil:
+                #     move_name = self.moves.move_name(action.move, action.label)
+                #     print 'invalid action:', move_name
+                return 1
+
             action.do(state, action.label)
             memset(eg.scores, 0, sizeof(eg.scores[0]) * eg.nr_class)
             for i in range(eg.nr_class):
@@ -275,12 +277,12 @@ cdef class StepwiseState:
 
     @property
     def heads(self):
-        return [self.stcls.H(i) for i in range(self.stcls.length)]
+        return [self.stcls.H(i) for i in range(self.stcls.c.length)]
 
     @property
     def deps(self):
         return [self.doc.vocab.strings[self.stcls.c._sent[i].dep]
-                for i in range(self.stcls.length)]
+                for i in range(self.stcls.c.length)]
 
     def predict(self):
         self.eg.reset()
diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx
index 775e613cd..a18cc284a 100644
--- a/spacy/syntax/stateclass.pyx
+++ b/spacy/syntax/stateclass.pyx
@@ -17,11 +17,11 @@ cdef class StateClass:
 
     @property
     def stack(self):
-        return {self.S(i) for i in range(self._s_i)}
+        return {self.S(i) for i in range(self.c._s_i)}
 
     @property
     def queue(self):
-        return {self.B(i) for i in range(self._b_i)}
+        return {self.B(i) for i in range(self.c._b_i)}
 
     def print_state(self, words):
         words = list(words) + ['_']
diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py
index e99148933..d4b633d0d 100644
--- a/spacy/tests/parser/test_parse.py
+++ b/spacy/tests/parser/test_parse.py
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import pytest
 
 
@@ -6,3 +8,14 @@ def test_root(EN):
     tokens = EN(u"i don't have other assistance")
     for t in tokens:
         assert t.dep != 0, t.orth_
+
+
+@pytest.mark.models
+def test_one_word_sentence(EN):
+	# one word sentence
+	doc = EN.tokenizer.tokens_from_list(['Hello'])
+	EN.tagger(doc)
+	assert len(doc) == 1
+	with EN.parser.step_through(doc) as _:
+		pass
+	assert doc[0].dep != 0
diff --git a/spacy/tests/parser/test_sbd.py b/spacy/tests/parser/test_sbd.py
index 57a79525f..771e2401f 100644
--- a/spacy/tests/parser/test_sbd.py
+++ b/spacy/tests/parser/test_sbd.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
 import pytest
 
 
+
 @pytest.mark.models
 def test_single_period(EN):
     string = 'A test sentence.'
@@ -37,3 +38,85 @@ def test_single_question(EN):
     assert len(words) == 4
     assert len(list(words.sents)) == 1
     assert sum(len(sent) for sent in words.sents) == len(words)
+
+
+@pytest.mark.models
+def test_sentence_breaks_no_space(EN):
+    doc = EN.tokenizer.tokens_from_list('This is a sentence . This is another one .'.split(' '))
+    EN.tagger(doc)
+    with EN.parser.step_through(doc) as stepwise:
+        # stack empty, automatic Shift (This)
+        assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
+        stepwise.transition('L-nsubj') # attach This
+        # stack empty, automatic Shift (is)
+        assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
+        stepwise.transition('S') # shift a
+        assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
+        stepwise.transition('L-det') # attach a
+        assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
+        stepwise.transition('R-attr') # attach sentence
+        stepwise.transition('D') # remove sentence
+        assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
+        stepwise.transition('R-punct') # attach .
+        assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
+        stepwise.transition('B-ROOT') # set sentence start on This
+        # automatic reduction of the stack, automatic Shift to start second sentence
+        assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
+        stepwise.transition('L-nsubj') # attach This
+        # stack empty, automatic Shift (is)
+        assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
+        stepwise.transition('S') # shift another
+        assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
+        stepwise.transition('L-attr') # attach another
+        assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
+        stepwise.transition('R-attr') # attach one
+        assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
+        stepwise.transition('D') # remove one
+        assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
+        stepwise.transition('R-punct') # attach .
+        # buffer empty, automatic cleanup
+    assert len(list(doc.sents)) == 2
+    for tok in doc:
+        assert tok.dep != 0 or tok.is_space
+    assert [ tok.head.i for tok in doc ] == [1,1,3,1,1,6,6,8,6,6]
+
+
+@pytest.mark.models
+def test_sentence_breaks_with_space(EN):
+    doc = EN.tokenizer.tokens_from_list('\t This is \n a sentence \n \n . \n \t \n This is another \t one .'.split(' '))
+    EN.tagger(doc)
+    with EN.parser.step_through(doc) as stepwise:
+        # stack empty, automatic Shift (This)
+        assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
+        stepwise.transition('L-nsubj') # attach This
+        # stack empty, automatic Shift (is)
+        assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
+        stepwise.transition('S') # shift a
+        assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
+        stepwise.transition('L-det') # attach a
+        assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
+        stepwise.transition('R-attr') # attach sentence
+        stepwise.transition('D') # remove sentence
+        assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
+        stepwise.transition('R-punct') # attach .
+        assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
+        stepwise.transition('B-ROOT') # set sentence start on This
+        # automatic reduction of the stack, automatic Shift to start second sentence
+        assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
+        stepwise.transition('L-nsubj') # attach This
+        # stack empty, automatic Shift (is)
+        assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
+        stepwise.transition('S') # shift another
+        assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
+        stepwise.transition('L-attr') # attach another
+        assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
+        stepwise.transition('R-attr') # attach one
+        assert EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
+        stepwise.transition('D') # remove one
+        assert not EN.parser.moves.is_valid(stepwise.stcls,'B-ROOT')
+        stepwise.transition('R-punct') # attach .
+        # buffer empty, automatic cleanup
+    assert len(list(doc.sents)) == 2
+    for tok in doc:
+        assert tok.dep != 0 or tok.is_space
+    assert [ tok.head.i for tok in doc ] == [1,2,2,2,5,2,5,5,2,8,8,8,13,13,16,14,13,13]
diff --git a/spacy/tests/parser/test_space_attachment.py b/spacy/tests/parser/test_space_attachment.py
index ca533e3ef..102618446 100644
--- a/spacy/tests/parser/test_space_attachment.py
+++ b/spacy/tests/parser/test_space_attachment.py
@@ -4,6 +4,12 @@ import pytest
 import numpy
 from spacy.attrs import HEAD
 
+def make_doc(EN, sentstr):
+	sent = sentstr.split(' ')
+	doc = EN.tokenizer.tokens_from_list(sent)
+	EN.tagger(doc)
+	return doc
+
 
 @pytest.mark.models
 def test_space_attachment(EN):
@@ -22,3 +28,63 @@ def test_sentence_space(EN):
     doc = EN(text)
     assert len(list(doc.sents)) == 2
 
+
+@pytest.mark.models
+def test_space_attachment_leading_space(EN):
+	# leading space token
+	doc = make_doc(EN, '\t \n This is a sentence .')
+	assert doc[0].is_space
+	assert doc[1].is_space
+	assert doc[2].orth_ == 'This'
+	with EN.parser.step_through(doc) as stepwise:
+		pass
+	assert doc[0].head.i == 2
+	assert doc[1].head.i == 2
+	assert stepwise.stack == set([2])
+
+
+@pytest.mark.models
+def test_space_attachment_intermediate_and_trailing_space(EN):
+	# intermediate and trailing space tokens
+	doc = make_doc(EN, 'This is \t a \t\n \n sentence . \n\n \n')
+	assert doc[2].is_space
+	assert doc[4].is_space
+	assert doc[5].is_space
+	assert doc[8].is_space
+	assert doc[9].is_space
+	with EN.parser.step_through(doc) as stepwise:
+		stepwise.transition('L-nsubj')
+		stepwise.transition('S')
+		stepwise.transition('L-det')
+		stepwise.transition('R-attr')
+		stepwise.transition('D')
+		stepwise.transition('R-punct')
+	assert stepwise.stack == set([])
+	for tok in doc:
+		assert tok.dep != 0 or tok.is_space
+	assert [ tok.head.i for tok in doc ] == [1,1,1,6,3,3,1,1,7,7]
+
+
+@pytest.mark.models
+def test_space_attachment_one_space_sentence(EN):
+	# one space token sentence
+	doc = make_doc(EN, '\n')
+	assert len(doc) == 1
+	with EN.parser.step_through(doc) as _:
+		pass
+	assert doc[0].is_space
+	assert doc[0].head.i == 0
+
+
+@pytest.mark.models
+def test_space_attachment_only_space_sentence(EN):
+	# space-exclusive sentence
+	doc = make_doc(EN, '\n \t \n\n \t')
+	assert len(doc) == 4
+	for tok in doc:
+		assert tok.is_space
+	with EN.parser.step_through(doc) as _:
+		pass
+	# all tokens are attached to the last one
+	for tok in doc:
+		assert tok.head.i == 3

From 289b10f441069c9ccccdd7205c43c6775cc68ddf Mon Sep 17 00:00:00 2001
From: Wolfgang Seeker <seeker@spacy.io>
Date: Thu, 14 Apr 2016 15:37:51 +0200
Subject: [PATCH 3/3] remove some comments

---
 spacy/syntax/_state.pxd    | 21 ---------------------
 spacy/syntax/arc_eager.pyx |  4 ----
 2 files changed, 25 deletions(-)

diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd
index 76cae3eb4..d7a24dbd1 100644
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@@ -296,27 +296,6 @@ cdef cppclass StateC:
         this._break = src._break
 
     void fast_forward() nogil:
-        # while this.buffer_length() == 0 \
-        # or this.stack_depth() == 0 \
-        # or Lexeme.c_check_flag(this.S_(0).lex, IS_SPACE):
-        #     if this.buffer_length() == 1 and this.stack_depth() == 0:
-        #         this.push()
-        #         this.pop()
-        #     elif this.buffer_length() == 0 and this.stack_depth() == 1:
-        #         this.pop()
-        #     elif this.buffer_length() == 0 and this.stack_depth() >= 2:
-        #         if this.has_head(this.S(0)):
-        #             this.pop()
-        #         else:
-        #             this.unshift()
-        #     elif (this.length - this._b_i) >= 1 and this.stack_depth() == 0:
-        #         this.push()
-        #     elif Lexeme.c_check_flag(this.S_(0).lex, IS_SPACE):
-        #         this.add_arc(this.B(0), this.S(0), 0)
-        #         this.pop()
-        #     else:
-        #         break
-
         # space token attachement policy:
         # - attach space tokens always to the last preceding real token
         # - except if it's the beginning of a sentence, then attach to the first following
diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 2305c309f..48614b591 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -233,10 +233,6 @@ cdef class Break:
             return False
         elif st.at_break():
             return False
-        # unnecessary, since the first item in the buffer is always put onto the stack
-        # automatically by fast_forward() in initialize_state()
-        # elif st.B(0) == 0:
-        #     return False
         elif st.stack_depth() < 1:
             return False
         # It is okay to predict a sentence boundary if the top item on the stack