From 04177debd0a4a75065dd00a7f9dabf927012a01c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 19 Jan 2016 02:54:15 +0100 Subject: [PATCH] * Unwind limit to sentence boundary detection that prevents it from inserting boundaries on whitespace. Replace it with a check for whitespace in StateClass.fast_forward, so that whitespace is LeftArced when it's on the stack. This should prevent the previous problem of whitespace-only sentences. Should fix Issue #184, but may cause further problems. Needs testing. --- spacy/syntax/arc_eager.pyx | 5 +---- spacy/syntax/stateclass.pyx | 10 +++++++++- spacy/tests/parser/test_space_attachment.py | 4 ---- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 1a198e460..79c75646d 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -9,7 +9,7 @@ from .transition_system cimport do_func_t, get_cost_func_t from .transition_system cimport move_cost_func_t, label_cost_func_t from ..gold cimport GoldParse from ..gold cimport GoldParseC -from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE +from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE from ..lexeme cimport Lexeme from libc.stdint cimport uint32_t @@ -238,9 +238,6 @@ cdef class Break: elif (st.S(0) + 1) != st.B(0): # Must break at the token boundary return False - # Don't allow spaces to be the first word of a sentence - elif Lexeme.c_check_flag(st.B_(0).lex, IS_SPACE): - return False else: return True diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 9603b6607..1ee103f61 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -2,6 +2,9 @@ from libc.string cimport memcpy, memset from libc.stdint cimport uint32_t from ..vocab cimport EMPTY_LEXEME from ..structs cimport Entity +from ..lexeme cimport Lexeme +from ..symbols cimport punct +from ..attrs cimport IS_SPACE cdef class StateClass: @@ -119,7 +122,9 @@ cdef class StateClass: self.shifted[self.B(0)] = True cdef void fast_forward(self) nogil: - while self.buffer_length() == 0 or self.stack_depth() == 0: + while self.buffer_length() == 0 \ + or self.stack_depth() == 0 \ + or Lexeme.c_check_flag(self.S_(0).lex, IS_SPACE): if self.buffer_length() == 1 and self.stack_depth() == 0: self.push() self.pop() @@ -132,6 +137,9 @@ cdef class StateClass: self.unshift() elif (self.length - self._b_i) >= 1 and self.stack_depth() == 0: self.push() + elif Lexeme.c_check_flag(self.S_(0).lex, IS_SPACE): + self.add_arc(self.B(0), self.S(0), 0) + self.pop() else: break diff --git a/spacy/tests/parser/test_space_attachment.py b/spacy/tests/parser/test_space_attachment.py index d1e520a0f..f2a34f4c6 100644 --- a/spacy/tests/parser/test_space_attachment.py +++ b/spacy/tests/parser/test_space_attachment.py @@ -15,13 +15,9 @@ def test_space_attachment(EN): assert not sent[-1].is_space -@pytest.mark.xfail def test_sentence_space(EN): text = ('''I look forward to using Thingamajig. I've been told it will ''' '''make my life easier...''') doc = EN(text) - doc.from_array([HEAD], numpy.asarray([[1, 0, -1, -2, -1, -1, -5, - 4, 3, 2, 1, 0, 2, 1, -3, 1, 1, -3, -7]], - dtype='int32').T) assert len(list(doc.sents)) == 2