* Unwind limit to sentence boundary detection that prevents it from inserting boundaries on whitespace. Replace it with a check for whitespace in StateClass.fast_forward, so that whitespace is LeftArced when it's on the stack. This should prevent the previous problem of whitespace-only sentences. Should fix Issue #184, but may cause further problems. Needs testing.

This commit is contained in:
Matthew Honnibal 2016-01-19 02:54:15 +01:00
parent 7893de3203
commit 04177debd0
3 changed files with 10 additions and 9 deletions

View File

@ -9,7 +9,7 @@ from .transition_system cimport do_func_t, get_cost_func_t
from .transition_system cimport move_cost_func_t, label_cost_func_t
from ..gold cimport GoldParse
from ..gold cimport GoldParseC
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
from ..lexeme cimport Lexeme
from libc.stdint cimport uint32_t
@ -238,9 +238,6 @@ cdef class Break:
elif (st.S(0) + 1) != st.B(0):
# Must break at the token boundary
return False
# Don't allow spaces to be the first word of a sentence
elif Lexeme.c_check_flag(st.B_(0).lex, IS_SPACE):
return False
else:
return True

View File

@ -2,6 +2,9 @@ from libc.string cimport memcpy, memset
from libc.stdint cimport uint32_t
from ..vocab cimport EMPTY_LEXEME
from ..structs cimport Entity
from ..lexeme cimport Lexeme
from ..symbols cimport punct
from ..attrs cimport IS_SPACE
cdef class StateClass:
@ -119,7 +122,9 @@ cdef class StateClass:
self.shifted[self.B(0)] = True
cdef void fast_forward(self) nogil:
while self.buffer_length() == 0 or self.stack_depth() == 0:
while self.buffer_length() == 0 \
or self.stack_depth() == 0 \
or Lexeme.c_check_flag(self.S_(0).lex, IS_SPACE):
if self.buffer_length() == 1 and self.stack_depth() == 0:
self.push()
self.pop()
@ -132,6 +137,9 @@ cdef class StateClass:
self.unshift()
elif (self.length - self._b_i) >= 1 and self.stack_depth() == 0:
self.push()
elif Lexeme.c_check_flag(self.S_(0).lex, IS_SPACE):
self.add_arc(self.B(0), self.S(0), 0)
self.pop()
else:
break

View File

@ -15,13 +15,9 @@ def test_space_attachment(EN):
assert not sent[-1].is_space
@pytest.mark.xfail
def test_sentence_space(EN):
text = ('''I look forward to using Thingamajig. I've been told it will '''
'''make my life easier...''')
doc = EN(text)
doc.from_array([HEAD], numpy.asarray([[1, 0, -1, -2, -1, -1, -5,
4, 3, 2, 1, 0, 2, 1, -3, 1, 1, -3, -7]],
dtype='int32').T)
assert len(list(doc.sents)) == 2