mirror of https://github.com/explosion/spaCy.git
* Unwind limit to sentence boundary detection that prevents it from inserting boundaries on whitespace. Replace it with a check for whitespace in StateClass.fast_forward, so that whitespace is LeftArced when it's on the stack. This should prevent the previous problem of whitespace-only sentences. Should fix Issue #184, but may cause further problems. Needs testing.
This commit is contained in:
parent
7893de3203
commit
04177debd0
|
@ -9,7 +9,7 @@ from .transition_system cimport do_func_t, get_cost_func_t
|
|||
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
||||
from ..gold cimport GoldParse
|
||||
from ..gold cimport GoldParseC
|
||||
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE
|
||||
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
||||
from ..lexeme cimport Lexeme
|
||||
|
||||
from libc.stdint cimport uint32_t
|
||||
|
@ -238,9 +238,6 @@ cdef class Break:
|
|||
elif (st.S(0) + 1) != st.B(0):
|
||||
# Must break at the token boundary
|
||||
return False
|
||||
# Don't allow spaces to be the first word of a sentence
|
||||
elif Lexeme.c_check_flag(st.B_(0).lex, IS_SPACE):
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
|
|
@ -2,6 +2,9 @@ from libc.string cimport memcpy, memset
|
|||
from libc.stdint cimport uint32_t
|
||||
from ..vocab cimport EMPTY_LEXEME
|
||||
from ..structs cimport Entity
|
||||
from ..lexeme cimport Lexeme
|
||||
from ..symbols cimport punct
|
||||
from ..attrs cimport IS_SPACE
|
||||
|
||||
|
||||
cdef class StateClass:
|
||||
|
@ -119,7 +122,9 @@ cdef class StateClass:
|
|||
self.shifted[self.B(0)] = True
|
||||
|
||||
cdef void fast_forward(self) nogil:
|
||||
while self.buffer_length() == 0 or self.stack_depth() == 0:
|
||||
while self.buffer_length() == 0 \
|
||||
or self.stack_depth() == 0 \
|
||||
or Lexeme.c_check_flag(self.S_(0).lex, IS_SPACE):
|
||||
if self.buffer_length() == 1 and self.stack_depth() == 0:
|
||||
self.push()
|
||||
self.pop()
|
||||
|
@ -132,6 +137,9 @@ cdef class StateClass:
|
|||
self.unshift()
|
||||
elif (self.length - self._b_i) >= 1 and self.stack_depth() == 0:
|
||||
self.push()
|
||||
elif Lexeme.c_check_flag(self.S_(0).lex, IS_SPACE):
|
||||
self.add_arc(self.B(0), self.S(0), 0)
|
||||
self.pop()
|
||||
else:
|
||||
break
|
||||
|
||||
|
|
|
@ -15,13 +15,9 @@ def test_space_attachment(EN):
|
|||
assert not sent[-1].is_space
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_sentence_space(EN):
|
||||
text = ('''I look forward to using Thingamajig. I've been told it will '''
|
||||
'''make my life easier...''')
|
||||
doc = EN(text)
|
||||
doc.from_array([HEAD], numpy.asarray([[1, 0, -1, -2, -1, -1, -5,
|
||||
4, 3, 2, 1, 0, 2, 1, -3, 1, 1, -3, -7]],
|
||||
dtype='int32').T)
|
||||
assert len(list(doc.sents)) == 2
|
||||
|
||||
|
|
Loading…
Reference in New Issue