mirror of https://github.com/explosion/spaCy.git
* Unwind limit to sentence boundary detection that prevents it from inserting boundaries on whitespace. Replace it with a check for whitespace in StateClass.fast_forward, so that whitespace is LeftArced when it's on the stack. This should prevent the previous problem of whitespace-only sentences. Should fix Issue #184, but may cause further problems. Needs testing.
This commit is contained in:
parent
7893de3203
commit
04177debd0
|
@ -9,7 +9,7 @@ from .transition_system cimport do_func_t, get_cost_func_t
|
||||||
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse
|
||||||
from ..gold cimport GoldParseC
|
from ..gold cimport GoldParseC
|
||||||
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE
|
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
|
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
|
@ -238,9 +238,6 @@ cdef class Break:
|
||||||
elif (st.S(0) + 1) != st.B(0):
|
elif (st.S(0) + 1) != st.B(0):
|
||||||
# Must break at the token boundary
|
# Must break at the token boundary
|
||||||
return False
|
return False
|
||||||
# Don't allow spaces to be the first word of a sentence
|
|
||||||
elif Lexeme.c_check_flag(st.B_(0).lex, IS_SPACE):
|
|
||||||
return False
|
|
||||||
else:
|
else:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,9 @@ from libc.string cimport memcpy, memset
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
from ..vocab cimport EMPTY_LEXEME
|
from ..vocab cimport EMPTY_LEXEME
|
||||||
from ..structs cimport Entity
|
from ..structs cimport Entity
|
||||||
|
from ..lexeme cimport Lexeme
|
||||||
|
from ..symbols cimport punct
|
||||||
|
from ..attrs cimport IS_SPACE
|
||||||
|
|
||||||
|
|
||||||
cdef class StateClass:
|
cdef class StateClass:
|
||||||
|
@ -119,7 +122,9 @@ cdef class StateClass:
|
||||||
self.shifted[self.B(0)] = True
|
self.shifted[self.B(0)] = True
|
||||||
|
|
||||||
cdef void fast_forward(self) nogil:
|
cdef void fast_forward(self) nogil:
|
||||||
while self.buffer_length() == 0 or self.stack_depth() == 0:
|
while self.buffer_length() == 0 \
|
||||||
|
or self.stack_depth() == 0 \
|
||||||
|
or Lexeme.c_check_flag(self.S_(0).lex, IS_SPACE):
|
||||||
if self.buffer_length() == 1 and self.stack_depth() == 0:
|
if self.buffer_length() == 1 and self.stack_depth() == 0:
|
||||||
self.push()
|
self.push()
|
||||||
self.pop()
|
self.pop()
|
||||||
|
@ -132,6 +137,9 @@ cdef class StateClass:
|
||||||
self.unshift()
|
self.unshift()
|
||||||
elif (self.length - self._b_i) >= 1 and self.stack_depth() == 0:
|
elif (self.length - self._b_i) >= 1 and self.stack_depth() == 0:
|
||||||
self.push()
|
self.push()
|
||||||
|
elif Lexeme.c_check_flag(self.S_(0).lex, IS_SPACE):
|
||||||
|
self.add_arc(self.B(0), self.S(0), 0)
|
||||||
|
self.pop()
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
|
@ -15,13 +15,9 @@ def test_space_attachment(EN):
|
||||||
assert not sent[-1].is_space
|
assert not sent[-1].is_space
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
def test_sentence_space(EN):
|
def test_sentence_space(EN):
|
||||||
text = ('''I look forward to using Thingamajig. I've been told it will '''
|
text = ('''I look forward to using Thingamajig. I've been told it will '''
|
||||||
'''make my life easier...''')
|
'''make my life easier...''')
|
||||||
doc = EN(text)
|
doc = EN(text)
|
||||||
doc.from_array([HEAD], numpy.asarray([[1, 0, -1, -2, -1, -1, -5,
|
|
||||||
4, 3, 2, 1, 0, 2, 1, -3, 1, 1, -3, -7]],
|
|
||||||
dtype='int32').T)
|
|
||||||
assert len(list(doc.sents)) == 2
|
assert len(list(doc.sents)) == 2
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue