mirror of https://github.com/explosion/spaCy.git
* Fix Issue #131: Force whitespace characters to attach syntactically to previous token, and ensure they cannot serve as stand-alone 'sentence' units.
This commit is contained in:
parent
8b39feefbe
commit
9dd2f25c74
|
@ -380,10 +380,17 @@ cdef class ArcEager(TransitionSystem):
|
|||
st.fast_forward()
|
||||
|
||||
cdef int finalize_state(self, StateClass st) nogil:
|
||||
cdef int i
|
||||
for i in range(st.length):
|
||||
# Always attach spaces to the previous word
|
||||
if Lexeme.c_check_flag(st._sent[i].lex, IS_SPACE):
|
||||
st._sent[i].head = -1 if (i >= 1) else 1
|
||||
if st._sent[i].sent_start and st._sent[i].head == -1:
|
||||
st._sent[i].sent_start = False
|
||||
# If we had this space token as the start of a sentence,
|
||||
# move that sentence start forward one
|
||||
if (i + 1) < st.length and not st._sent[i+1].sent_start:
|
||||
st._sent[i+1].sent_start = True
|
||||
elif st._sent[i].head == 0 and st._sent[i].dep == 0:
|
||||
st._sent[i].dep = self.root_label
|
||||
# If we're not using the Break transition, we segment via root-labelled
|
||||
|
|
Loading…
Reference in New Issue