mirror of https://github.com/explosion/spaCy.git
* Add dependency post-process rule to ensure spaces are attached to neighbouring tokens, so that they can't be sentence boundaries
This commit is contained in:
parent
1521cf25c9
commit
8b39feefbe
|
@ -9,7 +9,8 @@ from .transition_system cimport do_func_t, get_cost_func_t
|
||||||
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
||||||
from ..gold cimport GoldParse
|
from ..gold cimport GoldParse
|
||||||
from ..gold cimport GoldParseC
|
from ..gold cimport GoldParseC
|
||||||
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE
|
||||||
|
from ..lexeme cimport Lexeme
|
||||||
|
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy
|
||||||
|
@ -380,7 +381,10 @@ cdef class ArcEager(TransitionSystem):
|
||||||
|
|
||||||
cdef int finalize_state(self, StateClass st) nogil:
|
cdef int finalize_state(self, StateClass st) nogil:
|
||||||
for i in range(st.length):
|
for i in range(st.length):
|
||||||
if st._sent[i].head == 0 and st._sent[i].dep == 0:
|
# Always attach spaces to the previous word
|
||||||
|
if Lexeme.c_check_flag(st._sent[i].lex, IS_SPACE):
|
||||||
|
st._sent[i].head = -1 if (i >= 1) else 1
|
||||||
|
elif st._sent[i].head == 0 and st._sent[i].dep == 0:
|
||||||
st._sent[i].dep = self.root_label
|
st._sent[i].dep = self.root_label
|
||||||
# If we're not using the Break transition, we segment via root-labelled
|
# If we're not using the Break transition, we segment via root-labelled
|
||||||
# arcs between the root words.
|
# arcs between the root words.
|
||||||
|
|
Loading…
Reference in New Issue