diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index b473e76b0..3f6d96304 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -10,6 +10,8 @@ from ._state cimport StateC from .transition_system cimport Transition from .transition_system cimport do_func_t from ..gold cimport GoldParseC, GoldParse +from ..lexeme cimport Lexeme +from ..attrs cimport IS_SPACE from ..errors import Errors @@ -273,6 +275,9 @@ cdef class Begin: # Don't allow entities to extend across sentence boundaries elif st.B_(1).sent_start == 1: return False + # Don't allow entities to start on whitespace + elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE): + return False else: return label != 0 and not st.entity_is_open() @@ -366,6 +371,9 @@ cdef class Last: cdef bint is_valid(const StateC* st, attr_t label) nogil: if st.B_(1).ent_iob == 1: return False + # Don't allow entities to end on whitespace + elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE): + return False return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label @staticmethod @@ -418,6 +426,8 @@ cdef class Unit: return False elif st.B_(1).ent_iob == 1: return False + elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE): + return False return label != 0 and not st.entity_is_open() @staticmethod