From 1e6725e9b734862e61081a916baf440697b9971e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 7 Dec 2018 00:12:12 +0000 Subject: [PATCH] Try to prevent spaces from being tagged as entities --- spacy/syntax/ner.pyx | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index b473e76b0..3f6d96304 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -10,6 +10,8 @@ from ._state cimport StateC from .transition_system cimport Transition from .transition_system cimport do_func_t from ..gold cimport GoldParseC, GoldParse +from ..lexeme cimport Lexeme +from ..attrs cimport IS_SPACE from ..errors import Errors @@ -273,6 +275,9 @@ cdef class Begin: # Don't allow entities to extend across sentence boundaries elif st.B_(1).sent_start == 1: return False + # Don't allow entities to start on whitespace + elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE): + return False else: return label != 0 and not st.entity_is_open() @@ -366,6 +371,9 @@ cdef class Last: cdef bint is_valid(const StateC* st, attr_t label) nogil: if st.B_(1).ent_iob == 1: return False + # Don't allow entities to end on whitespace + elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE): + return False return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label @staticmethod @@ -418,6 +426,8 @@ cdef class Unit: return False elif st.B_(1).ent_iob == 1: return False + elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE): + return False return label != 0 and not st.entity_is_open() @staticmethod