From 334c4b2b57c5b671bd8e5348680fbbfc6fda8a6b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 18 Jan 2016 18:14:09 +0100 Subject: [PATCH] * Disprefer punctuation and spaces as heads of spans --- spacy/tokens/span.pyx | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 455cd2045..e4e6ad582 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -12,6 +12,8 @@ from ..attrs cimport attr_id_t from ..parts_of_speech cimport univ_pos_t from ..util import normalize_slice from .doc cimport token_by_start, token_by_end +from ..attrs cimport IS_PUNCT, IS_SPACE +from ..lexeme cimport Lexeme cdef class Span: @@ -183,12 +185,17 @@ cdef class Span: # For each word, we count the path length, and arg min this measure. # We could use better tree logic to save steps here...But I think this # should be okay. - cdef int current_best = _count_words_to_root(&self.doc.c[self.start], - self.doc.length) - cdef int root = self.start + cdef int current_best = self.doc.length + cdef int root = -1 for i in range(self.start, self.end): if self.start <= (i+self.doc.c[i].head) < self.end: continue + # Don't allow punctuation or spaces to be the root, if there are + # better candidates + if root != -1 and Lexeme.c_check_flag(self.doc.c[i].lex, IS_PUNCT): + continue + if root != -1 and Lexeme.c_check_flag(self.doc.c[i].lex, IS_SPACE): + continue words_to_root = _count_words_to_root(&self.doc.c[i], self.doc.length) if words_to_root < current_best: current_best = words_to_root