* Disprefer punctuation and spaces as heads of spans

This commit is contained in:
Matthew Honnibal 2016-01-18 18:14:09 +01:00
parent bed36ab0ff
commit 334c4b2b57
1 changed files with 10 additions and 3 deletions

View File

@ -12,6 +12,8 @@ from ..attrs cimport attr_id_t
from ..parts_of_speech cimport univ_pos_t from ..parts_of_speech cimport univ_pos_t
from ..util import normalize_slice from ..util import normalize_slice
from .doc cimport token_by_start, token_by_end from .doc cimport token_by_start, token_by_end
from ..attrs cimport IS_PUNCT, IS_SPACE
from ..lexeme cimport Lexeme
cdef class Span: cdef class Span:
@ -183,12 +185,17 @@ cdef class Span:
# For each word, we count the path length, and arg min this measure. # For each word, we count the path length, and arg min this measure.
# We could use better tree logic to save steps here...But I think this # We could use better tree logic to save steps here...But I think this
# should be okay. # should be okay.
cdef int current_best = _count_words_to_root(&self.doc.c[self.start], cdef int current_best = self.doc.length
self.doc.length) cdef int root = -1
cdef int root = self.start
for i in range(self.start, self.end): for i in range(self.start, self.end):
if self.start <= (i+self.doc.c[i].head) < self.end: if self.start <= (i+self.doc.c[i].head) < self.end:
continue continue
# Don't allow punctuation or spaces to be the root, if there are
# better candidates
if root != -1 and Lexeme.c_check_flag(self.doc.c[i].lex, IS_PUNCT):
continue
if root != -1 and Lexeme.c_check_flag(self.doc.c[i].lex, IS_SPACE):
continue
words_to_root = _count_words_to_root(&self.doc.c[i], self.doc.length) words_to_root = _count_words_to_root(&self.doc.c[i], self.doc.length)
if words_to_root < current_best: if words_to_root < current_best:
current_best = words_to_root current_best = words_to_root