mirror of https://github.com/explosion/spaCy.git
* Disprefer punctuation and spaces as heads of spans
This commit is contained in:
parent
bed36ab0ff
commit
334c4b2b57
|
@ -12,6 +12,8 @@ from ..attrs cimport attr_id_t
|
||||||
from ..parts_of_speech cimport univ_pos_t
|
from ..parts_of_speech cimport univ_pos_t
|
||||||
from ..util import normalize_slice
|
from ..util import normalize_slice
|
||||||
from .doc cimport token_by_start, token_by_end
|
from .doc cimport token_by_start, token_by_end
|
||||||
|
from ..attrs cimport IS_PUNCT, IS_SPACE
|
||||||
|
from ..lexeme cimport Lexeme
|
||||||
|
|
||||||
|
|
||||||
cdef class Span:
|
cdef class Span:
|
||||||
|
@ -183,12 +185,17 @@ cdef class Span:
|
||||||
# For each word, we count the path length, and arg min this measure.
|
# For each word, we count the path length, and arg min this measure.
|
||||||
# We could use better tree logic to save steps here...But I think this
|
# We could use better tree logic to save steps here...But I think this
|
||||||
# should be okay.
|
# should be okay.
|
||||||
cdef int current_best = _count_words_to_root(&self.doc.c[self.start],
|
cdef int current_best = self.doc.length
|
||||||
self.doc.length)
|
cdef int root = -1
|
||||||
cdef int root = self.start
|
|
||||||
for i in range(self.start, self.end):
|
for i in range(self.start, self.end):
|
||||||
if self.start <= (i+self.doc.c[i].head) < self.end:
|
if self.start <= (i+self.doc.c[i].head) < self.end:
|
||||||
continue
|
continue
|
||||||
|
# Don't allow punctuation or spaces to be the root, if there are
|
||||||
|
# better candidates
|
||||||
|
if root != -1 and Lexeme.c_check_flag(self.doc.c[i].lex, IS_PUNCT):
|
||||||
|
continue
|
||||||
|
if root != -1 and Lexeme.c_check_flag(self.doc.c[i].lex, IS_SPACE):
|
||||||
|
continue
|
||||||
words_to_root = _count_words_to_root(&self.doc.c[i], self.doc.length)
|
words_to_root = _count_words_to_root(&self.doc.c[i], self.doc.length)
|
||||||
if words_to_root < current_best:
|
if words_to_root < current_best:
|
||||||
current_best = words_to_root
|
current_best = words_to_root
|
||||||
|
|
Loading…
Reference in New Issue