mirror of https://github.com/explosion/spaCy.git
* Fix calculation of root token in Span. Now take root to be word with shortest tree path. Avoids parse trees ending up in inconsistent state, as had occurred in Issue #214.
This commit is contained in:
parent
c1039fa4b4
commit
8cbcc3a799
|
@ -125,8 +125,10 @@ cdef class Span:
|
|||
return u''.join([t.text_with_ws for t in self])
|
||||
|
||||
property root:
|
||||
"""The first ancestor of the first word of the span that has its head
|
||||
outside the span.
|
||||
"""The word of the span that is highest in the parse tree, i.e. has the
|
||||
shortest path to the root of the sentence (or is the root itself).
|
||||
|
||||
If multiple words are equally high in the tree, the first word is taken.
|
||||
|
||||
For example:
|
||||
|
||||
|
@ -149,45 +151,37 @@ cdef class Span:
|
|||
>>> new_york.root.orth_
|
||||
'York'
|
||||
|
||||
When there are multiple words with external dependencies, we take the first:
|
||||
Here's a more complicated case, raise by Issue #214
|
||||
|
||||
>>> toks[autumn].head.orth_, toks[dot].head.orth_
|
||||
('in', like')
|
||||
>>> autumn_dot = toks[autumn:]
|
||||
>>> autumn_dot.root.orth_
|
||||
'Autumn'
|
||||
>>> toks = nlp(u'to, north and south carolina')
|
||||
>>> to, north, and_, south, carolina = toks
|
||||
>>> south.head.text, carolina.head.text
|
||||
('north', 'to')
|
||||
|
||||
Here 'south' is a child of 'north', which is a child of 'carolina'.
|
||||
Carolina is the root of the span:
|
||||
|
||||
>>> south_carolina = toks[-2:]
|
||||
>>> south_carolina.root.text
|
||||
'carolina'
|
||||
"""
|
||||
def __get__(self):
|
||||
self._recalculate_indices()
|
||||
# This should probably be called 'head', and the other one called
|
||||
# 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/
|
||||
cdef const TokenC* start = &self.doc.c[self.start]
|
||||
cdef const TokenC* end = &self.doc.c[self.end]
|
||||
head = start
|
||||
cdef int nr_iter = 0
|
||||
while start <= (head + head.head) < end and head.head != 0:
|
||||
head += head.head
|
||||
# Guard against infinite loops
|
||||
if nr_iter >= (self.doc.length+1):
|
||||
# Retrieve the words without getting the Python tokens, to
|
||||
# avoid potential problems
|
||||
try:
|
||||
words = [self.doc.vocab.strings[self.doc.c[i].lex.orth] for i
|
||||
in range(self.doc.length)]
|
||||
except:
|
||||
words = '<Exception retrieving words!>'
|
||||
try:
|
||||
heads = [self.doc.c[i].head for i in range(self.doc.length)]
|
||||
except:
|
||||
heads = '<Exception retrieving heads!>'
|
||||
raise RuntimeError(
|
||||
"Invalid dependency parse, leading to potentially infinite loop. " +
|
||||
"Please report this error on the issue tracker.\n" +
|
||||
("Words: %s\n" % repr(words)) +
|
||||
("Heads: %s\n" % repr(heads)))
|
||||
nr_iter += 1
|
||||
return self.doc[head - self.doc.c]
|
||||
|
||||
cdef int i
|
||||
cdef int current_best = _count_words_to_root(&self.doc.c[self.start],
|
||||
self.doc.length)
|
||||
cdef int root = self.start
|
||||
for i in range(self.start, self.end):
|
||||
if current_best == 0:
|
||||
break
|
||||
words_to_root = _count_words_to_root(&self.doc.c[i], self.doc.length)
|
||||
if words_to_root < current_best:
|
||||
current_best = words_to_root
|
||||
root = i
|
||||
return self.doc[root]
|
||||
|
||||
property lefts:
|
||||
"""Tokens that are to the left of the Span, whose head is within the Span."""
|
||||
def __get__(self):
|
||||
|
@ -228,3 +222,16 @@ cdef class Span:
|
|||
def __get__(self):
|
||||
return self.doc.vocab.strings[self.label]
|
||||
|
||||
|
||||
cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
|
||||
cdef int n = 0
|
||||
while token.head != 0:
|
||||
token += token.head
|
||||
n += 1
|
||||
if n >= sent_length:
|
||||
raise RuntimeError(
|
||||
"Array bounds exceeded while searching for root word. This likely "
|
||||
"means the parse tree is in an invalid state. Please report this "
|
||||
"issue here: http://github.com/honnibal/spaCy/")
|
||||
token += token.head
|
||||
return n
|
||||
|
|
Loading…
Reference in New Issue