mirror of https://github.com/explosion/spaCy.git
* Fix calculation of root token in Span. Now take root to be word with shortest tree path. Avoids parse trees ending up in inconsistent state, as had occurred in Issue #214.
This commit is contained in:
parent
c1039fa4b4
commit
8cbcc3a799
|
@ -125,8 +125,10 @@ cdef class Span:
|
||||||
return u''.join([t.text_with_ws for t in self])
|
return u''.join([t.text_with_ws for t in self])
|
||||||
|
|
||||||
property root:
|
property root:
|
||||||
"""The first ancestor of the first word of the span that has its head
|
"""The word of the span that is highest in the parse tree, i.e. has the
|
||||||
outside the span.
|
shortest path to the root of the sentence (or is the root itself).
|
||||||
|
|
||||||
|
If multiple words are equally high in the tree, the first word is taken.
|
||||||
|
|
||||||
For example:
|
For example:
|
||||||
|
|
||||||
|
@ -149,45 +151,37 @@ cdef class Span:
|
||||||
>>> new_york.root.orth_
|
>>> new_york.root.orth_
|
||||||
'York'
|
'York'
|
||||||
|
|
||||||
When there are multiple words with external dependencies, we take the first:
|
Here's a more complicated case, raise by Issue #214
|
||||||
|
|
||||||
>>> toks[autumn].head.orth_, toks[dot].head.orth_
|
>>> toks = nlp(u'to, north and south carolina')
|
||||||
('in', like')
|
>>> to, north, and_, south, carolina = toks
|
||||||
>>> autumn_dot = toks[autumn:]
|
>>> south.head.text, carolina.head.text
|
||||||
>>> autumn_dot.root.orth_
|
('north', 'to')
|
||||||
'Autumn'
|
|
||||||
|
Here 'south' is a child of 'north', which is a child of 'carolina'.
|
||||||
|
Carolina is the root of the span:
|
||||||
|
|
||||||
|
>>> south_carolina = toks[-2:]
|
||||||
|
>>> south_carolina.root.text
|
||||||
|
'carolina'
|
||||||
"""
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
self._recalculate_indices()
|
self._recalculate_indices()
|
||||||
# This should probably be called 'head', and the other one called
|
# This should probably be called 'head', and the other one called
|
||||||
# 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/
|
# 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/
|
||||||
cdef const TokenC* start = &self.doc.c[self.start]
|
cdef int i
|
||||||
cdef const TokenC* end = &self.doc.c[self.end]
|
cdef int current_best = _count_words_to_root(&self.doc.c[self.start],
|
||||||
head = start
|
self.doc.length)
|
||||||
cdef int nr_iter = 0
|
cdef int root = self.start
|
||||||
while start <= (head + head.head) < end and head.head != 0:
|
for i in range(self.start, self.end):
|
||||||
head += head.head
|
if current_best == 0:
|
||||||
# Guard against infinite loops
|
break
|
||||||
if nr_iter >= (self.doc.length+1):
|
words_to_root = _count_words_to_root(&self.doc.c[i], self.doc.length)
|
||||||
# Retrieve the words without getting the Python tokens, to
|
if words_to_root < current_best:
|
||||||
# avoid potential problems
|
current_best = words_to_root
|
||||||
try:
|
root = i
|
||||||
words = [self.doc.vocab.strings[self.doc.c[i].lex.orth] for i
|
return self.doc[root]
|
||||||
in range(self.doc.length)]
|
|
||||||
except:
|
|
||||||
words = '<Exception retrieving words!>'
|
|
||||||
try:
|
|
||||||
heads = [self.doc.c[i].head for i in range(self.doc.length)]
|
|
||||||
except:
|
|
||||||
heads = '<Exception retrieving heads!>'
|
|
||||||
raise RuntimeError(
|
|
||||||
"Invalid dependency parse, leading to potentially infinite loop. " +
|
|
||||||
"Please report this error on the issue tracker.\n" +
|
|
||||||
("Words: %s\n" % repr(words)) +
|
|
||||||
("Heads: %s\n" % repr(heads)))
|
|
||||||
nr_iter += 1
|
|
||||||
return self.doc[head - self.doc.c]
|
|
||||||
|
|
||||||
property lefts:
|
property lefts:
|
||||||
"""Tokens that are to the left of the Span, whose head is within the Span."""
|
"""Tokens that are to the left of the Span, whose head is within the Span."""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -228,3 +222,16 @@ cdef class Span:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.doc.vocab.strings[self.label]
|
return self.doc.vocab.strings[self.label]
|
||||||
|
|
||||||
|
|
||||||
|
cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
|
||||||
|
cdef int n = 0
|
||||||
|
while token.head != 0:
|
||||||
|
token += token.head
|
||||||
|
n += 1
|
||||||
|
if n >= sent_length:
|
||||||
|
raise RuntimeError(
|
||||||
|
"Array bounds exceeded while searching for root word. This likely "
|
||||||
|
"means the parse tree is in an invalid state. Please report this "
|
||||||
|
"issue here: http://github.com/honnibal/spaCy/")
|
||||||
|
token += token.head
|
||||||
|
return n
|
||||||
|
|
Loading…
Reference in New Issue