From 8cbcc3a799070217554823f6997dbeb8109c224e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 16 Jan 2016 15:38:50 +0100 Subject: [PATCH] * Fix calculation of root token in Span. Now take root to be word with shortest tree path. Avoids parse trees ending up in inconsistent state, as had occurred in Issue #214. --- spacy/tokens/span.pyx | 77 +++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 35 deletions(-) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 1e72fb8c9..4a47e6c42 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -125,8 +125,10 @@ cdef class Span: return u''.join([t.text_with_ws for t in self]) property root: - """The first ancestor of the first word of the span that has its head - outside the span. + """The word of the span that is highest in the parse tree, i.e. has the + shortest path to the root of the sentence (or is the root itself). + + If multiple words are equally high in the tree, the first word is taken. For example: @@ -149,45 +151,37 @@ cdef class Span: >>> new_york.root.orth_ 'York' - When there are multiple words with external dependencies, we take the first: + Here's a more complicated case, raise by Issue #214 - >>> toks[autumn].head.orth_, toks[dot].head.orth_ - ('in', like') - >>> autumn_dot = toks[autumn:] - >>> autumn_dot.root.orth_ - 'Autumn' + >>> toks = nlp(u'to, north and south carolina') + >>> to, north, and_, south, carolina = toks + >>> south.head.text, carolina.head.text + ('north', 'to') + + Here 'south' is a child of 'north', which is a child of 'carolina'. + Carolina is the root of the span: + + >>> south_carolina = toks[-2:] + >>> south_carolina.root.text + 'carolina' """ def __get__(self): self._recalculate_indices() # This should probably be called 'head', and the other one called # 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/ - cdef const TokenC* start = &self.doc.c[self.start] - cdef const TokenC* end = &self.doc.c[self.end] - head = start - cdef int nr_iter = 0 - while start <= (head + head.head) < end and head.head != 0: - head += head.head - # Guard against infinite loops - if nr_iter >= (self.doc.length+1): - # Retrieve the words without getting the Python tokens, to - # avoid potential problems - try: - words = [self.doc.vocab.strings[self.doc.c[i].lex.orth] for i - in range(self.doc.length)] - except: - words = '' - try: - heads = [self.doc.c[i].head for i in range(self.doc.length)] - except: - heads = '' - raise RuntimeError( - "Invalid dependency parse, leading to potentially infinite loop. " + - "Please report this error on the issue tracker.\n" + - ("Words: %s\n" % repr(words)) + - ("Heads: %s\n" % repr(heads))) - nr_iter += 1 - return self.doc[head - self.doc.c] - + cdef int i + cdef int current_best = _count_words_to_root(&self.doc.c[self.start], + self.doc.length) + cdef int root = self.start + for i in range(self.start, self.end): + if current_best == 0: + break + words_to_root = _count_words_to_root(&self.doc.c[i], self.doc.length) + if words_to_root < current_best: + current_best = words_to_root + root = i + return self.doc[root] + property lefts: """Tokens that are to the left of the Span, whose head is within the Span.""" def __get__(self): @@ -228,3 +222,16 @@ cdef class Span: def __get__(self): return self.doc.vocab.strings[self.label] + +cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1: + cdef int n = 0 + while token.head != 0: + token += token.head + n += 1 + if n >= sent_length: + raise RuntimeError( + "Array bounds exceeded while searching for root word. This likely " + "means the parse tree is in an invalid state. Please report this " + "issue here: http://github.com/honnibal/spaCy/") + token += token.head + return n