diff --git a/spacy/spans.pyx b/spacy/spans.pyx index 19a1f640c..052e977f2 100644 --- a/spacy/spans.pyx +++ b/spacy/spans.pyx @@ -5,6 +5,10 @@ from collections import defaultdict cdef class Span: """A slice from a Doc object.""" def __cinit__(self, Doc tokens, int start, int end, int label=0): + if start < 0: + start = tokens.length - start + if end < 0: + end = tokens.length - end self._seq = tokens self.start = start self.end = end @@ -37,20 +41,48 @@ cdef class Span: for i in range(self.start, self.end): yield self._seq[i] - property head: - """The highest Token in the dependency tree in the Span, or None if - the Span is not internally connected (i.e. there are multiple heads). + property root: + """The first ancestor of the first word of the span that has its head + outside the span. + + For example: + + >>> toks = nlp(u'I like New York in Autumn.') + + Let's name the indices --- easier than writing "toks[4]" etc. + + >>> i, like, new, york, in_, autumn, dot = range(len(toks)) + + The head of 'new' is 'York', and the head of 'York' is 'like' + + >>> toks[new].head.orth_ + 'York' + >>> toks[york].head.orth_ + 'like' + + Create a span for "New York". Its root is "York". + + >>> new_york = toks[new:york+1] + >>> new_york.root.orth_ + 'York' + + When there are multiple words with external dependencies, we take the first: + + >>> toks[autumn].head.orth_, toks[dot].head.orth_ + ('in', like') + >>> autumn_dot = toks[autumn:] + >>> autumn_dot.root.orth_ + 'Autumn' """ def __get__(self): - heads = [] - for token in self: - head_i = token.head.i - if token.head is token or head_i >= self.end or head_i < self.start: - heads.append(token) - if len(heads) != 1: - return None - else: - return heads[0] + # This should probably be called 'head', and the other one called + # 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/ + cdef const TokenC* start = &self._seq.data[self.start] + cdef const TokenC* end = &self._seq.data[self.end] + head = start + while start <= (head + head.head) < end and head.head != 0: + head += head.head + return self[head - self._seq.data] property lefts: """Tokens that are to the left of the Span, whose head is within the Span.""" @@ -68,6 +100,14 @@ cdef class Span: if right.i >= self.end: yield right + property subtree: + def __get__(self): + for word in self.lefts: + yield from word.subtree + yield from self + for word in self.rights: + yield from word.subtree + property orth_: def __get__(self): return ''.join([t.string for t in self]).strip() @@ -84,10 +124,3 @@ cdef class Span: def __get__(self): return self._seq.vocab.strings[self.label] - property subtree: - def __get__(self): - for word in self.lefts: - yield from word.subtree - yield from self - for word in self.rights: - yield from word.subtree diff --git a/tests/spans/test_span.py b/tests/spans/test_span.py index 47867fc08..9ef5fb147 100644 --- a/tests/spans/test_span.py +++ b/tests/spans/test_span.py @@ -5,8 +5,8 @@ import pytest @pytest.fixture -def doc(en_nlp): - return en_nlp('This is a sentence. This is another sentence. And a third.') +def doc(EN): + return EN('This is a sentence. This is another sentence. And a third.') def test_sent_spans(doc): @@ -15,3 +15,11 @@ def test_sent_spans(doc): assert sents[0].end == 5 assert len(sents) == 3 assert sum(len(sent) for sent in sents) == len(doc) + + +def test_root(doc): + np = doc[2:4] + assert len(np) == 2 + assert np.orth_ == 'a sentence' + assert np.root.orth_ == 'sentence' + assert nlp.root.head.orth_ == 'is'