From a27b23cc8f8ac7abf928e08e5d795dfd54626842 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 22 Jan 2015 22:24:44 +1100 Subject: [PATCH] * Have SBD return start/end indices --- spacy/tokens.pyx | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 209d5288f..a879b7359 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -92,15 +92,18 @@ cdef class Tokens: cdef attr_t period = self.vocab.strings['.'] cdef attr_t question = self.vocab.strings['?'] cdef attr_t exclamation = self.vocab.strings['!'] + spans = [] + start = None for i in range(self.length): - sent.push_back(self.data[i].idx, &self.data[i]) + if start is None: + start = i if self.data[i].lex.sic == period or self.data[i].lex.sic == exclamation or \ self.data[i].lex.sic == question: - sentences.append(sent) - sent = Tokens(self.vocab, self._string[self.data[i].idx:]) - if sent.length: - sentences.append(sent) - return sentences + spans.append((start, i+1)) + start = None + if start is not None: + spans.append((start, self.length)) + return spans def __getitem__(self, i): """Retrieve a token.