From 5fd72bc220ee35d193892b2b27f4603a472798bf Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 24 Jan 2015 07:32:38 +1100 Subject: [PATCH] * Have 'string' refer to the whitespace-padded string --- spacy/tokens.pxd | 1 + spacy/tokens.pyx | 23 +++++++---------------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 25263db29..873819706 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -65,3 +65,4 @@ cdef class Token: cdef readonly attr_t dep cdef readonly ndarray repvec + cdef readonly unicode string diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 19922cf4c..dbc76ba71 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -235,16 +235,10 @@ cdef class Token: self.tag = t.tag self.dep = t.dep self.repvec = numpy.asarray( t.lex.repvec) - - def __unicode__(self): - cdef const TokenC* t = &self._seq.data[self.i] - cdef int end_idx = t.idx + t.lex.length - if self.i + 1 == self._seq.length: - return self.string - if end_idx == t[1].idx: - return self.string - else: - return self.string + ' ' + cdef int next_idx = (t+1).idx + if next_idx <= self.idx: + next_idx = self.idx + self.length + self.string = tokens._string[self.idx:next_idx] def __len__(self): """The number of unicode code-points in the original string. @@ -260,13 +254,10 @@ cdef class Token: cdef const TokenC* t = &self._seq.data[self.i] return Token(self._seq, self.i + t.head) - property string: + property whitespace: def __get__(self): - cdef const TokenC* t = &self._seq.data[self.i] - if t.lex.orth == 0: - return '' - cdef unicode py_ustr = self._seq.vocab.strings[t.lex.orth] - return py_ustr + cdef int end_idx = self.idx + self.length + property orth_: def __get__(self):