diff --git a/spacy/lang.pxd b/spacy/lang.pxd index d948dd5bb..9f8362b90 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -30,7 +30,7 @@ cdef class Language: cpdef readonly Lexicon lexicon cpdef readonly object tokens_class - cpdef list tokenize(self, unicode text) + cpdef Tokens tokenize(self, unicode text) cpdef Lexeme lookup(self, unicode text) cdef _tokenize(self, Tokens tokens, unicode string) diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 96ec6797c..a31c3925e 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -62,7 +62,7 @@ cdef class Language: """ return self.lexicon.lookup(string) - cpdef list tokenize(self, unicode string): + cpdef Tokens tokenize(self, unicode string): """Tokenize a string. The tokenization rules are defined in two places: @@ -78,12 +78,12 @@ cdef class Language: tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs. """ cdef size_t length = len(string) + cdef Tokens tokens = self.tokens_class(length) if length == 0: - return [] + return tokens cdef size_t start = 0 cdef size_t i = 0 - cdef Tokens tokens = self.tokens_class() for c in string: if c == ' ': if start < i: @@ -92,11 +92,7 @@ cdef class Language: i += 1 if start < i: self._tokenize(tokens, string[start:i]) - assert tokens - output = [] - for i in range(tokens.length): - output.append(Lexeme(tokens.lexemes[i])) - return output + return tokens cdef _tokenize(self, Tokens tokens, unicode string): cdef list lexemes diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 8495dbae0..75816bebe 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -30,6 +30,14 @@ cdef class Tokens: self.size = size self.length = 0 + def __getitem__(self, i): + if i >= self.length: + raise IndexError + return Lexeme(self.lexemes[i]) + + def __len__(self): + return self.length + def append(self, Lexeme lexeme): self.push_back(lexeme._c)