From 563047e90f326df93726d4df479bf9c83a78879e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 11 Sep 2014 21:37:32 +0200 Subject: [PATCH] * Switch to returning a Tokens object --- spacy/lang.pxd | 2 +- spacy/lang.pyx | 12 ++++-------- spacy/tokens.pyx | 8 ++++++++ 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/spacy/lang.pxd b/spacy/lang.pxd index d948dd5bb..9f8362b90 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -30,7 +30,7 @@ cdef class Language: cpdef readonly Lexicon lexicon cpdef readonly object tokens_class - cpdef list tokenize(self, unicode text) + cpdef Tokens tokenize(self, unicode text) cpdef Lexeme lookup(self, unicode text) cdef _tokenize(self, Tokens tokens, unicode string) diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 96ec6797c..a31c3925e 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -62,7 +62,7 @@ cdef class Language: """ return self.lexicon.lookup(string) - cpdef list tokenize(self, unicode string): + cpdef Tokens tokenize(self, unicode string): """Tokenize a string. The tokenization rules are defined in two places: @@ -78,12 +78,12 @@ cdef class Language: tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs. """ cdef size_t length = len(string) + cdef Tokens tokens = self.tokens_class(length) if length == 0: - return [] + return tokens cdef size_t start = 0 cdef size_t i = 0 - cdef Tokens tokens = self.tokens_class() for c in string: if c == ' ': if start < i: @@ -92,11 +92,7 @@ cdef class Language: i += 1 if start < i: self._tokenize(tokens, string[start:i]) - assert tokens - output = [] - for i in range(tokens.length): - output.append(Lexeme(tokens.lexemes[i])) - return output + return tokens cdef _tokenize(self, Tokens tokens, unicode string): cdef list lexemes diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 8495dbae0..75816bebe 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -30,6 +30,14 @@ cdef class Tokens: self.size = size self.length = 0 + def __getitem__(self, i): + if i >= self.length: + raise IndexError + return Lexeme(self.lexemes[i]) + + def __len__(self): + return self.length + def append(self, Lexeme lexeme): self.push_back(lexeme._c)