* Switch to returning a Tokens object

This commit is contained in:
Matthew Honnibal 2014-09-11 21:37:32 +02:00
parent 1a3222af4b
commit 563047e90f
3 changed files with 13 additions and 9 deletions

View File

@ -30,7 +30,7 @@ cdef class Language:
cpdef readonly Lexicon lexicon cpdef readonly Lexicon lexicon
cpdef readonly object tokens_class cpdef readonly object tokens_class
cpdef list tokenize(self, unicode text) cpdef Tokens tokenize(self, unicode text)
cpdef Lexeme lookup(self, unicode text) cpdef Lexeme lookup(self, unicode text)
cdef _tokenize(self, Tokens tokens, unicode string) cdef _tokenize(self, Tokens tokens, unicode string)

View File

@ -62,7 +62,7 @@ cdef class Language:
""" """
return self.lexicon.lookup(string) return self.lexicon.lookup(string)
cpdef list tokenize(self, unicode string): cpdef Tokens tokenize(self, unicode string):
"""Tokenize a string. """Tokenize a string.
The tokenization rules are defined in two places: The tokenization rules are defined in two places:
@ -78,12 +78,12 @@ cdef class Language:
tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs. tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
""" """
cdef size_t length = len(string) cdef size_t length = len(string)
cdef Tokens tokens = self.tokens_class(length)
if length == 0: if length == 0:
return [] return tokens
cdef size_t start = 0 cdef size_t start = 0
cdef size_t i = 0 cdef size_t i = 0
cdef Tokens tokens = self.tokens_class()
for c in string: for c in string:
if c == ' ': if c == ' ':
if start < i: if start < i:
@ -92,11 +92,7 @@ cdef class Language:
i += 1 i += 1
if start < i: if start < i:
self._tokenize(tokens, string[start:i]) self._tokenize(tokens, string[start:i])
assert tokens return tokens
output = []
for i in range(tokens.length):
output.append(Lexeme(<size_t>tokens.lexemes[i]))
return output
cdef _tokenize(self, Tokens tokens, unicode string): cdef _tokenize(self, Tokens tokens, unicode string):
cdef list lexemes cdef list lexemes

View File

@ -30,6 +30,14 @@ cdef class Tokens:
self.size = size self.size = size
self.length = 0 self.length = 0
def __getitem__(self, i):
if i >= self.length:
raise IndexError
return Lexeme(<size_t>self.lexemes[i])
def __len__(self):
return self.length
def append(self, Lexeme lexeme): def append(self, Lexeme lexeme):
self.push_back(lexeme._c) self.push_back(lexeme._c)