* Switch to returning a Tokens object

This commit is contained in:
Matthew Honnibal 2014-09-11 21:37:32 +02:00
parent 1a3222af4b
commit 563047e90f
3 changed files with 13 additions and 9 deletions

View File

@ -30,7 +30,7 @@ cdef class Language:
cpdef readonly Lexicon lexicon
cpdef readonly object tokens_class
cpdef list tokenize(self, unicode text)
cpdef Tokens tokenize(self, unicode text)
cpdef Lexeme lookup(self, unicode text)
cdef _tokenize(self, Tokens tokens, unicode string)

View File

@ -62,7 +62,7 @@ cdef class Language:
"""
return self.lexicon.lookup(string)
cpdef list tokenize(self, unicode string):
cpdef Tokens tokenize(self, unicode string):
"""Tokenize a string.
The tokenization rules are defined in two places:
@ -78,12 +78,12 @@ cdef class Language:
tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
"""
cdef size_t length = len(string)
cdef Tokens tokens = self.tokens_class(length)
if length == 0:
return []
return tokens
cdef size_t start = 0
cdef size_t i = 0
cdef Tokens tokens = self.tokens_class()
for c in string:
if c == ' ':
if start < i:
@ -92,11 +92,7 @@ cdef class Language:
i += 1
if start < i:
self._tokenize(tokens, string[start:i])
assert tokens
output = []
for i in range(tokens.length):
output.append(Lexeme(<size_t>tokens.lexemes[i]))
return output
return tokens
cdef _tokenize(self, Tokens tokens, unicode string):
cdef list lexemes

View File

@ -30,6 +30,14 @@ cdef class Tokens:
self.size = size
self.length = 0
def __getitem__(self, i):
if i >= self.length:
raise IndexError
return Lexeme(<size_t>self.lexemes[i])
def __len__(self):
return self.length
def append(self, Lexeme lexeme):
self.push_back(lexeme._c)