From e68a431e5ec85882795518838580dfb312934607 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 15 Sep 2014 04:01:38 +0200 Subject: [PATCH] * Pass only the tokens vector to _tokenize, instead of the whole python object. --- spacy/lang.pxd | 2 +- spacy/lang.pyx | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 3ea29e53d..4ccc0f078 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -42,5 +42,5 @@ cdef class Language: cpdef Tokens tokenize(self, unicode text) cpdef Lexeme lookup(self, unicode text) - cdef int _tokenize(self, Tokens tokens, String* string) + cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) cdef int _split_one(self, Py_UNICODE* characters, size_t length) diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 894d9a3c4..b8fd5368d 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -182,25 +182,25 @@ cdef class Language: if Py_UNICODE_ISSPACE(c) == 1: if start < i: string_from_slice(&span, chars, start, i) - self._tokenize(tokens, &span) + self._tokenize(tokens.v, &span) start = i + 1 i += 1 if start < i: string_from_slice(&span, chars, start, i) - self._tokenize(tokens, &span) + self._tokenize(tokens.v, &span) return tokens - cdef int _tokenize(self, Tokens tokens, String* string): + cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string): cdef LexemeC** lexemes = self.cache.get(string.key) cdef size_t i if lexemes != NULL: i = 0 while lexemes[i] != NULL: - tokens.v.push_back(lexemes[i]) + tokens_v.push_back(lexemes[i]) i += 1 return 0 cdef uint64_t key = string.key - cdef size_t first_token = len(tokens) + cdef size_t first_token = tokens_v.size() cdef int split cdef int remaining = string.n cdef String prefix @@ -212,14 +212,14 @@ cdef class Language: if lexemes != NULL: i = 0 while lexemes[i] != NULL: - tokens.v.push_back(lexemes[i]) + tokens_v.push_back(lexemes[i]) i += 1 else: - tokens.v.push_back(self.lexicon.get(&prefix)) - lexemes = calloc(len(tokens) - first_token, sizeof(LexemeC*)) + tokens_v.push_back(self.lexicon.get(&prefix)) + lexemes = calloc(tokens_v.size() - first_token, sizeof(LexemeC*)) cdef size_t j - for i, j in enumerate(range(first_token, tokens.v.size())): - lexemes[i] = tokens.v[0][j] + for i, j in enumerate(range(first_token, tokens_v.size())): + lexemes[i] = tokens_v[0][j] self.cache.set(key, lexemes) cdef int _split_one(self, Py_UNICODE* characters, size_t length):