* Use fused type in Tokens.push_back, simplifying the use of the cache

This commit is contained in:
Matthew Honnibal 2014-12-09 16:50:01 +11:00
parent 516f0f1e14
commit 495e1c7366
4 changed files with 47 additions and 25 deletions

View File

@ -13,6 +13,17 @@ from .tagger cimport univ_tag_t
from .utf8string cimport StringStore, UniStr
cdef union LexemesOrTokens:
const Lexeme* const* lexemes
TokenC* tokens
cdef struct Cached:
LexemesOrTokens data
bint is_lex
int length
cdef class Lexicon:
cpdef public get_lex_props
cdef Pool mem

View File

@ -137,21 +137,19 @@ cdef class Language:
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
cdef int i
specials = <TokenC*>self._specials.get(key)
if specials != NULL:
i = 0
while specials[i].lex != NULL:
tokens.push_back(idx, specials[i].lex)
tokens.data[tokens.length - 1].pos = specials[i].pos
tokens.data[tokens.length - 1].morph = specials[i].morph
tokens.data[tokens.length - 1].lemma = specials[i].lemma
tokens.data[tokens.length - 1].sense = specials[i].sense
i += 1
cdef TokenC* token
cached = <Cached*>self._specials.get(key)
if cached != NULL:
assert not cached.is_lex
for i in range(cached.length):
token = &cached.data.tokens[i]
idx = tokens.push_back(idx, token)
return True
else:
cached = <const Lexeme* const*>self._cache.get(key)
cached = <Cached*>self._cache.get(key)
if cached != NULL:
tokens.extend(i, cached, 0)
assert cached.is_lex == True
tokens.extend(i, cached.data.lexemes, cached.length)
return True
else:
return False
@ -244,11 +242,14 @@ cdef class Language:
for i in range(n):
if tokens[i].lex.id == 1:
return 0
lexemes = <const Lexeme**>self.mem.alloc(n + 1, sizeof(Lexeme**))
cached = <Cached*>self.mem.alloc(1, sizeof(Cached))
cached.length = n
cached.is_lex = True
lexemes = <const Lexeme**>self.mem.alloc(n, sizeof(Lexeme**))
for i in range(n):
lexemes[i] = tokens[i].lex
lexemes[i + 1] = NULL
self._cache.set(key, lexemes)
cached.data.lexemes = <const Lexeme* const*>lexemes
self._cache.set(key, cached)
cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1:
cdef unicode string = chars[:length]
@ -287,10 +288,12 @@ cdef class Language:
if lemma:
tokens[i].lemma = self.lexicon.strings[lemma]
set_morph_from_dict(&tokens[i].morph, props)
# Null-terminated array
tokens[i+1].lex = NULL
cached = <Cached*>self.mem.alloc(1, sizeof(Cached))
cached.length = len(substrings)
cached.is_lex = False
cached.data.tokens = tokens
slice_unicode(&string, chunk, 0, len(chunk))
self._specials.set(string.key, tokens)
self._specials.set(string.key, cached)
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:

View File

@ -30,6 +30,14 @@ cdef struct TokenC:
int sense
ctypedef const Lexeme* const_Lexeme_ptr
ctypedef TokenC* TokenC_ptr
ctypedef fused LexemeOrToken:
const_Lexeme_ptr
TokenC_ptr
cdef class Tokens:
cdef Pool mem
cdef StringStore _string_store
@ -40,7 +48,7 @@ cdef class Tokens:
cdef int max_length
cdef int extend(self, int i, const Lexeme* const* lexemes, int n) except -1
cdef int push_back(self, int i, const Lexeme* lexeme) except -1
cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
cpdef int set_tag(self, int i, int tag_type, int tag) except -1
cpdef np.ndarray[long, ndim=2] get_array(self, list features)

View File

@ -60,16 +60,16 @@ cdef class Tokens:
def __len__(self):
return self.length
cdef int push_back(self, int idx, const Lexeme* lexeme) except -1:
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
if self.length == self.max_length:
self._realloc(self.length * 2)
cdef TokenC* t = &self.data[self.length]
t.lex = lexeme
t.idx = idx
t.pos = 0
t.sense = 0
if LexemeOrToken is TokenC_ptr:
t[0] = lex_or_tok[0]
else:
t.lex = lex_or_tok
self.length += 1
return idx + lexeme.length
return idx + t.lex.length
cdef int extend(self, int idx, const Lexeme* const* lexemes, int n) except -1:
cdef int i