From 495e1c7366d30f79ef3332d05c94dbf14abd909d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 9 Dec 2014 16:50:01 +1100 Subject: [PATCH] * Use fused type in Tokens.push_back, simplifying the use of the cache --- spacy/lang.pxd | 11 +++++++++++ spacy/lang.pyx | 39 +++++++++++++++++++++------------------ spacy/tokens.pxd | 10 +++++++++- spacy/tokens.pyx | 12 ++++++------ 4 files changed, 47 insertions(+), 25 deletions(-) diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 0307e12fe..8a6aa5f97 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -13,6 +13,17 @@ from .tagger cimport univ_tag_t from .utf8string cimport StringStore, UniStr +cdef union LexemesOrTokens: + const Lexeme* const* lexemes + TokenC* tokens + + +cdef struct Cached: + LexemesOrTokens data + bint is_lex + int length + + cdef class Lexicon: cpdef public get_lex_props cdef Pool mem diff --git a/spacy/lang.pyx b/spacy/lang.pyx index cdae8644a..044bfb7bc 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -137,21 +137,19 @@ cdef class Language: cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1: cdef int i - specials = self._specials.get(key) - if specials != NULL: - i = 0 - while specials[i].lex != NULL: - tokens.push_back(idx, specials[i].lex) - tokens.data[tokens.length - 1].pos = specials[i].pos - tokens.data[tokens.length - 1].morph = specials[i].morph - tokens.data[tokens.length - 1].lemma = specials[i].lemma - tokens.data[tokens.length - 1].sense = specials[i].sense - i += 1 + cdef TokenC* token + cached = self._specials.get(key) + if cached != NULL: + assert not cached.is_lex + for i in range(cached.length): + token = &cached.data.tokens[i] + idx = tokens.push_back(idx, token) return True else: - cached = self._cache.get(key) + cached = self._cache.get(key) if cached != NULL: - tokens.extend(i, cached, 0) + assert cached.is_lex == True + tokens.extend(i, cached.data.lexemes, cached.length) return True else: return False @@ -244,11 +242,14 @@ cdef class Language: for i in range(n): if tokens[i].lex.id == 1: return 0 - lexemes = self.mem.alloc(n + 1, sizeof(Lexeme**)) + cached = self.mem.alloc(1, sizeof(Cached)) + cached.length = n + cached.is_lex = True + lexemes = self.mem.alloc(n, sizeof(Lexeme**)) for i in range(n): lexemes[i] = tokens[i].lex - lexemes[i + 1] = NULL - self._cache.set(key, lexemes) + cached.data.lexemes = lexemes + self._cache.set(key, cached) cdef int _find_infix(self, Py_UNICODE* chars, size_t length) except -1: cdef unicode string = chars[:length] @@ -287,10 +288,12 @@ cdef class Language: if lemma: tokens[i].lemma = self.lexicon.strings[lemma] set_morph_from_dict(&tokens[i].morph, props) - # Null-terminated array - tokens[i+1].lex = NULL + cached = self.mem.alloc(1, sizeof(Cached)) + cached.length = len(substrings) + cached.is_lex = False + cached.data.tokens = tokens slice_unicode(&string, chunk, 0, len(chunk)) - self._specials.set(string.key, tokens) + self._specials.set(string.key, cached) cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index f3d6011ec..01bec6815 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -30,6 +30,14 @@ cdef struct TokenC: int sense +ctypedef const Lexeme* const_Lexeme_ptr +ctypedef TokenC* TokenC_ptr + +ctypedef fused LexemeOrToken: + const_Lexeme_ptr + TokenC_ptr + + cdef class Tokens: cdef Pool mem cdef StringStore _string_store @@ -40,7 +48,7 @@ cdef class Tokens: cdef int max_length cdef int extend(self, int i, const Lexeme* const* lexemes, int n) except -1 - cdef int push_back(self, int i, const Lexeme* lexeme) except -1 + cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1 cpdef int set_tag(self, int i, int tag_type, int tag) except -1 cpdef np.ndarray[long, ndim=2] get_array(self, list features) diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 004d0578c..4075e64d7 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -60,16 +60,16 @@ cdef class Tokens: def __len__(self): return self.length - cdef int push_back(self, int idx, const Lexeme* lexeme) except -1: + cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1: if self.length == self.max_length: self._realloc(self.length * 2) cdef TokenC* t = &self.data[self.length] - t.lex = lexeme - t.idx = idx - t.pos = 0 - t.sense = 0 + if LexemeOrToken is TokenC_ptr: + t[0] = lex_or_tok[0] + else: + t.lex = lex_or_tok self.length += 1 - return idx + lexeme.length + return idx + t.lex.length cdef int extend(self, int idx, const Lexeme* const* lexemes, int n) except -1: cdef int i