From 45865be37e35bb7a63575f38f31486b0c033cb58 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 13 Sep 2014 17:02:06 +0200 Subject: [PATCH] * Switch hash interface, using void* instead of size_t, to avoid casts. --- spacy/_hashing.pyx | 19 +++++++++---------- spacy/lang.pyx | 39 ++++++++++++++++++++------------------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/spacy/_hashing.pyx b/spacy/_hashing.pyx index 2218fb1c5..250a963eb 100644 --- a/spacy/_hashing.pyx +++ b/spacy/_hashing.pyx @@ -18,20 +18,19 @@ cdef class PointerHash: def __getitem__(self, key_t key): assert key != 0 - cdef val_t value = self.lookup(key) - return value if value != 0 else None + cdef val_t value = self.get(key) + return value if value != NULL else None - def __setitem__(self, key_t key, val_t value): - assert key != 0 - assert value != 0 - self.insert(key, value) + def __setitem__(self, key_t key, size_t value): + assert key != 0 and value != 0 + self.set(key, value) - cdef val_t lookup(self, key_t key): + cdef val_t get(self, key_t key): cell = _find_cell(self.cells, self.size, key) self._last = cell return cell.value - cdef void insert(self, key_t key, val_t value) except *: + cdef void set(self, key_t key, val_t value) except *: cdef Cell* cell if self._last != NULL and key == self._last.key: cell = self._last @@ -60,8 +59,8 @@ cdef class PointerHash: cdef size_t slot for i in range(old_size): if old_cells[i].key != 0: - assert old_cells[i].value != 0, i - self.insert(old_cells[i].key, old_cells[i].value) + assert old_cells[i].value != NULL, i + self.set(old_cells[i].key, old_cells[i].value) free(old_cells) diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 172a99de2..50f421005 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -109,7 +109,7 @@ cdef class Language: return tokens cdef int _tokenize(self, Tokens tokens, String* string): - cdef LexemeC** lexemes = self.cache.lookup(string.key) + cdef LexemeC** lexemes = self.cache.get(string.key) cdef size_t i if lexemes != NULL: i = 0 @@ -127,7 +127,7 @@ cdef class Language: split = self._split_one(string.chars, string.n) remaining -= split string_slice_prefix(string, &prefix, split) - lexemes = self.specials.lookup(prefix.key) + lexemes = self.specials.get(prefix.key) if lexemes != NULL: i = 0 while lexemes[i] != NULL: @@ -139,7 +139,7 @@ cdef class Language: cdef size_t j for i, j in enumerate(range(first_token, tokens.length)): lexemes[i] = tokens.lexemes[j] - self.cache.insert(key, lexemes) + self.cache.set(key, lexemes) cdef int _split_one(self, Py_UNICODE* characters, size_t length): return length @@ -166,8 +166,8 @@ cdef class Language: lexemes[i] = self.lexicon.get(&string) lexemes[i + 1] = NULL string_from_unicode(&string, uni_string) - self.specials[string.key] = lexemes - self.cache.insert(string.key, lexemes) + self.specials.set(string.key, lexemes) + self.cache.set(string.key, lexemes) cdef class Lexicon: @@ -177,26 +177,27 @@ cdef class Lexicon: self._string_features = string_features self._dict = PointerHash(2 ** 20) self.size = 0 - cdef Lexeme word - for string in words: - prob = probs.get(string, 0.0) - cluster = clusters.get(string, 0.0) - cases = case_stats.get(string, {}) - tags = tag_stats.get(string, {}) - views = [string_view(string, prob, cluster, cases, tags) + cdef String string + for uni_string in words: + prob = probs.get(uni_string, 0.0) + cluster = clusters.get(uni_string, 0.0) + cases = case_stats.get(uni_string, {}) + tags = tag_stats.get(uni_string, {}) + views = [string_view(uni_string, prob, cluster, cases, tags) for string_view in self._string_features] flags = set() for i, flag_feature in enumerate(self._flag_features): - if flag_feature(string, prob, cluster, cases, tags): + if flag_feature(uni_string, prob, cluster, cases, tags): flags.add(i) - lexeme = lexeme_init(string, prob, cluster, views, flags) - self._dict[string] = lexeme + lexeme = lexeme_init(uni_string, prob, cluster, views, flags) + string_from_unicode(&string, uni_string) + self._dict.set(string.key, lexeme) self.size += 1 cdef size_t get(self, String* string): - cdef size_t lex_addr = self._dict.lookup(string.key) - if lex_addr != 0: - return lex_addr + cdef LexemeC* lex_addr = self._dict.get(string.key) + if lex_addr != NULL: + return lex_addr cdef unicode uni_string = string.chars[:string.n] views = [string_view(uni_string, 0.0, 0, {}, {}) @@ -207,7 +208,7 @@ cdef class Lexicon: flags.add(i) cdef LexemeC* lexeme = lexeme_init(uni_string, 0, 0, views, flags) - self._dict.insert(string.key, lexeme) + self._dict.set(string.key, lexeme) self.size += 1 return lexeme