From f39211b2b172273a6e7dbc69561ef03530a4d350 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 1 Aug 2014 07:27:21 +0100 Subject: [PATCH] * Add FixedTable for hashing --- spacy/_hashing.pxd | 11 +++++++++++ spacy/_hashing.pyx | 48 ++++++++++++++++++++++++++++++++++++++++++++++ spacy/spacy.pxd | 6 ++---- spacy/spacy.pyx | 25 ++++++------------------ 4 files changed, 67 insertions(+), 23 deletions(-) create mode 100644 spacy/_hashing.pxd create mode 100644 spacy/_hashing.pyx diff --git a/spacy/_hashing.pxd b/spacy/_hashing.pxd new file mode 100644 index 000000000..44733451d --- /dev/null +++ b/spacy/_hashing.pxd @@ -0,0 +1,11 @@ +from libc.stdint cimport uint64_t + + +cdef class FixedTable: + cdef size_t size + cdef uint64_t* keys + cdef size_t* values + + cdef int insert(self, uint64_t key, size_t value) nogil + cdef size_t get(self, uint64_t key) nogil + cdef int erase(self, uint64_t key) nogil diff --git a/spacy/_hashing.pyx b/spacy/_hashing.pyx new file mode 100644 index 000000000..bd59e0469 --- /dev/null +++ b/spacy/_hashing.pyx @@ -0,0 +1,48 @@ +from libc.stdlib cimport calloc, free +import cython + + +cdef class FixedTable: + def __cinit__(self, const size_t size): + self.size = size + self.keys = calloc(self.size, sizeof(uint64_t)) + self.values = calloc(self.size, sizeof(size_t)) + + def __dealloc__(self): + free(self.keys) + free(self.values) + + def __getitem__(self, uint64_t key): + return self.get(key) + + def __setitem__(self, uint64_t key, size_t value): + self.insert(key, value) + + def pop(self, uint64_t key): + self.delete(key) + + def bucket(self, uint64_t key): + return _find(key, self.size) + + cdef int insert(self, uint64_t key, size_t value) nogil: + cdef size_t bucket = _find(key, self.size) + self.keys[bucket] = key + self.values[bucket] = value + + cdef size_t get(self, uint64_t key) nogil: + cdef size_t bucket = _find(key, self.size) + if self.keys[bucket] == key: + return self.values[bucket] + else: + return 0 + + cdef int erase(self, uint64_t key) nogil: + cdef size_t bucket = _find(key, self.size) + self.keys[bucket] = 0 + + +@cython.cdivision +cdef inline size_t _find(uint64_t key, size_t size) nogil: + return key % size + + diff --git a/spacy/spacy.pxd b/spacy/spacy.pxd index 2c8b5e141..befcf9a2f 100644 --- a/spacy/spacy.pxd +++ b/spacy/spacy.pxd @@ -2,14 +2,12 @@ from libcpp.vector cimport vector from libc.stdint cimport uint64_t from sparsehash.dense_hash_map cimport dense_hash_map -from sparsehash.sparse_hash_map cimport sparse_hash_map - +from _hashing cimport FixedTable # Circular import problems here ctypedef size_t Lexeme_addr ctypedef uint64_t StringHash ctypedef dense_hash_map[StringHash, size_t] Vocab -ctypedef sparse_hash_map[StringHash, size_t] SparseVocab from spacy.lexeme cimport Lexeme from spacy.tokens cimport Tokens @@ -27,7 +25,7 @@ from spacy.lexeme cimport Orthography cdef class Language: cdef object name - cdef SparseVocab* happax + cdef FixedTable happax cdef Vocab* vocab cdef Vocab* distri cdef Vocab* ortho diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index 535f2ae55..5db6c26dd 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -55,18 +55,17 @@ def set_orth_flags(lex, length): return 0 -DEF MAX_HAPPAX = 1000000 +DEF MAX_HAPPAX = 1048576 cdef class Language: def __cinit__(self, name): self.name = name self.bacov = {} - self.happax = new SparseVocab() + self.happax = FixedTable(MAX_HAPPAX) self.vocab = new Vocab() self.ortho = new Vocab() self.distri = new Vocab() - self.happax[0].set_deleted_key(0) self.vocab[0].set_empty_key(0) self.distri[0].set_empty_key(0) self.ortho[0].set_empty_key(0) @@ -108,7 +107,7 @@ cdef class Language: cdef StringHash hash_string(self, Py_UNICODE* s, size_t length) except 0: '''Hash unicode with MurmurHash64A''' - return mrmr.hash64(s, length * sizeof(Py_UNICODE), 0) + return mrmr.real_hash64(s, length * sizeof(Py_UNICODE), 0) cdef unicode unhash(self, StringHash hash_value): '''Fetch a string from the reverse index, given its hash value.''' @@ -128,32 +127,20 @@ cdef class Language: cdef Lexeme* word_ptr = self.vocab[0][hashed] if word_ptr == NULL: # Now check words seen exactly once - word_ptr = self.happax[0][hashed] + word_ptr = self.happax.get(hashed) if word_ptr == NULL: start = self.find_split(string, length) if start == -1 else start word_ptr = self._add(hashed, string, start, length) else: # Second time word seen, move to vocab self.vocab[0][hashed] = word_ptr - self.happax[0].erase(hashed) + self.happax.erase(hashed) return word_ptr cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length): cdef size_t i - cdef sparse_hash_map[StringHash, size_t].iterator it - cdef pair[StringHash, size_t] last_elem - if self.happax[0].size() >= MAX_HAPPAX: - # Delete last element. - last_elem = deref(self.happax[0].end()) - free(self.ortho[0][last_elem.first]) - # TODO: Do this when we set distributions - #free(self.distri[0][last_elem.first]) - free(last_elem.second) - self.happax[0].erase(last_elem.first) - self.ortho[0].erase(last_elem.first) - self.distri[0].erase(last_elem.first) word = self.init_lexeme(string, hashed, split, length) - self.happax[0][hashed] = word + self.happax.insert(hashed, word) self.bacov[hashed] = string return word