* Add FixedTable for hashing

2014-08-01 07:27:21 +01:00 · 2014-08-01 07:27:21 +01:00 · f39211b2b1
parent a44e15f623
commit f39211b2b1
4 changed files with 67 additions and 23 deletions
--- a/spacy/_hashing.pxd
+++ b/spacy/_hashing.pxd
@ -0,0 +1,11 @@
+from libc.stdint cimport uint64_t
+
+
+cdef class FixedTable:
+    cdef size_t size
+    cdef uint64_t* keys
+    cdef size_t* values
+
+    cdef int insert(self, uint64_t key, size_t value) nogil
+    cdef size_t get(self, uint64_t key) nogil
+    cdef int erase(self, uint64_t key) nogil
--- a/spacy/_hashing.pyx
+++ b/spacy/_hashing.pyx
@ -0,0 +1,48 @@
+from libc.stdlib cimport calloc, free
+import cython
+
+
+cdef class FixedTable:
+    def __cinit__(self, const size_t size):
+        self.size = size
+        self.keys = <uint64_t*>calloc(self.size, sizeof(uint64_t))
+        self.values = <size_t*>calloc(self.size, sizeof(size_t))
+
+    def __dealloc__(self):
+        free(self.keys)
+        free(self.values)
+
+    def __getitem__(self, uint64_t key):
+        return self.get(key)
+
+    def __setitem__(self, uint64_t key, size_t value):
+        self.insert(key, value)
+
+    def pop(self, uint64_t key):
+        self.delete(key)
+
+    def bucket(self, uint64_t key):
+        return _find(key, self.size)
+
+    cdef int insert(self, uint64_t key, size_t value) nogil:
+        cdef size_t bucket = _find(key, self.size)
+        self.keys[bucket] = key
+        self.values[bucket] = value
+
+    cdef size_t get(self, uint64_t key) nogil:
+        cdef size_t bucket = _find(key, self.size)
+        if self.keys[bucket] == key:
+            return self.values[bucket]
+        else:
+            return 0
+
+    cdef int erase(self, uint64_t key) nogil:
+        cdef size_t bucket = _find(key, self.size)
+        self.keys[bucket] = 0
+
+
+@cython.cdivision
+cdef inline size_t _find(uint64_t key, size_t size) nogil:
+    return key % size
+
+
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@ -2,14 +2,12 @@ from libcpp.vector cimport vector
 from libc.stdint cimport uint64_t

 from sparsehash.dense_hash_map cimport dense_hash_map
-from sparsehash.sparse_hash_map cimport sparse_hash_map
-
+from _hashing cimport FixedTable

 # Circular import problems here
 ctypedef size_t Lexeme_addr
 ctypedef uint64_t StringHash
 ctypedef dense_hash_map[StringHash, size_t] Vocab
-ctypedef sparse_hash_map[StringHash, size_t] SparseVocab
 from spacy.lexeme cimport Lexeme

 from spacy.tokens cimport Tokens
@ -27,7 +25,7 @@ from spacy.lexeme cimport Orthography

 cdef class Language:
    cdef object name
-    cdef SparseVocab* happax
+    cdef FixedTable happax
    cdef Vocab* vocab
    cdef Vocab* distri
    cdef Vocab* ortho
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@ -55,18 +55,17 @@ def set_orth_flags(lex, length):
    return 0


-DEF MAX_HAPPAX = 1000000
+DEF MAX_HAPPAX = 1048576


 cdef class Language:
    def __cinit__(self, name):
        self.name = name
        self.bacov = {}
-        self.happax = new SparseVocab()
+        self.happax = FixedTable(MAX_HAPPAX)
        self.vocab = new Vocab()
        self.ortho = new Vocab()
        self.distri = new Vocab()
-        self.happax[0].set_deleted_key(0)
        self.vocab[0].set_empty_key(0)
        self.distri[0].set_empty_key(0)
        self.ortho[0].set_empty_key(0)
@ -108,7 +107,7 @@ cdef class Language:
   
    cdef StringHash hash_string(self, Py_UNICODE* s, size_t length) except 0:
        '''Hash unicode with MurmurHash64A'''
-        return mrmr.hash64(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
+        return mrmr.real_hash64(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)

    cdef unicode unhash(self, StringHash hash_value):
        '''Fetch a string from the reverse index, given its hash value.'''
@ -128,32 +127,20 @@ cdef class Language:
        cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
        if word_ptr == NULL:
            # Now check words seen exactly once
-            word_ptr = <Lexeme*>self.happax[0][hashed]
+            word_ptr = <Lexeme*>self.happax.get(hashed)
            if word_ptr == NULL:
                start = self.find_split(string, length) if start == -1 else start
                word_ptr = self._add(hashed, string, start, length)
            else:
                # Second time word seen, move to vocab
                self.vocab[0][hashed] = <Lexeme_addr>word_ptr
-                self.happax[0].erase(hashed)
+                self.happax.erase(hashed)
        return <Lexeme_addr>word_ptr

    cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
        cdef size_t i
-        cdef sparse_hash_map[StringHash, size_t].iterator it
-        cdef pair[StringHash, size_t] last_elem
-        if self.happax[0].size() >= MAX_HAPPAX:
-            # Delete last element.
-            last_elem = deref(self.happax[0].end())
-            free(<Orthography*>self.ortho[0][last_elem.first])
-            # TODO: Do this when we set distributions
-            #free(<Distribution*>self.distri[0][last_elem.first])
-            free(<Lexeme*>last_elem.second)
-            self.happax[0].erase(last_elem.first)
-            self.ortho[0].erase(last_elem.first)
-            self.distri[0].erase(last_elem.first)
        word = self.init_lexeme(string, hashed, split, length)
-        self.happax[0][hashed] = <Lexeme_addr>word
+        self.happax.insert(hashed, <size_t>word)
        self.bacov[hashed] = string
        return word