From f39211b2b172273a6e7dbc69561ef03530a4d350 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <matthew@honnibal.com>
Date: Fri, 1 Aug 2014 07:27:21 +0100
Subject: [PATCH] * Add FixedTable for hashing

---
 spacy/_hashing.pxd | 11 +++++++++++
 spacy/_hashing.pyx | 48 ++++++++++++++++++++++++++++++++++++++++++++++
 spacy/spacy.pxd    |  6 ++----
 spacy/spacy.pyx    | 25 ++++++------------------
 4 files changed, 67 insertions(+), 23 deletions(-)
 create mode 100644 spacy/_hashing.pxd
 create mode 100644 spacy/_hashing.pyx

diff --git a/spacy/_hashing.pxd b/spacy/_hashing.pxd
new file mode 100644
index 000000000..44733451d
--- /dev/null
+++ b/spacy/_hashing.pxd
@@ -0,0 +1,11 @@
+from libc.stdint cimport uint64_t
+
+
+cdef class FixedTable:
+    cdef size_t size
+    cdef uint64_t* keys
+    cdef size_t* values
+
+    cdef int insert(self, uint64_t key, size_t value) nogil
+    cdef size_t get(self, uint64_t key) nogil
+    cdef int erase(self, uint64_t key) nogil
diff --git a/spacy/_hashing.pyx b/spacy/_hashing.pyx
new file mode 100644
index 000000000..bd59e0469
--- /dev/null
+++ b/spacy/_hashing.pyx
@@ -0,0 +1,48 @@
+from libc.stdlib cimport calloc, free
+import cython
+
+
+cdef class FixedTable:
+    def __cinit__(self, const size_t size):
+        self.size = size
+        self.keys = <uint64_t*>calloc(self.size, sizeof(uint64_t))
+        self.values = <size_t*>calloc(self.size, sizeof(size_t))
+
+    def __dealloc__(self):
+        free(self.keys)
+        free(self.values)
+
+    def __getitem__(self, uint64_t key):
+        return self.get(key)
+
+    def __setitem__(self, uint64_t key, size_t value):
+        self.insert(key, value)
+
+    def pop(self, uint64_t key):
+        self.delete(key)
+
+    def bucket(self, uint64_t key):
+        return _find(key, self.size)
+
+    cdef int insert(self, uint64_t key, size_t value) nogil:
+        cdef size_t bucket = _find(key, self.size)
+        self.keys[bucket] = key
+        self.values[bucket] = value
+
+    cdef size_t get(self, uint64_t key) nogil:
+        cdef size_t bucket = _find(key, self.size)
+        if self.keys[bucket] == key:
+            return self.values[bucket]
+        else:
+            return 0
+
+    cdef int erase(self, uint64_t key) nogil:
+        cdef size_t bucket = _find(key, self.size)
+        self.keys[bucket] = 0
+
+
+@cython.cdivision
+cdef inline size_t _find(uint64_t key, size_t size) nogil:
+    return key % size
+
+
diff --git a/spacy/spacy.pxd b/spacy/spacy.pxd
index 2c8b5e141..befcf9a2f 100644
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@@ -2,14 +2,12 @@ from libcpp.vector cimport vector
 from libc.stdint cimport uint64_t
 
 from sparsehash.dense_hash_map cimport dense_hash_map
-from sparsehash.sparse_hash_map cimport sparse_hash_map
-
+from _hashing cimport FixedTable
 
 # Circular import problems here
 ctypedef size_t Lexeme_addr
 ctypedef uint64_t StringHash
 ctypedef dense_hash_map[StringHash, size_t] Vocab
-ctypedef sparse_hash_map[StringHash, size_t] SparseVocab
 from spacy.lexeme cimport Lexeme
 
 from spacy.tokens cimport Tokens
@@ -27,7 +25,7 @@ from spacy.lexeme cimport Orthography
 
 cdef class Language:
     cdef object name
-    cdef SparseVocab* happax
+    cdef FixedTable happax
     cdef Vocab* vocab
     cdef Vocab* distri
     cdef Vocab* ortho
diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx
index 535f2ae55..5db6c26dd 100644
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@@ -55,18 +55,17 @@ def set_orth_flags(lex, length):
     return 0
 
 
-DEF MAX_HAPPAX = 1000000
+DEF MAX_HAPPAX = 1048576
 
 
 cdef class Language:
     def __cinit__(self, name):
         self.name = name
         self.bacov = {}
-        self.happax = new SparseVocab()
+        self.happax = FixedTable(MAX_HAPPAX)
         self.vocab = new Vocab()
         self.ortho = new Vocab()
         self.distri = new Vocab()
-        self.happax[0].set_deleted_key(0)
         self.vocab[0].set_empty_key(0)
         self.distri[0].set_empty_key(0)
         self.ortho[0].set_empty_key(0)
@@ -108,7 +107,7 @@ cdef class Language:
    
     cdef StringHash hash_string(self, Py_UNICODE* s, size_t length) except 0:
         '''Hash unicode with MurmurHash64A'''
-        return mrmr.hash64(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
+        return mrmr.real_hash64(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
 
     cdef unicode unhash(self, StringHash hash_value):
         '''Fetch a string from the reverse index, given its hash value.'''
@@ -128,32 +127,20 @@ cdef class Language:
         cdef Lexeme* word_ptr = <Lexeme*>self.vocab[0][hashed]
         if word_ptr == NULL:
             # Now check words seen exactly once
-            word_ptr = <Lexeme*>self.happax[0][hashed]
+            word_ptr = <Lexeme*>self.happax.get(hashed)
             if word_ptr == NULL:
                 start = self.find_split(string, length) if start == -1 else start
                 word_ptr = self._add(hashed, string, start, length)
             else:
                 # Second time word seen, move to vocab
                 self.vocab[0][hashed] = <Lexeme_addr>word_ptr
-                self.happax[0].erase(hashed)
+                self.happax.erase(hashed)
         return <Lexeme_addr>word_ptr
 
     cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length):
         cdef size_t i
-        cdef sparse_hash_map[StringHash, size_t].iterator it
-        cdef pair[StringHash, size_t] last_elem
-        if self.happax[0].size() >= MAX_HAPPAX:
-            # Delete last element.
-            last_elem = deref(self.happax[0].end())
-            free(<Orthography*>self.ortho[0][last_elem.first])
-            # TODO: Do this when we set distributions
-            #free(<Distribution*>self.distri[0][last_elem.first])
-            free(<Lexeme*>last_elem.second)
-            self.happax[0].erase(last_elem.first)
-            self.ortho[0].erase(last_elem.first)
-            self.distri[0].erase(last_elem.first)
         word = self.init_lexeme(string, hashed, split, length)
-        self.happax[0][hashed] = <Lexeme_addr>word
+        self.happax.insert(hashed, <size_t>word)
         self.bacov[hashed] = string
         return word