From d6e07aa9222a801925ba32123a8da0225cc5bbec Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <matthew@honnibal.com>
Date: Sat, 2 Aug 2014 21:51:52 +0100
Subject: [PATCH] * Switch to 32bit hash for strings

---
 spacy/lexeme.pxd |  3 ++-
 spacy/spacy.pxd  |  3 ++-
 spacy/spacy.pyx  | 14 +++++++-------
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd
index 17ea473f9..6b9dc5c7a 100644
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@@ -1,8 +1,9 @@
+from libc.stdint cimport uint32_t
 from libc.stdint cimport uint64_t
 
 # Put these above import to avoid circular import problem
 ctypedef int ClusterID
-ctypedef uint64_t StringHash
+ctypedef uint32_t StringHash
 ctypedef size_t Lexeme_addr
 ctypedef char Bits8
 ctypedef uint64_t Bits64
diff --git a/spacy/spacy.pxd b/spacy/spacy.pxd
index f5316a618..862a84b7f 100644
--- a/spacy/spacy.pxd
+++ b/spacy/spacy.pxd
@@ -1,4 +1,5 @@
 from libcpp.vector cimport vector
+from libc.stdint cimport uint32_t
 from libc.stdint cimport uint64_t
 
 from sparsehash.dense_hash_map cimport dense_hash_map
@@ -6,7 +7,7 @@ from _hashing cimport FixedTable
 
 # Circular import problems here
 ctypedef size_t Lexeme_addr
-ctypedef uint64_t StringHash
+ctypedef uint32_t StringHash
 ctypedef dense_hash_map[StringHash, size_t] Vocab
 from spacy.lexeme cimport Lexeme
 
diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx
index 1e31ecdb2..515ed4bbd 100644
--- a/spacy/spacy.pyx
+++ b/spacy/spacy.pyx
@@ -107,11 +107,11 @@ cdef class Language:
    
     cdef StringHash hash_string(self, Py_UNICODE* s, size_t length) except 0:
         '''Hash unicode with MurmurHash64A'''
-        return mrmr.hash64(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
+        return mrmr.hash32(<Py_UNICODE*>s, length * sizeof(Py_UNICODE), 0)
 
     cdef unicode unhash(self, StringHash hash_value):
         '''Fetch a string from the reverse index, given its hash value.'''
-        return self.bacov[hash_value]
+        return self.bacov[hash_value].decode('utf8')
 
     cdef Lexeme_addr lookup(self, int start, Py_UNICODE* string, size_t length) except 0:
         '''Fetch a Lexeme representing a word string. If the word has not been seen,
@@ -147,7 +147,7 @@ cdef class Language:
             self._happax_to_vocab(self.happax.keys[hashed % self.happax.size],
                                   self.happax.values[hashed % self.happax.size])
         self.happax.insert(hashed, <size_t>word)
-        self.bacov[hashed] = string
+        self.bacov[hashed] = string.encode('utf8')
         return word   
 
     cpdef Tokens tokenize(self, unicode string):
@@ -202,7 +202,7 @@ cdef class Language:
             tail_string = ''
     
         word.lex = self.hash_string(lex, len(lex))
-        self.bacov[word.lex] = lex
+        self.bacov[word.lex] = lex.encode('utf8')
         word.orth = <Orthography*>self.ortho[0][word.lex]
         if word.orth == NULL:
             word.orth = self.init_orth(word.lex, lex)
@@ -231,9 +231,9 @@ cdef class Language:
         orth.shape = self.hash_string(shape, len(shape))
         orth.norm = self.hash_string(norm, len(norm))
 
-        self.bacov[orth.last3] = last3
-        self.bacov[orth.shape] = shape
-        self.bacov[orth.norm] = norm
+        self.bacov[orth.last3] = last3.encode('utf8')
+        self.bacov[orth.shape] = shape.encode('utf8')
+        self.bacov[orth.norm] = norm.encode('utf8')
 
         self.ortho[0][hashed] = <size_t>orth
         return orth