From d6e07aa9222a801925ba32123a8da0225cc5bbec Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 2 Aug 2014 21:51:52 +0100 Subject: [PATCH] * Switch to 32bit hash for strings --- spacy/lexeme.pxd | 3 ++- spacy/spacy.pxd | 3 ++- spacy/spacy.pyx | 14 +++++++------- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 17ea473f9..6b9dc5c7a 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -1,8 +1,9 @@ +from libc.stdint cimport uint32_t from libc.stdint cimport uint64_t # Put these above import to avoid circular import problem ctypedef int ClusterID -ctypedef uint64_t StringHash +ctypedef uint32_t StringHash ctypedef size_t Lexeme_addr ctypedef char Bits8 ctypedef uint64_t Bits64 diff --git a/spacy/spacy.pxd b/spacy/spacy.pxd index f5316a618..862a84b7f 100644 --- a/spacy/spacy.pxd +++ b/spacy/spacy.pxd @@ -1,4 +1,5 @@ from libcpp.vector cimport vector +from libc.stdint cimport uint32_t from libc.stdint cimport uint64_t from sparsehash.dense_hash_map cimport dense_hash_map @@ -6,7 +7,7 @@ from _hashing cimport FixedTable # Circular import problems here ctypedef size_t Lexeme_addr -ctypedef uint64_t StringHash +ctypedef uint32_t StringHash ctypedef dense_hash_map[StringHash, size_t] Vocab from spacy.lexeme cimport Lexeme diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index 1e31ecdb2..515ed4bbd 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -107,11 +107,11 @@ cdef class Language: cdef StringHash hash_string(self, Py_UNICODE* s, size_t length) except 0: '''Hash unicode with MurmurHash64A''' - return mrmr.hash64(s, length * sizeof(Py_UNICODE), 0) + return mrmr.hash32(s, length * sizeof(Py_UNICODE), 0) cdef unicode unhash(self, StringHash hash_value): '''Fetch a string from the reverse index, given its hash value.''' - return self.bacov[hash_value] + return self.bacov[hash_value].decode('utf8') cdef Lexeme_addr lookup(self, int start, Py_UNICODE* string, size_t length) except 0: '''Fetch a Lexeme representing a word string. If the word has not been seen, @@ -147,7 +147,7 @@ cdef class Language: self._happax_to_vocab(self.happax.keys[hashed % self.happax.size], self.happax.values[hashed % self.happax.size]) self.happax.insert(hashed, word) - self.bacov[hashed] = string + self.bacov[hashed] = string.encode('utf8') return word cpdef Tokens tokenize(self, unicode string): @@ -202,7 +202,7 @@ cdef class Language: tail_string = '' word.lex = self.hash_string(lex, len(lex)) - self.bacov[word.lex] = lex + self.bacov[word.lex] = lex.encode('utf8') word.orth = self.ortho[0][word.lex] if word.orth == NULL: word.orth = self.init_orth(word.lex, lex) @@ -231,9 +231,9 @@ cdef class Language: orth.shape = self.hash_string(shape, len(shape)) orth.norm = self.hash_string(norm, len(norm)) - self.bacov[orth.last3] = last3 - self.bacov[orth.shape] = shape - self.bacov[orth.norm] = norm + self.bacov[orth.last3] = last3.encode('utf8') + self.bacov[orth.shape] = shape.encode('utf8') + self.bacov[orth.norm] = norm.encode('utf8') self.ortho[0][hashed] = orth return orth