From 865cacfaf7dc3cef0178c85cd8402d2fd86a2dce Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 16 Aug 2014 17:37:09 +0200 Subject: [PATCH] * Remove dependence on murmurhash --- setup.py | 4 ---- spacy/spacy.pyx | 24 +++++++++--------------- spacy/string_tools.pyx | 2 -- 3 files changed, 9 insertions(+), 21 deletions(-) diff --git a/setup.py b/setup.py index 7a08ef900..1b0093808 100644 --- a/setup.py +++ b/setup.py @@ -44,8 +44,6 @@ else: # If you're not using virtualenv, set your include dir here. pass -print includes -print cython_includes exts = [ Extension("spacy.en", ["spacy/en.pyx"], language="c++", @@ -64,8 +62,6 @@ exts = [ cython_include_dirs=cython_includes), Extension("spacy.string_tools", ["spacy/string_tools.pyx"], language="c++", include_dirs=includes, cython_include_dirs=cython_includes), - Extension("murmurhash.mrmr", ["murmurhash/mrmr.pyx", 'murmurhash/MurmurHash2.cpp', 'murmurhash/MurmurHash3.cpp'], language="c++", - include_dirs=includes, cython_include_dirs=cython_includes) ] diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index 12b96270d..743ebc771 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -5,7 +5,6 @@ from libc.stdlib cimport calloc, free from libcpp.pair cimport pair from cython.operator cimport dereference as deref -from murmurhash cimport mrmr from spacy.lexeme cimport Lexeme from spacy.lexeme cimport BLANK_WORD @@ -16,11 +15,6 @@ from os import path cimport cython -cdef inline StringHash hash_string(Py_UNICODE* string, size_t length) nogil: - '''Hash unicode with MurmurHash64A''' - return mrmr.hash32(string, length * sizeof(Py_UNICODE), 0) - - def get_normalized(unicode lex, size_t length): if lex.isalpha() and lex.islower(): return lex @@ -97,7 +91,7 @@ cdef class Language: if length == 0: return &BLANK_WORD - cdef StringHash hashed = hash_string(string, len(string)) + cdef StringHash hashed = hash(string) # First, check words seen 2+ times cdef Lexeme* word_ptr = self.vocab[0][hashed] if word_ptr == NULL: @@ -112,7 +106,7 @@ cdef class Language: cdef size_t length = len(string) if length == 0: return &BLANK_WORD - cdef StringHash hashed = hash_string(string, length) + cdef StringHash hashed = hash(string) # First, check words seen 2+ times cdef Lexeme* word_ptr = self.vocab[0][hashed] cdef int split @@ -141,7 +135,7 @@ cdef class Language: cdef Lexeme* new_lexeme(self, StringHash key, unicode string) except NULL: cdef Lexeme* word = calloc(1, sizeof(Lexeme)) word.sic = key - word.lex = hash_string(string, len(string)) + word.lex = hash(string) self.bacov[word.lex] = string word.orth = self.lookup_orth(word.lex, string) word.dist = self.lookup_dist(word.lex) @@ -162,11 +156,11 @@ cdef class Language: orth.flags = set_orth_flags(lex, orth.length) orth.norm = hashed last3 = substr(lex, length - 3, length, length) - orth.last3 = hash_string(last3, len(last3)) + orth.last3 = hash(last3) norm = get_normalized(lex, length) - orth.norm = hash_string(norm, len(norm)) + orth.norm = hash(norm) shape = get_word_shape(lex, length) - orth.shape = hash_string(shape, len(shape)) + orth.shape = hash(shape) self.bacov[orth.last3] = last3 self.bacov[orth.norm] = norm @@ -191,12 +185,12 @@ cdef class Language: cdef Lexeme* word cdef StringHash hashed for chunk, lex, tokens in token_rules: - hashed = hash_string(chunk, len(chunk)) + hashed = hash(chunk) word = self.new_lexeme(hashed, lex) for i, lex in enumerate(tokens): token_string = '%s:@:%d:@:%s' % (chunk, i, lex) length = len(token_string) - hashed = hash_string(token_string, len(token_string)) + hashed = hash(token_string) word.tail = self.new_lexeme(hashed, lex) word = word.tail @@ -214,7 +208,7 @@ cdef class Language: # the first 4 bits. See redshift._parse_features.pyx cluster = int(cluster_str[::-1], 2) upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0)) - hashed = hash_string(token_string, len(token_string)) + hashed = hash(token_string) word = self.init_lexeme(hashed, token_string) diff --git a/spacy/string_tools.pyx b/spacy/string_tools.pyx index 8fc7995b4..7a8304c2a 100644 --- a/spacy/string_tools.pyx +++ b/spacy/string_tools.pyx @@ -1,6 +1,4 @@ # cython: profile=True -from murmurhash cimport mrmr - cpdef bytes to_bytes(unicode string): return string.encode('utf8')