From 3ada25b92d122ad0acdcc443e9c410c3d93db834 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 15 Aug 2014 23:06:46 +0200 Subject: [PATCH] * Shifting to WordTree instead of dense_hash_map for storage. --- spacy/_hashing.pxd | 15 +++++ spacy/_hashing.pyx | 53 +++++++++++++++ spacy/en.pyx | 8 ++- spacy/lexeme.pyx | 5 ++ spacy/spacy.pxd | 5 +- spacy/spacy.pyx | 146 ++++++++++++++--------------------------- spacy/string_tools.pyx | 6 ++ 7 files changed, 137 insertions(+), 101 deletions(-) diff --git a/spacy/_hashing.pxd b/spacy/_hashing.pxd index d87704c1a..eac56941d 100644 --- a/spacy/_hashing.pxd +++ b/spacy/_hashing.pxd @@ -1,6 +1,9 @@ from libc.stdint cimport uint64_t +from chartree cimport CharTree + +cdef bytes to_utf8(unicode string) cdef class FixedTable: cdef size_t size cdef uint64_t* keys @@ -9,3 +12,15 @@ cdef class FixedTable: cdef size_t insert(self, uint64_t key, size_t value) nogil cdef size_t get(self, uint64_t key) nogil cdef int erase(self, uint64_t key) nogil + + +cdef class WordTree: + cdef size_t max_length + cdef size_t default + cdef CharTree* _trees + cdef dict _dict + + cdef size_t get(self, bytes string) except * + cdef int set(self, bytes string, size_t value) except * + cdef bint contains(self, bytes string) except * + diff --git a/spacy/_hashing.pyx b/spacy/_hashing.pyx index 99c8e7406..df7da7053 100644 --- a/spacy/_hashing.pyx +++ b/spacy/_hashing.pyx @@ -1,6 +1,8 @@ from libc.stdlib cimport calloc, free import cython +cimport chartree + cdef class FixedTable: def __cinit__(self, const size_t size): @@ -51,3 +53,54 @@ cdef class FixedTable: @cython.cdivision cdef inline size_t _find(uint64_t key, size_t size) nogil: return key % size + + +cdef bytes to_utf8(unicode string): + cdef bytes py_byte_string = string.encode('UTF-8') + return py_byte_string + + +cdef unicode to_unicode(unsigned char[:] c_string, size_t length): + # This prevents a call to strlen + cdef bytes py_string = c_string[:length] + return py_string.decode('utf8') + + +cdef class WordTree: + def __cinit__(self, size_t default, size_t max_length): + self.max_length = max_length + self.default = default + self._trees = calloc(max_length, sizeof(CharTree)) + for i in range(self.max_length): + chartree.init(&self._trees[i], i) + self._dict = {} + + cdef size_t get(self, bytes string) except *: + cdef size_t length = len(string) + if length >= self.max_length: + return self._dict.get(string, 0) + else: + return chartree.getitem(&self._trees[length], string) + + cdef int set(self, bytes string, size_t value) except *: + cdef size_t length = len(string) + if length >= self.max_length: + self._dict[string] = value + else: + chartree.setitem(&self._trees[length], string, value) + + cdef bint contains(self, bytes string) except *: + cdef size_t length = len(string) + if length >= self.max_length: + return string in self._dict + else: + return chartree.contains(&self._trees[length], string) + + def __getitem__(self, unicode key): + return self.get(to_utf8(key)) + + def __setitem__(self, unicode key, size_t value): + self.set(to_utf8(key), value) + + def __contains__(self, unicode key): + return self.contains(to_utf8(key)) diff --git a/spacy/en.pyx b/spacy/en.pyx index 1775d097c..926e954fe 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -43,9 +43,8 @@ cdef bint is_punct(unicode word, size_t i, size_t length): # Don't count commas as punct if the next char is a number if word[i] == "," and i < (length - 1) and word[i+1].isdigit(): return False - # Don't count periods as punct if the next char is not whitespace - if word[i] == "." and i < (length - 1) and not word[i+1].isspace(): - return False + if word[i] == ".": + return True return not word[i].isalnum() @@ -62,3 +61,6 @@ cpdef Lexeme_addr lookup(unicode string) except 0: cpdef unicode unhash(StringHash hash_value): return EN.unhash(hash_value) + +def words(): + return EN.words diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 430033db0..fd193b57e 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -20,6 +20,11 @@ from spacy.spacy cimport StringHash #SHAPE = StringAttr.shape #LAST3 = StringAttr.last3 +cdef Lexeme* init(StringHash hashed, bytes lex_string) except NULL: + cdef Lexeme* word = calloc(1, sizeof(Lexeme)) + word.lex = hashed + return word + cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0: if attr == SIC: diff --git a/spacy/spacy.pxd b/spacy/spacy.pxd index cf592a338..bff494232 100644 --- a/spacy/spacy.pxd +++ b/spacy/spacy.pxd @@ -4,6 +4,7 @@ from libc.stdint cimport uint64_t from sparsehash.dense_hash_map cimport dense_hash_map from _hashing cimport FixedTable +from _hashing cimport WordTree # Circular import problems here ctypedef size_t Lexeme_addr @@ -26,7 +27,7 @@ from spacy.lexeme cimport Orthography cdef class Language: cdef object name - cdef Vocab* vocab + cdef WordTree vocab cdef Vocab* distri cdef Vocab* ortho cdef dict bacov @@ -37,7 +38,7 @@ cdef class Language: cdef unicode unhash(self, StringHash hashed) cpdef Tokens tokenize(self, unicode text) - cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length) + cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length) except NULL cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed, int split, size_t length) cdef Orthography* init_orth(self, StringHash hashed, unicode lex) diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index ffdc1522f..30c3ead43 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -5,12 +5,12 @@ from libc.stdlib cimport calloc, free from libcpp.pair cimport pair from cython.operator cimport dereference as deref -from murmurhash cimport mrmr from spacy.lexeme cimport Lexeme from spacy.lexeme cimport BLANK_WORD from spacy.string_tools cimport substr - +from _hashing cimport WordTree +from _hashing cimport to_utf8 from . import util from os import path @@ -58,28 +58,27 @@ cdef class Language: def __cinit__(self, name): self.name = name self.bacov = {} - self.vocab = new Vocab() + self.vocab = WordTree(0, 10) self.ortho = new Vocab() self.distri = new Vocab() - self.vocab[0].set_empty_key(0) self.distri[0].set_empty_key(0) self.ortho[0].set_empty_key(0) - self.vocab[0].set_deleted_key(1) self.distri[0].set_deleted_key(1) self.ortho[0].set_deleted_key(1) self.load_tokenization(util.read_tokenization(name)) + property words: + def __get__(self): + return self.bacov.keys() + def load_tokenization(self, token_rules=None): cdef Lexeme* word cdef StringHash hashed for chunk, lex, tokens in token_rules: - hashed = self.hash_string(chunk, len(chunk)) - word = self._add(hashed, lex, len(lex), len(lex)) + word = self.init_lexeme(chunk) for i, lex in enumerate(tokens): token_string = '%s:@:%d:@:%s' % (chunk, i, lex) - length = len(token_string) - hashed = self.hash_string(token_string, length) - word.tail = self._add(hashed, lex, 0, len(lex)) + word.tail = self.init_lexeme(lex) word = word.tail def load_clusters(self): @@ -89,111 +88,59 @@ cdef class Language: brown_loc = path.join(data_dir, 'clusters') cdef size_t start cdef int end + cdef unicode token_unicode + cdef bytes token_bytes with util.utf8open(brown_loc) as browns_file: for i, line in enumerate(browns_file): - cluster_str, token_string, freq_str = line.split() + cluster_str, token_unicode, freq_str = line.split() + token_bytes = token_unicode.encode('utf8') # Decode as a little-endian string, so that we can do & 15 to get # the first 4 bits. See redshift._parse_features.pyx cluster = int(cluster_str[::-1], 2) upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0)) - hashed = self.hash_string(token_string, len(token_string)) - word = self._add(hashed, token_string, - len(token_string), len(token_string)) - - cdef StringHash hash_string(self, Py_UNICODE* s, size_t length) except 0: - '''Hash unicode with MurmurHash64A''' - return mrmr.hash32(s, length * sizeof(Py_UNICODE), 0) - - cdef unicode unhash(self, StringHash hash_value): - '''Fetch a string from the reverse index, given its hash value.''' - return self.bacov[hash_value].decode('utf8') - - cdef Lexeme_addr lookup(self, int start, Py_UNICODE* string, size_t length) except 0: - '''Fetch a Lexeme representing a word string. If the word has not been seen, - construct one, splitting off any attached punctuation or clitics. A - reference to BLANK_WORD is returned for the empty string. - - To specify the boundaries of the word if it has not been seen, use lookup_chunk. - ''' - if length == 0: - return &BLANK_WORD - cdef StringHash hashed = self.hash_string(string, length) - # First, check words seen 2+ times - cdef Lexeme* word_ptr = self.vocab[0][hashed] - if word_ptr == NULL: - start = self.find_split(string, length) if start == -1 else start - word_ptr = self._add(hashed, string, start, length) - return word_ptr - - cdef Lexeme* _add(self, StringHash hashed, unicode string, int split, size_t length): - cdef size_t i - word = self.init_lexeme(string, hashed, split, length) - self.vocab[0][hashed] = word - self.bacov[hashed] = string.encode('utf8') - return word - - cpdef Tokens tokenize(self, unicode string): - cdef size_t length = len(string) - cdef Py_UNICODE* characters = string - - cdef size_t i - cdef Py_UNICODE c + word = self.init_lexeme(token_bytes) + cpdef Tokens tokenize(self, unicode unicode_string): + cdef bytes characters = unicode_string.encode('utf8') + cdef size_t length = len(characters) + cdef Tokens tokens = Tokens(self) - cdef Py_UNICODE* current = calloc(len(string), sizeof(Py_UNICODE)) - cdef size_t word_len = 0 + cdef size_t start = 0 + cdef Lexeme* token + cdef size_t i + cdef unsigned char c + for i in range(length): c = characters[i] - if _is_whitespace(c): - if word_len != 0: - token = self.lookup(-1, current, word_len) + if c == b' ': + if start < i: + token = self.lookup(characters[start:i]) while token != NULL: tokens.append(token) token = token.tail - for j in range(word_len+1): - current[j] = 0 - word_len = 0 - else: - current[word_len] = c - word_len += 1 - if word_len != 0: - token = self.lookup(-1, current, word_len) + start = i + 1 + if start < i: + token = self.lookup(characters[start:]) while token != NULL: tokens.append(token) token = token.tail - free(current) return tokens - cdef int find_split(self, unicode word, size_t length): - return -1 - - cdef Lexeme* init_lexeme(self, unicode string, StringHash hashed, - int split, size_t length): - cdef Lexeme* word = calloc(1, sizeof(Lexeme)) - - word.sic = hashed - - cdef unicode tail_string - cdef unicode lex - if split != 0 and split < length: - lex = substr(string, 0, split, length) - tail_string = substr(string, split, length, length) - else: - lex = string - tail_string = '' - - word.lex = self.hash_string(lex, len(lex)) - self.bacov[word.lex] = lex.encode('utf8') - word.orth = self.ortho[0][word.lex] - if word.orth == NULL: - word.orth = self.init_orth(word.lex, lex) - word.dist = self.distri[0][word.lex] - - # Now recurse, and deal with the tail - if tail_string: - word.tail = self.lookup(-1, tail_string, len(tail_string)) - return word + cdef Lexeme_addr lookup(self, bytes string) except 0: + '''Fetch a Lexeme representing a word string. If the word has not been seen, + construct one, splitting off any attached punctuation or clitics. A + reference to BLANK_WORD is returned for the empty string. + ''' + cdef size_t length = len(string) + if length == 0: + return &BLANK_WORD + cdef Lexeme* word_ptr = self.vocab.get(string) + if word_ptr == NULL: + start = self.find_split(string, length) + word_ptr = self.init_lexeme(string[) + self.vocab.set(string[start:], word_ptr) + return word_ptr cdef Orthography* init_orth(self, StringHash hashed, unicode lex): cdef Orthography* orth = calloc(1, sizeof(Orthography)) @@ -219,6 +166,13 @@ cdef class Language: self.ortho[0][hashed] = orth return orth + cdef unicode unhash(self, StringHash hash_value): + '''Fetch a string from the reverse index, given its hash value.''' + return self.bacov[hash_value].decode('utf8') + + cdef int find_split(self, unicode word, size_t length): + return -1 + cdef inline bint _is_whitespace(Py_UNICODE c) nogil: if c == ' ': diff --git a/spacy/string_tools.pyx b/spacy/string_tools.pyx index 2f199766f..d36cf1215 100644 --- a/spacy/string_tools.pyx +++ b/spacy/string_tools.pyx @@ -1,4 +1,10 @@ # cython: profile=True +from murmurhash cimport mrmr + + +cdef StringHash hash_string(self, unsigned char* s, size_t length) except 0: + '''Hash bytes with MurmurHash32''' + return mrmr.hash32(s, length * sizeof(unsigned char), 0) cpdef unicode substr(unicode string, int start, int end, size_t length):