From ff1869ff07f78606e1bbfea7e08a845c4585f7e9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 7 Jul 2014 07:36:43 +0200 Subject: [PATCH 1/4] * Fixed major efficiency problem, from not quite grokking pass by reference in cython c++ --- ext/murmurhash.pxd | 10 +++-- ext/murmurhash.pyx | 1 + ext/sparsehash.pyx | 1 + spacy/en.pxd | 2 +- spacy/en.pyx | 9 +++-- spacy/en_ptb.pxd | 2 +- spacy/en_ptb.pyx | 4 +- spacy/lexeme.pxd | 4 +- spacy/lexeme.pyx | 8 ++-- spacy/spacy.pxd | 11 +++--- spacy/spacy.pyx | 85 ++++++++++++++++++++++++++++++------------ spacy/string_tools.pyx | 2 + spacy/util.py | 45 +--------------------- 13 files changed, 96 insertions(+), 88 deletions(-) diff --git a/ext/murmurhash.pxd b/ext/murmurhash.pxd index 67b0cd06c..9ded57240 100644 --- a/ext/murmurhash.pxd +++ b/ext/murmurhash.pxd @@ -1,11 +1,13 @@ +# cython profile=True + from libc.stdint cimport uint64_t, int64_t cdef extern from "../include/MurmurHash3.h": - void MurmurHash3_x86_32(void * key, uint64_t len, uint64_t seed, void* out) - void MurmurHash3_x86_128(void * key, uint64_t len, uint64_t seed, void* out) + void MurmurHash3_x86_32(void * key, uint64_t len, uint64_t seed, void* out) nogil + void MurmurHash3_x86_128(void * key, uint64_t len, uint64_t seed, void* out) nogil cdef extern from "../include/MurmurHash2.h": - uint64_t MurmurHash64A(void * key, uint64_t len, int64_t seed) - uint64_t MurmurHash64B(void * key, uint64_t len, int64_t seed) + uint64_t MurmurHash64A(void * key, uint64_t len, int64_t seed) nogil + uint64_t MurmurHash64B(void * key, uint64_t len, int64_t seed) nogil diff --git a/ext/murmurhash.pyx b/ext/murmurhash.pyx index e69de29bb..54652d22a 100644 --- a/ext/murmurhash.pyx +++ b/ext/murmurhash.pyx @@ -0,0 +1 @@ +# cython: profile=True diff --git a/ext/sparsehash.pyx b/ext/sparsehash.pyx index e69de29bb..54f2811e1 100644 --- a/ext/sparsehash.pyx +++ b/ext/sparsehash.pyx @@ -0,0 +1 @@ +# cython profile=True diff --git a/spacy/en.pxd b/spacy/en.pxd index 183490102..efced3606 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -6,7 +6,7 @@ from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme_addr -cdef Vocab VOCAB +cdef Vocab* VOCAB cdef dict BACOV diff --git a/spacy/en.pyx b/spacy/en.pyx index df8d30ff9..986468988 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -1,3 +1,4 @@ +# cython: profile=True '''Serve pointers to Lexeme structs, given strings. Maintain a reverse index, so that strings can be retrieved from hashes. Use 64-bit hash values and boldly assume no collisions. @@ -15,19 +16,18 @@ from . import util cimport spacy BACOV = {} -VOCAB = Vocab() +VOCAB = new Vocab(100000) VOCAB.set_empty_key(0) spacy.load_tokenization(VOCAB, BACOV, util.read_tokenization('en')) - cpdef vector[Lexeme_addr] tokenize(unicode string) except *: return spacy.tokenize(VOCAB, BACOV, find_split, string) cpdef Lexeme_addr lookup(unicode string) except 0: - return spacy.lookup(VOCAB, BACOV, find_split, -1, string) + return spacy.lookup(VOCAB, BACOV, find_split, -1, string, len(string)) cpdef unicode unhash(StringHash hash_value): @@ -72,3 +72,6 @@ cdef bint is_punct(unicode word, size_t i, size_t length): if word[i] == "." and i < (length - 1) and word[i+1].isdigit(): return False return not word[i].isalnum() + + +#spacy.load_browns(VOCAB, BACOV, find_split) diff --git a/spacy/en_ptb.pxd b/spacy/en_ptb.pxd index 183490102..efced3606 100644 --- a/spacy/en_ptb.pxd +++ b/spacy/en_ptb.pxd @@ -6,7 +6,7 @@ from spacy.lexeme cimport Lexeme from spacy.lexeme cimport Lexeme_addr -cdef Vocab VOCAB +cdef Vocab* VOCAB cdef dict BACOV diff --git a/spacy/en_ptb.pyx b/spacy/en_ptb.pyx index 2ad8f96b2..d950c2133 100644 --- a/spacy/en_ptb.pyx +++ b/spacy/en_ptb.pyx @@ -15,7 +15,7 @@ from . import util cimport spacy BACOV = {} -VOCAB = Vocab() +VOCAB = new Vocab(100000) VOCAB.set_empty_key(0) @@ -27,7 +27,7 @@ cpdef vector[Lexeme_addr] tokenize(unicode string) except *: cpdef Lexeme_addr lookup(unicode string) except 0: - return spacy.lookup(VOCAB, BACOV, find_split, -1, string) + return spacy.lookup(VOCAB, BACOV, find_split, -1, string, len(string)) cpdef unicode unhash(StringHash hash_value): diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 9d6be64b7..2cd38e709 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -25,9 +25,9 @@ cdef struct Lexeme: cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL) -cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split, +cdef Lexeme* init_lexeme(Vocab* vocab, dict bacov, Splitter find_split, unicode string, StringHash hashed, - int split, size_t length) except NULL + int split, size_t length) # Use these to access the Lexeme fields via get_attr(Lexeme*, LexAttr), which # has a conditional to pick out the correct item. This allows safe iteration diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 4d760f0a0..2bc56969b 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -1,3 +1,4 @@ +# cython: profile=True '''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*. Mostly useful from Python-space. From Cython-space, you can just cast to Lexeme* yourself. @@ -13,9 +14,9 @@ from libc.stdint cimport uint64_t from libcpp.vector cimport vector -cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split, +cdef Lexeme* init_lexeme(Vocab* vocab, dict bacov, Splitter find_split, unicode string, StringHash hashed, - int split, size_t length) except NULL: + int split, size_t length): assert split <= length cdef Lexeme* word = calloc(1, sizeof(Lexeme)) @@ -54,7 +55,8 @@ cdef Lexeme* init_lexeme(Vocab vocab, dict bacov, Splitter find_split, # Now recurse, and deal with the tail if tail_string: - word.tail = lookup(vocab, bacov, find_split, -1, tail_string) + word.tail = lookup(vocab, bacov, find_split, -1, tail_string, + len(tail_string)) return word diff --git a/spacy/spacy.pxd b/spacy/spacy.pxd index db3226d23..ac1132ca2 100644 --- a/spacy/spacy.pxd +++ b/spacy/spacy.pxd @@ -12,12 +12,13 @@ ctypedef int (*Splitter)(unicode word, size_t length) from spacy.lexeme cimport Lexeme -cdef load_tokenization(Vocab& vocab, dict bacov, token_rules) -cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter, +cdef load_tokenization(Vocab* vocab, dict bacov, token_rules) +cdef load_browns(Vocab* vocab, dict bacov, Splitter find_split) +cdef vector[Lexeme_addr] tokenize(Vocab* vocab, dict bacov, Splitter splitter, unicode string) except * -cdef Lexeme_addr lookup(Vocab& vocab, dict bacov, Splitter splitter, int start, - unicode string) except 0 -cdef StringHash hash_string(unicode s, size_t length) except 0 +cdef Lexeme_addr lookup(Vocab* vocab, dict bacov, Splitter splitter, int start, + Py_UNICODE* string, size_t length) except 0 +cdef StringHash hash_string(Py_UNICODE* s, size_t length) nogil cdef unicode unhash(dict bacov, StringHash hash_value) diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index ca04ad82c..1cc73ac3c 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -1,3 +1,4 @@ +# cython: profile=True from __future__ import unicode_literals from ext.murmurhash cimport MurmurHash64A @@ -9,14 +10,16 @@ from spacy.lexeme cimport BLANK_WORD from spacy.string_tools cimport is_whitespace from . import util +from os import path +cimport cython -cdef load_tokenization(Vocab& vocab, dict bacov, token_rules): +cdef load_tokenization(Vocab* vocab, dict bacov, token_rules): cdef Lexeme* word cdef StringHash hashed for chunk, lex, tokens in token_rules: hashed = hash_string(chunk, len(chunk)) - assert vocab[hashed] == 0, chunk + assert vocab[0][hashed] == 0, chunk word = _add(vocab, bacov, NULL, hashed, lex, len(lex), len(lex)) for i, lex in enumerate(tokens): token_string = '%s:@:%d:@:%s' % (chunk, i, lex) @@ -26,7 +29,29 @@ cdef load_tokenization(Vocab& vocab, dict bacov, token_rules): word = word.tail -cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter, +cdef load_browns(Vocab* vocab, dict bacov, Splitter find_split): + cdef Lexeme* w + data_dir = path.join(path.dirname(__file__), '..', 'data', 'en') + case_stats = util.load_case_stats(data_dir) + brown_loc = path.join(data_dir, 'clusters') + cdef size_t start + cdef int end + with util.utf8open(brown_loc) as browns_file: + for i, line in enumerate(browns_file): + cluster_str, token_string, freq_str = line.split() + # Decode as a little-endian string, so that we can do & 15 to get + # the first 4 bits. See redshift._parse_features.pyx + cluster = int(cluster_str[::-1], 2) + upper_pc, title_pc = case_stats.get(token_string.lower(), (0.0, 0.0)) + start = 0 + end = -1 + hashed = hash_string(token_string, len(token_string)) + + word = _add(vocab, bacov, find_split, hashed, token_string, + len(token_string), len(token_string)) + + +cdef vector[Lexeme_addr] tokenize(Vocab* vocab, dict bacov, Splitter splitter, unicode string) except *: cdef size_t length = len(string) cdef Py_UNICODE* characters = string @@ -35,40 +60,54 @@ cdef vector[Lexeme_addr] tokenize(Vocab& vocab, dict bacov, Splitter splitter, cdef Py_UNICODE c cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]() - cdef unicode current = u'' + cdef Py_UNICODE[1000] current + for i in range(1000): + current[i] = 0 + cdef size_t word_len = 0 cdef Lexeme* token for i in range(length): c = characters[i] - if is_whitespace(c): - if current: - token = lookup(vocab, bacov, splitter, -1, current) + if _is_whitespace(c): + if word_len != 0: + token = lookup(vocab, bacov, splitter, -1, current, word_len) while token != NULL: tokens.push_back(token) token = token.tail - current = u'' + for j in range(word_len+1): + current[j] = 0 + word_len = 0 else: - current += c - if current: - token = lookup(vocab, bacov, splitter, -1, current) + current[word_len] = c + word_len += 1 + if word_len != 0: + token = lookup(vocab, bacov, splitter, -1, current, word_len) while token != NULL: tokens.push_back(token) token = token.tail return tokens +cdef inline bint _is_whitespace(Py_UNICODE c) nogil: + if c == ' ': + return True + elif c == '\n': + return True + elif c == '\t': + return True + else: + return False -cdef Lexeme_addr lookup(Vocab& vocab, dict bacov, Splitter find_split, int start, - unicode string) except 0: +cdef Lexeme_addr lookup(Vocab* vocab, dict bacov, Splitter find_split, int start, + Py_UNICODE* string, size_t length) except 0: '''Fetch a Lexeme representing a word string. If the word has not been seen, construct one, splitting off any attached punctuation or clitics. A reference to BLANK_WORD is returned for the empty string. To specify the boundaries of the word if it has not been seen, use lookup_chunk. ''' - if string == '': + if length == 0: return &BLANK_WORD - cdef size_t length = len(string) cdef StringHash hashed = hash_string(string, length) - cdef Lexeme* word_ptr = vocab[hashed] + cdef Lexeme* word_ptr = vocab[0][hashed] if word_ptr == NULL: start = find_split(string, length) if start == -1 else start word_ptr = _add(vocab, bacov, find_split, hashed, string, start, length) @@ -84,9 +123,8 @@ cpdef vector[size_t] expand_chunk(size_t addr) except *: return tokens -cdef StringHash hash_string(unicode s, size_t length) except 0: +cdef StringHash hash_string(Py_UNICODE* s, size_t length) nogil: '''Hash unicode with MurmurHash64A''' - assert length return MurmurHash64A(s, length * sizeof(Py_UNICODE), 0) @@ -95,11 +133,12 @@ cdef unicode unhash(dict bacov, StringHash hash_value): return bacov[hash_value] -cdef Lexeme* _add(Vocab& vocab, dict bacov, Splitter find_split, StringHash hashed, - unicode string, int split, size_t length) except NULL: - assert string - assert split <= length +@cython.nonecheck(False) +cdef Lexeme* _add(Vocab* vocab, dict bacov, Splitter find_split, StringHash hashed, + unicode string, int split, size_t length): word = init_lexeme(vocab, bacov, find_split, string, hashed, split, length) - vocab[hashed] = word + vocab[0][hashed] = word bacov[hashed] = string return word + + diff --git a/spacy/string_tools.pyx b/spacy/string_tools.pyx index 437fc152a..5397fd647 100644 --- a/spacy/string_tools.pyx +++ b/spacy/string_tools.pyx @@ -1,3 +1,5 @@ +# cython: profile=True + cpdef unicode substr(unicode string, int start, int end, size_t length): if end >= length: end = -1 diff --git a/spacy/util.py b/spacy/util.py index 64dee8877..4e080d0b3 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -10,7 +10,7 @@ def utf8open(loc, mode='r'): def load_case_stats(data_dir): - case_loc = path.join(data_dir, 'english.case') + case_loc = path.join(data_dir, 'case') case_stats = {} with utf8open(case_loc) as cases_file: for line in cases_file: @@ -42,46 +42,3 @@ def read_tokenization(lang): seen.add(chunk) entries.append((chunk, lex, pieces)) return entries - - -""" - def load_browns(self, data_dir): - cdef Lexeme* w - case_stats = load_case_stats(data_dir) - brown_loc = path.join(data_dir, 'bllip-clusters') - assert path.exists(brown_loc) - cdef size_t start - cdef int end - with utf8open(brown_loc) as browns_file: - for i, line in enumerate(browns_file): - cluster_str, word, freq_str = line.split() - # Decode as a little-endian string, so that we can do & 15 to get - # the first 4 bits. See redshift._parse_features.pyx - cluster = int(cluster_str[::-1], 2) - upper_pc, title_pc = case_stats.get(word.lower(), (0.0, 0.0)) - start = 0 - end = -1 - find_slice(&start, &end, word) - print "Load", repr(word), start, end - w = init_word(word, start, end, cluster, - upper_pc, title_pc, int(freq_str)) - self.words[_hash_str(word)] = w - self.strings[w] = word - - def load_clitics(self, data_dir): - cdef unicode orig_str - cdef unicode clitic - for orig_str, norm_form, clitic_strs in util.load_clitics(data_dir): - w = init_clitic(orig_str, self.lookup_slice(norm_form, 0, -1)) - self.words[w.orig] = w - self.strings[w] = orig_str - assert len(clitic_strs) < MAX_CLITICS - assert clitic_strs - for i, clitic in enumerate(clitic_strs): - # If we write punctuation here, assume we want to keep it, - # so tell it the slice boundaries (the full string) - w.clitics[i] = self.lookup_slice(clitic, 0, -1) - # Ensure we null terminate - w.clitics[i+1] = 0 -""" - From 0575f16ade8e067543ef4cb261c93c1a55ecf8ae Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 7 Jul 2014 07:37:29 +0200 Subject: [PATCH 2/4] * Upd requirements --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 029a6618e..f6629e024 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1 @@ cython -sparsehash From 0074ae2fc0b82153139fed16bf77ab215b38faab Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 7 Jul 2014 08:05:29 +0200 Subject: [PATCH 3/4] * Switch to dynamically allocating array, based on the document length --- spacy/spacy.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index 1cc73ac3c..51ab59da2 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -1,6 +1,8 @@ # cython: profile=True from __future__ import unicode_literals +from libc.stdlib cimport calloc, free + from ext.murmurhash cimport MurmurHash64A from ext.murmurhash cimport MurmurHash64B @@ -60,9 +62,7 @@ cdef vector[Lexeme_addr] tokenize(Vocab* vocab, dict bacov, Splitter splitter, cdef Py_UNICODE c cdef vector[Lexeme_addr] tokens = vector[Lexeme_addr]() - cdef Py_UNICODE[1000] current - for i in range(1000): - current[i] = 0 + cdef Py_UNICODE* current = calloc(len(string), sizeof(Py_UNICODE)) cdef size_t word_len = 0 cdef Lexeme* token for i in range(length): @@ -84,6 +84,7 @@ cdef vector[Lexeme_addr] tokenize(Vocab* vocab, dict bacov, Splitter splitter, while token != NULL: tokens.push_back(token) token = token.tail + free(current) return tokens cdef inline bint _is_whitespace(Py_UNICODE c) nogil: From 6668e449614a2c29109e24bfa846a7d2402fb186 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 7 Jul 2014 08:15:44 +0200 Subject: [PATCH 4/4] * Whitespace --- spacy/spacy.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index 51ab59da2..59cc2fd51 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -141,5 +141,3 @@ cdef Lexeme* _add(Vocab* vocab, dict bacov, Splitter find_split, StringHash hash vocab[0][hashed] = word bacov[hashed] = string return word - -