From fb54052ae09ebe7f1aa3d0356ad19b5fab5f7ad0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 16 Jul 2015 17:46:46 +0200 Subject: [PATCH] * Work on serializer design --- spacy/serialize.pyx | 111 +++++++++++++++++++++++--------------------- 1 file changed, 59 insertions(+), 52 deletions(-) diff --git a/spacy/serialize.pyx b/spacy/serialize.pyx index 1ea58092d..a354203d4 100644 --- a/spacy/serialize.pyx +++ b/spacy/serialize.pyx @@ -67,7 +67,6 @@ cdef class BitArray: def append(self, bint bit): cdef uint64_t one = 1 - print 'append', bit if bit: self.byte |= one << self.bit_of_byte else: @@ -95,10 +94,11 @@ cdef class BitArray: cdef class Serializer: # Manage codecs, maintain consistent format for io - def __init__(self, Vocab vocab, model_dir): - self.vocab = vocab - self.lex = None - self.codecs = [] + def __init__(self, Vocab vocab, data_dir): + model_dir = path.join(data_dir, 'bitter') + self.vocab = vocab # Vocab owns the word codec, the big one + self.cfg = Config.read(model_dir, 'config') + self.codecs = tuple([CodecWrapper(attr) for attr in self.cfg.attrs]) def __call__(self, doc_or_bits): if isinstance(doc_or_bits, Doc): @@ -109,66 +109,67 @@ cdef class Serializer: raise ValueError(doc_or_bits) def train(self, doc): - array = doc.to_array(self.attrs) - for i, attr in enumerate(self.attrs): - for j in range(doc.length): - self.freqs[attr].inc(array[i, j], 1) - self.freqs[attr].inc(self.eol, 1) + array = doc.to_array([codec.id for codec in self.codecs]) + for i, codec in enumerate(self.codecs): + codec.count(array[i]) def serialize(self, doc): bits = BitArray() array = doc.to_array(self.attrs) - for i, attr in enumerate(self.attrs, self.codecs): + for i, codec in enumerate(self.codecs): codec.encode(array[i,], bits) return bits @cython.boundscheck(False) def deserialize(self, bits): - cdef Doc doc = Doc(self.vocab) biterator = iter(bits) - ids = self.codecs[0].decode(bits) + cdef Doc doc = Doc(self.vocab) + ids = self.vocab.codec.decode(biterator) cdef int id_ cdef bint is_spacy for id_ in ids: is_spacy = biterator.next() doc.push_back(vocab.lexemes.at(id_), is_spacy) - + cdef int length = doc.length - cdef int i - cdef attr_t value - cdef attr_id_t attr_id - cdef attr_t[:] values - cdef TokenC* tokens = doc.data - for codec in vocab.codecs[1:]: - values = codec.decode(biterator) - attr_id = codec.id - if attr_id == HEAD: - for i in range(length): - tokens[i].head = values[i] - elif attr_id == TAG: - for i in range(length): - tokens[i].tag = values[i] - elif attr_id == DEP: - for i in range(length): - tokens[i].dep = values[i] - elif attr_id == ENT_IOB: - for i in range(length): - tokens[i].ent_iob = values[i] - elif attr_id == ENT_TYPE: - for i in range(length): - tokens[i].ent_type = values[i] + array = numpy.zeros(shape=(length, len(self.codecs)), dtype=numpy.int) + for i, codec in enumerate(self.codecs): + array[i] = codec.decode(biterator) + doc.from_array([c.id for c in self.codecs], array) return doc - def lex_codec(self): - cdef Address mem - cdef int i - cdef float[:] cv_probs - mem = Address(len(self), sizeof(float)) - probs = mem.ptr - for i in range(len(self.vocab)): - probs[i] = c_exp(self.lexemes[i].prob) - cv_probs = probs - return HuffmanCodec(cv_probs, 0, id=ID) + +cdef class AttributeEncoder: + """Wrapper around HuffmanCodec""" + def __init__(self, freqs, id=0): + cdef uint64_t key + cdef uint64_t count + cdef pair[uint64_t] item + cdef priority_queue[pair[uint64_t]] items + for key, count in freqs: + item.first = count + item.second = key + items.push(item) + + weights = array('f') + keys = array('i') + key_to_i = PreshMap() + i = 0 + while not items.empty(): + item = items.top() + weights.append(item.first) + keys.append(item.second) + key_to_i[item.second] = i + i += 1 + items.pop() + + def encode(self, symbols): + indices = [self.table[symbol] for symbol in symbols] + return self._codec.encode(indices) + + def decode(self, bits): + indices = self._codec.decode(bits) + return [self.symbols[i] for i in indices] cdef class HuffmanCodec: @@ -182,19 +183,17 @@ cdef class HuffmanCodec: the EOL symbol in your message. Arguments: - probs (float[:]): A descending-sorted sequence of probabilities/weights. + weights (float[:]): A descending-sorted sequence of probabilities/weights. Must include a weight for an EOL symbol. eol (uint32_t): The index of the weight of the EOL symbol. """ - def __init__(self, float[:] probs, uint32_t eol, id=0): - self.id = id - self.eol = eol + def __init__(self, float[:] weights, unt32_t eol): self.codes.resize(len(probs)) for i in range(len(self.codes)): self.codes[i].bits = 0 self.codes[i].length = 0 - populate_nodes(self.nodes, probs) + populate_nodes(self.nodes, weights) cdef Code path path.bits = 0 path.length = 0 @@ -270,6 +269,7 @@ cdef int populate_nodes(vector[Node]& nodes, float[:] probs) except -1: return 0 cdef int _cover_two_nodes(vector[Node]& nodes, int j) nogil: + """Introduce a new non-terminal, over two non-terminals)""" cdef Node node node.left = j node.right = j+1 @@ -278,6 +278,7 @@ cdef int _cover_two_nodes(vector[Node]& nodes, int j) nogil: cdef int _cover_one_word_one_node(vector[Node]& nodes, int j, int id_, float prob) nogil: + """Introduce a new non-terminal, over one terminal and one non-terminal.""" cdef Node node # Encode leaves as negative integers, where the integer is the index of the # word in the vocabulary. @@ -295,6 +296,7 @@ cdef int _cover_one_word_one_node(vector[Node]& nodes, int j, int id_, float pro cdef int _cover_two_words(vector[Node]& nodes, int id1, int id2, float prob) nogil: + """Introduce a new node, over two non-terminals.""" cdef Node node node.left = -(id1+1) node.right = -(id2+1) @@ -303,6 +305,11 @@ cdef int _cover_two_words(vector[Node]& nodes, int id1, int id2, float prob) nog cdef int assign_codes(vector[Node]& nodes, vector[Code]& codes, int i, Code path) except -1: + """Recursively assign paths, from the top down. At the end, the entry codes[i] + knows the bit-address of the node[j] that points to entry i in the vocabulary. + So, to encode i, we go to codes[i] and read its bit-string. To decode, we + navigate nodes recursively. + """ cdef Code left_path = bit_append(path, 0) cdef Code right_path = bit_append(path, 1)