2015-07-16 23:19:29 +00:00
|
|
|
from libc.stdint cimport uint32_t
|
|
|
|
from libc.stdint cimport uint64_t
|
|
|
|
from libc.math cimport exp as c_exp
|
|
|
|
from libcpp.queue cimport priority_queue
|
|
|
|
from libcpp.pair cimport pair
|
|
|
|
|
|
|
|
from cymem.cymem cimport Address, Pool
|
|
|
|
from preshed.maps cimport PreshMap
|
|
|
|
|
2015-07-17 14:38:29 +00:00
|
|
|
from ..attrs cimport ID, ORTH, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
2015-07-16 23:19:29 +00:00
|
|
|
from ..tokens.doc cimport Doc
|
|
|
|
from ..vocab cimport Vocab
|
|
|
|
from ..typedefs cimport attr_t
|
|
|
|
from .bits cimport BitArray
|
|
|
|
from .huffman cimport HuffmanCodec
|
|
|
|
|
|
|
|
from os import path
|
|
|
|
import numpy
|
|
|
|
|
|
|
|
cimport cython
|
|
|
|
|
|
|
|
|
|
|
|
# Format
|
|
|
|
# - Total number of bytes in message (32 bit int) --- handled outside this
|
|
|
|
# - Number of words (32 bit int)
|
|
|
|
# - Words, terminating in an EOL symbol, huffman coded ~12 bits per word
|
|
|
|
# - Spaces 1 bit per word
|
|
|
|
# - Attributes:
|
|
|
|
# POS tag
|
|
|
|
# Head offset
|
|
|
|
# Dep label
|
|
|
|
# Entity IOB
|
|
|
|
# Entity tag
|
|
|
|
|
|
|
|
|
|
|
|
def make_vocab_codec(Vocab vocab):
|
|
|
|
cdef int length = len(vocab)
|
|
|
|
cdef Address mem = Address(length, sizeof(float))
|
|
|
|
probs = <float*>mem.ptr
|
|
|
|
cdef int i
|
|
|
|
for i in range(length):
|
|
|
|
probs[i] = <float>c_exp(vocab.lexemes[i].prob)
|
|
|
|
cdef float[:] cv_probs = <float[:len(vocab)]>probs
|
|
|
|
return HuffmanCodec(cv_probs)
|
|
|
|
|
|
|
|
|
|
|
|
cdef class _BinaryCodec:
|
2015-07-17 14:38:29 +00:00
|
|
|
def encode(self, attr_t[:] msg, BitArray bits):
|
2015-07-16 23:19:29 +00:00
|
|
|
cdef int i
|
2015-07-17 14:38:29 +00:00
|
|
|
for i in range(len(msg)):
|
|
|
|
bits.append(msg[i])
|
2015-07-16 23:19:29 +00:00
|
|
|
|
2015-07-17 14:38:29 +00:00
|
|
|
def decode(self, bits, attr_t[:] msg):
|
|
|
|
for i in range(len(msg)):
|
|
|
|
msg[i] = bits.next()
|
2015-07-16 23:19:29 +00:00
|
|
|
|
|
|
|
|
|
|
|
cdef class _AttributeCodec:
|
|
|
|
cdef Pool mem
|
|
|
|
cdef attr_t* _keys
|
|
|
|
cdef PreshMap _map
|
|
|
|
cdef HuffmanCodec _codec
|
|
|
|
|
|
|
|
def __init__(self, freqs):
|
2015-07-17 14:38:29 +00:00
|
|
|
self.mem = Pool()
|
2015-07-16 23:19:29 +00:00
|
|
|
cdef uint64_t key
|
|
|
|
cdef uint64_t count
|
|
|
|
cdef pair[uint64_t, uint64_t] item
|
|
|
|
|
|
|
|
cdef priority_queue[pair[uint64_t, uint64_t]] items
|
|
|
|
|
|
|
|
for key, count in freqs:
|
|
|
|
item.first = count
|
|
|
|
item.second = key
|
|
|
|
items.push(item)
|
2015-07-17 14:38:29 +00:00
|
|
|
weights = numpy.ndarray(shape=(len(freqs),), dtype=numpy.float32)
|
2015-07-16 23:19:29 +00:00
|
|
|
self._keys = <attr_t*>self.mem.alloc(len(freqs), sizeof(attr_t))
|
|
|
|
self._map = PreshMap()
|
|
|
|
cdef int i = 0
|
|
|
|
while not items.empty():
|
|
|
|
item = items.top()
|
2015-07-17 14:38:29 +00:00
|
|
|
# We put freq first above, for sorting
|
2015-07-16 23:19:29 +00:00
|
|
|
self._keys[i] = item.second
|
2015-07-17 14:38:29 +00:00
|
|
|
weights[i] = item.first
|
|
|
|
self._map[self._keys[i]] = i
|
2015-07-16 23:19:29 +00:00
|
|
|
items.pop()
|
2015-07-17 14:38:29 +00:00
|
|
|
i += 1
|
2015-07-16 23:19:29 +00:00
|
|
|
self._codec = HuffmanCodec(weights)
|
|
|
|
|
2015-07-17 14:38:29 +00:00
|
|
|
def encode(self, attr_t[:] msg, BitArray dest):
|
2015-07-16 23:19:29 +00:00
|
|
|
for i in range(len(msg)):
|
2015-07-17 14:38:29 +00:00
|
|
|
msg[i] = <attr_t>self._map[msg[i]]
|
|
|
|
self._codec.encode(msg, dest)
|
2015-07-16 23:19:29 +00:00
|
|
|
|
2015-07-17 14:38:29 +00:00
|
|
|
def decode(self, BitArray bits, attr_t[:] dest):
|
2015-07-16 23:19:29 +00:00
|
|
|
cdef int i
|
2015-07-17 14:38:29 +00:00
|
|
|
self._codec.decode(bits, dest)
|
|
|
|
for i in range(len(dest)):
|
|
|
|
dest[i] = <attr_t>self._keys[dest[i]]
|
2015-07-16 23:19:29 +00:00
|
|
|
|
|
|
|
|
|
|
|
cdef class Packer:
|
|
|
|
def __init__(self, Vocab vocab, list_of_attr_freqs):
|
|
|
|
self.vocab = vocab
|
|
|
|
codecs = []
|
|
|
|
self.attrs = []
|
|
|
|
|
|
|
|
for attr, freqs in list_of_attr_freqs:
|
2015-07-17 14:38:29 +00:00
|
|
|
if attr == ORTH:
|
2015-07-16 23:19:29 +00:00
|
|
|
codecs.append(make_vocab_codec(vocab))
|
|
|
|
elif attr == SPACY:
|
|
|
|
codecs.append(_BinaryCodec())
|
|
|
|
else:
|
|
|
|
codecs.append(_AttributeCodec(freqs))
|
|
|
|
self.attrs.append(attr)
|
|
|
|
self._codecs = tuple(codecs)
|
|
|
|
|
2015-07-17 14:38:29 +00:00
|
|
|
def pack(self, Doc doc):
|
|
|
|
array = doc.to_array(self.attrs)
|
2015-07-16 23:19:29 +00:00
|
|
|
cdef BitArray bits = BitArray()
|
|
|
|
cdef uint32_t length = len(array)
|
|
|
|
bits.extend(length, 32)
|
|
|
|
for i, codec in enumerate(self._codecs):
|
|
|
|
codec.encode(array[i], bits)
|
|
|
|
return bits
|
|
|
|
|
2015-07-17 14:38:29 +00:00
|
|
|
def unpack(self, bits):
|
2015-07-16 23:19:29 +00:00
|
|
|
cdef uint32_t length = bits.read(32)
|
|
|
|
array = numpy.ndarray(shape=(len(self.codecs), length), dtype=numpy.int)
|
|
|
|
for i, codec in enumerate(self.codecs):
|
|
|
|
array[i] = codec.decode(bits)
|
2015-07-17 14:38:29 +00:00
|
|
|
return Doc.from_array(self.vocab, self.attrs, array)
|