mirror of https://github.com/explosion/spaCy.git
* Clean up encoder a bit. now read for integration into Vocab.
This commit is contained in:
parent
8d0f1d98da
commit
083b6ea7ae
|
@ -43,6 +43,37 @@ cdef Code bit_append(Code code, bint bit) nogil:
|
||||||
return code
|
return code
|
||||||
|
|
||||||
|
|
||||||
|
cdef class BitArray:
|
||||||
|
cdef int length
|
||||||
|
cdef bytes data
|
||||||
|
cdef unsigned char byte
|
||||||
|
cdef unsigned char bit_of_byte
|
||||||
|
def __init__(self):
|
||||||
|
self.data = b''
|
||||||
|
self.byte = 0
|
||||||
|
self.bit_of_byte = 0
|
||||||
|
|
||||||
|
def as_bytes(self):
|
||||||
|
if self.bit_of_byte != 0:
|
||||||
|
return self.data + chr(self.byte)
|
||||||
|
else:
|
||||||
|
return self.data
|
||||||
|
|
||||||
|
cdef int extend(self, uint64_t code, char n_bits) except -1:
|
||||||
|
cdef uint64_t one = 1
|
||||||
|
cdef unsigned char bit_of_code
|
||||||
|
for bit_of_code in range(n_bits):
|
||||||
|
if code & (one << bit_of_code):
|
||||||
|
self.byte |= one << self.bit_of_byte
|
||||||
|
else:
|
||||||
|
self.byte &= ~(one << self.bit_of_byte)
|
||||||
|
self.bit_of_byte += 1
|
||||||
|
if self.bit_of_byte == 8:
|
||||||
|
self.data += chr(self.byte)
|
||||||
|
self.byte = 0
|
||||||
|
self.bit_of_byte = 0
|
||||||
|
|
||||||
|
|
||||||
cdef class HuffmanCodec:
|
cdef class HuffmanCodec:
|
||||||
"""Create a Huffman code table, and use it to pack and unpack sequences into
|
"""Create a Huffman code table, and use it to pack and unpack sequences into
|
||||||
byte strings. Emphasis is on efficiency, so API is quite strict:
|
byte strings. Emphasis is on efficiency, so API is quite strict:
|
||||||
|
@ -75,27 +106,11 @@ cdef class HuffmanCodec:
|
||||||
assign_codes(self.nodes, self.codes, len(self.nodes) - 1, path)
|
assign_codes(self.nodes, self.codes, len(self.nodes) - 1, path)
|
||||||
|
|
||||||
def encode(self, uint32_t[:] sequence):
|
def encode(self, uint32_t[:] sequence):
|
||||||
cdef Code code
|
cdef BitArray bits = BitArray()
|
||||||
cdef bytes output = b''
|
for i in sequence:
|
||||||
cdef unsigned char byte = 0
|
bits.extend(self.codes[i].bits, self.codes[i].length)
|
||||||
cdef uint64_t one = 1
|
bits.extend(self.codes[self.eol].bits, self.codes[self.eol].length)
|
||||||
cdef unsigned char i_of_byte = 0
|
return bits.as_bytes()
|
||||||
cdef unsigned char i_of_code = 0
|
|
||||||
for index in list(sequence) + [self.eol]:
|
|
||||||
code = self.codes[index]
|
|
||||||
for i_of_code in range(code.length):
|
|
||||||
if code.bits & (one << i_of_code):
|
|
||||||
byte |= one << i_of_byte
|
|
||||||
else:
|
|
||||||
byte &= ~(one << i_of_byte)
|
|
||||||
i_of_byte += 1
|
|
||||||
if i_of_byte == 8:
|
|
||||||
output += chr(byte)
|
|
||||||
byte = 0
|
|
||||||
i_of_byte = 0
|
|
||||||
if i_of_byte != 0:
|
|
||||||
output += chr(byte)
|
|
||||||
return output
|
|
||||||
|
|
||||||
def decode(self, bytes data):
|
def decode(self, bytes data):
|
||||||
node = self.nodes.back()
|
node = self.nodes.back()
|
||||||
|
|
Loading…
Reference in New Issue