spaCy/spacy/serialize/bits.pyx

121 lines
3.6 KiB
Cython
Raw Normal View History

2015-07-23 17:36:21 +00:00
from __future__ import unicode_literals
from libc.string cimport memcpy
# Note that we're setting the most significant bits here first, when in practice
# we're actually wanting the last bit to be most significant (for Huffman coding,
# anyway).
cdef Code bit_append(Code code, bint bit) nogil:
cdef uint64_t one = 1
if bit:
code.bits |= one << code.length
else:
code.bits &= ~(one << code.length)
code.length += 1
return code
cdef class BitArray:
2015-07-24 01:48:23 +00:00
def __init__(self, data=b''):
self.data = bytearray(data)
self.byte = 0
self.bit_of_byte = 0
self.i = 0
def __len__(self):
return 8 * len(self.data) + self.bit_of_byte
def __str__(self):
cdef uchar byte, i
cdef uchar one = 1
string = b''
for i in range(len(self.data)):
byte = ord(self.data[i])
for j in range(8):
string += b'1' if (byte & (one << j)) else b'0'
for i in range(self.bit_of_byte):
string += b'1' if (byte & (one << i)) else b'0'
return string
def seek(self, i):
self.i = i
def __iter__(self):
cdef uchar byte, i
cdef uchar one = 1
start_byte = self.i // 8
start_bit = self.i % 8
if start_bit != 0 and start_byte < len(self.data):
2015-07-24 01:48:23 +00:00
byte = self.data[start_byte]
for i in range(start_bit, 8):
self.i += 1
yield 1 if (byte & (one << i)) else 0
start_byte += 1
start_bit = 0
for byte in self.data[start_byte:]:
for i in range(8):
self.i += 1
yield 1 if byte & (one << i) else 0
if self.bit_of_byte != 0:
byte = self.byte
for i in range(start_bit, self.bit_of_byte):
self.i += 1
yield 1 if self.byte & (one << i) else 0
cdef uint32_t read32(self) except 0:
cdef int start_byte = self.i // 8
# TODO portability
cdef uchar[4] chars
2015-07-24 01:48:23 +00:00
chars[0] = self.data[start_byte]
chars[1] = self.data[start_byte+1]
chars[2] = self.data[start_byte+2]
chars[3] = self.data[start_byte+3]
cdef uint32_t output
memcpy(&output, chars, 4)
self.i += 32
return output
def as_bytes(self):
2015-07-23 18:37:57 +00:00
cdef unsigned char byte_char
if self.bit_of_byte != 0:
2015-07-23 17:36:21 +00:00
byte = chr(self.byte)
# Jump through some hoops for Python3
if isinstance(byte, unicode):
2015-07-24 01:48:23 +00:00
return self.data + <bytes>(&self.byte)[:1]
2015-07-23 17:36:21 +00:00
else:
return self.data + chr(self.byte)
else:
return self.data
def append(self, bint bit):
cdef uint64_t one = 1
if bit:
self.byte |= one << self.bit_of_byte
else:
self.byte &= ~(one << self.bit_of_byte)
self.bit_of_byte += 1
self.i += 1
if self.bit_of_byte == 8:
2015-07-24 01:48:23 +00:00
self.data += bytearray((self.byte,))
self.byte = 0
self.bit_of_byte = 0
cdef int extend(self, uint64_t code, char n_bits) except -1:
cdef uint64_t one = 1
cdef unsigned char bit_of_code
for bit_of_code in range(n_bits):
if code & (one << bit_of_code):
self.byte |= one << self.bit_of_byte
else:
self.byte &= ~(one << self.bit_of_byte)
self.bit_of_byte += 1
if self.bit_of_byte == 8:
2015-07-24 00:45:34 +00:00
self.data += <bytes>self.byte
self.byte = 0
self.bit_of_byte = 0
self.i += 1