mirror of https://github.com/explosion/spaCy.git
* Fix bytes problems for Python3
This commit is contained in:
parent
ce984f471c
commit
c4ff410fdb
|
@ -13,7 +13,7 @@ cdef Code bit_append(Code code, bint bit) nogil
|
|||
|
||||
|
||||
cdef class BitArray:
|
||||
cdef bytes data
|
||||
cdef bytearray data
|
||||
cdef uchar byte
|
||||
cdef uchar bit_of_byte
|
||||
cdef uint32_t i
|
||||
|
|
|
@ -16,8 +16,8 @@ cdef Code bit_append(Code code, bint bit) nogil:
|
|||
|
||||
|
||||
cdef class BitArray:
|
||||
def __init__(self, bytes data=b''):
|
||||
self.data = data
|
||||
def __init__(self, data=b''):
|
||||
self.data = bytearray(data)
|
||||
self.byte = 0
|
||||
self.bit_of_byte = 0
|
||||
self.i = 0
|
||||
|
@ -47,7 +47,7 @@ cdef class BitArray:
|
|||
start_bit = self.i % 8
|
||||
|
||||
if start_bit != 0 and start_byte < len(self.data):
|
||||
byte = ord(self.data[start_byte])
|
||||
byte = self.data[start_byte]
|
||||
for i in range(start_bit, 8):
|
||||
self.i += 1
|
||||
yield 1 if (byte & (one << i)) else 0
|
||||
|
@ -70,10 +70,10 @@ cdef class BitArray:
|
|||
|
||||
# TODO portability
|
||||
cdef uchar[4] chars
|
||||
chars[0] = <uchar>ord(self.data[start_byte])
|
||||
chars[1] = <uchar>ord(self.data[start_byte+1])
|
||||
chars[2] = <uchar>ord(self.data[start_byte+2])
|
||||
chars[3] = <uchar>ord(self.data[start_byte+3])
|
||||
chars[0] = self.data[start_byte]
|
||||
chars[1] = self.data[start_byte+1]
|
||||
chars[2] = self.data[start_byte+2]
|
||||
chars[3] = self.data[start_byte+3]
|
||||
cdef uint32_t output
|
||||
memcpy(&output, chars, 4)
|
||||
self.i += 32
|
||||
|
@ -85,8 +85,7 @@ cdef class BitArray:
|
|||
byte = chr(self.byte)
|
||||
# Jump through some hoops for Python3
|
||||
if isinstance(byte, unicode):
|
||||
byte_char = <unsigned char>byte
|
||||
return self.data + <bytes>&byte_char
|
||||
return self.data + <bytes>(&self.byte)[:1]
|
||||
else:
|
||||
return self.data + chr(self.byte)
|
||||
else:
|
||||
|
@ -101,7 +100,7 @@ cdef class BitArray:
|
|||
self.bit_of_byte += 1
|
||||
self.i += 1
|
||||
if self.bit_of_byte == 8:
|
||||
self.data += chr(self.byte)
|
||||
self.data += bytearray((self.byte,))
|
||||
self.byte = 0
|
||||
self.bit_of_byte = 0
|
||||
|
||||
|
|
|
@ -110,14 +110,14 @@ cdef class HuffmanCodec:
|
|||
cdef int branch
|
||||
|
||||
cdef int n_msg = msg.shape[0]
|
||||
cdef bytes bytes_ = bits.as_bytes()
|
||||
cdef bytearray bytes_ = bits.as_bytes()
|
||||
cdef unsigned char byte
|
||||
cdef int i_msg = 0
|
||||
cdef int i_byte = bits.i // 8
|
||||
cdef unsigned char i_bit = 0
|
||||
cdef unsigned char one = 1
|
||||
while i_msg < n_msg:
|
||||
byte = ord(bytes_[i_byte])
|
||||
byte = bytes_[i_byte]
|
||||
i_byte += 1
|
||||
for i_bit in range(8):
|
||||
branch = node.right if (byte & (one << i_bit)) else node.left
|
||||
|
@ -138,11 +138,11 @@ cdef class HuffmanCodec:
|
|||
def __get__(self):
|
||||
output = []
|
||||
cdef int i, j
|
||||
cdef bytes string
|
||||
cdef unicode string
|
||||
cdef Code code
|
||||
for i in range(self.codes.size()):
|
||||
code = self.codes[i]
|
||||
string = b'{0:b}'.format(code.bits).rjust(code.length, '0')
|
||||
string = '{0:b}'.format(code.bits).rjust(code.length, '0')
|
||||
string = string[::-1]
|
||||
output.append(string)
|
||||
return output
|
||||
|
|
|
@ -66,7 +66,7 @@ def _gen_orths(Vocab vocab):
|
|||
def _gen_chars(Vocab vocab):
|
||||
cdef attr_t orth
|
||||
cdef size_t addr
|
||||
char_weights = {chr(i): 1e-20 for i in range(256)}
|
||||
char_weights = {i: 1e-20 for i in range(256)}
|
||||
cdef unicode string
|
||||
cdef bytes char
|
||||
cdef bytes utf8_str
|
||||
|
@ -75,9 +75,9 @@ def _gen_chars(Vocab vocab):
|
|||
string = vocab.strings[lex.orth]
|
||||
utf8_str = string.encode('utf8')
|
||||
for char in utf8_str:
|
||||
char_weights.setdefault(char, 0.0)
|
||||
char_weights[char] += c_exp(lex.prob)
|
||||
char_weights[b' '] += c_exp(lex.prob)
|
||||
char_weights.setdefault(ord(char), 0.0)
|
||||
char_weights[ord(char)] += c_exp(lex.prob)
|
||||
char_weights[ord(' ')] += c_exp(lex.prob)
|
||||
return char_weights.items()
|
||||
|
||||
|
||||
|
@ -110,12 +110,12 @@ cdef class Packer:
|
|||
codec.encode(array[:, i], bits)
|
||||
return bits.as_bytes()
|
||||
|
||||
def unpack(self, bytes data):
|
||||
def unpack(self, data):
|
||||
doc = Doc(self.vocab)
|
||||
self.unpack_into(data, doc)
|
||||
return doc
|
||||
|
||||
def unpack_into(self, bytes byte_string, Doc doc):
|
||||
def unpack_into(self, byte_string, Doc doc):
|
||||
bits = BitArray(byte_string)
|
||||
bits.seek(0)
|
||||
cdef int32_t length = bits.read32()
|
||||
|
@ -149,7 +149,7 @@ cdef class Packer:
|
|||
cdef int32_t length = len(utf8_str)
|
||||
# Signal chars with negative length
|
||||
bits.extend(-length, 32)
|
||||
self.char_codec.encode(utf8_str, bits)
|
||||
self.char_codec.encode(bytearray(utf8_str), bits)
|
||||
cdef int i, j
|
||||
for i in range(doc.length):
|
||||
for j in range(doc.data[i].lex.length-1):
|
||||
|
@ -167,7 +167,7 @@ cdef class Packer:
|
|||
spaces = iter(bits)
|
||||
for i in range(n):
|
||||
orth = orths[i]
|
||||
space = spaces.next()
|
||||
space = next(spaces)
|
||||
lex = self.vocab.get_by_orth(doc.mem, orth)
|
||||
doc.push_back(lex, space)
|
||||
return doc
|
||||
|
|
Loading…
Reference in New Issue