Remove spacy.serialize

This commit is contained in:
Matthew Honnibal 2017-05-09 17:22:06 +02:00
parent 62ecdea9f2
commit b53f7dfdc3
8 changed files with 0 additions and 552 deletions

View File

@ -1,23 +0,0 @@
from libc.stdint cimport uint64_t
from libc.stdint cimport int32_t, uint32_t
ctypedef unsigned char uchar
cdef struct Code:
uint64_t bits
char length
cdef Code bit_append(Code code, bint bit) nogil
cdef class BitArray:
cdef bytearray data
cdef uchar byte
cdef uchar bit_of_byte
cdef uint32_t i
cdef int extend(self, uint64_t code, char n_bits) except -1
cpdef int32_t read32(self) except 0

View File

@ -1,120 +0,0 @@
from __future__ import unicode_literals
from libc.string cimport memcpy
# Note that we're setting the most significant bits here first, when in practice
# we're actually wanting the last bit to be most significant (for Huffman coding,
# anyway).
cdef Code bit_append(Code code, bint bit) nogil:
cdef uint64_t one = 1
if bit:
code.bits |= one << code.length
else:
code.bits &= ~(one << code.length)
code.length += 1
return code
cdef class BitArray:
def __init__(self, data=b''):
self.data = bytearray(data)
self.byte = 0
self.bit_of_byte = 0
self.i = 0
def __len__(self):
return 8 * len(self.data) + self.bit_of_byte
def __str__(self):
cdef uchar byte, i
cdef uchar one = 1
string = b''
for i in range(len(self.data)):
byte = ord(self.data[i])
for j in range(8):
string += b'1' if (byte & (one << j)) else b'0'
for i in range(self.bit_of_byte):
string += b'1' if (byte & (one << i)) else b'0'
return string
def seek(self, i):
self.i = i
def __iter__(self):
cdef uchar byte, i
cdef uchar one = 1
start_byte = self.i // 8
start_bit = self.i % 8
if start_bit != 0 and start_byte < len(self.data):
byte = self.data[start_byte]
for i in range(start_bit, 8):
self.i += 1
yield 1 if (byte & (one << i)) else 0
start_byte += 1
start_bit = 0
for byte in self.data[start_byte:]:
for i in range(8):
self.i += 1
yield 1 if byte & (one << i) else 0
if self.bit_of_byte != 0:
byte = self.byte
for i in range(start_bit, self.bit_of_byte):
self.i += 1
yield 1 if self.byte & (one << i) else 0
cpdef int32_t read32(self) except 0:
cdef int start_byte = self.i // 8
# TODO portability
cdef uchar[4] chars
chars[0] = self.data[start_byte]
chars[1] = self.data[start_byte+1]
chars[2] = self.data[start_byte+2]
chars[3] = self.data[start_byte+3]
cdef uint32_t output
memcpy(&output, chars, 4)
self.i += 32
return output
def as_bytes(self):
cdef unsigned char byte_char
if self.bit_of_byte != 0:
byte = chr(self.byte)
# Jump through some hoops for Python3
if isinstance(byte, unicode):
return self.data + <bytes>(&self.byte)[:1]
else:
return self.data + chr(self.byte)
else:
return self.data
def append(self, bint bit):
cdef uint64_t one = 1
if bit:
self.byte |= one << self.bit_of_byte
else:
self.byte &= ~(one << self.bit_of_byte)
self.bit_of_byte += 1
self.i += 1
if self.bit_of_byte == 8:
self.data += bytearray((self.byte,))
self.byte = 0
self.bit_of_byte = 0
cdef int extend(self, uint64_t code, char n_bits) except -1:
cdef uint64_t one = 1
cdef unsigned char bit_of_code
for bit_of_code in range(n_bits):
if code & (one << bit_of_code):
self.byte |= one << self.bit_of_byte
else:
self.byte &= ~(one << self.bit_of_byte)
self.bit_of_byte += 1
if self.bit_of_byte == 8:
self.data += <bytes>self.byte
self.byte = 0
self.bit_of_byte = 0
self.i += 1

View File

@ -1,24 +0,0 @@
from libcpp.vector cimport vector
from libc.stdint cimport uint32_t
from libc.stdint cimport int64_t
from libc.stdint cimport int32_t
from libc.stdint cimport uint64_t
from .bits cimport BitArray, Code
cdef struct Node:
int32_t left
int32_t right
cdef class HuffmanCodec:
cdef vector[Node] nodes
cdef vector[Code] codes
cdef Node root
cdef readonly list leaves
cdef readonly dict _map
cpdef int encode_int32(self, int32_t[:] msg, BitArray bits) except -1
cpdef int decode_int32(self, BitArray bits, int32_t[:] msg) except -1

View File

@ -1,176 +0,0 @@
# cython: profile=True
from __future__ import unicode_literals
cimport cython
from libcpp.queue cimport priority_queue
from libcpp.pair cimport pair
import numpy
from ..typedefs cimport attr_t
from .bits cimport bit_append
from .bits cimport BitArray
cdef class HuffmanCodec:
def __init__(self, freqs):
cdef float count
cdef Code code
cdef pair[float, int] item
cdef pair[float, int] item1
cdef pair[float, int] item2
cdef priority_queue[pair[float, int]] queue
cdef int i = 0
self._map = {}
self.leaves = []
for word, weight in freqs:
item.first = -weight
item.second = -(i+1)
queue.push(item)
self.leaves.append(word)
code.bits = 0
code.length = 0
self.codes.push_back(code)
self._map[word] = i
i += 1
cdef Node node
while queue.size() >= 2:
item1 = queue.top(); queue.pop()
item2 = queue.top(); queue.pop()
node = Node(left=item1.second, right=item2.second)
self.nodes.push_back(node)
item.first = item1.first + item2.first
item.second = self.nodes.size()-1
queue.push(item)
# Careful of empty freqs dicts
cdef Code path
if queue.size() >= 1:
item = queue.top()
self.root = self.nodes[item.second]
path.bits = 0
path.length = 0
assign_codes(self.nodes, self.codes, item.second, path)
def encode(self, msg, BitArray bits=None):
if bits is None:
bits = BitArray()
cdef int i
for word in msg:
i = self._map[word]
bits.extend(self.codes[i].bits, self.codes[i].length)
return bits
cpdef int encode_int32(self, int32_t[:] msg, BitArray bits) except -1:
cdef int msg_i
cdef int leaf_i
cdef int length = 0
for msg_i in range(msg.shape[0]):
leaf_i = self._map.get(msg[msg_i], -1)
if leaf_i is -1:
return 0
code = self.codes[leaf_i]
bits.extend(code.bits, code.length)
length += code.length
return length
def n_bits(self, msg, overhead=0):
cdef int i
length = 0
for word in msg:
if word not in self._map:
return numpy.nan
i = self._map[word]
length += self.codes[i].length
return length + overhead * len(msg)
def decode(self, bits, msg):
node = self.root
cdef int i = 0
cdef int n = len(msg)
cdef int branch
cdef bint bit
for bit in bits:
branch = node.right if bit else node.left
if branch >= 0:
node = self.nodes.at(branch)
else:
msg[i] = self.leaves[-(branch + 1)]
node = self.nodes.back()
i += 1
if i == n:
break
else:
raise Exception("Buffer exhausted at %d/%d symbols read." % (i, len(msg)))
@cython.boundscheck(False)
cpdef int decode_int32(self, BitArray bits, int32_t[:] msg) except -1:
assert bits.i % 8 == 0
cdef Node node = self.root
cdef int branch
cdef int n_msg = msg.shape[0]
cdef bytearray bytes_ = bits.as_bytes()
cdef unsigned char byte
cdef int i_msg = 0
cdef int i_byte = bits.i // 8
cdef unsigned char i_bit = 0
cdef unsigned char one = 1
while i_msg < n_msg:
byte = bytes_[i_byte]
i_byte += 1
for i_bit in range(8):
branch = node.right if (byte & (one << i_bit)) else node.left
bits.i += 1
if branch >= 0:
node = self.nodes.at(branch)
else:
msg[i_msg] = self.leaves[-(branch + 1)]
i_msg += 1
if i_msg == n_msg:
break
node = self.root
property strings:
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def __get__(self):
output = []
cdef int i, j
cdef unicode string
cdef Code code
for i in range(self.codes.size()):
code = self.codes[i]
string = '{0:b}'.format(code.bits).rjust(code.length, '0')
string = string[::-1]
output.append(string)
return output
cdef int assign_codes(vector[Node]& nodes, vector[Code]& codes, int i, Code path) except -1:
"""Recursively assign paths, from the top down. At the end, the entry codes[i]
knows the bit-address of the node[j] that points to entry i in the vocabulary.
So, to encode i, we go to codes[i] and read its bit-string. To decode, we
navigate nodes recursively.
"""
cdef Code left_path = bit_append(path, 0)
cdef Code right_path = bit_append(path, 1)
# Assign down left branch
if nodes[i].left >= 0:
assign_codes(nodes, codes, nodes[i].left, left_path)
else:
# Leaf on left
id_ = -(nodes[i].left + 1)
codes[id_] = left_path
# Assign down right branch
if nodes[i].right >= 0:
assign_codes(nodes, codes, nodes[i].right, right_path)
else:
# Leaf on right
id_ = -(nodes[i].right + 1)
codes[id_] = right_path

View File

@ -1,9 +0,0 @@
from ..vocab cimport Vocab
cdef class Packer:
cdef readonly tuple attrs
cdef readonly tuple _codecs
cdef readonly object orth_codec
cdef readonly object char_codec
cdef readonly Vocab vocab

View File

@ -1,200 +0,0 @@
# cython: profile=True
from __future__ import unicode_literals
from libc.stdint cimport uint32_t, int32_t
from libc.stdint cimport uint64_t
from libc.math cimport exp as c_exp
from libcpp.queue cimport priority_queue
from libcpp.pair cimport pair
from cymem.cymem cimport Address, Pool
from preshed.maps cimport PreshMap
from preshed.counter cimport PreshCounter
import json
from ..attrs cimport ORTH, ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
from ..tokens.doc cimport Doc
from ..vocab cimport Vocab
from ..structs cimport LexemeC
from ..typedefs cimport attr_t
from .bits cimport BitArray
from .huffman cimport HuffmanCodec
from os import path
import numpy
from .. import util
cimport cython
# Format
# - Total number of bytes in message (32 bit int) --- handled outside this
# - Number of words (32 bit int)
# - Words, terminating in an EOL symbol, huffman coded ~12 bits per word
# - Spaces 1 bit per word
# - Attributes:
# POS tag
# Head offset
# Dep label
# Entity IOB
# Entity tag
cdef class _BinaryCodec:
def encode(self, attr_t[:] msg, BitArray bits):
cdef int i
for i in range(len(msg)):
bits.append(msg[i])
def decode(self, BitArray bits, attr_t[:] msg):
cdef int i = 0
for bit in bits:
msg[i] = bit
i += 1
if i == len(msg):
break
def _gen_orths(Vocab vocab):
cdef attr_t orth
cdef size_t addr
for orth, addr in vocab._by_orth.items():
lex = <LexemeC*>addr
yield orth, c_exp(lex.prob)
def _gen_chars(Vocab vocab):
cdef attr_t orth
cdef size_t addr
char_weights = {i: 1e-20 for i in range(256)}
cdef unicode string
cdef bytes char
cdef bytes utf8_str
for orth, addr in vocab._by_orth.items():
lex = <LexemeC*>addr
string = vocab.strings[lex.orth]
utf8_str = string.encode('utf8')
for char in utf8_str:
char_weights.setdefault(ord(char), 0.0)
char_weights[ord(char)] += c_exp(lex.prob)
char_weights[ord(' ')] += c_exp(lex.prob)
return char_weights.items()
cdef class Packer:
def __init__(self, Vocab vocab, attr_freqs, char_freqs=None):
if char_freqs is None:
char_freqs = _gen_chars(vocab)
self.vocab = vocab
self.orth_codec = HuffmanCodec(_gen_orths(vocab))
self.char_codec = HuffmanCodec(char_freqs)
codecs = []
attrs = []
for attr, freqs in sorted(attr_freqs):
if attr in (ORTH, ID, SPACY):
continue
codecs.append(HuffmanCodec(freqs))
attrs.append(attr)
self._codecs = tuple(codecs)
self.attrs = tuple(attrs)
def pack(self, Doc doc):
if len(doc) == 0:
return b''
bits = self._orth_encode(doc)
if bits is None:
bits = self._char_encode(doc)
cdef int i
if self.attrs:
array = doc.to_array(self.attrs)
for i, codec in enumerate(self._codecs):
codec.encode(array[:, i], bits)
return bits.as_bytes()
def unpack(self, data):
doc = Doc(self.vocab)
self.unpack_into(data, doc)
return doc
def unpack_into(self, byte_string, Doc doc):
if byte_string == b'':
return None
bits = BitArray(byte_string)
bits.seek(0)
cdef int32_t length = bits.read32()
if length >= 0:
self._orth_decode(bits, length, doc)
else:
self._char_decode(bits, -length, doc)
array = numpy.zeros(shape=(len(doc), len(self._codecs)), dtype=numpy.int32)
for i, codec in enumerate(self._codecs):
codec.decode(bits, array[:, i])
doc.from_array(self.attrs, array)
return doc
def _orth_encode(self, Doc doc):
for t in doc:
if t.is_oov:
return None
cdef BitArray bits = BitArray()
cdef int32_t length = len(doc)
bits.extend(length, 32)
orths = doc.to_array([ORTH])
n_bits = self.orth_codec.encode_int32(orths[:, 0], bits)
if n_bits == 0:
return None
for token in doc:
bits.append(bool(token.whitespace_))
return bits
def _char_encode(self, Doc doc):
cdef bytes utf8_str = doc.string.encode('utf8')
cdef BitArray bits = BitArray()
cdef int32_t length = len(utf8_str)
# Signal chars with negative length
bits.extend(-length, 32)
self.char_codec.encode(bytearray(utf8_str), bits)
cdef int i, j
for i in range(doc.length):
for j in range(doc.c[i].lex.length-1):
bits.append(False)
bits.append(True)
if doc.c[i].spacy:
bits.append(False)
return bits
def _orth_decode(self, BitArray bits, int32_t n, Doc doc):
cdef attr_t[:] orths = numpy.ndarray(shape=(n,), dtype=numpy.int32)
self.orth_codec.decode_int32(bits, orths)
cdef int i
cdef bint space
spaces = iter(bits)
for i in range(n):
orth = orths[i]
space = next(spaces)
lex = self.vocab.get_by_orth(doc.mem, orth)
doc.push_back(lex, space)
return doc
def _char_decode(self, BitArray bits, int32_t n_bytes, Doc doc):
cdef bytearray utf8_str = bytearray(n_bytes)
self.char_codec.decode(bits, utf8_str)
cdef unicode string = utf8_str.decode('utf8')
cdef int start = 0
cdef bint is_spacy
cdef int n_unicode_chars = len(string)
cdef int i = 0
cdef bint is_end_token
for is_end_token in bits:
if is_end_token:
span = string[start:i+1]
lex = self.vocab.get(doc.mem, span)
is_spacy = (i+1) < n_unicode_chars and string[i+1] == u' '
doc.push_back(lex, is_spacy)
start = i + 1 + is_spacy
i += 1
if i >= n_unicode_chars:
break
return doc