mirror of https://github.com/explosion/spaCy.git
Remove spacy.serialize
This commit is contained in:
parent
62ecdea9f2
commit
b53f7dfdc3
|
@ -1,23 +0,0 @@
|
|||
from libc.stdint cimport uint64_t
|
||||
from libc.stdint cimport int32_t, uint32_t
|
||||
|
||||
ctypedef unsigned char uchar
|
||||
|
||||
|
||||
cdef struct Code:
|
||||
uint64_t bits
|
||||
char length
|
||||
|
||||
|
||||
cdef Code bit_append(Code code, bint bit) nogil
|
||||
|
||||
|
||||
cdef class BitArray:
|
||||
cdef bytearray data
|
||||
cdef uchar byte
|
||||
cdef uchar bit_of_byte
|
||||
cdef uint32_t i
|
||||
|
||||
cdef int extend(self, uint64_t code, char n_bits) except -1
|
||||
|
||||
cpdef int32_t read32(self) except 0
|
|
@ -1,120 +0,0 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from libc.string cimport memcpy
|
||||
|
||||
# Note that we're setting the most significant bits here first, when in practice
|
||||
# we're actually wanting the last bit to be most significant (for Huffman coding,
|
||||
# anyway).
|
||||
cdef Code bit_append(Code code, bint bit) nogil:
|
||||
cdef uint64_t one = 1
|
||||
if bit:
|
||||
code.bits |= one << code.length
|
||||
else:
|
||||
code.bits &= ~(one << code.length)
|
||||
code.length += 1
|
||||
return code
|
||||
|
||||
|
||||
cdef class BitArray:
|
||||
def __init__(self, data=b''):
|
||||
self.data = bytearray(data)
|
||||
self.byte = 0
|
||||
self.bit_of_byte = 0
|
||||
self.i = 0
|
||||
|
||||
def __len__(self):
|
||||
return 8 * len(self.data) + self.bit_of_byte
|
||||
|
||||
def __str__(self):
|
||||
cdef uchar byte, i
|
||||
cdef uchar one = 1
|
||||
string = b''
|
||||
for i in range(len(self.data)):
|
||||
byte = ord(self.data[i])
|
||||
for j in range(8):
|
||||
string += b'1' if (byte & (one << j)) else b'0'
|
||||
for i in range(self.bit_of_byte):
|
||||
string += b'1' if (byte & (one << i)) else b'0'
|
||||
return string
|
||||
|
||||
def seek(self, i):
|
||||
self.i = i
|
||||
|
||||
def __iter__(self):
|
||||
cdef uchar byte, i
|
||||
cdef uchar one = 1
|
||||
start_byte = self.i // 8
|
||||
start_bit = self.i % 8
|
||||
|
||||
if start_bit != 0 and start_byte < len(self.data):
|
||||
byte = self.data[start_byte]
|
||||
for i in range(start_bit, 8):
|
||||
self.i += 1
|
||||
yield 1 if (byte & (one << i)) else 0
|
||||
start_byte += 1
|
||||
start_bit = 0
|
||||
|
||||
for byte in self.data[start_byte:]:
|
||||
for i in range(8):
|
||||
self.i += 1
|
||||
yield 1 if byte & (one << i) else 0
|
||||
|
||||
if self.bit_of_byte != 0:
|
||||
byte = self.byte
|
||||
for i in range(start_bit, self.bit_of_byte):
|
||||
self.i += 1
|
||||
yield 1 if self.byte & (one << i) else 0
|
||||
|
||||
cpdef int32_t read32(self) except 0:
|
||||
cdef int start_byte = self.i // 8
|
||||
|
||||
# TODO portability
|
||||
cdef uchar[4] chars
|
||||
chars[0] = self.data[start_byte]
|
||||
chars[1] = self.data[start_byte+1]
|
||||
chars[2] = self.data[start_byte+2]
|
||||
chars[3] = self.data[start_byte+3]
|
||||
cdef uint32_t output
|
||||
memcpy(&output, chars, 4)
|
||||
self.i += 32
|
||||
return output
|
||||
|
||||
def as_bytes(self):
|
||||
cdef unsigned char byte_char
|
||||
if self.bit_of_byte != 0:
|
||||
byte = chr(self.byte)
|
||||
# Jump through some hoops for Python3
|
||||
if isinstance(byte, unicode):
|
||||
return self.data + <bytes>(&self.byte)[:1]
|
||||
else:
|
||||
return self.data + chr(self.byte)
|
||||
else:
|
||||
return self.data
|
||||
|
||||
def append(self, bint bit):
|
||||
cdef uint64_t one = 1
|
||||
if bit:
|
||||
self.byte |= one << self.bit_of_byte
|
||||
else:
|
||||
self.byte &= ~(one << self.bit_of_byte)
|
||||
self.bit_of_byte += 1
|
||||
self.i += 1
|
||||
if self.bit_of_byte == 8:
|
||||
self.data += bytearray((self.byte,))
|
||||
self.byte = 0
|
||||
self.bit_of_byte = 0
|
||||
|
||||
cdef int extend(self, uint64_t code, char n_bits) except -1:
|
||||
cdef uint64_t one = 1
|
||||
cdef unsigned char bit_of_code
|
||||
for bit_of_code in range(n_bits):
|
||||
if code & (one << bit_of_code):
|
||||
self.byte |= one << self.bit_of_byte
|
||||
else:
|
||||
self.byte &= ~(one << self.bit_of_byte)
|
||||
self.bit_of_byte += 1
|
||||
if self.bit_of_byte == 8:
|
||||
self.data += <bytes>self.byte
|
||||
self.byte = 0
|
||||
self.bit_of_byte = 0
|
||||
self.i += 1
|
|
@ -1,24 +0,0 @@
|
|||
from libcpp.vector cimport vector
|
||||
from libc.stdint cimport uint32_t
|
||||
from libc.stdint cimport int64_t
|
||||
from libc.stdint cimport int32_t
|
||||
from libc.stdint cimport uint64_t
|
||||
|
||||
from .bits cimport BitArray, Code
|
||||
|
||||
|
||||
cdef struct Node:
|
||||
int32_t left
|
||||
int32_t right
|
||||
|
||||
|
||||
cdef class HuffmanCodec:
|
||||
cdef vector[Node] nodes
|
||||
cdef vector[Code] codes
|
||||
cdef Node root
|
||||
|
||||
cdef readonly list leaves
|
||||
cdef readonly dict _map
|
||||
|
||||
cpdef int encode_int32(self, int32_t[:] msg, BitArray bits) except -1
|
||||
cpdef int decode_int32(self, BitArray bits, int32_t[:] msg) except -1
|
|
@ -1,176 +0,0 @@
|
|||
# cython: profile=True
|
||||
from __future__ import unicode_literals
|
||||
cimport cython
|
||||
from libcpp.queue cimport priority_queue
|
||||
from libcpp.pair cimport pair
|
||||
import numpy
|
||||
|
||||
from ..typedefs cimport attr_t
|
||||
|
||||
from .bits cimport bit_append
|
||||
from .bits cimport BitArray
|
||||
|
||||
|
||||
cdef class HuffmanCodec:
|
||||
def __init__(self, freqs):
|
||||
cdef float count
|
||||
cdef Code code
|
||||
|
||||
cdef pair[float, int] item
|
||||
cdef pair[float, int] item1
|
||||
cdef pair[float, int] item2
|
||||
cdef priority_queue[pair[float, int]] queue
|
||||
cdef int i = 0
|
||||
self._map = {}
|
||||
self.leaves = []
|
||||
for word, weight in freqs:
|
||||
item.first = -weight
|
||||
item.second = -(i+1)
|
||||
queue.push(item)
|
||||
|
||||
self.leaves.append(word)
|
||||
code.bits = 0
|
||||
code.length = 0
|
||||
self.codes.push_back(code)
|
||||
self._map[word] = i
|
||||
i += 1
|
||||
|
||||
cdef Node node
|
||||
while queue.size() >= 2:
|
||||
item1 = queue.top(); queue.pop()
|
||||
item2 = queue.top(); queue.pop()
|
||||
|
||||
node = Node(left=item1.second, right=item2.second)
|
||||
self.nodes.push_back(node)
|
||||
|
||||
item.first = item1.first + item2.first
|
||||
item.second = self.nodes.size()-1
|
||||
queue.push(item)
|
||||
# Careful of empty freqs dicts
|
||||
cdef Code path
|
||||
if queue.size() >= 1:
|
||||
item = queue.top()
|
||||
self.root = self.nodes[item.second]
|
||||
path.bits = 0
|
||||
path.length = 0
|
||||
assign_codes(self.nodes, self.codes, item.second, path)
|
||||
|
||||
def encode(self, msg, BitArray bits=None):
|
||||
if bits is None:
|
||||
bits = BitArray()
|
||||
cdef int i
|
||||
for word in msg:
|
||||
i = self._map[word]
|
||||
bits.extend(self.codes[i].bits, self.codes[i].length)
|
||||
return bits
|
||||
|
||||
cpdef int encode_int32(self, int32_t[:] msg, BitArray bits) except -1:
|
||||
cdef int msg_i
|
||||
cdef int leaf_i
|
||||
cdef int length = 0
|
||||
for msg_i in range(msg.shape[0]):
|
||||
leaf_i = self._map.get(msg[msg_i], -1)
|
||||
if leaf_i is -1:
|
||||
return 0
|
||||
code = self.codes[leaf_i]
|
||||
bits.extend(code.bits, code.length)
|
||||
length += code.length
|
||||
return length
|
||||
|
||||
def n_bits(self, msg, overhead=0):
|
||||
cdef int i
|
||||
length = 0
|
||||
for word in msg:
|
||||
if word not in self._map:
|
||||
return numpy.nan
|
||||
i = self._map[word]
|
||||
length += self.codes[i].length
|
||||
return length + overhead * len(msg)
|
||||
|
||||
def decode(self, bits, msg):
|
||||
node = self.root
|
||||
cdef int i = 0
|
||||
cdef int n = len(msg)
|
||||
cdef int branch
|
||||
cdef bint bit
|
||||
for bit in bits:
|
||||
branch = node.right if bit else node.left
|
||||
if branch >= 0:
|
||||
node = self.nodes.at(branch)
|
||||
else:
|
||||
msg[i] = self.leaves[-(branch + 1)]
|
||||
node = self.nodes.back()
|
||||
i += 1
|
||||
if i == n:
|
||||
break
|
||||
else:
|
||||
raise Exception("Buffer exhausted at %d/%d symbols read." % (i, len(msg)))
|
||||
|
||||
@cython.boundscheck(False)
|
||||
cpdef int decode_int32(self, BitArray bits, int32_t[:] msg) except -1:
|
||||
assert bits.i % 8 == 0
|
||||
cdef Node node = self.root
|
||||
cdef int branch
|
||||
|
||||
cdef int n_msg = msg.shape[0]
|
||||
cdef bytearray bytes_ = bits.as_bytes()
|
||||
cdef unsigned char byte
|
||||
cdef int i_msg = 0
|
||||
cdef int i_byte = bits.i // 8
|
||||
cdef unsigned char i_bit = 0
|
||||
cdef unsigned char one = 1
|
||||
while i_msg < n_msg:
|
||||
byte = bytes_[i_byte]
|
||||
i_byte += 1
|
||||
for i_bit in range(8):
|
||||
branch = node.right if (byte & (one << i_bit)) else node.left
|
||||
bits.i += 1
|
||||
if branch >= 0:
|
||||
node = self.nodes.at(branch)
|
||||
else:
|
||||
msg[i_msg] = self.leaves[-(branch + 1)]
|
||||
i_msg += 1
|
||||
if i_msg == n_msg:
|
||||
break
|
||||
node = self.root
|
||||
|
||||
property strings:
|
||||
@cython.boundscheck(False)
|
||||
@cython.wraparound(False)
|
||||
@cython.nonecheck(False)
|
||||
def __get__(self):
|
||||
output = []
|
||||
cdef int i, j
|
||||
cdef unicode string
|
||||
cdef Code code
|
||||
for i in range(self.codes.size()):
|
||||
code = self.codes[i]
|
||||
string = '{0:b}'.format(code.bits).rjust(code.length, '0')
|
||||
string = string[::-1]
|
||||
output.append(string)
|
||||
return output
|
||||
|
||||
|
||||
cdef int assign_codes(vector[Node]& nodes, vector[Code]& codes, int i, Code path) except -1:
|
||||
"""Recursively assign paths, from the top down. At the end, the entry codes[i]
|
||||
knows the bit-address of the node[j] that points to entry i in the vocabulary.
|
||||
So, to encode i, we go to codes[i] and read its bit-string. To decode, we
|
||||
navigate nodes recursively.
|
||||
"""
|
||||
cdef Code left_path = bit_append(path, 0)
|
||||
cdef Code right_path = bit_append(path, 1)
|
||||
|
||||
# Assign down left branch
|
||||
if nodes[i].left >= 0:
|
||||
assign_codes(nodes, codes, nodes[i].left, left_path)
|
||||
else:
|
||||
# Leaf on left
|
||||
id_ = -(nodes[i].left + 1)
|
||||
codes[id_] = left_path
|
||||
# Assign down right branch
|
||||
if nodes[i].right >= 0:
|
||||
assign_codes(nodes, codes, nodes[i].right, right_path)
|
||||
else:
|
||||
# Leaf on right
|
||||
id_ = -(nodes[i].right + 1)
|
||||
codes[id_] = right_path
|
|
@ -1,9 +0,0 @@
|
|||
from ..vocab cimport Vocab
|
||||
|
||||
|
||||
cdef class Packer:
|
||||
cdef readonly tuple attrs
|
||||
cdef readonly tuple _codecs
|
||||
cdef readonly object orth_codec
|
||||
cdef readonly object char_codec
|
||||
cdef readonly Vocab vocab
|
|
@ -1,200 +0,0 @@
|
|||
# cython: profile=True
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from libc.stdint cimport uint32_t, int32_t
|
||||
from libc.stdint cimport uint64_t
|
||||
from libc.math cimport exp as c_exp
|
||||
from libcpp.queue cimport priority_queue
|
||||
from libcpp.pair cimport pair
|
||||
|
||||
from cymem.cymem cimport Address, Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
from preshed.counter cimport PreshCounter
|
||||
import json
|
||||
|
||||
from ..attrs cimport ORTH, ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..vocab cimport Vocab
|
||||
from ..structs cimport LexemeC
|
||||
from ..typedefs cimport attr_t
|
||||
from .bits cimport BitArray
|
||||
from .huffman cimport HuffmanCodec
|
||||
|
||||
from os import path
|
||||
import numpy
|
||||
from .. import util
|
||||
|
||||
cimport cython
|
||||
|
||||
|
||||
# Format
|
||||
# - Total number of bytes in message (32 bit int) --- handled outside this
|
||||
# - Number of words (32 bit int)
|
||||
# - Words, terminating in an EOL symbol, huffman coded ~12 bits per word
|
||||
# - Spaces 1 bit per word
|
||||
# - Attributes:
|
||||
# POS tag
|
||||
# Head offset
|
||||
# Dep label
|
||||
# Entity IOB
|
||||
# Entity tag
|
||||
|
||||
|
||||
cdef class _BinaryCodec:
|
||||
def encode(self, attr_t[:] msg, BitArray bits):
|
||||
cdef int i
|
||||
for i in range(len(msg)):
|
||||
bits.append(msg[i])
|
||||
|
||||
def decode(self, BitArray bits, attr_t[:] msg):
|
||||
cdef int i = 0
|
||||
for bit in bits:
|
||||
msg[i] = bit
|
||||
i += 1
|
||||
if i == len(msg):
|
||||
break
|
||||
|
||||
|
||||
def _gen_orths(Vocab vocab):
|
||||
cdef attr_t orth
|
||||
cdef size_t addr
|
||||
for orth, addr in vocab._by_orth.items():
|
||||
lex = <LexemeC*>addr
|
||||
yield orth, c_exp(lex.prob)
|
||||
|
||||
|
||||
def _gen_chars(Vocab vocab):
|
||||
cdef attr_t orth
|
||||
cdef size_t addr
|
||||
char_weights = {i: 1e-20 for i in range(256)}
|
||||
cdef unicode string
|
||||
cdef bytes char
|
||||
cdef bytes utf8_str
|
||||
for orth, addr in vocab._by_orth.items():
|
||||
lex = <LexemeC*>addr
|
||||
string = vocab.strings[lex.orth]
|
||||
utf8_str = string.encode('utf8')
|
||||
for char in utf8_str:
|
||||
char_weights.setdefault(ord(char), 0.0)
|
||||
char_weights[ord(char)] += c_exp(lex.prob)
|
||||
char_weights[ord(' ')] += c_exp(lex.prob)
|
||||
return char_weights.items()
|
||||
|
||||
|
||||
cdef class Packer:
|
||||
def __init__(self, Vocab vocab, attr_freqs, char_freqs=None):
|
||||
if char_freqs is None:
|
||||
char_freqs = _gen_chars(vocab)
|
||||
self.vocab = vocab
|
||||
self.orth_codec = HuffmanCodec(_gen_orths(vocab))
|
||||
self.char_codec = HuffmanCodec(char_freqs)
|
||||
|
||||
codecs = []
|
||||
attrs = []
|
||||
for attr, freqs in sorted(attr_freqs):
|
||||
if attr in (ORTH, ID, SPACY):
|
||||
continue
|
||||
codecs.append(HuffmanCodec(freqs))
|
||||
attrs.append(attr)
|
||||
self._codecs = tuple(codecs)
|
||||
self.attrs = tuple(attrs)
|
||||
|
||||
def pack(self, Doc doc):
|
||||
if len(doc) == 0:
|
||||
return b''
|
||||
bits = self._orth_encode(doc)
|
||||
if bits is None:
|
||||
bits = self._char_encode(doc)
|
||||
cdef int i
|
||||
if self.attrs:
|
||||
array = doc.to_array(self.attrs)
|
||||
for i, codec in enumerate(self._codecs):
|
||||
codec.encode(array[:, i], bits)
|
||||
return bits.as_bytes()
|
||||
|
||||
def unpack(self, data):
|
||||
doc = Doc(self.vocab)
|
||||
self.unpack_into(data, doc)
|
||||
return doc
|
||||
|
||||
def unpack_into(self, byte_string, Doc doc):
|
||||
if byte_string == b'':
|
||||
return None
|
||||
bits = BitArray(byte_string)
|
||||
bits.seek(0)
|
||||
cdef int32_t length = bits.read32()
|
||||
if length >= 0:
|
||||
self._orth_decode(bits, length, doc)
|
||||
else:
|
||||
self._char_decode(bits, -length, doc)
|
||||
array = numpy.zeros(shape=(len(doc), len(self._codecs)), dtype=numpy.int32)
|
||||
for i, codec in enumerate(self._codecs):
|
||||
codec.decode(bits, array[:, i])
|
||||
doc.from_array(self.attrs, array)
|
||||
return doc
|
||||
|
||||
def _orth_encode(self, Doc doc):
|
||||
for t in doc:
|
||||
if t.is_oov:
|
||||
return None
|
||||
cdef BitArray bits = BitArray()
|
||||
cdef int32_t length = len(doc)
|
||||
bits.extend(length, 32)
|
||||
orths = doc.to_array([ORTH])
|
||||
n_bits = self.orth_codec.encode_int32(orths[:, 0], bits)
|
||||
if n_bits == 0:
|
||||
return None
|
||||
for token in doc:
|
||||
bits.append(bool(token.whitespace_))
|
||||
return bits
|
||||
|
||||
def _char_encode(self, Doc doc):
|
||||
cdef bytes utf8_str = doc.string.encode('utf8')
|
||||
cdef BitArray bits = BitArray()
|
||||
cdef int32_t length = len(utf8_str)
|
||||
# Signal chars with negative length
|
||||
bits.extend(-length, 32)
|
||||
self.char_codec.encode(bytearray(utf8_str), bits)
|
||||
cdef int i, j
|
||||
for i in range(doc.length):
|
||||
for j in range(doc.c[i].lex.length-1):
|
||||
bits.append(False)
|
||||
bits.append(True)
|
||||
if doc.c[i].spacy:
|
||||
bits.append(False)
|
||||
return bits
|
||||
|
||||
def _orth_decode(self, BitArray bits, int32_t n, Doc doc):
|
||||
cdef attr_t[:] orths = numpy.ndarray(shape=(n,), dtype=numpy.int32)
|
||||
self.orth_codec.decode_int32(bits, orths)
|
||||
cdef int i
|
||||
cdef bint space
|
||||
spaces = iter(bits)
|
||||
for i in range(n):
|
||||
orth = orths[i]
|
||||
space = next(spaces)
|
||||
lex = self.vocab.get_by_orth(doc.mem, orth)
|
||||
doc.push_back(lex, space)
|
||||
return doc
|
||||
|
||||
def _char_decode(self, BitArray bits, int32_t n_bytes, Doc doc):
|
||||
cdef bytearray utf8_str = bytearray(n_bytes)
|
||||
self.char_codec.decode(bits, utf8_str)
|
||||
|
||||
cdef unicode string = utf8_str.decode('utf8')
|
||||
cdef int start = 0
|
||||
cdef bint is_spacy
|
||||
cdef int n_unicode_chars = len(string)
|
||||
cdef int i = 0
|
||||
cdef bint is_end_token
|
||||
for is_end_token in bits:
|
||||
if is_end_token:
|
||||
span = string[start:i+1]
|
||||
lex = self.vocab.get(doc.mem, span)
|
||||
is_spacy = (i+1) < n_unicode_chars and string[i+1] == u' '
|
||||
doc.push_back(lex, is_spacy)
|
||||
start = i + 1 + is_spacy
|
||||
i += 1
|
||||
if i >= n_unicode_chars:
|
||||
break
|
||||
return doc
|
Loading…
Reference in New Issue