diff --git a/spacy/serialize/__init__.pxd b/spacy/serialize/__init__.pxd deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/serialize/__init__.py b/spacy/serialize/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/serialize/bits.pxd b/spacy/serialize/bits.pxd deleted file mode 100644 index 5e462a2da..000000000 --- a/spacy/serialize/bits.pxd +++ /dev/null @@ -1,23 +0,0 @@ -from libc.stdint cimport uint64_t -from libc.stdint cimport int32_t, uint32_t - -ctypedef unsigned char uchar - - -cdef struct Code: - uint64_t bits - char length - - -cdef Code bit_append(Code code, bint bit) nogil - - -cdef class BitArray: - cdef bytearray data - cdef uchar byte - cdef uchar bit_of_byte - cdef uint32_t i - - cdef int extend(self, uint64_t code, char n_bits) except -1 - - cpdef int32_t read32(self) except 0 diff --git a/spacy/serialize/bits.pyx b/spacy/serialize/bits.pyx deleted file mode 100644 index 71a198a46..000000000 --- a/spacy/serialize/bits.pyx +++ /dev/null @@ -1,120 +0,0 @@ -from __future__ import unicode_literals - -from libc.string cimport memcpy - -# Note that we're setting the most significant bits here first, when in practice -# we're actually wanting the last bit to be most significant (for Huffman coding, -# anyway). -cdef Code bit_append(Code code, bint bit) nogil: - cdef uint64_t one = 1 - if bit: - code.bits |= one << code.length - else: - code.bits &= ~(one << code.length) - code.length += 1 - return code - - -cdef class BitArray: - def __init__(self, data=b''): - self.data = bytearray(data) - self.byte = 0 - self.bit_of_byte = 0 - self.i = 0 - - def __len__(self): - return 8 * len(self.data) + self.bit_of_byte - - def __str__(self): - cdef uchar byte, i - cdef uchar one = 1 - string = b'' - for i in range(len(self.data)): - byte = ord(self.data[i]) - for j in range(8): - string += b'1' if (byte & (one << j)) else b'0' - for i in range(self.bit_of_byte): - string += b'1' if (byte & (one << i)) else b'0' - return string - - def seek(self, i): - self.i = i - - def __iter__(self): - cdef uchar byte, i - cdef uchar one = 1 - start_byte = self.i // 8 - start_bit = self.i % 8 - - if start_bit != 0 and start_byte < len(self.data): - byte = self.data[start_byte] - for i in range(start_bit, 8): - self.i += 1 - yield 1 if (byte & (one << i)) else 0 - start_byte += 1 - start_bit = 0 - - for byte in self.data[start_byte:]: - for i in range(8): - self.i += 1 - yield 1 if byte & (one << i) else 0 - - if self.bit_of_byte != 0: - byte = self.byte - for i in range(start_bit, self.bit_of_byte): - self.i += 1 - yield 1 if self.byte & (one << i) else 0 - - cpdef int32_t read32(self) except 0: - cdef int start_byte = self.i // 8 - - # TODO portability - cdef uchar[4] chars - chars[0] = self.data[start_byte] - chars[1] = self.data[start_byte+1] - chars[2] = self.data[start_byte+2] - chars[3] = self.data[start_byte+3] - cdef uint32_t output - memcpy(&output, chars, 4) - self.i += 32 - return output - - def as_bytes(self): - cdef unsigned char byte_char - if self.bit_of_byte != 0: - byte = chr(self.byte) - # Jump through some hoops for Python3 - if isinstance(byte, unicode): - return self.data + (&self.byte)[:1] - else: - return self.data + chr(self.byte) - else: - return self.data - - def append(self, bint bit): - cdef uint64_t one = 1 - if bit: - self.byte |= one << self.bit_of_byte - else: - self.byte &= ~(one << self.bit_of_byte) - self.bit_of_byte += 1 - self.i += 1 - if self.bit_of_byte == 8: - self.data += bytearray((self.byte,)) - self.byte = 0 - self.bit_of_byte = 0 - - cdef int extend(self, uint64_t code, char n_bits) except -1: - cdef uint64_t one = 1 - cdef unsigned char bit_of_code - for bit_of_code in range(n_bits): - if code & (one << bit_of_code): - self.byte |= one << self.bit_of_byte - else: - self.byte &= ~(one << self.bit_of_byte) - self.bit_of_byte += 1 - if self.bit_of_byte == 8: - self.data += self.byte - self.byte = 0 - self.bit_of_byte = 0 - self.i += 1 diff --git a/spacy/serialize/huffman.pxd b/spacy/serialize/huffman.pxd deleted file mode 100644 index e2f0600c8..000000000 --- a/spacy/serialize/huffman.pxd +++ /dev/null @@ -1,24 +0,0 @@ -from libcpp.vector cimport vector -from libc.stdint cimport uint32_t -from libc.stdint cimport int64_t -from libc.stdint cimport int32_t -from libc.stdint cimport uint64_t - -from .bits cimport BitArray, Code - - -cdef struct Node: - int32_t left - int32_t right - - -cdef class HuffmanCodec: - cdef vector[Node] nodes - cdef vector[Code] codes - cdef Node root - - cdef readonly list leaves - cdef readonly dict _map - - cpdef int encode_int32(self, int32_t[:] msg, BitArray bits) except -1 - cpdef int decode_int32(self, BitArray bits, int32_t[:] msg) except -1 diff --git a/spacy/serialize/huffman.pyx b/spacy/serialize/huffman.pyx deleted file mode 100644 index 34ab75925..000000000 --- a/spacy/serialize/huffman.pyx +++ /dev/null @@ -1,176 +0,0 @@ -# cython: profile=True -from __future__ import unicode_literals -cimport cython -from libcpp.queue cimport priority_queue -from libcpp.pair cimport pair -import numpy - -from ..typedefs cimport attr_t - -from .bits cimport bit_append -from .bits cimport BitArray - - -cdef class HuffmanCodec: - def __init__(self, freqs): - cdef float count - cdef Code code - - cdef pair[float, int] item - cdef pair[float, int] item1 - cdef pair[float, int] item2 - cdef priority_queue[pair[float, int]] queue - cdef int i = 0 - self._map = {} - self.leaves = [] - for word, weight in freqs: - item.first = -weight - item.second = -(i+1) - queue.push(item) - - self.leaves.append(word) - code.bits = 0 - code.length = 0 - self.codes.push_back(code) - self._map[word] = i - i += 1 - - cdef Node node - while queue.size() >= 2: - item1 = queue.top(); queue.pop() - item2 = queue.top(); queue.pop() - - node = Node(left=item1.second, right=item2.second) - self.nodes.push_back(node) - - item.first = item1.first + item2.first - item.second = self.nodes.size()-1 - queue.push(item) - # Careful of empty freqs dicts - cdef Code path - if queue.size() >= 1: - item = queue.top() - self.root = self.nodes[item.second] - path.bits = 0 - path.length = 0 - assign_codes(self.nodes, self.codes, item.second, path) - - def encode(self, msg, BitArray bits=None): - if bits is None: - bits = BitArray() - cdef int i - for word in msg: - i = self._map[word] - bits.extend(self.codes[i].bits, self.codes[i].length) - return bits - - cpdef int encode_int32(self, int32_t[:] msg, BitArray bits) except -1: - cdef int msg_i - cdef int leaf_i - cdef int length = 0 - for msg_i in range(msg.shape[0]): - leaf_i = self._map.get(msg[msg_i], -1) - if leaf_i is -1: - return 0 - code = self.codes[leaf_i] - bits.extend(code.bits, code.length) - length += code.length - return length - - def n_bits(self, msg, overhead=0): - cdef int i - length = 0 - for word in msg: - if word not in self._map: - return numpy.nan - i = self._map[word] - length += self.codes[i].length - return length + overhead * len(msg) - - def decode(self, bits, msg): - node = self.root - cdef int i = 0 - cdef int n = len(msg) - cdef int branch - cdef bint bit - for bit in bits: - branch = node.right if bit else node.left - if branch >= 0: - node = self.nodes.at(branch) - else: - msg[i] = self.leaves[-(branch + 1)] - node = self.nodes.back() - i += 1 - if i == n: - break - else: - raise Exception("Buffer exhausted at %d/%d symbols read." % (i, len(msg))) - - @cython.boundscheck(False) - cpdef int decode_int32(self, BitArray bits, int32_t[:] msg) except -1: - assert bits.i % 8 == 0 - cdef Node node = self.root - cdef int branch - - cdef int n_msg = msg.shape[0] - cdef bytearray bytes_ = bits.as_bytes() - cdef unsigned char byte - cdef int i_msg = 0 - cdef int i_byte = bits.i // 8 - cdef unsigned char i_bit = 0 - cdef unsigned char one = 1 - while i_msg < n_msg: - byte = bytes_[i_byte] - i_byte += 1 - for i_bit in range(8): - branch = node.right if (byte & (one << i_bit)) else node.left - bits.i += 1 - if branch >= 0: - node = self.nodes.at(branch) - else: - msg[i_msg] = self.leaves[-(branch + 1)] - i_msg += 1 - if i_msg == n_msg: - break - node = self.root - - property strings: - @cython.boundscheck(False) - @cython.wraparound(False) - @cython.nonecheck(False) - def __get__(self): - output = [] - cdef int i, j - cdef unicode string - cdef Code code - for i in range(self.codes.size()): - code = self.codes[i] - string = '{0:b}'.format(code.bits).rjust(code.length, '0') - string = string[::-1] - output.append(string) - return output - - -cdef int assign_codes(vector[Node]& nodes, vector[Code]& codes, int i, Code path) except -1: - """Recursively assign paths, from the top down. At the end, the entry codes[i] - knows the bit-address of the node[j] that points to entry i in the vocabulary. - So, to encode i, we go to codes[i] and read its bit-string. To decode, we - navigate nodes recursively. - """ - cdef Code left_path = bit_append(path, 0) - cdef Code right_path = bit_append(path, 1) - - # Assign down left branch - if nodes[i].left >= 0: - assign_codes(nodes, codes, nodes[i].left, left_path) - else: - # Leaf on left - id_ = -(nodes[i].left + 1) - codes[id_] = left_path - # Assign down right branch - if nodes[i].right >= 0: - assign_codes(nodes, codes, nodes[i].right, right_path) - else: - # Leaf on right - id_ = -(nodes[i].right + 1) - codes[id_] = right_path diff --git a/spacy/serialize/packer.pxd b/spacy/serialize/packer.pxd deleted file mode 100644 index c4974eb60..000000000 --- a/spacy/serialize/packer.pxd +++ /dev/null @@ -1,9 +0,0 @@ -from ..vocab cimport Vocab - - -cdef class Packer: - cdef readonly tuple attrs - cdef readonly tuple _codecs - cdef readonly object orth_codec - cdef readonly object char_codec - cdef readonly Vocab vocab diff --git a/spacy/serialize/packer.pyx b/spacy/serialize/packer.pyx deleted file mode 100644 index 4b6a3dd1a..000000000 --- a/spacy/serialize/packer.pyx +++ /dev/null @@ -1,200 +0,0 @@ -# cython: profile=True -from __future__ import unicode_literals - -from libc.stdint cimport uint32_t, int32_t -from libc.stdint cimport uint64_t -from libc.math cimport exp as c_exp -from libcpp.queue cimport priority_queue -from libcpp.pair cimport pair - -from cymem.cymem cimport Address, Pool -from preshed.maps cimport PreshMap -from preshed.counter cimport PreshCounter -import json - -from ..attrs cimport ORTH, ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE -from ..tokens.doc cimport Doc -from ..vocab cimport Vocab -from ..structs cimport LexemeC -from ..typedefs cimport attr_t -from .bits cimport BitArray -from .huffman cimport HuffmanCodec - -from os import path -import numpy -from .. import util - -cimport cython - - -# Format -# - Total number of bytes in message (32 bit int) --- handled outside this -# - Number of words (32 bit int) -# - Words, terminating in an EOL symbol, huffman coded ~12 bits per word -# - Spaces 1 bit per word -# - Attributes: -# POS tag -# Head offset -# Dep label -# Entity IOB -# Entity tag - - -cdef class _BinaryCodec: - def encode(self, attr_t[:] msg, BitArray bits): - cdef int i - for i in range(len(msg)): - bits.append(msg[i]) - - def decode(self, BitArray bits, attr_t[:] msg): - cdef int i = 0 - for bit in bits: - msg[i] = bit - i += 1 - if i == len(msg): - break - - -def _gen_orths(Vocab vocab): - cdef attr_t orth - cdef size_t addr - for orth, addr in vocab._by_orth.items(): - lex = addr - yield orth, c_exp(lex.prob) - - -def _gen_chars(Vocab vocab): - cdef attr_t orth - cdef size_t addr - char_weights = {i: 1e-20 for i in range(256)} - cdef unicode string - cdef bytes char - cdef bytes utf8_str - for orth, addr in vocab._by_orth.items(): - lex = addr - string = vocab.strings[lex.orth] - utf8_str = string.encode('utf8') - for char in utf8_str: - char_weights.setdefault(ord(char), 0.0) - char_weights[ord(char)] += c_exp(lex.prob) - char_weights[ord(' ')] += c_exp(lex.prob) - return char_weights.items() - - -cdef class Packer: - def __init__(self, Vocab vocab, attr_freqs, char_freqs=None): - if char_freqs is None: - char_freqs = _gen_chars(vocab) - self.vocab = vocab - self.orth_codec = HuffmanCodec(_gen_orths(vocab)) - self.char_codec = HuffmanCodec(char_freqs) - - codecs = [] - attrs = [] - for attr, freqs in sorted(attr_freqs): - if attr in (ORTH, ID, SPACY): - continue - codecs.append(HuffmanCodec(freqs)) - attrs.append(attr) - self._codecs = tuple(codecs) - self.attrs = tuple(attrs) - - def pack(self, Doc doc): - if len(doc) == 0: - return b'' - bits = self._orth_encode(doc) - if bits is None: - bits = self._char_encode(doc) - cdef int i - if self.attrs: - array = doc.to_array(self.attrs) - for i, codec in enumerate(self._codecs): - codec.encode(array[:, i], bits) - return bits.as_bytes() - - def unpack(self, data): - doc = Doc(self.vocab) - self.unpack_into(data, doc) - return doc - - def unpack_into(self, byte_string, Doc doc): - if byte_string == b'': - return None - bits = BitArray(byte_string) - bits.seek(0) - cdef int32_t length = bits.read32() - if length >= 0: - self._orth_decode(bits, length, doc) - else: - self._char_decode(bits, -length, doc) - array = numpy.zeros(shape=(len(doc), len(self._codecs)), dtype=numpy.int32) - for i, codec in enumerate(self._codecs): - codec.decode(bits, array[:, i]) - doc.from_array(self.attrs, array) - return doc - - def _orth_encode(self, Doc doc): - for t in doc: - if t.is_oov: - return None - cdef BitArray bits = BitArray() - cdef int32_t length = len(doc) - bits.extend(length, 32) - orths = doc.to_array([ORTH]) - n_bits = self.orth_codec.encode_int32(orths[:, 0], bits) - if n_bits == 0: - return None - for token in doc: - bits.append(bool(token.whitespace_)) - return bits - - def _char_encode(self, Doc doc): - cdef bytes utf8_str = doc.string.encode('utf8') - cdef BitArray bits = BitArray() - cdef int32_t length = len(utf8_str) - # Signal chars with negative length - bits.extend(-length, 32) - self.char_codec.encode(bytearray(utf8_str), bits) - cdef int i, j - for i in range(doc.length): - for j in range(doc.c[i].lex.length-1): - bits.append(False) - bits.append(True) - if doc.c[i].spacy: - bits.append(False) - return bits - - def _orth_decode(self, BitArray bits, int32_t n, Doc doc): - cdef attr_t[:] orths = numpy.ndarray(shape=(n,), dtype=numpy.int32) - self.orth_codec.decode_int32(bits, orths) - cdef int i - cdef bint space - spaces = iter(bits) - for i in range(n): - orth = orths[i] - space = next(spaces) - lex = self.vocab.get_by_orth(doc.mem, orth) - doc.push_back(lex, space) - return doc - - def _char_decode(self, BitArray bits, int32_t n_bytes, Doc doc): - cdef bytearray utf8_str = bytearray(n_bytes) - self.char_codec.decode(bits, utf8_str) - - cdef unicode string = utf8_str.decode('utf8') - cdef int start = 0 - cdef bint is_spacy - cdef int n_unicode_chars = len(string) - cdef int i = 0 - cdef bint is_end_token - for is_end_token in bits: - if is_end_token: - span = string[start:i+1] - lex = self.vocab.get(doc.mem, span) - is_spacy = (i+1) < n_unicode_chars and string[i+1] == u' ' - doc.push_back(lex, is_spacy) - start = i + 1 + is_spacy - i += 1 - if i >= n_unicode_chars: - break - return doc