Remove spacy.serialize

2017-05-09 17:22:06 +02:00 · 2017-05-09 17:22:06 +02:00 · b53f7dfdc3
parent 62ecdea9f2
commit b53f7dfdc3
8 changed files with 0 additions and 552 deletions
--- a/spacy/serialize/init.pxd
+++ b/spacy/serialize/init.pxd
--- a/spacy/serialize/init.py
+++ b/spacy/serialize/init.py
--- a/spacy/serialize/bits.pxd
+++ b/spacy/serialize/bits.pxd
@ -1,23 +0,0 @@
-from libc.stdint cimport uint64_t
-from libc.stdint cimport int32_t, uint32_t
-
-ctypedef unsigned char uchar
-
-
-cdef struct Code:
-    uint64_t bits
-    char length
-
-
-cdef Code bit_append(Code code, bint bit) nogil
-
-
-cdef class BitArray:
-    cdef bytearray data
-    cdef uchar byte
-    cdef uchar bit_of_byte
-    cdef uint32_t i
-    
-    cdef int extend(self, uint64_t code, char n_bits) except -1
-
-    cpdef int32_t read32(self) except 0
--- a/spacy/serialize/bits.pyx
+++ b/spacy/serialize/bits.pyx
@ -1,120 +0,0 @@
-from __future__ import unicode_literals
-
-from libc.string cimport memcpy
-
-# Note that we're setting the most significant bits here first, when in practice
-# we're actually wanting the last bit to be most significant (for Huffman coding,
-# anyway).
-cdef Code bit_append(Code code, bint bit) nogil:
-    cdef uint64_t one = 1
-    if bit:
-        code.bits |= one << code.length
-    else:
-        code.bits &= ~(one << code.length)
-    code.length += 1
-    return code
-
-
-cdef class BitArray:
-    def __init__(self, data=b''):
-        self.data = bytearray(data)
-        self.byte = 0
-        self.bit_of_byte = 0
-        self.i = 0
-
-    def __len__(self):
-        return 8 * len(self.data) + self.bit_of_byte
-
-    def __str__(self):
-        cdef uchar byte, i
-        cdef uchar one = 1
-        string = b''
-        for i in range(len(self.data)):
-            byte = ord(self.data[i])
-            for j in range(8):
-                string += b'1' if (byte & (one << j)) else b'0'
-        for i in range(self.bit_of_byte):
-            string += b'1' if (byte & (one << i)) else b'0'
-        return string
-
-    def seek(self, i):
-        self.i = i
-
-    def __iter__(self):
-        cdef uchar byte, i
-        cdef uchar one = 1
-        start_byte = self.i // 8
-        start_bit = self.i % 8
-
-        if start_bit != 0 and start_byte < len(self.data):
-            byte = self.data[start_byte]
-            for i in range(start_bit, 8):
-                self.i += 1
-                yield 1 if (byte & (one << i)) else 0
-            start_byte += 1
-            start_bit = 0
-
-        for byte in self.data[start_byte:]:
-            for i in range(8):
-                self.i += 1
-                yield 1 if byte & (one << i) else 0
-
-        if self.bit_of_byte != 0:
-            byte = self.byte
-            for i in range(start_bit, self.bit_of_byte):
-                self.i += 1
-                yield 1 if self.byte & (one << i) else 0
-
-    cpdef int32_t read32(self) except 0:
-        cdef int start_byte = self.i // 8
-
-        # TODO portability
-        cdef uchar[4] chars
-        chars[0] = self.data[start_byte]
-        chars[1] = self.data[start_byte+1]
-        chars[2] = self.data[start_byte+2]
-        chars[3] = self.data[start_byte+3]
-        cdef uint32_t output
-        memcpy(&output, chars, 4)
-        self.i += 32
-        return output
-
-    def as_bytes(self):
-        cdef unsigned char byte_char
-        if self.bit_of_byte != 0:
-            byte = chr(self.byte)
-            # Jump through some hoops for Python3
-            if isinstance(byte, unicode):
-                return self.data + <bytes>(&self.byte)[:1]
-            else:
-                return self.data + chr(self.byte)
-        else:
-            return self.data
-
-    def append(self, bint bit):
-        cdef uint64_t one = 1
-        if bit:
-            self.byte |= one << self.bit_of_byte
-        else:
-            self.byte &= ~(one << self.bit_of_byte)
-        self.bit_of_byte += 1
-        self.i += 1
-        if self.bit_of_byte == 8:
-            self.data += bytearray((self.byte,))
-            self.byte = 0
-            self.bit_of_byte = 0
-
-    cdef int extend(self, uint64_t code, char n_bits) except -1:
-        cdef uint64_t one = 1
-        cdef unsigned char bit_of_code
-        for bit_of_code in range(n_bits):
-            if code & (one << bit_of_code):
-                self.byte |= one << self.bit_of_byte
-            else:
-                self.byte &= ~(one << self.bit_of_byte)
-            self.bit_of_byte += 1
-            if self.bit_of_byte == 8:
-                self.data += <bytes>self.byte
-                self.byte = 0
-                self.bit_of_byte = 0
-            self.i += 1
--- a/spacy/serialize/huffman.pxd
+++ b/spacy/serialize/huffman.pxd
@ -1,24 +0,0 @@
-from libcpp.vector cimport vector
-from libc.stdint cimport uint32_t
-from libc.stdint cimport int64_t
-from libc.stdint cimport int32_t
-from libc.stdint cimport uint64_t
-
-from .bits cimport BitArray, Code
-
-
-cdef struct Node:
-    int32_t left
-    int32_t right
-
-
-cdef class HuffmanCodec:
-    cdef vector[Node] nodes
-    cdef vector[Code] codes
-    cdef Node root
-
-    cdef readonly list leaves
-    cdef readonly dict _map 
-    
-    cpdef int encode_int32(self, int32_t[:] msg, BitArray bits) except -1
-    cpdef int decode_int32(self, BitArray bits, int32_t[:] msg) except -1
--- a/spacy/serialize/huffman.pyx
+++ b/spacy/serialize/huffman.pyx
@ -1,176 +0,0 @@
-# cython: profile=True
-from __future__ import unicode_literals
-cimport cython
-from libcpp.queue cimport priority_queue
-from libcpp.pair cimport pair
-import numpy
-
-from ..typedefs cimport attr_t
-
-from .bits cimport bit_append
-from .bits cimport BitArray
-
-
-cdef class HuffmanCodec:
-    def __init__(self, freqs):
-        cdef float count
-        cdef Code code
-
-        cdef pair[float, int] item
-        cdef pair[float, int] item1
-        cdef pair[float, int] item2
-        cdef priority_queue[pair[float, int]] queue
-        cdef int i = 0
-        self._map = {}
-        self.leaves = []
-        for word, weight in freqs:
-            item.first = -weight
-            item.second = -(i+1)
-            queue.push(item)
-            
-            self.leaves.append(word)
-            code.bits = 0
-            code.length = 0
-            self.codes.push_back(code)
-            self._map[word] = i
-            i += 1
-
-        cdef Node node
-        while queue.size() >= 2:
-            item1 = queue.top(); queue.pop()
-            item2 = queue.top(); queue.pop()
-            
-            node = Node(left=item1.second, right=item2.second)
-            self.nodes.push_back(node)
-
-            item.first = item1.first + item2.first
-            item.second = self.nodes.size()-1
-            queue.push(item)
-        # Careful of empty freqs dicts
-        cdef Code path
-        if queue.size() >= 1:
-            item = queue.top()
-            self.root = self.nodes[item.second]
-            path.bits = 0
-            path.length = 0
-            assign_codes(self.nodes, self.codes, item.second, path)
-
-    def encode(self, msg, BitArray bits=None):
-        if bits is None:
-            bits = BitArray()
-        cdef int i
-        for word in msg:
-            i = self._map[word]
-            bits.extend(self.codes[i].bits, self.codes[i].length)
-        return bits
-
-    cpdef int encode_int32(self, int32_t[:] msg, BitArray bits) except -1:
-        cdef int msg_i
-        cdef int leaf_i
-        cdef int length = 0
-        for msg_i in range(msg.shape[0]):
-            leaf_i = self._map.get(msg[msg_i], -1)
-            if leaf_i is -1:
-                return 0
-            code = self.codes[leaf_i]
-            bits.extend(code.bits, code.length)
-            length += code.length
-        return length
-
-    def n_bits(self, msg, overhead=0):
-        cdef int i
-        length = 0
-        for word in msg:
-            if word not in self._map:
-                return numpy.nan
-            i = self._map[word]
-            length += self.codes[i].length
-        return length + overhead * len(msg)
-
-    def decode(self, bits, msg):
-        node = self.root
-        cdef int i = 0
-        cdef int n = len(msg)
-        cdef int branch
-        cdef bint bit
-        for bit in bits:
-            branch = node.right if bit else node.left
-            if branch >= 0:
-                node = self.nodes.at(branch)
-            else:
-                msg[i] = self.leaves[-(branch + 1)]
-                node = self.nodes.back()
-                i += 1
-                if i == n:
-                    break
-        else:
-            raise Exception("Buffer exhausted at %d/%d symbols read." % (i, len(msg)))
-
-    @cython.boundscheck(False)
-    cpdef int decode_int32(self, BitArray bits, int32_t[:] msg) except -1:
-        assert bits.i % 8 == 0
-        cdef Node node = self.root
-        cdef int branch
-
-        cdef int n_msg = msg.shape[0]
-        cdef bytearray bytes_ = bits.as_bytes()
-        cdef unsigned char byte
-        cdef int i_msg = 0
-        cdef int i_byte = bits.i // 8
-        cdef unsigned char i_bit = 0
-        cdef unsigned char one = 1
-        while i_msg < n_msg:
-            byte = bytes_[i_byte]
-            i_byte += 1
-            for i_bit in range(8):
-                branch = node.right if (byte & (one << i_bit)) else node.left
-                bits.i += 1
-                if branch >= 0:
-                    node = self.nodes.at(branch)
-                else:
-                    msg[i_msg] = self.leaves[-(branch + 1)]
-                    i_msg += 1
-                    if i_msg == n_msg:
-                        break
-                    node = self.root
-
-    property strings:
-        @cython.boundscheck(False)
-        @cython.wraparound(False)
-        @cython.nonecheck(False)
-        def __get__(self):
-            output = []
-            cdef int i, j
-            cdef unicode string
-            cdef Code code
-            for i in range(self.codes.size()):
-                code = self.codes[i]
-                string = '{0:b}'.format(code.bits).rjust(code.length, '0')
-                string = string[::-1]
-                output.append(string)
-            return output
-
-
-cdef int assign_codes(vector[Node]& nodes, vector[Code]& codes, int i, Code path) except -1:
-    """Recursively assign paths, from the top down. At the end, the entry codes[i]
-    knows the bit-address of the node[j] that points to entry i in the vocabulary.
-    So, to encode i, we go to codes[i] and read its bit-string. To decode, we
-    navigate nodes recursively.
-    """
-    cdef Code left_path = bit_append(path, 0)
-    cdef Code right_path = bit_append(path, 1)
-    
-    # Assign down left branch
-    if nodes[i].left >= 0:
-        assign_codes(nodes, codes, nodes[i].left, left_path)
-    else:
-        # Leaf on left
-        id_ = -(nodes[i].left + 1)
-        codes[id_] = left_path
-    # Assign down right branch
-    if nodes[i].right >= 0:
-        assign_codes(nodes, codes, nodes[i].right, right_path)
-    else:
-        # Leaf on right
-        id_ = -(nodes[i].right + 1)
-        codes[id_] = right_path
--- a/spacy/serialize/packer.pxd
+++ b/spacy/serialize/packer.pxd
@ -1,9 +0,0 @@
-from ..vocab cimport Vocab
-
-
-cdef class Packer:
-    cdef readonly tuple attrs
-    cdef readonly tuple _codecs
-    cdef readonly object orth_codec
-    cdef readonly object char_codec
-    cdef readonly Vocab vocab
--- a/spacy/serialize/packer.pyx
+++ b/spacy/serialize/packer.pyx
@ -1,200 +0,0 @@
-# cython: profile=True
-from __future__ import unicode_literals
-
-from libc.stdint cimport uint32_t, int32_t
-from libc.stdint cimport uint64_t
-from libc.math cimport exp as c_exp
-from libcpp.queue cimport priority_queue
-from libcpp.pair cimport pair
-
-from cymem.cymem cimport Address, Pool
-from preshed.maps cimport PreshMap
-from preshed.counter cimport PreshCounter
-import json
-
-from ..attrs cimport ORTH, ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
-from ..tokens.doc cimport Doc
-from ..vocab cimport Vocab
-from ..structs cimport LexemeC
-from ..typedefs cimport attr_t
-from .bits cimport BitArray
-from .huffman cimport HuffmanCodec
-
-from os import path
-import numpy
-from .. import util
-
-cimport cython
-
-
-# Format
-# - Total number of bytes in message (32 bit int) --- handled outside this
-# - Number of words (32 bit int)
-# - Words, terminating in an EOL symbol, huffman coded ~12 bits per word
-# - Spaces 1 bit per word
-# - Attributes:
-#       POS tag
-#       Head offset
-#       Dep label
-#       Entity IOB
-#       Entity tag
-
-
-cdef class _BinaryCodec:
-    def encode(self, attr_t[:] msg, BitArray bits):
-        cdef int i
-        for i in range(len(msg)):
-            bits.append(msg[i])
-
-    def decode(self, BitArray bits, attr_t[:] msg):
-        cdef int i = 0 
-        for bit in bits:
-            msg[i] = bit
-            i += 1
-            if i == len(msg):
-                break
-
-
-def _gen_orths(Vocab vocab):
-    cdef attr_t orth
-    cdef size_t addr
-    for orth, addr in vocab._by_orth.items():
-        lex = <LexemeC*>addr
-        yield orth, c_exp(lex.prob)
-
-
-def _gen_chars(Vocab vocab):
-    cdef attr_t orth
-    cdef size_t addr
-    char_weights = {i: 1e-20 for i in range(256)}
-    cdef unicode string
-    cdef bytes char
-    cdef bytes utf8_str
-    for orth, addr in vocab._by_orth.items():
-        lex = <LexemeC*>addr
-        string = vocab.strings[lex.orth]
-        utf8_str = string.encode('utf8')
-        for char in utf8_str:
-            char_weights.setdefault(ord(char), 0.0)
-            char_weights[ord(char)] += c_exp(lex.prob)
-        char_weights[ord(' ')] += c_exp(lex.prob)
-    return char_weights.items()
-
-
-cdef class Packer:
-    def __init__(self, Vocab vocab, attr_freqs, char_freqs=None):
-        if char_freqs is None:
-            char_freqs = _gen_chars(vocab)
-        self.vocab = vocab
-        self.orth_codec = HuffmanCodec(_gen_orths(vocab))
-        self.char_codec = HuffmanCodec(char_freqs)
-        
-        codecs = []
-        attrs = []
-        for attr, freqs in sorted(attr_freqs):
-            if attr in (ORTH, ID, SPACY):
-                continue
-            codecs.append(HuffmanCodec(freqs))
-            attrs.append(attr)
-        self._codecs = tuple(codecs)
-        self.attrs = tuple(attrs)
-
-    def pack(self, Doc doc):
-        if len(doc) == 0:
-            return b''
-        bits = self._orth_encode(doc)
-        if bits is None:
-            bits = self._char_encode(doc)
-        cdef int i
-        if self.attrs:
-            array = doc.to_array(self.attrs)
-            for i, codec in enumerate(self._codecs):
-                codec.encode(array[:, i], bits)
-        return bits.as_bytes()
-
-    def unpack(self, data):
-        doc = Doc(self.vocab)
-        self.unpack_into(data, doc)
-        return doc
-
-    def unpack_into(self, byte_string, Doc doc):
-        if byte_string == b'':
-            return None
-        bits = BitArray(byte_string)
-        bits.seek(0)
-        cdef int32_t length = bits.read32()
-        if length >= 0:
-            self._orth_decode(bits, length, doc)
-        else:
-            self._char_decode(bits, -length, doc)
-        array = numpy.zeros(shape=(len(doc), len(self._codecs)), dtype=numpy.int32)
-        for i, codec in enumerate(self._codecs):
-            codec.decode(bits, array[:, i])
-        doc.from_array(self.attrs, array)
-        return doc
-
-    def _orth_encode(self, Doc doc):
-        for t in doc:
-            if t.is_oov:
-                return None
-        cdef BitArray bits = BitArray()
-        cdef int32_t length = len(doc)
-        bits.extend(length, 32) 
-        orths = doc.to_array([ORTH])
-        n_bits = self.orth_codec.encode_int32(orths[:, 0], bits)
-        if n_bits == 0:
-            return None
-        for token in doc:
-            bits.append(bool(token.whitespace_))
-        return bits
-
-    def _char_encode(self, Doc doc):
-        cdef bytes utf8_str = doc.string.encode('utf8')
-        cdef BitArray bits = BitArray()
-        cdef int32_t length = len(utf8_str)
-        # Signal chars with negative length
-        bits.extend(-length, 32)
-        self.char_codec.encode(bytearray(utf8_str), bits)
-        cdef int i, j
-        for i in range(doc.length):
-            for j in range(doc.c[i].lex.length-1):
-                bits.append(False)
-            bits.append(True)
-            if doc.c[i].spacy:
-                bits.append(False)
-        return bits
-
-    def _orth_decode(self, BitArray bits, int32_t n, Doc doc):
-        cdef attr_t[:] orths = numpy.ndarray(shape=(n,), dtype=numpy.int32)
-        self.orth_codec.decode_int32(bits, orths)
-        cdef int i
-        cdef bint space
-        spaces = iter(bits)
-        for i in range(n):
-            orth = orths[i]
-            space = next(spaces)
-            lex = self.vocab.get_by_orth(doc.mem, orth)
-            doc.push_back(lex, space)
-        return doc
-
-    def _char_decode(self, BitArray bits, int32_t n_bytes, Doc doc):
-        cdef bytearray utf8_str = bytearray(n_bytes)
-        self.char_codec.decode(bits, utf8_str)
-
-        cdef unicode string = utf8_str.decode('utf8')
-        cdef int start = 0
-        cdef bint is_spacy
-        cdef int n_unicode_chars = len(string)
-        cdef int i = 0
-        cdef bint is_end_token
-        for is_end_token in bits:
-            if is_end_token:
-                span = string[start:i+1]
-                lex = self.vocab.get(doc.mem, span)
-                is_spacy = (i+1) < n_unicode_chars and string[i+1] == u' '
-                doc.push_back(lex, is_spacy)
-                start = i + 1 + is_spacy
-            i += 1
-            if i >= n_unicode_chars:
-                break
-        return doc