From 1f31d96bf940cbc01ea9490ee42b43d1518b0296 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 23 Jul 2015 01:12:00 +0200 Subject: [PATCH] * Fix Packer API, so that it reads and writes bytes strings, instead of BitArray. Docs are always byte aligned anyway. --- spacy/serialize/packer.pyx | 55 +++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/spacy/serialize/packer.pyx b/spacy/serialize/packer.pyx index 17d687669..bc0cad217 100644 --- a/spacy/serialize/packer.pyx +++ b/spacy/serialize/packer.pyx @@ -10,6 +10,7 @@ from libcpp.pair cimport pair from cymem.cymem cimport Address, Pool from preshed.maps cimport PreshMap from preshed.counter cimport PreshCounter +import json from ..attrs cimport ORTH, ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE from ..tokens.doc cimport Doc @@ -98,33 +99,34 @@ cdef class Packer: self._codecs = tuple(codecs) self.attrs = tuple(attrs) - @classmethod - def from_dir(cls, Vocab vocab, data_dir): - return cls(vocab, util.read_encoding_freqs(data_dir)) - def pack(self, Doc doc): bits = self._orth_encode(doc) if bits is None: bits = self._char_encode(doc) - cdef int i if self.attrs: array = doc.to_array(self.attrs) for i, codec in enumerate(self._codecs): - codec.encode_int32(array[:, i], bits) - return bits + codec.encode(array[:, i], bits) + return bits.as_bytes() - def unpack(self, BitArray bits): + def unpack(self, bytes data): + doc = Doc(self.vocab) + self.unpack_into(data, doc) + return doc + + def unpack_into(self, bytes byte_string, Doc doc): + bits = BitArray(byte_string) bits.seek(0) cdef int32_t length = bits.read32() if length >= 0: - doc = self._orth_decode(bits, length) + self._orth_decode(bits, length, doc) else: - doc = self._char_decode(bits, -length) - + self._char_decode(bits, -length, doc) + array = numpy.zeros(shape=(len(doc), len(self._codecs)), dtype=numpy.int32) for i, codec in enumerate(self._codecs): - codec.decode_int32(bits, array[:, i]) + codec.decode(bits, array[:, i]) doc.from_array(self.attrs, array) return doc @@ -141,13 +143,6 @@ cdef class Packer: bits.append(bool(token.whitespace_)) return bits - def _orth_decode(self, BitArray bits, n): - orths = numpy.ndarray(shape=(n,), dtype=numpy.int32) - self.orth_codec.decode_int32(bits, orths) - orths_and_spaces = zip(orths, bits) - cdef Doc doc = Doc(self.vocab, orths_and_spaces) - return doc - def _char_encode(self, Doc doc): cdef bytes utf8_str = doc.string.encode('utf8') cdef BitArray bits = BitArray() @@ -164,12 +159,24 @@ cdef class Packer: bits.append(False) return bits - def _char_decode(self, BitArray bits, n): + def _orth_decode(self, BitArray bits, int32_t n, Doc doc): + cdef attr_t[:] orths = numpy.ndarray(shape=(n,), dtype=numpy.int32) + self.orth_codec.decode_int32(bits, orths) + cdef int i + cdef bint space + spaces = iter(bits) + for i in range(n): + orth = orths[i] + space = spaces.next() + lex = self.vocab.get_by_orth(doc.mem, orth) + doc.push_back(lex, space) + return doc + + def _char_decode(self, BitArray bits, int32_t n, Doc doc): cdef bytearray utf8_str = bytearray(n) self.char_codec.decode(bits, utf8_str) cdef unicode string = utf8_str.decode('utf8') - cdef Doc tokens = Doc(self.vocab) cdef int start = 0 cdef bint is_spacy cdef int length = len(string) @@ -178,11 +185,11 @@ cdef class Packer: for is_end_token in bits: if is_end_token: span = string[start:i+1] - lex = self.vocab.get(tokens.mem, span) + lex = self.vocab.get(doc.mem, span) is_spacy = (i+1) < length and string[i+1] == u' ' - tokens.push_back(lex, is_spacy) + doc.push_back(lex, is_spacy) start = i + 1 + is_spacy i += 1 if i >= n: break - return tokens + return doc