Strip serializer from code

This commit is contained in:
Matthew Honnibal 2017-05-09 17:28:50 +02:00
parent 825c6403d8
commit 9e167b7bb6
4 changed files with 7 additions and 67 deletions

View File

@ -203,7 +203,6 @@ class Language(object):
parser=False,
entity=False,
matcher=False,
serializer=False,
vectors=False,
pipeline=False)

View File

@ -22,7 +22,6 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
from ..parts_of_speech cimport univ_pos_t
from ..lexeme cimport Lexeme
from ..serialize.bits cimport BitArray
from ..util import normalize_slice
from ..syntax.iterators import CHUNKERS
from ..compat import is_config
@ -81,11 +80,6 @@ cdef class Doc:
"""
Create a Doc object.
Aside: Implementation
This method of constructing a `Doc` object is usually only used
for deserialization. Standard usage is to construct the document via
a call to the language object.
Arguments:
vocab:
A Vocabulary object, which must match any models you want to
@ -615,46 +609,13 @@ cdef class Doc:
"""
Serialize, producing a byte string.
"""
byte_string = self.vocab.serializer.pack(self)
cdef uint32_t length = len(byte_string)
return struct.pack('I', length) + byte_string
raise NotImplementedError
def from_bytes(self, data):
"""
Deserialize, loading from bytes.
"""
self.vocab.serializer.unpack_into(data[4:], self)
return self
@staticmethod
def read_bytes(file_):
"""
A static method, used to read serialized #[code Doc] objects from
a file. For example:
Example:
from spacy.tokens.doc import Doc
loc = 'test_serialize.bin'
with open(loc, 'wb') as file_:
file_.write(nlp(u'This is a document.').to_bytes())
file_.write(nlp(u'This is another.').to_bytes())
docs = []
with open(loc, 'rb') as file_:
for byte_string in Doc.read_bytes(file_):
docs.append(Doc(nlp.vocab).from_bytes(byte_string))
assert len(docs) == 2
"""
keep_reading = True
while keep_reading:
try:
n_bytes_str = file_.read(4)
if len(n_bytes_str) < 4:
break
n_bytes = struct.unpack('I', n_bytes_str)[0]
data = file_.read(n_bytes)
except StopIteration:
keep_reading = False
yield n_bytes_str + data
raise NotImplementedError
def merge(self, int start_idx, int end_idx, *args, **attributes):
"""

View File

@ -29,10 +29,8 @@ cdef class Vocab:
cpdef readonly StringStore strings
cpdef readonly Morphology morphology
cdef readonly int length
cdef public object _serializer
cdef public object data_dir
cdef public object lex_attr_getters
cdef public object serializer_freqs
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL

View File

@ -15,7 +15,6 @@ from .strings cimport hash_string
from .typedefs cimport attr_t
from .cfile cimport CFile, StringCFile
from .tokens.token cimport Token
from .serialize.packer cimport Packer
from .attrs cimport PROB, LANG
from .compat import copy_reg, pickle
@ -41,7 +40,7 @@ cdef class Vocab:
"""
@classmethod
def load(cls, path, lex_attr_getters=None, lemmatizer=True,
tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs):
tag_map=True, oov_prob=True, **deprecated_kwargs):
"""
Load the vocabulary from a path.
@ -80,22 +79,17 @@ cdef class Vocab:
lex_attr_getters[PROB] = lambda text: oov_prob
if lemmatizer is True:
lemmatizer = Lemmatizer.load(path)
if serializer_freqs is True and (path / 'vocab' / 'serializer.json').exists():
with (path / 'vocab' / 'serializer.json').open('r', encoding='utf8') as file_:
serializer_freqs = ujson.load(file_)
else:
serializer_freqs = None
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
strings_list = ujson.load(file_)
cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs,
lemmatizer=lemmatizer,
strings=strings_list)
self.load_lexemes(path / 'vocab' / 'lexemes.bin')
return self
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
serializer_freqs=None, strings=tuple(), **deprecated_kwargs):
strings=tuple(), **deprecated_kwargs):
"""
Create the vocabulary.
@ -119,7 +113,6 @@ cdef class Vocab:
tag_map = tag_map if tag_map is not None else {}
if lemmatizer in (None, True, False):
lemmatizer = Lemmatizer({}, {}, {})
serializer_freqs = serializer_freqs if serializer_freqs is not None else {}
self.mem = Pool()
self._by_hash = PreshMap()
@ -141,17 +134,8 @@ cdef class Vocab:
_ = self.strings[name]
self.lex_attr_getters = lex_attr_getters
self.morphology = Morphology(self.strings, tag_map, lemmatizer)
self.serializer_freqs = serializer_freqs
self.length = 1
self._serializer = None
property serializer:
# Having the serializer live here is super messy :(
def __get__(self):
if self._serializer is None:
self._serializer = Packer(self, self.serializer_freqs)
return self._serializer
property lang:
def __get__(self):
@ -630,7 +614,6 @@ def pickle_vocab(vocab):
sstore = vocab.strings
morph = vocab.morphology
length = vocab.length
serializer = vocab._serializer
data_dir = vocab.data_dir
lex_attr_getters = vocab.lex_attr_getters
@ -638,11 +621,11 @@ def pickle_vocab(vocab):
vectors_length = vocab.vectors_length
return (unpickle_vocab,
(sstore, morph, serializer, data_dir, lex_attr_getters,
(sstore, morph, data_dir, lex_attr_getters,
lexemes_data, length, vectors_length))
def unpickle_vocab(sstore, morphology, serializer, data_dir,
def unpickle_vocab(sstore, morphology, data_dir,
lex_attr_getters, bytes lexemes_data, int length, int vectors_length):
cdef Vocab vocab = Vocab()
vocab.length = length
@ -650,7 +633,6 @@ def unpickle_vocab(sstore, morphology, serializer, data_dir,
vocab.strings = sstore
cdef CFile fp = StringCFile('r', data=lexemes_data)
vocab.morphology = morphology
vocab._serializer = serializer
vocab.data_dir = data_dir
vocab.lex_attr_getters = lex_attr_getters
vocab._deserialize_lexemes(fp)