Draft Vocab to/from disk/bytes

This commit is contained in:
Matthew Honnibal 2017-05-28 23:34:12 +02:00
parent e6dd01fc90
commit 2edd96ce47
1 changed files with 20 additions and 11 deletions

View File

@ -275,9 +275,9 @@ cdef class Vocab:
path = util.ensure_path(path) path = util.ensure_path(path)
if not path.exists(): if not path.exists():
path.mkdir() path.mkdir()
strings_loc = path / 'strings.json' self.strings.to_disk(path / 'strings.json')
with strings_loc.open('w', encoding='utf8') as file_: with (path / 'lexemes.bin').open('wb') as file_:
self.strings.dump(file_) file_.write(self.lexemes_to_bytes())
def from_disk(self, path): def from_disk(self, path):
"""Loads state from a directory. Modifies the object in place and """Loads state from a directory. Modifies the object in place and
@ -288,11 +288,10 @@ cdef class Vocab:
RETURNS (Vocab): The modified `Vocab` object. RETURNS (Vocab): The modified `Vocab` object.
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_: self.strings.from_disk(path / 'strings.json')
strings_list = ujson.load(file_) with (path / 'lexemes.bin').open('rb') as file_:
for string in strings_list: self.lexemes_from_bytes(file_.read())
self.strings.add(string) return self
self.load_lexemes(path / 'lexemes.bin')
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
"""Serialize the current state to a binary string. """Serialize the current state to a binary string.
@ -300,7 +299,12 @@ cdef class Vocab:
**exclude: Named attributes to prevent from being serialized. **exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `Vocab` object. RETURNS (bytes): The serialized form of the `Vocab` object.
""" """
raise NotImplementedError() data = {}
if 'strings' not in exclude:
data['strings'] = self.strings.to_bytes()
if 'lexemes' not in exclude:
data['lexemes'] = self.lexemes_to_bytes
return ujson.dumps(data)
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, **exclude):
"""Load state from a binary string. """Load state from a binary string.
@ -309,9 +313,14 @@ cdef class Vocab:
**exclude: Named attributes to prevent from being loaded. **exclude: Named attributes to prevent from being loaded.
RETURNS (Vocab): The `Vocab` object. RETURNS (Vocab): The `Vocab` object.
""" """
raise NotImplementedError() data = ujson.loads(bytes_data)
if 'strings' not in exclude:
self.strings.from_bytes(data['strings'])
if 'lexemes' not in exclude:
self.lexemes_from_bytes(data['lexemes'])
return self
def lexemes_to_bytes(self, **exclude): def lexemes_to_bytes(self):
cdef hash_t key cdef hash_t key
cdef size_t addr cdef size_t addr
cdef LexemeC* lexeme = NULL cdef LexemeC* lexeme = NULL