mirror of https://github.com/explosion/spaCy.git
* Start trying to pickle Vocab
This commit is contained in:
parent
5ca57bd859
commit
85e7944572
|
@ -25,7 +25,6 @@ cdef struct _Cached:
|
|||
|
||||
|
||||
cdef class Vocab:
|
||||
cpdef public lexeme_props_getter
|
||||
cdef Pool mem
|
||||
cpdef readonly StringStore strings
|
||||
cpdef readonly Morphology morphology
|
||||
|
@ -33,7 +32,6 @@ cdef class Vocab:
|
|||
cdef public object _serializer
|
||||
cdef public object data_dir
|
||||
cdef public object get_lex_attr
|
||||
cdef public object pos_tags
|
||||
cdef public object serializer_freqs
|
||||
|
||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
|
||||
|
|
|
@ -10,6 +10,8 @@ from os import path
|
|||
import io
|
||||
import math
|
||||
import json
|
||||
import tempfile
|
||||
import copy_reg
|
||||
|
||||
from .lexeme cimport EMPTY_LEXEME
|
||||
from .lexeme cimport Lexeme
|
||||
|
@ -96,6 +98,18 @@ cdef class Vocab:
|
|||
"""The current number of lexemes stored."""
|
||||
return self.length
|
||||
|
||||
def __reduce__(self):
|
||||
tmp_dir = tempfile.mkdtmp()
|
||||
lex_loc = path.join(tmp_dir, 'lexemes.bin')
|
||||
str_loc = path.join(tmp_dir, 'strings.txt')
|
||||
map_loc = path.join(tmp_dir, 'tag_map.json')
|
||||
|
||||
self.dump(lex_loc)
|
||||
self.strings.dump(str_loc)
|
||||
json.dump(self.morphology.tag_map, open(map_loc, 'w'))
|
||||
|
||||
return (Vocab.from_dir, (tmp_dir, self.get_lex_attr), None, None)
|
||||
|
||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
|
||||
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||
if necessary, using memory acquired from the given pool. If the pool
|
||||
|
@ -339,6 +353,9 @@ cdef class Vocab:
|
|||
return vec_len
|
||||
|
||||
|
||||
copy_reg.constructor(Vocab.from_dir)
|
||||
|
||||
|
||||
def write_binary_vectors(in_loc, out_loc):
|
||||
cdef CFile out_file = CFile(out_loc, 'wb')
|
||||
cdef Address mem
|
||||
|
|
Loading…
Reference in New Issue