diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 929c7b345..d850bf929 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -25,7 +25,6 @@ cdef struct _Cached: cdef class Vocab: - cpdef public lexeme_props_getter cdef Pool mem cpdef readonly StringStore strings cpdef readonly Morphology morphology @@ -33,7 +32,6 @@ cdef class Vocab: cdef public object _serializer cdef public object data_dir cdef public object get_lex_attr - cdef public object pos_tags cdef public object serializer_freqs cdef const LexemeC* get(self, Pool mem, unicode string) except NULL diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index af9161d6b..7f07a64ba 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -10,6 +10,8 @@ from os import path import io import math import json +import tempfile +import copy_reg from .lexeme cimport EMPTY_LEXEME from .lexeme cimport Lexeme @@ -96,6 +98,18 @@ cdef class Vocab: """The current number of lexemes stored.""" return self.length + def __reduce__(self): + tmp_dir = tempfile.mkdtmp() + lex_loc = path.join(tmp_dir, 'lexemes.bin') + str_loc = path.join(tmp_dir, 'strings.txt') + map_loc = path.join(tmp_dir, 'tag_map.json') + + self.dump(lex_loc) + self.strings.dump(str_loc) + json.dump(self.morphology.tag_map, open(map_loc, 'w')) + + return (Vocab.from_dir, (tmp_dir, self.get_lex_attr), None, None) + cdef const LexemeC* get(self, Pool mem, unicode string) except NULL: '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme if necessary, using memory acquired from the given pool. If the pool @@ -339,6 +353,9 @@ cdef class Vocab: return vec_len +copy_reg.constructor(Vocab.from_dir) + + def write_binary_vectors(in_loc, out_loc): cdef CFile out_file = CFile(out_loc, 'wb') cdef Address mem