Make vocabs more compatible across versions

Previously, symbols were inserted into the string-store
before strings were loaded. This meant that adding a symbol
would invalidate saved models. We now make sure that strings
are loaded faithfully, so that compatibility is maintained.
This commit is contained in:
Matthew Honnibal 2017-03-17 18:29:04 +01:00
parent 6f11f22204
commit 854cfce7cf
1 changed files with 8 additions and 5 deletions

View File

@ -98,16 +98,16 @@ cdef class Vocab:
else: else:
serializer_freqs = None serializer_freqs = None
cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs)
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_: with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
self.strings.load(file_) strings_list = json.load(file_)
cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs,
strings=strings_list)
self.load_lexemes(path / 'vocab' / 'lexemes.bin') self.load_lexemes(path / 'vocab' / 'lexemes.bin')
return self return self
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
serializer_freqs=None, **deprecated_kwargs): serializer_freqs=None, strings=tuple(), **deprecated_kwargs):
'''Create the vocabulary. '''Create the vocabulary.
lex_attr_getters (dict): lex_attr_getters (dict):
@ -136,6 +136,9 @@ cdef class Vocab:
self._by_hash = PreshMap() self._by_hash = PreshMap()
self._by_orth = PreshMap() self._by_orth = PreshMap()
self.strings = StringStore() self.strings = StringStore()
if strings:
for string in strings:
self.strings[string]
# Load strings in a special order, so that we have an onset number for # Load strings in a special order, so that we have an onset number for
# the vocabulary. This way, when words are added in order, the orth ID # the vocabulary. This way, when words are added in order, the orth ID
# is the frequency rank of the word, plus a certain offset. The structural # is the frequency rank of the word, plus a certain offset. The structural