From 854cfce7cfd8c947cf4e4c629234b482dab3bd12 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 17 Mar 2017 18:29:04 +0100 Subject: [PATCH] Make vocabs more compatible across versions Previously, symbols were inserted into the string-store before strings were loaded. This meant that adding a symbol would invalidate saved models. We now make sure that strings are loaded faithfully, so that compatibility is maintained. --- spacy/vocab.pyx | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 8d41c48b3..ddd3134cb 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -98,16 +98,16 @@ cdef class Vocab: else: serializer_freqs = None - cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map, - lemmatizer=lemmatizer, serializer_freqs=serializer_freqs) - with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_: - self.strings.load(file_) + strings_list = json.load(file_) + cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map, + lemmatizer=lemmatizer, serializer_freqs=serializer_freqs, + strings=strings_list) self.load_lexemes(path / 'vocab' / 'lexemes.bin') return self def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, - serializer_freqs=None, **deprecated_kwargs): + serializer_freqs=None, strings=tuple(), **deprecated_kwargs): '''Create the vocabulary. lex_attr_getters (dict): @@ -136,6 +136,9 @@ cdef class Vocab: self._by_hash = PreshMap() self._by_orth = PreshMap() self.strings = StringStore() + if strings: + for string in strings: + self.strings[string] # Load strings in a special order, so that we have an onset number for # the vocabulary. This way, when words are added in order, the orth ID # is the frequency rank of the word, plus a certain offset. The structural