mirror of https://github.com/explosion/spaCy.git
Make vocabs more compatible across versions
Previously, symbols were inserted into the string-store before strings were loaded. This meant that adding a symbol would invalidate saved models. We now make sure that strings are loaded faithfully, so that compatibility is maintained.
This commit is contained in:
parent
6f11f22204
commit
854cfce7cf
|
@ -98,16 +98,16 @@ cdef class Vocab:
|
||||||
else:
|
else:
|
||||||
serializer_freqs = None
|
serializer_freqs = None
|
||||||
|
|
||||||
cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
|
|
||||||
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs)
|
|
||||||
|
|
||||||
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
|
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
|
||||||
self.strings.load(file_)
|
strings_list = json.load(file_)
|
||||||
|
cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
|
||||||
|
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs,
|
||||||
|
strings=strings_list)
|
||||||
self.load_lexemes(path / 'vocab' / 'lexemes.bin')
|
self.load_lexemes(path / 'vocab' / 'lexemes.bin')
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
||||||
serializer_freqs=None, **deprecated_kwargs):
|
serializer_freqs=None, strings=tuple(), **deprecated_kwargs):
|
||||||
'''Create the vocabulary.
|
'''Create the vocabulary.
|
||||||
|
|
||||||
lex_attr_getters (dict):
|
lex_attr_getters (dict):
|
||||||
|
@ -136,6 +136,9 @@ cdef class Vocab:
|
||||||
self._by_hash = PreshMap()
|
self._by_hash = PreshMap()
|
||||||
self._by_orth = PreshMap()
|
self._by_orth = PreshMap()
|
||||||
self.strings = StringStore()
|
self.strings = StringStore()
|
||||||
|
if strings:
|
||||||
|
for string in strings:
|
||||||
|
self.strings[string]
|
||||||
# Load strings in a special order, so that we have an onset number for
|
# Load strings in a special order, so that we have an onset number for
|
||||||
# the vocabulary. This way, when words are added in order, the orth ID
|
# the vocabulary. This way, when words are added in order, the orth ID
|
||||||
# is the frequency rank of the word, plus a certain offset. The structural
|
# is the frequency rank of the word, plus a certain offset. The structural
|
||||||
|
|
Loading…
Reference in New Issue