mirror of https://github.com/explosion/spaCy.git
Clean all caches
This commit is contained in:
parent
4e378dc4a4
commit
91e2fa6561
|
@ -559,13 +559,14 @@ class Language(object):
|
|||
nr_seen += 1
|
||||
elif len(old_refs) == 0:
|
||||
old_refs, recent_refs = recent_refs, old_refs
|
||||
self.vocab.strings._cleanup_stale_strings()
|
||||
keys, strings = self.vocab.strings._cleanup_stale_strings()
|
||||
self.vocab._reset_cache(keys, strings)
|
||||
self.tokenizer._reset_cache(keys)
|
||||
for string in original_strings_data:
|
||||
self.vocab.strings.add(string)
|
||||
nr_seen = 0
|
||||
# We can't know which strings from the last batch have really expired.
|
||||
# So we don't erase the strings — we just extend with the original
|
||||
# content.
|
||||
for string in original_strings_data:
|
||||
self.vocab.strings.add(string)
|
||||
# So we don't erase the strings.
|
||||
|
||||
def to_disk(self, path, disable=tuple()):
|
||||
"""Save the current state to a directory. If a model is loaded, this
|
||||
|
|
|
@ -250,15 +250,23 @@ cdef class StringStore:
|
|||
self.add(string)
|
||||
|
||||
def _cleanup_stale_strings(self):
|
||||
"""
|
||||
RETURNS (keys, strings): Dropped strings and keys that can be dropped from other places
|
||||
"""
|
||||
if self.hits.size() == 0:
|
||||
# If we don't have any hits, just skip cleanup
|
||||
return
|
||||
|
||||
cdef vector[hash_t] tmp
|
||||
dropped_strings = []
|
||||
dropped_keys = []
|
||||
for i in range(self.keys.size()):
|
||||
key = self.keys[i]
|
||||
if self.hits.count(key) != 0:
|
||||
tmp.push_back(key)
|
||||
else:
|
||||
dropped_keys.append(key)
|
||||
dropped_strings.append(self[key])
|
||||
|
||||
self.keys.swap(tmp)
|
||||
strings = list(self)
|
||||
|
@ -266,6 +274,8 @@ cdef class StringStore:
|
|||
# Here we have strings but hits to it should be reseted
|
||||
self.hits.clear()
|
||||
|
||||
return dropped_keys, dropped_strings
|
||||
|
||||
cdef const Utf8Str* intern_unicode(self, unicode py_string):
|
||||
# 0 means missing, but we don't bother offsetting the index.
|
||||
cdef bytes byte_string = py_string.encode('utf8')
|
||||
|
|
|
@ -132,6 +132,11 @@ cdef class Tokenizer:
|
|||
for text in texts:
|
||||
yield self(text)
|
||||
|
||||
def _reset_cache(self, keys):
|
||||
for k in keys:
|
||||
del self._cache[k]
|
||||
del self._specials[k]
|
||||
|
||||
cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
|
||||
cached = <_Cached*>self._cache.get(key)
|
||||
if cached == NULL:
|
||||
|
|
|
@ -465,6 +465,12 @@ cdef class Vocab:
|
|||
self._by_orth.set(lexeme.orth, lexeme)
|
||||
self.length += 1
|
||||
|
||||
def _reset_cache(self, keys, strings):
|
||||
for k in keys:
|
||||
del self._by_hash[k]
|
||||
|
||||
self._by_orth = PreshMap()
|
||||
|
||||
|
||||
def pickle_vocab(vocab):
|
||||
sstore = vocab.strings
|
||||
|
|
Loading…
Reference in New Issue