diff --git a/spacy/language.py b/spacy/language.py index b328cef36..d1c2cf1b2 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -559,13 +559,14 @@ class Language(object): nr_seen += 1 elif len(old_refs) == 0: old_refs, recent_refs = recent_refs, old_refs - self.vocab.strings._cleanup_stale_strings() + keys, strings = self.vocab.strings._cleanup_stale_strings() + self.vocab._reset_cache(keys, strings) + self.tokenizer._reset_cache(keys) + for string in original_strings_data: + self.vocab.strings.add(string) nr_seen = 0 # We can't know which strings from the last batch have really expired. - # So we don't erase the strings — we just extend with the original - # content. - for string in original_strings_data: - self.vocab.strings.add(string) + # So we don't erase the strings. def to_disk(self, path, disable=tuple()): """Save the current state to a directory. If a model is loaded, this diff --git a/spacy/strings.pyx b/spacy/strings.pyx index d08d96bb2..48c9a0cc8 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -250,15 +250,23 @@ cdef class StringStore: self.add(string) def _cleanup_stale_strings(self): + """ + RETURNS (keys, strings): Dropped strings and keys that can be dropped from other places + """ if self.hits.size() == 0: # If we don't have any hits, just skip cleanup return cdef vector[hash_t] tmp + dropped_strings = [] + dropped_keys = [] for i in range(self.keys.size()): key = self.keys[i] if self.hits.count(key) != 0: tmp.push_back(key) + else: + dropped_keys.append(key) + dropped_strings.append(self[key]) self.keys.swap(tmp) strings = list(self) @@ -266,6 +274,8 @@ cdef class StringStore: # Here we have strings but hits to it should be reseted self.hits.clear() + return dropped_keys, dropped_strings + cdef const Utf8Str* intern_unicode(self, unicode py_string): # 0 means missing, but we don't bother offsetting the index. cdef bytes byte_string = py_string.encode('utf8') diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 67ff47743..543a04256 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -132,6 +132,11 @@ cdef class Tokenizer: for text in texts: yield self(text) + def _reset_cache(self, keys): + for k in keys: + del self._cache[k] + del self._specials[k] + cdef int _try_cache(self, hash_t key, Doc tokens) except -1: cached = <_Cached*>self._cache.get(key) if cached == NULL: diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 675e4a805..122aa80dc 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -465,6 +465,12 @@ cdef class Vocab: self._by_orth.set(lexeme.orth, lexeme) self.length += 1 + def _reset_cache(self, keys, strings): + for k in keys: + del self._by_hash[k] + + self._by_orth = PreshMap() + def pickle_vocab(vocab): sstore = vocab.strings