Clean all caches

2017-11-14 21:15:04 +03:00 · 2017-11-14 21:15:04 +03:00 · 91e2fa6561
parent 4e378dc4a4
commit 91e2fa6561
4 changed files with 27 additions and 5 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -559,13 +559,14 @@ class Language(object):
                nr_seen += 1
            elif len(old_refs) == 0:
                old_refs, recent_refs = recent_refs, old_refs
-                self.vocab.strings._cleanup_stale_strings()
+                keys, strings = self.vocab.strings._cleanup_stale_strings()
+                self.vocab._reset_cache(keys, strings)
+                self.tokenizer._reset_cache(keys)
+                for string in original_strings_data:
+                    self.vocab.strings.add(string)
                nr_seen = 0
        # We can't know which strings from the last batch have really expired.
-        # So we don't erase the strings — we just extend with the original
-        # content.
-        for string in original_strings_data:
-            self.vocab.strings.add(string)
+        # So we don't erase the strings.

    def to_disk(self, path, disable=tuple()):
        """Save the current state to a directory.  If a model is loaded, this
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -250,15 +250,23 @@ cdef class StringStore:
            self.add(string)

    def _cleanup_stale_strings(self):
+        """
+        RETURNS (keys, strings): Dropped strings and keys that can be dropped from other places
+        """
        if self.hits.size() == 0:
            # If we don't have any hits, just skip cleanup
            return

        cdef vector[hash_t] tmp
+        dropped_strings = []
+        dropped_keys = []
        for i in range(self.keys.size()):
            key = self.keys[i]
            if self.hits.count(key) != 0:
                tmp.push_back(key)
+            else:
+                dropped_keys.append(key)
+                dropped_strings.append(self[key])

        self.keys.swap(tmp)
        strings = list(self)
@ -266,6 +274,8 @@ cdef class StringStore:
        # Here we have strings but hits to it should be reseted
        self.hits.clear()

+        return dropped_keys, dropped_strings
+
    cdef const Utf8Str* intern_unicode(self, unicode py_string):
        # 0 means missing, but we don't bother offsetting the index.
        cdef bytes byte_string = py_string.encode('utf8')
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -132,6 +132,11 @@ cdef class Tokenizer:
        for text in texts:
            yield self(text)

+    def _reset_cache(self, keys):
+        for k in keys:
+            del self._cache[k]
+            del self._specials[k]
+
    cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
        cached = <_Cached*>self._cache.get(key)
        if cached == NULL:
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -465,6 +465,12 @@ cdef class Vocab:
            self._by_orth.set(lexeme.orth, lexeme)
            self.length += 1

+    def _reset_cache(self, keys, strings):
+        for k in keys:
+            del self._by_hash[k]
+
+        self._by_orth = PreshMap()
+

 def pickle_vocab(vocab):
    sstore = vocab.strings