From 046c38bd265fdb504006f6ffbeccc5a9b765afee Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 1 Sep 2020 16:12:15 +0200
Subject: [PATCH] Remove 'cleanup' of strings (#6007)

A long time ago we went to some trouble to try to clean up "unused"
strings, to avoid the `StringStore` growing in long-running processes.

This never really worked reliably, and I think it was a really wrong
approach. It's much better to let the user reload the `nlp` object as
necessary, now that the string encoding is stable (in v1, the string IDs
were sequential integers, making reloading the NLP object really
annoying.)

The extra book-keeping does make some performance difference, and the
feature is unsed, so it's past time we killed it.
---
 spacy/language.py | 29 -----------------------------
 spacy/strings.pxd |  1 -
 spacy/strings.pyx | 36 ------------------------------------
 3 files changed, 66 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index e20bbdd80..8e7c39b90 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1314,7 +1314,6 @@ class Language:
         as_tuples: bool = False,
         batch_size: int = 1000,
         disable: Iterable[str] = SimpleFrozenList(),
-        cleanup: bool = False,
         component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
         n_process: int = 1,
     ):
@@ -1326,8 +1325,6 @@ class Language:
             (doc, context) tuples. Defaults to False.
         batch_size (int): The number of texts to buffer.
         disable (List[str]): Names of the pipeline components to disable.
-        cleanup (bool): If True, unneeded strings are freed to control memory
-            use. Experimental.
         component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword
             arguments for specific components.
         n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`.
@@ -1378,35 +1375,9 @@ class Language:
             for pipe in pipes:
                 docs = pipe(docs)
 
-        # Track weakrefs of "recent" documents, so that we can see when they
-        # expire from memory. When they do, we know we don't need old strings.
-        # This way, we avoid maintaining an unbounded growth in string entries
-        # in the string store.
-        recent_refs = weakref.WeakSet()
-        old_refs = weakref.WeakSet()
-        # Keep track of the original string data, so that if we flush old strings,
-        # we can recover the original ones. However, we only want to do this if we're
-        # really adding strings, to save up-front costs.
-        original_strings_data = None
         nr_seen = 0
         for doc in docs:
             yield doc
-            if cleanup:
-                recent_refs.add(doc)
-                if nr_seen < 10000:
-                    old_refs.add(doc)
-                    nr_seen += 1
-                elif len(old_refs) == 0:
-                    old_refs, recent_refs = recent_refs, old_refs
-                    if original_strings_data is None:
-                        original_strings_data = list(self.vocab.strings)
-                    else:
-                        keys, strings = self.vocab.strings._cleanup_stale_strings(
-                            original_strings_data
-                        )
-                        self.vocab._reset_cache(keys, strings)
-                        self.tokenizer._reset_cache(keys)
-                    nr_seen = 0
 
     def _multiprocessing_pipe(
         self,
diff --git a/spacy/strings.pxd b/spacy/strings.pxd
index ba2476ec7..07768d347 100644
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@@ -23,7 +23,6 @@ cdef class StringStore:
     cdef Pool mem
 
     cdef vector[hash_t] keys
-    cdef set[hash_t] hits
     cdef public PreshMap _map
 
     cdef const Utf8Str* intern_unicode(self, unicode py_string)
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 136eda9ff..6a1d68221 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -127,7 +127,6 @@ cdef class StringStore:
             return SYMBOLS_BY_INT[string_or_id]
         else:
             key = string_or_id
-            self.hits.insert(key)
             utf8str = <Utf8Str*>self._map.get(key)
             if utf8str is NULL:
                 raise KeyError(Errors.E018.format(hash_value=string_or_id))
@@ -198,7 +197,6 @@ cdef class StringStore:
         if key < len(SYMBOLS_BY_INT):
             return True
         else:
-            self.hits.insert(key)
             return self._map.get(key) is not NULL
 
     def __iter__(self):
@@ -210,7 +208,6 @@ cdef class StringStore:
         cdef hash_t key
         for i in range(self.keys.size()):
             key = self.keys[i]
-            self.hits.insert(key)
             utf8str = <Utf8Str*>self._map.get(key)
             yield decode_Utf8Str(utf8str)
         # TODO: Iterate OOV here?
@@ -269,41 +266,9 @@ cdef class StringStore:
         self.mem = Pool()
         self._map = PreshMap()
         self.keys.clear()
-        self.hits.clear()
         for string in strings:
             self.add(string)
 
-    def _cleanup_stale_strings(self, excepted):
-        """
-        excepted (list): Strings that should not be removed.
-        RETURNS (keys, strings): Dropped strings and keys that can be dropped from other places
-        """
-        if self.hits.size() == 0:
-            # If we don't have any hits, just skip cleanup
-            return
-
-        cdef vector[hash_t] tmp
-        dropped_strings = []
-        dropped_keys = []
-        for i in range(self.keys.size()):
-            key = self.keys[i]
-            # Here we cannot use __getitem__ because it also set hit.
-            utf8str = <Utf8Str*>self._map.get(key)
-            value = decode_Utf8Str(utf8str)
-            if self.hits.count(key) != 0 or value in excepted:
-                tmp.push_back(key)
-            else:
-                dropped_keys.append(key)
-                dropped_strings.append(value)
-
-        self.keys.swap(tmp)
-        strings = list(self)
-        self._reset_and_load(strings)
-        # Here we have strings but hits to it should be reseted
-        self.hits.clear()
-
-        return dropped_keys, dropped_strings
-
     cdef const Utf8Str* intern_unicode(self, unicode py_string):
         # 0 means missing, but we don't bother offsetting the index.
         cdef bytes byte_string = py_string.encode("utf8")
@@ -319,6 +284,5 @@ cdef class StringStore:
             return value
         value = _allocate(self.mem, <unsigned char*>utf8_string, length)
         self._map.set(key, value)
-        self.hits.insert(key)
         self.keys.push_back(key)
         return value