From 046c38bd265fdb504006f6ffbeccc5a9b765afee Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 1 Sep 2020 16:12:15 +0200 Subject: [PATCH] Remove 'cleanup' of strings (#6007) A long time ago we went to some trouble to try to clean up "unused" strings, to avoid the `StringStore` growing in long-running processes. This never really worked reliably, and I think it was a really wrong approach. It's much better to let the user reload the `nlp` object as necessary, now that the string encoding is stable (in v1, the string IDs were sequential integers, making reloading the NLP object really annoying.) The extra book-keeping does make some performance difference, and the feature is unsed, so it's past time we killed it. --- spacy/language.py | 29 ----------------------------- spacy/strings.pxd | 1 - spacy/strings.pyx | 36 ------------------------------------ 3 files changed, 66 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index e20bbdd80..8e7c39b90 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1314,7 +1314,6 @@ class Language: as_tuples: bool = False, batch_size: int = 1000, disable: Iterable[str] = SimpleFrozenList(), - cleanup: bool = False, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, n_process: int = 1, ): @@ -1326,8 +1325,6 @@ class Language: (doc, context) tuples. Defaults to False. batch_size (int): The number of texts to buffer. disable (List[str]): Names of the pipeline components to disable. - cleanup (bool): If True, unneeded strings are freed to control memory - use. Experimental. component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword arguments for specific components. n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`. @@ -1378,35 +1375,9 @@ class Language: for pipe in pipes: docs = pipe(docs) - # Track weakrefs of "recent" documents, so that we can see when they - # expire from memory. When they do, we know we don't need old strings. - # This way, we avoid maintaining an unbounded growth in string entries - # in the string store. - recent_refs = weakref.WeakSet() - old_refs = weakref.WeakSet() - # Keep track of the original string data, so that if we flush old strings, - # we can recover the original ones. However, we only want to do this if we're - # really adding strings, to save up-front costs. - original_strings_data = None nr_seen = 0 for doc in docs: yield doc - if cleanup: - recent_refs.add(doc) - if nr_seen < 10000: - old_refs.add(doc) - nr_seen += 1 - elif len(old_refs) == 0: - old_refs, recent_refs = recent_refs, old_refs - if original_strings_data is None: - original_strings_data = list(self.vocab.strings) - else: - keys, strings = self.vocab.strings._cleanup_stale_strings( - original_strings_data - ) - self.vocab._reset_cache(keys, strings) - self.tokenizer._reset_cache(keys) - nr_seen = 0 def _multiprocessing_pipe( self, diff --git a/spacy/strings.pxd b/spacy/strings.pxd index ba2476ec7..07768d347 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -23,7 +23,6 @@ cdef class StringStore: cdef Pool mem cdef vector[hash_t] keys - cdef set[hash_t] hits cdef public PreshMap _map cdef const Utf8Str* intern_unicode(self, unicode py_string) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 136eda9ff..6a1d68221 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -127,7 +127,6 @@ cdef class StringStore: return SYMBOLS_BY_INT[string_or_id] else: key = string_or_id - self.hits.insert(key) utf8str = self._map.get(key) if utf8str is NULL: raise KeyError(Errors.E018.format(hash_value=string_or_id)) @@ -198,7 +197,6 @@ cdef class StringStore: if key < len(SYMBOLS_BY_INT): return True else: - self.hits.insert(key) return self._map.get(key) is not NULL def __iter__(self): @@ -210,7 +208,6 @@ cdef class StringStore: cdef hash_t key for i in range(self.keys.size()): key = self.keys[i] - self.hits.insert(key) utf8str = self._map.get(key) yield decode_Utf8Str(utf8str) # TODO: Iterate OOV here? @@ -269,41 +266,9 @@ cdef class StringStore: self.mem = Pool() self._map = PreshMap() self.keys.clear() - self.hits.clear() for string in strings: self.add(string) - def _cleanup_stale_strings(self, excepted): - """ - excepted (list): Strings that should not be removed. - RETURNS (keys, strings): Dropped strings and keys that can be dropped from other places - """ - if self.hits.size() == 0: - # If we don't have any hits, just skip cleanup - return - - cdef vector[hash_t] tmp - dropped_strings = [] - dropped_keys = [] - for i in range(self.keys.size()): - key = self.keys[i] - # Here we cannot use __getitem__ because it also set hit. - utf8str = self._map.get(key) - value = decode_Utf8Str(utf8str) - if self.hits.count(key) != 0 or value in excepted: - tmp.push_back(key) - else: - dropped_keys.append(key) - dropped_strings.append(value) - - self.keys.swap(tmp) - strings = list(self) - self._reset_and_load(strings) - # Here we have strings but hits to it should be reseted - self.hits.clear() - - return dropped_keys, dropped_strings - cdef const Utf8Str* intern_unicode(self, unicode py_string): # 0 means missing, but we don't bother offsetting the index. cdef bytes byte_string = py_string.encode("utf8") @@ -319,6 +284,5 @@ cdef class StringStore: return value value = _allocate(self.mem, utf8_string, length) self._map.set(key, value) - self.hits.insert(key) self.keys.push_back(key) return value