From a33d5a068d073dc1a283edcacab692ea83a5d5b0 Mon Sep 17 00:00:00 2001 From: Roman Domrachev Date: Tue, 14 Nov 2017 22:40:03 +0300 Subject: [PATCH] Try to hold origin data instead of restore it --- spacy/language.py | 6 +----- spacy/strings.pyx | 7 ++++--- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index d1c2cf1b2..c43f4e4c5 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -559,14 +559,10 @@ class Language(object): nr_seen += 1 elif len(old_refs) == 0: old_refs, recent_refs = recent_refs, old_refs - keys, strings = self.vocab.strings._cleanup_stale_strings() + keys, strings = self.vocab.strings._cleanup_stale_strings(original_strings_data) self.vocab._reset_cache(keys, strings) self.tokenizer._reset_cache(keys) - for string in original_strings_data: - self.vocab.strings.add(string) nr_seen = 0 - # We can't know which strings from the last batch have really expired. - # So we don't erase the strings. def to_disk(self, path, disable=tuple()): """Save the current state to a directory. If a model is loaded, this diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 48c9a0cc8..4243c8193 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -249,7 +249,7 @@ cdef class StringStore: for string in strings: self.add(string) - def _cleanup_stale_strings(self): + def _cleanup_stale_strings(self, excepted): """ RETURNS (keys, strings): Dropped strings and keys that can be dropped from other places """ @@ -262,11 +262,12 @@ cdef class StringStore: dropped_keys = [] for i in range(self.keys.size()): key = self.keys[i] - if self.hits.count(key) != 0: + value = self[key] + if self.hits.count(key) != 0 or value in excepted: tmp.push_back(key) else: dropped_keys.append(key) - dropped_strings.append(self[key]) + dropped_strings.append(value) self.keys.swap(tmp) strings = list(self)