From a2745b0e84f15867758fca2867500fba9784623c Mon Sep 17 00:00:00 2001 From: Roman Domrachev Date: Tue, 14 Nov 2017 17:45:50 +0300 Subject: [PATCH] StringStore now actually cleaned Do not lose docs in ref tracking --- spacy/language.py | 1 + spacy/strings.pyx | 3 +++ spacy/tests/regression/test_issue1506.py | 22 +++++++++++++++++++--- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 739e7665d..d42c75fa9 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -558,6 +558,7 @@ class Language(object): old_refs.add(doc) nr_seen += 1 elif len(old_refs) == 0: + old_refs, recent_refs = recent_refs, old_refs self.vocab.strings._cleanup_stale_strings() nr_seen = 0 # Last batch can be not garbage collected and we cannot know it — last diff --git a/spacy/strings.pyx b/spacy/strings.pyx index b8628cef1..f088b955c 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -260,6 +260,9 @@ cdef class StringStore: if self.hits.count(key) != 0: tmp.push_back(key) + strings = list(self) + self._reset_and_load(strings) + self.keys.swap(tmp) self.hits.clear() diff --git a/spacy/tests/regression/test_issue1506.py b/spacy/tests/regression/test_issue1506.py index d9ba1ac97..338126d3a 100644 --- a/spacy/tests/regression/test_issue1506.py +++ b/spacy/tests/regression/test_issue1506.py @@ -1,6 +1,8 @@ # coding: utf8 from __future__ import unicode_literals +import gc + from ...lang.en import English @@ -11,12 +13,26 @@ def test_issue1506(): for _ in range(10001): yield "It's sentence produced by that bug." + yield "Oh snap." + for _ in range(10001): yield "I erase lemmas." for _ in range(10001): yield "It's sentence produced by that bug." - for d in nlp.pipe(string_generator()): - for t in d: - str(t.lemma_) + for _ in range(10001): + yield "It's sentence produced by that bug." + + anchor = None + remember = None + for i, d in enumerate(nlp.pipe(string_generator())): + if i == 9999: + anchor = d + elif 10001 == i: + remember = d + elif i == 10002: + del anchor + gc.collect() + + assert remember.text == 'Oh snap.'