From 3c600adf23e2ed08bd91bbd5dec3972f8f723492 Mon Sep 17 00:00:00 2001
From: Roman Domrachev <ligser@gmail.com>
Date: Sat, 11 Nov 2017 03:11:27 +0300
Subject: [PATCH] Try to fix StringStore clean up (see #1506)

---
 spacy/language.py                        | 18 +++++----------
 spacy/strings.pxd                        |  2 ++
 spacy/strings.pyx                        | 20 +++++++++++++++++
 spacy/tests/regression/test_issue1506.py | 28 ++++++++++++++++++++++++
 4 files changed, 56 insertions(+), 12 deletions(-)
 create mode 100644 spacy/tests/regression/test_issue1506.py

diff --git a/spacy/language.py b/spacy/language.py
index 92ed6133c..c4a282793 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -12,6 +12,7 @@ from copy import copy
 from thinc.neural import Model
 from thinc.neural.optimizers import Adam
 
+from .strings import StringStore
 from .tokenizer import Tokenizer
 from .vocab import Vocab
 from .lemmatizer import Lemmatizer
@@ -547,27 +548,20 @@ class Language(object):
         # in the string store.
         recent_refs = weakref.WeakSet()
         old_refs = weakref.WeakSet()
-        original_strings_data = self.vocab.strings.to_bytes()
-        StringStore = self.vocab.strings.__class__
-        recent_strings = StringStore().from_bytes(original_strings_data)
+        # If there is anything that we have inside — after iterations we should
+        # carefully get it back.
+        original_strings_data = list(self.vocab.strings)
         nr_seen = 0
         for doc in docs:
             yield doc
-            for word in doc:
-                recent_strings.add(word.text)
             recent_refs.add(doc)
             if nr_seen < 10000:
                 old_refs.add(doc)
                 nr_seen += 1
             elif len(old_refs) == 0:
-                # All the docs in the 'old' set have expired, so the only
-                # difference between the backup strings and the current
-                # string-store should be obsolete. We therefore swap out the
-                # old strings data.
-                old_refs, recent_refs = recent_refs, old_refs
-                self.vocab.strings._reset_and_load(recent_strings)
-                recent_strings = StringStore().from_bytes(original_strings_data)
+                self.vocab.strings._cleanup_stale_strings()
                 nr_seen = 0
+        self.vocab.strings._reset_and_load(original_strings_data)
 
     def to_disk(self, path, disable=tuple()):
         """Save the current state to a directory.  If a model is loaded, this
diff --git a/spacy/strings.pxd b/spacy/strings.pxd
index 4f987baed..e436fb33b 100644
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@@ -1,5 +1,6 @@
 from libc.stdint cimport int64_t
 from libcpp.vector cimport vector
+from libcpp.set cimport set
 
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
@@ -23,6 +24,7 @@ cdef class StringStore:
     cdef Pool mem
 
     cdef vector[hash_t] keys
+    cdef set[hash_t] hits
     cdef public PreshMap _map
 
     cdef const Utf8Str* intern_unicode(self, unicode py_string)
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 647f140bb..b8628cef1 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -4,6 +4,7 @@ from __future__ import unicode_literals, absolute_import
 
 cimport cython
 from libc.string cimport memcpy
+from libcpp.set cimport set
 from libc.stdint cimport uint32_t
 from murmurhash.mrmr cimport hash64, hash32
 import ujson
@@ -111,6 +112,7 @@ cdef class StringStore:
             return SYMBOLS_BY_INT[string_or_id]
         else:
             key = string_or_id
+            self.hits.insert(key)
             utf8str = <Utf8Str*>self._map.get(key)
             if utf8str is NULL:
                 raise KeyError(string_or_id)
@@ -168,6 +170,7 @@ cdef class StringStore:
         if key < len(SYMBOLS_BY_INT):
             return True
         else:
+            self.hits.insert(key)
             return self._map.get(key) is not NULL
 
     def __iter__(self):
@@ -179,6 +182,7 @@ cdef class StringStore:
         cdef hash_t key
         for i in range(self.keys.size()):
             key = self.keys[i]
+            self.hits.insert(key)
             utf8str = <Utf8Str*>self._map.get(key)
             yield decode_Utf8Str(utf8str)
         # TODO: Iterate OOV here?
@@ -241,9 +245,24 @@ cdef class StringStore:
         self.mem = Pool()
         self._map = PreshMap()
         self.keys.clear()
+        self.hits.clear()
         for string in strings:
             self.add(string)
 
+    def _cleanup_stale_strings(self):
+        if self.hits.size() == 0:
+            # If no any hits — just skip cleanup
+            return
+
+        cdef vector[hash_t] tmp
+        for i in range(self.keys.size()):
+            key = self.keys[i]
+            if self.hits.count(key) != 0:
+                tmp.push_back(key)
+
+        self.keys.swap(tmp)
+        self.hits.clear()
+
     cdef const Utf8Str* intern_unicode(self, unicode py_string):
         # 0 means missing, but we don't bother offsetting the index.
         cdef bytes byte_string = py_string.encode('utf8')
@@ -259,5 +278,6 @@ cdef class StringStore:
             return value
         value = _allocate(self.mem, <unsigned char*>utf8_string, length)
         self._map.set(key, value)
+        self.hits.insert(key)
         self.keys.push_back(key)
         return value
diff --git a/spacy/tests/regression/test_issue1506.py b/spacy/tests/regression/test_issue1506.py
new file mode 100644
index 000000000..1a4ba6399
--- /dev/null
+++ b/spacy/tests/regression/test_issue1506.py
@@ -0,0 +1,28 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import random
+import string
+
+import itertools
+from compat import izip
+
+from ...lang.en import English
+
+
+def test_issue1506():
+    nlp = English()
+
+    def string_generator():
+        for (_, t) in izip(range(10001), itertools.repeat("It's sentence produced by that bug.")):
+            yield t
+
+        for (_, t) in izip(range(10001), itertools.repeat("I erase lemmas.")):
+            yield t
+
+        for (_, t) in izip(range(10001), itertools.repeat("It's sentence produced by that bug.")):
+            yield t
+
+    for d in nlp.pipe(string_generator()):
+        for t in d:
+            str(t.lemma_)