From 59c216196cb0a502ca9214318a17efa4934b1268 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 16 Oct 2017 19:22:11 +0200
Subject: [PATCH 1/7] Allow weakrefs on Doc objects

---
 spacy/tokens/doc.pxd | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd
index ad2b9876d..f34c455c6 100644
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@@ -54,6 +54,8 @@ cdef class Doc:
 
     cdef public object noun_chunks_iterator
 
+    cdef object __weakref__
+
     cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1
 
     cpdef np.ndarray to_array(self, object features)

From 5c14f3f033232b9329183148e706c0884d9d043f Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 16 Oct 2017 19:22:40 +0200
Subject: [PATCH 2/7] Create a rolling buffer for the StringStore in
 Language.pipe()

---
 spacy/language.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/spacy/language.py b/spacy/language.py
index 047c94a37..f092c9806 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -8,6 +8,7 @@ import random
 import ujson
 from collections import OrderedDict
 import itertools
+import weakref
 
 from .tokenizer import Tokenizer
 from .vocab import Vocab
@@ -510,8 +511,33 @@ class Language(object):
             else:
                 # Apply the function, but yield the doc
                 docs = _pipe(proc, docs)
+        # Track weakrefs of "recent" documents, so that we can see when they
+        # expire from memory. When they do, we know we don't need old strings.
+        # This way, we avoid maintaining an unbounded growth in string entries
+        # in the string store.
+        recent_refs = weakref.WeakSet()
+        old_refs = weakref.WeakSet()
+        original_strings_data = self.vocab.strings.to_bytes()
+        StringStore = self.vocab.strings.__class__
+        recent_strings = StringStore().from_bytes(original_strings_data)
+        nr_seen = 0
         for doc in docs:
             yield doc
+            for word in doc:
+                recent_strings.add(word.text)
+            recent_refs.add(doc)
+            if nr_seen < 1000:
+                old_refs.add(doc)
+                nr_seen += 1
+            elif len(old_refs) == 0:
+                # All the docs in the 'old' set have expired, so the only
+                # difference between the backup strings and the current
+                # string-store should be obsolete. We therefore swap out the
+                # old strings data.
+                old_refs, recent_refs = recent_refs, old_refs
+                self.vocab.strings._reset_and_load(recent_strings)
+                recent_strings = StringStore().from_bytes(original_strings_data)
+                nr_seen = 0
 
     def to_disk(self, path, disable=tuple()):
         """Save the current state to a directory.  If a model is loaded, this

From 3e037054c88476e11ca6c0bc2e0ee2ce32d0997e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 16 Oct 2017 19:23:10 +0200
Subject: [PATCH 3/7] Remove obsolete is_frozen functionality from StringStore

---
 spacy/strings.pxd |  2 --
 spacy/strings.pyx | 21 +--------------------
 2 files changed, 1 insertion(+), 22 deletions(-)

diff --git a/spacy/strings.pxd b/spacy/strings.pxd
index 0ad403cf1..4f987baed 100644
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@@ -21,11 +21,9 @@ ctypedef union Utf8Str:
 
 cdef class StringStore:
     cdef Pool mem
-    cdef bint is_frozen
 
     cdef vector[hash_t] keys
     cdef public PreshMap _map
-    cdef public PreshMap _oov
 
     cdef const Utf8Str* intern_unicode(self, unicode py_string)
     cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 6f676c79a..29a706996 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -86,8 +86,6 @@ cdef class StringStore:
         """
         self.mem = Pool()
         self._map = PreshMap()
-        self._oov = PreshMap()
-        self.is_frozen = freeze
         if strings is not None:
             for string in strings:
                 self.add(string)
@@ -243,21 +241,12 @@ cdef class StringStore:
             self.add(word)
         return self
 
-    def set_frozen(self, bint is_frozen):
-        # TODO
-        self.is_frozen = is_frozen
-
-    def flush_oov(self):
-        self._oov = PreshMap()
-
-    def _reset_and_load(self, strings, freeze=False):
+    def _reset_and_load(self, strings):
         self.mem = Pool()
         self._map = PreshMap()
-        self._oov = PreshMap()
         self.keys.clear()
         for string in strings:
             self.add(string)
-        self.is_frozen = freeze
 
     cdef const Utf8Str* intern_unicode(self, unicode py_string):
         # 0 means missing, but we don't bother offsetting the index.
@@ -275,14 +264,6 @@ cdef class StringStore:
         value = <Utf8Str*>self._oov.get(key)
         if value is not NULL:
             return value
-        if self.is_frozen:
-            # OOV store uses 32 bit hashes. Pretty ugly :(
-            key32 = hash32_utf8(utf8_string, length)
-            # Important: Make the OOV store own the memory. That way it's trivial
-            # to flush them all.
-            value = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
-            self._oov.set(key32, value)
-            return NULL
 
         value = _allocate(self.mem, <unsigned char*>utf8_string, length)
         self._map.set(key, value)

From a002264fec3f49e85f530bf8cb3d16be0a049071 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 16 Oct 2017 19:34:21 +0200
Subject: [PATCH 4/7] Remove caching of Token in Doc, as caused cycle.

---
 spacy/tokens/doc.pyx   | 13 ++-----------
 spacy/tokens/token.pxd |  3 ---
 2 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 05d393d2b..bf48cf4f5 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -140,7 +140,6 @@ cdef class Doc:
         self.user_span_hooks = {}
         self.tensor = numpy.zeros((0,), dtype='float32')
         self.user_data = {}
-        self._py_tokens = []
         self._vector = None
         self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
         cdef unicode orth
@@ -209,10 +208,7 @@ cdef class Doc:
         if i < 0:
             i = self.length + i
         bounds_check(i, self.length, PADDING)
-        if self._py_tokens[i] is not None:
-            return self._py_tokens[i]
-        else:
-            return Token.cinit(self.vocab, &self.c[i], i, self)
+        return Token.cinit(self.vocab, &self.c[i], i, self)
 
     def __iter__(self):
         """Iterate over `Token`  objects, from which the annotations can be
@@ -226,10 +222,7 @@ cdef class Doc:
         """
         cdef int i
         for i in range(self.length):
-            if self._py_tokens[i] is not None:
-                yield self._py_tokens[i]
-            else:
-                yield Token.cinit(self.vocab, &self.c[i], i, self)
+            yield Token.cinit(self.vocab, &self.c[i], i, self)
 
     def __len__(self):
         """The number of tokens in the document.
@@ -535,7 +528,6 @@ cdef class Doc:
         self.length += 1
         # Set morphological attributes, e.g. by lemma, if possible
         self.vocab.morphology.assign_untagged(t)
-        self._py_tokens.append(None)
         return t.idx + t.lex.length + t.spacy
 
     @cython.boundscheck(False)
@@ -841,7 +833,6 @@ cdef class Doc:
         # Set the left/right children, left/right edges
         set_children_from_heads(self.c, self.length)
         # Clear the cached Python objects
-        self._py_tokens = [None] * self.length
         # Return the merged Python object
         return self[start]
 
diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd
index f63a0490c..b408e04eb 100644
--- a/spacy/tokens/token.pxd
+++ b/spacy/tokens/token.pxd
@@ -19,10 +19,7 @@ cdef class Token:
         if offset < 0 or offset >= doc.length:
             msg = "Attempt to access token at %d, max length %d"
             raise IndexError(msg % (offset, doc.length))
-        if doc._py_tokens[offset] != None:
-            return doc._py_tokens[offset]
         cdef Token self = Token.__new__(Token, vocab, doc, offset)
-        doc._py_tokens[offset] = self
         return self
 
     #cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):

From 66e2eb8f397c82505d5b44c1b52071fcda2a5a1c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 16 Oct 2017 19:34:41 +0200
Subject: [PATCH 5/7] Clean up remnant of frozen in StringStore

---
 spacy/strings.pyx | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 29a706996..e6926a75d 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -261,10 +261,6 @@ cdef class StringStore:
         cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
         if value is not NULL:
             return value
-        value = <Utf8Str*>self._oov.get(key)
-        if value is not NULL:
-            return value
-
         value = _allocate(self.mem, <unsigned char*>utf8_string, length)
         self._map.set(key, value)
         self.keys.push_back(key)

From 2bc06e4b222c7f38505235b30105bca1d15bf286 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 16 Oct 2017 19:38:29 +0200
Subject: [PATCH 6/7] Bump rolling buffer size to 10k

---
 spacy/language.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/language.py b/spacy/language.py
index f092c9806..7fd56ed56 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -526,7 +526,7 @@ class Language(object):
             for word in doc:
                 recent_strings.add(word.text)
             recent_refs.add(doc)
-            if nr_seen < 1000:
+            if nr_seen < 10000:
                 old_refs.add(doc)
                 nr_seen += 1
             elif len(old_refs) == 0:

From 41744771611305363484f046b0271b5f0ea071aa Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 16 Oct 2017 19:50:35 +0200
Subject: [PATCH 7/7] Fix equality check in test

---
 spacy/tests/parser/test_parse_navigate.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py
index 4d909f0d6..da59b0b59 100644
--- a/spacy/tests/parser/test_parse_navigate.py
+++ b/spacy/tests/parser/test_parse_navigate.py
@@ -57,9 +57,9 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
     doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
     for head in doc:
         for child in head.lefts:
-            assert child.head is head
+            assert child.head == head
         for child in head.rights:
-            assert child.head is head
+            assert child.head == head
 
 
 def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):