Merge pull request #1424 from explosion/feature/streaming-data-memory-growth

💫 Fix streaming data memory growth (!!)
2017-10-16 23:08:18 +02:00 · 2017-10-16 23:08:18 +02:00 · fc797a58de
parent df488274b1 19531bad4c
commit fc797a58de
7 changed files with 33 additions and 42 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -8,6 +8,7 @@ import random
 import ujson
 from collections import OrderedDict
 import itertools
+import weakref

 from .tokenizer import Tokenizer
 from .vocab import Vocab
@ -510,8 +511,33 @@ class Language(object):
            else:
                # Apply the function, but yield the doc
                docs = _pipe(proc, docs)
+        # Track weakrefs of "recent" documents, so that we can see when they
+        # expire from memory. When they do, we know we don't need old strings.
+        # This way, we avoid maintaining an unbounded growth in string entries
+        # in the string store.
+        recent_refs = weakref.WeakSet()
+        old_refs = weakref.WeakSet()
+        original_strings_data = self.vocab.strings.to_bytes()
+        StringStore = self.vocab.strings.__class__
+        recent_strings = StringStore().from_bytes(original_strings_data)
+        nr_seen = 0
        for doc in docs:
            yield doc
+            for word in doc:
+                recent_strings.add(word.text)
+            recent_refs.add(doc)
+            if nr_seen < 10000:
+                old_refs.add(doc)
+                nr_seen += 1
+            elif len(old_refs) == 0:
+                # All the docs in the 'old' set have expired, so the only
+                # difference between the backup strings and the current
+                # string-store should be obsolete. We therefore swap out the
+                # old strings data.
+                old_refs, recent_refs = recent_refs, old_refs
+                self.vocab.strings._reset_and_load(recent_strings)
+                recent_strings = StringStore().from_bytes(original_strings_data)
+                nr_seen = 0

    def to_disk(self, path, disable=tuple()):
        """Save the current state to a directory.  If a model is loaded, this
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -21,11 +21,9 @@ ctypedef union Utf8Str:

 cdef class StringStore:
    cdef Pool mem
-    cdef bint is_frozen

    cdef vector[hash_t] keys
    cdef public PreshMap _map
-    cdef public PreshMap _oov

    cdef const Utf8Str* intern_unicode(self, unicode py_string)
    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -86,8 +86,6 @@ cdef class StringStore:
        """
        self.mem = Pool()
        self._map = PreshMap()
-        self._oov = PreshMap()
-        self.is_frozen = freeze
        if strings is not None:
            for string in strings:
                self.add(string)
@ -243,21 +241,12 @@ cdef class StringStore:
            self.add(word)
        return self

-    def set_frozen(self, bint is_frozen):
-        # TODO
-        self.is_frozen = is_frozen
-
-    def flush_oov(self):
-        self._oov = PreshMap()
-
-    def _reset_and_load(self, strings, freeze=False):
+    def _reset_and_load(self, strings):
        self.mem = Pool()
        self._map = PreshMap()
-        self._oov = PreshMap()
        self.keys.clear()
        for string in strings:
            self.add(string)
-        self.is_frozen = freeze

    cdef const Utf8Str* intern_unicode(self, unicode py_string):
        # 0 means missing, but we don't bother offsetting the index.
@ -272,18 +261,6 @@ cdef class StringStore:
        cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
        if value is not NULL:
            return value
-        value = <Utf8Str*>self._oov.get(key)
-        if value is not NULL:
-            return value
-        if self.is_frozen:
-            # OOV store uses 32 bit hashes. Pretty ugly :(
-            key32 = hash32_utf8(utf8_string, length)
-            # Important: Make the OOV store own the memory. That way it's trivial
-            # to flush them all.
-            value = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
-            self._oov.set(key32, value)
-            return NULL
-
        value = _allocate(self.mem, <unsigned char*>utf8_string, length)
        self._map.set(key, value)
        self.keys.push_back(key)
--- a/spacy/tests/parser/test_parse_navigate.py
+++ b/spacy/tests/parser/test_parse_navigate.py
@ -57,9 +57,9 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
    doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
    for head in doc:
        for child in head.lefts:
-            assert child.head is head
+            assert child.head == head
        for child in head.rights:
-            assert child.head is head
+            assert child.head == head


 def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -54,6 +54,8 @@ cdef class Doc:

    cdef public object noun_chunks_iterator

+    cdef object __weakref__
+
    cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1

    cpdef np.ndarray to_array(self, object features)
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -140,7 +140,6 @@ cdef class Doc:
        self.user_span_hooks = {}
        self.tensor = numpy.zeros((0,), dtype='float32')
        self.user_data = {}
-        self._py_tokens = []
        self._vector = None
        self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
        cdef unicode orth
@ -209,10 +208,7 @@ cdef class Doc:
        if i < 0:
            i = self.length + i
        bounds_check(i, self.length, PADDING)
-        if self._py_tokens[i] is not None:
-            return self._py_tokens[i]
-        else:
-            return Token.cinit(self.vocab, &self.c[i], i, self)
+        return Token.cinit(self.vocab, &self.c[i], i, self)

    def __iter__(self):
        """Iterate over `Token`  objects, from which the annotations can be
@ -226,10 +222,7 @@ cdef class Doc:
        """
        cdef int i
        for i in range(self.length):
-            if self._py_tokens[i] is not None:
-                yield self._py_tokens[i]
-            else:
-                yield Token.cinit(self.vocab, &self.c[i], i, self)
+            yield Token.cinit(self.vocab, &self.c[i], i, self)

    def __len__(self):
        """The number of tokens in the document.
@ -535,7 +528,6 @@ cdef class Doc:
        self.length += 1
        # Set morphological attributes, e.g. by lemma, if possible
        self.vocab.morphology.assign_untagged(t)
-        self._py_tokens.append(None)
        return t.idx + t.lex.length + t.spacy

    @cython.boundscheck(False)
@ -841,7 +833,6 @@ cdef class Doc:
        # Set the left/right children, left/right edges
        set_children_from_heads(self.c, self.length)
        # Clear the cached Python objects
-        self._py_tokens = [None] * self.length
        # Return the merged Python object
        return self[start]

--- a/spacy/tokens/token.pxd
+++ b/spacy/tokens/token.pxd
@ -19,10 +19,7 @@ cdef class Token:
        if offset < 0 or offset >= doc.length:
            msg = "Attempt to access token at %d, max length %d"
            raise IndexError(msg % (offset, doc.length))
-        if doc._py_tokens[offset] != None:
-            return doc._py_tokens[offset]
        cdef Token self = Token.__new__(Token, vocab, doc, offset)
-        doc._py_tokens[offset] = self
        return self

    #cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):