Merge pull request #1424 from explosion/feature/streaming-data-memory-growth

💫 Fix streaming data memory growth (!!)
This commit is contained in:
Matthew Honnibal 2017-10-16 23:08:18 +02:00 committed by GitHub
commit fc797a58de
7 changed files with 33 additions and 42 deletions

View File

@ -8,6 +8,7 @@ import random
import ujson
from collections import OrderedDict
import itertools
import weakref
from .tokenizer import Tokenizer
from .vocab import Vocab
@ -510,8 +511,33 @@ class Language(object):
else:
# Apply the function, but yield the doc
docs = _pipe(proc, docs)
# Track weakrefs of "recent" documents, so that we can see when they
# expire from memory. When they do, we know we don't need old strings.
# This way, we avoid maintaining an unbounded growth in string entries
# in the string store.
recent_refs = weakref.WeakSet()
old_refs = weakref.WeakSet()
original_strings_data = self.vocab.strings.to_bytes()
StringStore = self.vocab.strings.__class__
recent_strings = StringStore().from_bytes(original_strings_data)
nr_seen = 0
for doc in docs:
yield doc
for word in doc:
recent_strings.add(word.text)
recent_refs.add(doc)
if nr_seen < 10000:
old_refs.add(doc)
nr_seen += 1
elif len(old_refs) == 0:
# All the docs in the 'old' set have expired, so the only
# difference between the backup strings and the current
# string-store should be obsolete. We therefore swap out the
# old strings data.
old_refs, recent_refs = recent_refs, old_refs
self.vocab.strings._reset_and_load(recent_strings)
recent_strings = StringStore().from_bytes(original_strings_data)
nr_seen = 0
def to_disk(self, path, disable=tuple()):
"""Save the current state to a directory. If a model is loaded, this

View File

@ -21,11 +21,9 @@ ctypedef union Utf8Str:
cdef class StringStore:
cdef Pool mem
cdef bint is_frozen
cdef vector[hash_t] keys
cdef public PreshMap _map
cdef public PreshMap _oov
cdef const Utf8Str* intern_unicode(self, unicode py_string)
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)

View File

@ -86,8 +86,6 @@ cdef class StringStore:
"""
self.mem = Pool()
self._map = PreshMap()
self._oov = PreshMap()
self.is_frozen = freeze
if strings is not None:
for string in strings:
self.add(string)
@ -243,21 +241,12 @@ cdef class StringStore:
self.add(word)
return self
def set_frozen(self, bint is_frozen):
# TODO
self.is_frozen = is_frozen
def flush_oov(self):
self._oov = PreshMap()
def _reset_and_load(self, strings, freeze=False):
def _reset_and_load(self, strings):
self.mem = Pool()
self._map = PreshMap()
self._oov = PreshMap()
self.keys.clear()
for string in strings:
self.add(string)
self.is_frozen = freeze
cdef const Utf8Str* intern_unicode(self, unicode py_string):
# 0 means missing, but we don't bother offsetting the index.
@ -272,18 +261,6 @@ cdef class StringStore:
cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
if value is not NULL:
return value
value = <Utf8Str*>self._oov.get(key)
if value is not NULL:
return value
if self.is_frozen:
# OOV store uses 32 bit hashes. Pretty ugly :(
key32 = hash32_utf8(utf8_string, length)
# Important: Make the OOV store own the memory. That way it's trivial
# to flush them all.
value = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
self._oov.set(key32, value)
return NULL
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
self._map.set(key, value)
self.keys.push_back(key)

View File

@ -57,9 +57,9 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
for head in doc:
for child in head.lefts:
assert child.head is head
assert child.head == head
for child in head.rights:
assert child.head is head
assert child.head == head
def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):

View File

@ -54,6 +54,8 @@ cdef class Doc:
cdef public object noun_chunks_iterator
cdef object __weakref__
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1
cpdef np.ndarray to_array(self, object features)

View File

@ -140,7 +140,6 @@ cdef class Doc:
self.user_span_hooks = {}
self.tensor = numpy.zeros((0,), dtype='float32')
self.user_data = {}
self._py_tokens = []
self._vector = None
self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
cdef unicode orth
@ -209,10 +208,7 @@ cdef class Doc:
if i < 0:
i = self.length + i
bounds_check(i, self.length, PADDING)
if self._py_tokens[i] is not None:
return self._py_tokens[i]
else:
return Token.cinit(self.vocab, &self.c[i], i, self)
return Token.cinit(self.vocab, &self.c[i], i, self)
def __iter__(self):
"""Iterate over `Token` objects, from which the annotations can be
@ -226,10 +222,7 @@ cdef class Doc:
"""
cdef int i
for i in range(self.length):
if self._py_tokens[i] is not None:
yield self._py_tokens[i]
else:
yield Token.cinit(self.vocab, &self.c[i], i, self)
yield Token.cinit(self.vocab, &self.c[i], i, self)
def __len__(self):
"""The number of tokens in the document.
@ -535,7 +528,6 @@ cdef class Doc:
self.length += 1
# Set morphological attributes, e.g. by lemma, if possible
self.vocab.morphology.assign_untagged(t)
self._py_tokens.append(None)
return t.idx + t.lex.length + t.spacy
@cython.boundscheck(False)
@ -841,7 +833,6 @@ cdef class Doc:
# Set the left/right children, left/right edges
set_children_from_heads(self.c, self.length)
# Clear the cached Python objects
self._py_tokens = [None] * self.length
# Return the merged Python object
return self[start]

View File

@ -19,10 +19,7 @@ cdef class Token:
if offset < 0 or offset >= doc.length:
msg = "Attempt to access token at %d, max length %d"
raise IndexError(msg % (offset, doc.length))
if doc._py_tokens[offset] != None:
return doc._py_tokens[offset]
cdef Token self = Token.__new__(Token, vocab, doc, offset)
doc._py_tokens[offset] = self
return self
#cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):