Merge pull request #1424 from explosion/feature/streaming-data-memory-growth

💫 Fix streaming data memory growth (!!)
This commit is contained in:
Matthew Honnibal 2017-10-16 23:08:18 +02:00 committed by GitHub
commit fc797a58de
7 changed files with 33 additions and 42 deletions

View File

@ -8,6 +8,7 @@ import random
import ujson import ujson
from collections import OrderedDict from collections import OrderedDict
import itertools import itertools
import weakref
from .tokenizer import Tokenizer from .tokenizer import Tokenizer
from .vocab import Vocab from .vocab import Vocab
@ -510,8 +511,33 @@ class Language(object):
else: else:
# Apply the function, but yield the doc # Apply the function, but yield the doc
docs = _pipe(proc, docs) docs = _pipe(proc, docs)
# Track weakrefs of "recent" documents, so that we can see when they
# expire from memory. When they do, we know we don't need old strings.
# This way, we avoid maintaining an unbounded growth in string entries
# in the string store.
recent_refs = weakref.WeakSet()
old_refs = weakref.WeakSet()
original_strings_data = self.vocab.strings.to_bytes()
StringStore = self.vocab.strings.__class__
recent_strings = StringStore().from_bytes(original_strings_data)
nr_seen = 0
for doc in docs: for doc in docs:
yield doc yield doc
for word in doc:
recent_strings.add(word.text)
recent_refs.add(doc)
if nr_seen < 10000:
old_refs.add(doc)
nr_seen += 1
elif len(old_refs) == 0:
# All the docs in the 'old' set have expired, so the only
# difference between the backup strings and the current
# string-store should be obsolete. We therefore swap out the
# old strings data.
old_refs, recent_refs = recent_refs, old_refs
self.vocab.strings._reset_and_load(recent_strings)
recent_strings = StringStore().from_bytes(original_strings_data)
nr_seen = 0
def to_disk(self, path, disable=tuple()): def to_disk(self, path, disable=tuple()):
"""Save the current state to a directory. If a model is loaded, this """Save the current state to a directory. If a model is loaded, this

View File

@ -21,11 +21,9 @@ ctypedef union Utf8Str:
cdef class StringStore: cdef class StringStore:
cdef Pool mem cdef Pool mem
cdef bint is_frozen
cdef vector[hash_t] keys cdef vector[hash_t] keys
cdef public PreshMap _map cdef public PreshMap _map
cdef public PreshMap _oov
cdef const Utf8Str* intern_unicode(self, unicode py_string) cdef const Utf8Str* intern_unicode(self, unicode py_string)
cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)

View File

@ -86,8 +86,6 @@ cdef class StringStore:
""" """
self.mem = Pool() self.mem = Pool()
self._map = PreshMap() self._map = PreshMap()
self._oov = PreshMap()
self.is_frozen = freeze
if strings is not None: if strings is not None:
for string in strings: for string in strings:
self.add(string) self.add(string)
@ -243,21 +241,12 @@ cdef class StringStore:
self.add(word) self.add(word)
return self return self
def set_frozen(self, bint is_frozen): def _reset_and_load(self, strings):
# TODO
self.is_frozen = is_frozen
def flush_oov(self):
self._oov = PreshMap()
def _reset_and_load(self, strings, freeze=False):
self.mem = Pool() self.mem = Pool()
self._map = PreshMap() self._map = PreshMap()
self._oov = PreshMap()
self.keys.clear() self.keys.clear()
for string in strings: for string in strings:
self.add(string) self.add(string)
self.is_frozen = freeze
cdef const Utf8Str* intern_unicode(self, unicode py_string): cdef const Utf8Str* intern_unicode(self, unicode py_string):
# 0 means missing, but we don't bother offsetting the index. # 0 means missing, but we don't bother offsetting the index.
@ -272,18 +261,6 @@ cdef class StringStore:
cdef Utf8Str* value = <Utf8Str*>self._map.get(key) cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
if value is not NULL: if value is not NULL:
return value return value
value = <Utf8Str*>self._oov.get(key)
if value is not NULL:
return value
if self.is_frozen:
# OOV store uses 32 bit hashes. Pretty ugly :(
key32 = hash32_utf8(utf8_string, length)
# Important: Make the OOV store own the memory. That way it's trivial
# to flush them all.
value = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
self._oov.set(key32, value)
return NULL
value = _allocate(self.mem, <unsigned char*>utf8_string, length) value = _allocate(self.mem, <unsigned char*>utf8_string, length)
self._map.set(key, value) self._map.set(key, value)
self.keys.push_back(key) self.keys.push_back(key)

View File

@ -57,9 +57,9 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads)
for head in doc: for head in doc:
for child in head.lefts: for child in head.lefts:
assert child.head is head assert child.head == head
for child in head.rights: for child in head.rights:
assert child.head is head assert child.head == head
def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads): def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):

View File

@ -54,6 +54,8 @@ cdef class Doc:
cdef public object noun_chunks_iterator cdef public object noun_chunks_iterator
cdef object __weakref__
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1 cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1
cpdef np.ndarray to_array(self, object features) cpdef np.ndarray to_array(self, object features)

View File

@ -140,7 +140,6 @@ cdef class Doc:
self.user_span_hooks = {} self.user_span_hooks = {}
self.tensor = numpy.zeros((0,), dtype='float32') self.tensor = numpy.zeros((0,), dtype='float32')
self.user_data = {} self.user_data = {}
self._py_tokens = []
self._vector = None self._vector = None
self.noun_chunks_iterator = _get_chunker(self.vocab.lang) self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
cdef unicode orth cdef unicode orth
@ -209,10 +208,7 @@ cdef class Doc:
if i < 0: if i < 0:
i = self.length + i i = self.length + i
bounds_check(i, self.length, PADDING) bounds_check(i, self.length, PADDING)
if self._py_tokens[i] is not None: return Token.cinit(self.vocab, &self.c[i], i, self)
return self._py_tokens[i]
else:
return Token.cinit(self.vocab, &self.c[i], i, self)
def __iter__(self): def __iter__(self):
"""Iterate over `Token` objects, from which the annotations can be """Iterate over `Token` objects, from which the annotations can be
@ -226,10 +222,7 @@ cdef class Doc:
""" """
cdef int i cdef int i
for i in range(self.length): for i in range(self.length):
if self._py_tokens[i] is not None: yield Token.cinit(self.vocab, &self.c[i], i, self)
yield self._py_tokens[i]
else:
yield Token.cinit(self.vocab, &self.c[i], i, self)
def __len__(self): def __len__(self):
"""The number of tokens in the document. """The number of tokens in the document.
@ -535,7 +528,6 @@ cdef class Doc:
self.length += 1 self.length += 1
# Set morphological attributes, e.g. by lemma, if possible # Set morphological attributes, e.g. by lemma, if possible
self.vocab.morphology.assign_untagged(t) self.vocab.morphology.assign_untagged(t)
self._py_tokens.append(None)
return t.idx + t.lex.length + t.spacy return t.idx + t.lex.length + t.spacy
@cython.boundscheck(False) @cython.boundscheck(False)
@ -841,7 +833,6 @@ cdef class Doc:
# Set the left/right children, left/right edges # Set the left/right children, left/right edges
set_children_from_heads(self.c, self.length) set_children_from_heads(self.c, self.length)
# Clear the cached Python objects # Clear the cached Python objects
self._py_tokens = [None] * self.length
# Return the merged Python object # Return the merged Python object
return self[start] return self[start]

View File

@ -19,10 +19,7 @@ cdef class Token:
if offset < 0 or offset >= doc.length: if offset < 0 or offset >= doc.length:
msg = "Attempt to access token at %d, max length %d" msg = "Attempt to access token at %d, max length %d"
raise IndexError(msg % (offset, doc.length)) raise IndexError(msg % (offset, doc.length))
if doc._py_tokens[offset] != None:
return doc._py_tokens[offset]
cdef Token self = Token.__new__(Token, vocab, doc, offset) cdef Token self = Token.__new__(Token, vocab, doc, offset)
doc._py_tokens[offset] = self
return self return self
#cdef inline TokenC struct_from_attrs(Vocab vocab, attrs): #cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):