Try to fix StringStore clean up (see #1506)

This commit is contained in:
Roman Domrachev 2017-11-11 03:11:27 +03:00
parent ee97fd3cb4
commit 3c600adf23
4 changed files with 56 additions and 12 deletions

View File

@ -12,6 +12,7 @@ from copy import copy
from thinc.neural import Model from thinc.neural import Model
from thinc.neural.optimizers import Adam from thinc.neural.optimizers import Adam
from .strings import StringStore
from .tokenizer import Tokenizer from .tokenizer import Tokenizer
from .vocab import Vocab from .vocab import Vocab
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
@ -547,27 +548,20 @@ class Language(object):
# in the string store. # in the string store.
recent_refs = weakref.WeakSet() recent_refs = weakref.WeakSet()
old_refs = weakref.WeakSet() old_refs = weakref.WeakSet()
original_strings_data = self.vocab.strings.to_bytes() # If there is anything that we have inside — after iterations we should
StringStore = self.vocab.strings.__class__ # carefully get it back.
recent_strings = StringStore().from_bytes(original_strings_data) original_strings_data = list(self.vocab.strings)
nr_seen = 0 nr_seen = 0
for doc in docs: for doc in docs:
yield doc yield doc
for word in doc:
recent_strings.add(word.text)
recent_refs.add(doc) recent_refs.add(doc)
if nr_seen < 10000: if nr_seen < 10000:
old_refs.add(doc) old_refs.add(doc)
nr_seen += 1 nr_seen += 1
elif len(old_refs) == 0: elif len(old_refs) == 0:
# All the docs in the 'old' set have expired, so the only self.vocab.strings._cleanup_stale_strings()
# difference between the backup strings and the current
# string-store should be obsolete. We therefore swap out the
# old strings data.
old_refs, recent_refs = recent_refs, old_refs
self.vocab.strings._reset_and_load(recent_strings)
recent_strings = StringStore().from_bytes(original_strings_data)
nr_seen = 0 nr_seen = 0
self.vocab.strings._reset_and_load(original_strings_data)
def to_disk(self, path, disable=tuple()): def to_disk(self, path, disable=tuple()):
"""Save the current state to a directory. If a model is loaded, this """Save the current state to a directory. If a model is loaded, this

View File

@ -1,5 +1,6 @@
from libc.stdint cimport int64_t from libc.stdint cimport int64_t
from libcpp.vector cimport vector from libcpp.vector cimport vector
from libcpp.set cimport set
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
@ -23,6 +24,7 @@ cdef class StringStore:
cdef Pool mem cdef Pool mem
cdef vector[hash_t] keys cdef vector[hash_t] keys
cdef set[hash_t] hits
cdef public PreshMap _map cdef public PreshMap _map
cdef const Utf8Str* intern_unicode(self, unicode py_string) cdef const Utf8Str* intern_unicode(self, unicode py_string)

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals, absolute_import
cimport cython cimport cython
from libc.string cimport memcpy from libc.string cimport memcpy
from libcpp.set cimport set
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
from murmurhash.mrmr cimport hash64, hash32 from murmurhash.mrmr cimport hash64, hash32
import ujson import ujson
@ -111,6 +112,7 @@ cdef class StringStore:
return SYMBOLS_BY_INT[string_or_id] return SYMBOLS_BY_INT[string_or_id]
else: else:
key = string_or_id key = string_or_id
self.hits.insert(key)
utf8str = <Utf8Str*>self._map.get(key) utf8str = <Utf8Str*>self._map.get(key)
if utf8str is NULL: if utf8str is NULL:
raise KeyError(string_or_id) raise KeyError(string_or_id)
@ -168,6 +170,7 @@ cdef class StringStore:
if key < len(SYMBOLS_BY_INT): if key < len(SYMBOLS_BY_INT):
return True return True
else: else:
self.hits.insert(key)
return self._map.get(key) is not NULL return self._map.get(key) is not NULL
def __iter__(self): def __iter__(self):
@ -179,6 +182,7 @@ cdef class StringStore:
cdef hash_t key cdef hash_t key
for i in range(self.keys.size()): for i in range(self.keys.size()):
key = self.keys[i] key = self.keys[i]
self.hits.insert(key)
utf8str = <Utf8Str*>self._map.get(key) utf8str = <Utf8Str*>self._map.get(key)
yield decode_Utf8Str(utf8str) yield decode_Utf8Str(utf8str)
# TODO: Iterate OOV here? # TODO: Iterate OOV here?
@ -241,9 +245,24 @@ cdef class StringStore:
self.mem = Pool() self.mem = Pool()
self._map = PreshMap() self._map = PreshMap()
self.keys.clear() self.keys.clear()
self.hits.clear()
for string in strings: for string in strings:
self.add(string) self.add(string)
def _cleanup_stale_strings(self):
if self.hits.size() == 0:
# If no any hits — just skip cleanup
return
cdef vector[hash_t] tmp
for i in range(self.keys.size()):
key = self.keys[i]
if self.hits.count(key) != 0:
tmp.push_back(key)
self.keys.swap(tmp)
self.hits.clear()
cdef const Utf8Str* intern_unicode(self, unicode py_string): cdef const Utf8Str* intern_unicode(self, unicode py_string):
# 0 means missing, but we don't bother offsetting the index. # 0 means missing, but we don't bother offsetting the index.
cdef bytes byte_string = py_string.encode('utf8') cdef bytes byte_string = py_string.encode('utf8')
@ -259,5 +278,6 @@ cdef class StringStore:
return value return value
value = _allocate(self.mem, <unsigned char*>utf8_string, length) value = _allocate(self.mem, <unsigned char*>utf8_string, length)
self._map.set(key, value) self._map.set(key, value)
self.hits.insert(key)
self.keys.push_back(key) self.keys.push_back(key)
return value return value

View File

@ -0,0 +1,28 @@
# coding: utf8
from __future__ import unicode_literals
import random
import string
import itertools
from compat import izip
from ...lang.en import English
def test_issue1506():
nlp = English()
def string_generator():
for (_, t) in izip(range(10001), itertools.repeat("It's sentence produced by that bug.")):
yield t
for (_, t) in izip(range(10001), itertools.repeat("I erase lemmas.")):
yield t
for (_, t) in izip(range(10001), itertools.repeat("It's sentence produced by that bug.")):
yield t
for d in nlp.pipe(string_generator()):
for t in d:
str(t.lemma_)