From 3c600adf23e2ed08bd91bbd5dec3972f8f723492 Mon Sep 17 00:00:00 2001 From: Roman Domrachev Date: Sat, 11 Nov 2017 03:11:27 +0300 Subject: [PATCH] Try to fix StringStore clean up (see #1506) --- spacy/language.py | 18 +++++---------- spacy/strings.pxd | 2 ++ spacy/strings.pyx | 20 +++++++++++++++++ spacy/tests/regression/test_issue1506.py | 28 ++++++++++++++++++++++++ 4 files changed, 56 insertions(+), 12 deletions(-) create mode 100644 spacy/tests/regression/test_issue1506.py diff --git a/spacy/language.py b/spacy/language.py index 92ed6133c..c4a282793 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -12,6 +12,7 @@ from copy import copy from thinc.neural import Model from thinc.neural.optimizers import Adam +from .strings import StringStore from .tokenizer import Tokenizer from .vocab import Vocab from .lemmatizer import Lemmatizer @@ -547,27 +548,20 @@ class Language(object): # in the string store. recent_refs = weakref.WeakSet() old_refs = weakref.WeakSet() - original_strings_data = self.vocab.strings.to_bytes() - StringStore = self.vocab.strings.__class__ - recent_strings = StringStore().from_bytes(original_strings_data) + # If there is anything that we have inside — after iterations we should + # carefully get it back. + original_strings_data = list(self.vocab.strings) nr_seen = 0 for doc in docs: yield doc - for word in doc: - recent_strings.add(word.text) recent_refs.add(doc) if nr_seen < 10000: old_refs.add(doc) nr_seen += 1 elif len(old_refs) == 0: - # All the docs in the 'old' set have expired, so the only - # difference between the backup strings and the current - # string-store should be obsolete. We therefore swap out the - # old strings data. - old_refs, recent_refs = recent_refs, old_refs - self.vocab.strings._reset_and_load(recent_strings) - recent_strings = StringStore().from_bytes(original_strings_data) + self.vocab.strings._cleanup_stale_strings() nr_seen = 0 + self.vocab.strings._reset_and_load(original_strings_data) def to_disk(self, path, disable=tuple()): """Save the current state to a directory. If a model is loaded, this diff --git a/spacy/strings.pxd b/spacy/strings.pxd index 4f987baed..e436fb33b 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -1,5 +1,6 @@ from libc.stdint cimport int64_t from libcpp.vector cimport vector +from libcpp.set cimport set from cymem.cymem cimport Pool from preshed.maps cimport PreshMap @@ -23,6 +24,7 @@ cdef class StringStore: cdef Pool mem cdef vector[hash_t] keys + cdef set[hash_t] hits cdef public PreshMap _map cdef const Utf8Str* intern_unicode(self, unicode py_string) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 647f140bb..b8628cef1 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -4,6 +4,7 @@ from __future__ import unicode_literals, absolute_import cimport cython from libc.string cimport memcpy +from libcpp.set cimport set from libc.stdint cimport uint32_t from murmurhash.mrmr cimport hash64, hash32 import ujson @@ -111,6 +112,7 @@ cdef class StringStore: return SYMBOLS_BY_INT[string_or_id] else: key = string_or_id + self.hits.insert(key) utf8str = self._map.get(key) if utf8str is NULL: raise KeyError(string_or_id) @@ -168,6 +170,7 @@ cdef class StringStore: if key < len(SYMBOLS_BY_INT): return True else: + self.hits.insert(key) return self._map.get(key) is not NULL def __iter__(self): @@ -179,6 +182,7 @@ cdef class StringStore: cdef hash_t key for i in range(self.keys.size()): key = self.keys[i] + self.hits.insert(key) utf8str = self._map.get(key) yield decode_Utf8Str(utf8str) # TODO: Iterate OOV here? @@ -241,9 +245,24 @@ cdef class StringStore: self.mem = Pool() self._map = PreshMap() self.keys.clear() + self.hits.clear() for string in strings: self.add(string) + def _cleanup_stale_strings(self): + if self.hits.size() == 0: + # If no any hits — just skip cleanup + return + + cdef vector[hash_t] tmp + for i in range(self.keys.size()): + key = self.keys[i] + if self.hits.count(key) != 0: + tmp.push_back(key) + + self.keys.swap(tmp) + self.hits.clear() + cdef const Utf8Str* intern_unicode(self, unicode py_string): # 0 means missing, but we don't bother offsetting the index. cdef bytes byte_string = py_string.encode('utf8') @@ -259,5 +278,6 @@ cdef class StringStore: return value value = _allocate(self.mem, utf8_string, length) self._map.set(key, value) + self.hits.insert(key) self.keys.push_back(key) return value diff --git a/spacy/tests/regression/test_issue1506.py b/spacy/tests/regression/test_issue1506.py new file mode 100644 index 000000000..1a4ba6399 --- /dev/null +++ b/spacy/tests/regression/test_issue1506.py @@ -0,0 +1,28 @@ +# coding: utf8 +from __future__ import unicode_literals + +import random +import string + +import itertools +from compat import izip + +from ...lang.en import English + + +def test_issue1506(): + nlp = English() + + def string_generator(): + for (_, t) in izip(range(10001), itertools.repeat("It's sentence produced by that bug.")): + yield t + + for (_, t) in izip(range(10001), itertools.repeat("I erase lemmas.")): + yield t + + for (_, t) in izip(range(10001), itertools.repeat("It's sentence produced by that bug.")): + yield t + + for d in nlp.pipe(string_generator()): + for t in d: + str(t.lemma_)