Try to fix StringStore clean up (see #1506)

2017-11-11 03:11:27 +03:00 · 2017-11-11 03:11:27 +03:00 · 3c600adf23
parent ee97fd3cb4
commit 3c600adf23
4 changed files with 56 additions and 12 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -12,6 +12,7 @@ from copy import copy
 from thinc.neural import Model
 from thinc.neural.optimizers import Adam
 from .strings import StringStore
 from .tokenizer import Tokenizer
 from .vocab import Vocab
 from .lemmatizer import Lemmatizer
@ -547,27 +548,20 @@ class Language(object):
        # in the string store.
        recent_refs = weakref.WeakSet()
        old_refs = weakref.WeakSet()
-        original_strings_data = self.vocab.strings.to_bytes()
+        # If there is anything that we have inside — after iterations we should
-        StringStore = self.vocab.strings.__class__
+        # carefully get it back.
-        recent_strings = StringStore().from_bytes(original_strings_data)
+        original_strings_data = list(self.vocab.strings)
        nr_seen = 0
        for doc in docs:
            yield doc
            for word in doc:
                recent_strings.add(word.text)
            recent_refs.add(doc)
            if nr_seen < 10000:
                old_refs.add(doc)
                nr_seen += 1
            elif len(old_refs) == 0:
-                # All the docs in the 'old' set have expired, so the only
+                self.vocab.strings._cleanup_stale_strings()
                # difference between the backup strings and the current
                # string-store should be obsolete. We therefore swap out the
                # old strings data.
                old_refs, recent_refs = recent_refs, old_refs
                self.vocab.strings._reset_and_load(recent_strings)
                recent_strings = StringStore().from_bytes(original_strings_data)
                nr_seen = 0
        self.vocab.strings._reset_and_load(original_strings_data)
    def to_disk(self, path, disable=tuple()):
        """Save the current state to a directory.  If a model is loaded, this
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -1,5 +1,6 @@
 from libc.stdint cimport int64_t
 from libcpp.vector cimport vector
 from libcpp.set cimport set
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
@ -23,6 +24,7 @@ cdef class StringStore:
    cdef Pool mem
    cdef vector[hash_t] keys
    cdef set[hash_t] hits
    cdef public PreshMap _map
    cdef const Utf8Str* intern_unicode(self, unicode py_string)
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -4,6 +4,7 @@ from __future__ import unicode_literals, absolute_import
 cimport cython
 from libc.string cimport memcpy
 from libcpp.set cimport set
 from libc.stdint cimport uint32_t
 from murmurhash.mrmr cimport hash64, hash32
 import ujson
@ -111,6 +112,7 @@ cdef class StringStore:
            return SYMBOLS_BY_INT[string_or_id]
        else:
            key = string_or_id
            self.hits.insert(key)
            utf8str = <Utf8Str*>self._map.get(key)
            if utf8str is NULL:
                raise KeyError(string_or_id)
@ -168,6 +170,7 @@ cdef class StringStore:
        if key < len(SYMBOLS_BY_INT):
            return True
        else:
            self.hits.insert(key)
            return self._map.get(key) is not NULL
    def __iter__(self):
@ -179,6 +182,7 @@ cdef class StringStore:
        cdef hash_t key
        for i in range(self.keys.size()):
            key = self.keys[i]
            self.hits.insert(key)
            utf8str = <Utf8Str*>self._map.get(key)
            yield decode_Utf8Str(utf8str)
        # TODO: Iterate OOV here?
@ -241,9 +245,24 @@ cdef class StringStore:
        self.mem = Pool()
        self._map = PreshMap()
        self.keys.clear()
        self.hits.clear()
        for string in strings:
            self.add(string)
    def _cleanup_stale_strings(self):
        if self.hits.size() == 0:
            # If no any hits — just skip cleanup
            return
        cdef vector[hash_t] tmp
        for i in range(self.keys.size()):
            key = self.keys[i]
            if self.hits.count(key) != 0:
                tmp.push_back(key)
        self.keys.swap(tmp)
        self.hits.clear()
    cdef const Utf8Str* intern_unicode(self, unicode py_string):
        # 0 means missing, but we don't bother offsetting the index.
        cdef bytes byte_string = py_string.encode('utf8')
@ -259,5 +278,6 @@ cdef class StringStore:
            return value
        value = _allocate(self.mem, <unsigned char*>utf8_string, length)
        self._map.set(key, value)
        self.hits.insert(key)
        self.keys.push_back(key)
        return value
--- a/spacy/tests/regression/test_issue1506.py
+++ b/spacy/tests/regression/test_issue1506.py
@ -0,0 +1,28 @@
 # coding: utf8
 from __future__ import unicode_literals
 import random
 import string
 import itertools
 from compat import izip
 from ...lang.en import English
 def test_issue1506():
    nlp = English()
    def string_generator():
        for (_, t) in izip(range(10001), itertools.repeat("It's sentence produced by that bug.")):
            yield t
        for (_, t) in izip(range(10001), itertools.repeat("I erase lemmas.")):
            yield t
        for (_, t) in izip(range(10001), itertools.repeat("It's sentence produced by that bug.")):
            yield t
    for d in nlp.pipe(string_generator()):
        for t in d:
            str(t.lemma_)