mirror of https://github.com/explosion/spaCy.git
Try to fix StringStore clean up (see #1506)
This commit is contained in:
parent
ee97fd3cb4
commit
3c600adf23
|
@ -12,6 +12,7 @@ from copy import copy
|
||||||
from thinc.neural import Model
|
from thinc.neural import Model
|
||||||
from thinc.neural.optimizers import Adam
|
from thinc.neural.optimizers import Adam
|
||||||
|
|
||||||
|
from .strings import StringStore
|
||||||
from .tokenizer import Tokenizer
|
from .tokenizer import Tokenizer
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
|
@ -547,27 +548,20 @@ class Language(object):
|
||||||
# in the string store.
|
# in the string store.
|
||||||
recent_refs = weakref.WeakSet()
|
recent_refs = weakref.WeakSet()
|
||||||
old_refs = weakref.WeakSet()
|
old_refs = weakref.WeakSet()
|
||||||
original_strings_data = self.vocab.strings.to_bytes()
|
# If there is anything that we have inside — after iterations we should
|
||||||
StringStore = self.vocab.strings.__class__
|
# carefully get it back.
|
||||||
recent_strings = StringStore().from_bytes(original_strings_data)
|
original_strings_data = list(self.vocab.strings)
|
||||||
nr_seen = 0
|
nr_seen = 0
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
yield doc
|
yield doc
|
||||||
for word in doc:
|
|
||||||
recent_strings.add(word.text)
|
|
||||||
recent_refs.add(doc)
|
recent_refs.add(doc)
|
||||||
if nr_seen < 10000:
|
if nr_seen < 10000:
|
||||||
old_refs.add(doc)
|
old_refs.add(doc)
|
||||||
nr_seen += 1
|
nr_seen += 1
|
||||||
elif len(old_refs) == 0:
|
elif len(old_refs) == 0:
|
||||||
# All the docs in the 'old' set have expired, so the only
|
self.vocab.strings._cleanup_stale_strings()
|
||||||
# difference between the backup strings and the current
|
|
||||||
# string-store should be obsolete. We therefore swap out the
|
|
||||||
# old strings data.
|
|
||||||
old_refs, recent_refs = recent_refs, old_refs
|
|
||||||
self.vocab.strings._reset_and_load(recent_strings)
|
|
||||||
recent_strings = StringStore().from_bytes(original_strings_data)
|
|
||||||
nr_seen = 0
|
nr_seen = 0
|
||||||
|
self.vocab.strings._reset_and_load(original_strings_data)
|
||||||
|
|
||||||
def to_disk(self, path, disable=tuple()):
|
def to_disk(self, path, disable=tuple()):
|
||||||
"""Save the current state to a directory. If a model is loaded, this
|
"""Save the current state to a directory. If a model is loaded, this
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from libc.stdint cimport int64_t
|
from libc.stdint cimport int64_t
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
|
from libcpp.set cimport set
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
|
@ -23,6 +24,7 @@ cdef class StringStore:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
|
|
||||||
cdef vector[hash_t] keys
|
cdef vector[hash_t] keys
|
||||||
|
cdef set[hash_t] hits
|
||||||
cdef public PreshMap _map
|
cdef public PreshMap _map
|
||||||
|
|
||||||
cdef const Utf8Str* intern_unicode(self, unicode py_string)
|
cdef const Utf8Str* intern_unicode(self, unicode py_string)
|
||||||
|
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals, absolute_import
|
||||||
|
|
||||||
cimport cython
|
cimport cython
|
||||||
from libc.string cimport memcpy
|
from libc.string cimport memcpy
|
||||||
|
from libcpp.set cimport set
|
||||||
from libc.stdint cimport uint32_t
|
from libc.stdint cimport uint32_t
|
||||||
from murmurhash.mrmr cimport hash64, hash32
|
from murmurhash.mrmr cimport hash64, hash32
|
||||||
import ujson
|
import ujson
|
||||||
|
@ -111,6 +112,7 @@ cdef class StringStore:
|
||||||
return SYMBOLS_BY_INT[string_or_id]
|
return SYMBOLS_BY_INT[string_or_id]
|
||||||
else:
|
else:
|
||||||
key = string_or_id
|
key = string_or_id
|
||||||
|
self.hits.insert(key)
|
||||||
utf8str = <Utf8Str*>self._map.get(key)
|
utf8str = <Utf8Str*>self._map.get(key)
|
||||||
if utf8str is NULL:
|
if utf8str is NULL:
|
||||||
raise KeyError(string_or_id)
|
raise KeyError(string_or_id)
|
||||||
|
@ -168,6 +170,7 @@ cdef class StringStore:
|
||||||
if key < len(SYMBOLS_BY_INT):
|
if key < len(SYMBOLS_BY_INT):
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
|
self.hits.insert(key)
|
||||||
return self._map.get(key) is not NULL
|
return self._map.get(key) is not NULL
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
|
@ -179,6 +182,7 @@ cdef class StringStore:
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
for i in range(self.keys.size()):
|
for i in range(self.keys.size()):
|
||||||
key = self.keys[i]
|
key = self.keys[i]
|
||||||
|
self.hits.insert(key)
|
||||||
utf8str = <Utf8Str*>self._map.get(key)
|
utf8str = <Utf8Str*>self._map.get(key)
|
||||||
yield decode_Utf8Str(utf8str)
|
yield decode_Utf8Str(utf8str)
|
||||||
# TODO: Iterate OOV here?
|
# TODO: Iterate OOV here?
|
||||||
|
@ -241,9 +245,24 @@ cdef class StringStore:
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._map = PreshMap()
|
self._map = PreshMap()
|
||||||
self.keys.clear()
|
self.keys.clear()
|
||||||
|
self.hits.clear()
|
||||||
for string in strings:
|
for string in strings:
|
||||||
self.add(string)
|
self.add(string)
|
||||||
|
|
||||||
|
def _cleanup_stale_strings(self):
|
||||||
|
if self.hits.size() == 0:
|
||||||
|
# If no any hits — just skip cleanup
|
||||||
|
return
|
||||||
|
|
||||||
|
cdef vector[hash_t] tmp
|
||||||
|
for i in range(self.keys.size()):
|
||||||
|
key = self.keys[i]
|
||||||
|
if self.hits.count(key) != 0:
|
||||||
|
tmp.push_back(key)
|
||||||
|
|
||||||
|
self.keys.swap(tmp)
|
||||||
|
self.hits.clear()
|
||||||
|
|
||||||
cdef const Utf8Str* intern_unicode(self, unicode py_string):
|
cdef const Utf8Str* intern_unicode(self, unicode py_string):
|
||||||
# 0 means missing, but we don't bother offsetting the index.
|
# 0 means missing, but we don't bother offsetting the index.
|
||||||
cdef bytes byte_string = py_string.encode('utf8')
|
cdef bytes byte_string = py_string.encode('utf8')
|
||||||
|
@ -259,5 +278,6 @@ cdef class StringStore:
|
||||||
return value
|
return value
|
||||||
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
|
value = _allocate(self.mem, <unsigned char*>utf8_string, length)
|
||||||
self._map.set(key, value)
|
self._map.set(key, value)
|
||||||
|
self.hits.insert(key)
|
||||||
self.keys.push_back(key)
|
self.keys.push_back(key)
|
||||||
return value
|
return value
|
||||||
|
|
|
@ -0,0 +1,28 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import random
|
||||||
|
import string
|
||||||
|
|
||||||
|
import itertools
|
||||||
|
from compat import izip
|
||||||
|
|
||||||
|
from ...lang.en import English
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue1506():
|
||||||
|
nlp = English()
|
||||||
|
|
||||||
|
def string_generator():
|
||||||
|
for (_, t) in izip(range(10001), itertools.repeat("It's sentence produced by that bug.")):
|
||||||
|
yield t
|
||||||
|
|
||||||
|
for (_, t) in izip(range(10001), itertools.repeat("I erase lemmas.")):
|
||||||
|
yield t
|
||||||
|
|
||||||
|
for (_, t) in izip(range(10001), itertools.repeat("It's sentence produced by that bug.")):
|
||||||
|
yield t
|
||||||
|
|
||||||
|
for d in nlp.pipe(string_generator()):
|
||||||
|
for t in d:
|
||||||
|
str(t.lemma_)
|
Loading…
Reference in New Issue