diff --git a/spacy/language.py b/spacy/language.py index 047c94a37..7fd56ed56 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -8,6 +8,7 @@ import random import ujson from collections import OrderedDict import itertools +import weakref from .tokenizer import Tokenizer from .vocab import Vocab @@ -510,8 +511,33 @@ class Language(object): else: # Apply the function, but yield the doc docs = _pipe(proc, docs) + # Track weakrefs of "recent" documents, so that we can see when they + # expire from memory. When they do, we know we don't need old strings. + # This way, we avoid maintaining an unbounded growth in string entries + # in the string store. + recent_refs = weakref.WeakSet() + old_refs = weakref.WeakSet() + original_strings_data = self.vocab.strings.to_bytes() + StringStore = self.vocab.strings.__class__ + recent_strings = StringStore().from_bytes(original_strings_data) + nr_seen = 0 for doc in docs: yield doc + for word in doc: + recent_strings.add(word.text) + recent_refs.add(doc) + if nr_seen < 10000: + old_refs.add(doc) + nr_seen += 1 + elif len(old_refs) == 0: + # All the docs in the 'old' set have expired, so the only + # difference between the backup strings and the current + # string-store should be obsolete. We therefore swap out the + # old strings data. + old_refs, recent_refs = recent_refs, old_refs + self.vocab.strings._reset_and_load(recent_strings) + recent_strings = StringStore().from_bytes(original_strings_data) + nr_seen = 0 def to_disk(self, path, disable=tuple()): """Save the current state to a directory. If a model is loaded, this diff --git a/spacy/strings.pxd b/spacy/strings.pxd index 0ad403cf1..4f987baed 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -21,11 +21,9 @@ ctypedef union Utf8Str: cdef class StringStore: cdef Pool mem - cdef bint is_frozen cdef vector[hash_t] keys cdef public PreshMap _map - cdef public PreshMap _oov cdef const Utf8Str* intern_unicode(self, unicode py_string) cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 6f676c79a..e6926a75d 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -86,8 +86,6 @@ cdef class StringStore: """ self.mem = Pool() self._map = PreshMap() - self._oov = PreshMap() - self.is_frozen = freeze if strings is not None: for string in strings: self.add(string) @@ -243,21 +241,12 @@ cdef class StringStore: self.add(word) return self - def set_frozen(self, bint is_frozen): - # TODO - self.is_frozen = is_frozen - - def flush_oov(self): - self._oov = PreshMap() - - def _reset_and_load(self, strings, freeze=False): + def _reset_and_load(self, strings): self.mem = Pool() self._map = PreshMap() - self._oov = PreshMap() self.keys.clear() for string in strings: self.add(string) - self.is_frozen = freeze cdef const Utf8Str* intern_unicode(self, unicode py_string): # 0 means missing, but we don't bother offsetting the index. @@ -272,18 +261,6 @@ cdef class StringStore: cdef Utf8Str* value = self._map.get(key) if value is not NULL: return value - value = self._oov.get(key) - if value is not NULL: - return value - if self.is_frozen: - # OOV store uses 32 bit hashes. Pretty ugly :( - key32 = hash32_utf8(utf8_string, length) - # Important: Make the OOV store own the memory. That way it's trivial - # to flush them all. - value = _allocate(self._oov.mem, utf8_string, length) - self._oov.set(key32, value) - return NULL - value = _allocate(self.mem, utf8_string, length) self._map.set(key, value) self.keys.push_back(key) diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py index 4d909f0d6..da59b0b59 100644 --- a/spacy/tests/parser/test_parse_navigate.py +++ b/spacy/tests/parser/test_parse_navigate.py @@ -57,9 +57,9 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads): doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) for head in doc: for child in head.lefts: - assert child.head is head + assert child.head == head for child in head.rights: - assert child.head is head + assert child.head == head def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads): diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index ad2b9876d..f34c455c6 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -54,6 +54,8 @@ cdef class Doc: cdef public object noun_chunks_iterator + cdef object __weakref__ + cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1 cpdef np.ndarray to_array(self, object features) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 05d393d2b..bf48cf4f5 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -140,7 +140,6 @@ cdef class Doc: self.user_span_hooks = {} self.tensor = numpy.zeros((0,), dtype='float32') self.user_data = {} - self._py_tokens = [] self._vector = None self.noun_chunks_iterator = _get_chunker(self.vocab.lang) cdef unicode orth @@ -209,10 +208,7 @@ cdef class Doc: if i < 0: i = self.length + i bounds_check(i, self.length, PADDING) - if self._py_tokens[i] is not None: - return self._py_tokens[i] - else: - return Token.cinit(self.vocab, &self.c[i], i, self) + return Token.cinit(self.vocab, &self.c[i], i, self) def __iter__(self): """Iterate over `Token` objects, from which the annotations can be @@ -226,10 +222,7 @@ cdef class Doc: """ cdef int i for i in range(self.length): - if self._py_tokens[i] is not None: - yield self._py_tokens[i] - else: - yield Token.cinit(self.vocab, &self.c[i], i, self) + yield Token.cinit(self.vocab, &self.c[i], i, self) def __len__(self): """The number of tokens in the document. @@ -535,7 +528,6 @@ cdef class Doc: self.length += 1 # Set morphological attributes, e.g. by lemma, if possible self.vocab.morphology.assign_untagged(t) - self._py_tokens.append(None) return t.idx + t.lex.length + t.spacy @cython.boundscheck(False) @@ -841,7 +833,6 @@ cdef class Doc: # Set the left/right children, left/right edges set_children_from_heads(self.c, self.length) # Clear the cached Python objects - self._py_tokens = [None] * self.length # Return the merged Python object return self[start] diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd index f63a0490c..b408e04eb 100644 --- a/spacy/tokens/token.pxd +++ b/spacy/tokens/token.pxd @@ -19,10 +19,7 @@ cdef class Token: if offset < 0 or offset >= doc.length: msg = "Attempt to access token at %d, max length %d" raise IndexError(msg % (offset, doc.length)) - if doc._py_tokens[offset] != None: - return doc._py_tokens[offset] cdef Token self = Token.__new__(Token, vocab, doc, offset) - doc._py_tokens[offset] = self return self #cdef inline TokenC struct_from_attrs(Vocab vocab, attrs):