From 59c216196cb0a502ca9214318a17efa4934b1268 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 16 Oct 2017 19:22:11 +0200 Subject: [PATCH 1/7] Allow weakrefs on Doc objects --- spacy/tokens/doc.pxd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index ad2b9876d..f34c455c6 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -54,6 +54,8 @@ cdef class Doc: cdef public object noun_chunks_iterator + cdef object __weakref__ + cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1 cpdef np.ndarray to_array(self, object features) From 5c14f3f033232b9329183148e706c0884d9d043f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 16 Oct 2017 19:22:40 +0200 Subject: [PATCH 2/7] Create a rolling buffer for the StringStore in Language.pipe() --- spacy/language.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/spacy/language.py b/spacy/language.py index 047c94a37..f092c9806 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -8,6 +8,7 @@ import random import ujson from collections import OrderedDict import itertools +import weakref from .tokenizer import Tokenizer from .vocab import Vocab @@ -510,8 +511,33 @@ class Language(object): else: # Apply the function, but yield the doc docs = _pipe(proc, docs) + # Track weakrefs of "recent" documents, so that we can see when they + # expire from memory. When they do, we know we don't need old strings. + # This way, we avoid maintaining an unbounded growth in string entries + # in the string store. + recent_refs = weakref.WeakSet() + old_refs = weakref.WeakSet() + original_strings_data = self.vocab.strings.to_bytes() + StringStore = self.vocab.strings.__class__ + recent_strings = StringStore().from_bytes(original_strings_data) + nr_seen = 0 for doc in docs: yield doc + for word in doc: + recent_strings.add(word.text) + recent_refs.add(doc) + if nr_seen < 1000: + old_refs.add(doc) + nr_seen += 1 + elif len(old_refs) == 0: + # All the docs in the 'old' set have expired, so the only + # difference between the backup strings and the current + # string-store should be obsolete. We therefore swap out the + # old strings data. + old_refs, recent_refs = recent_refs, old_refs + self.vocab.strings._reset_and_load(recent_strings) + recent_strings = StringStore().from_bytes(original_strings_data) + nr_seen = 0 def to_disk(self, path, disable=tuple()): """Save the current state to a directory. If a model is loaded, this From 3e037054c88476e11ca6c0bc2e0ee2ce32d0997e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 16 Oct 2017 19:23:10 +0200 Subject: [PATCH 3/7] Remove obsolete is_frozen functionality from StringStore --- spacy/strings.pxd | 2 -- spacy/strings.pyx | 21 +-------------------- 2 files changed, 1 insertion(+), 22 deletions(-) diff --git a/spacy/strings.pxd b/spacy/strings.pxd index 0ad403cf1..4f987baed 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -21,11 +21,9 @@ ctypedef union Utf8Str: cdef class StringStore: cdef Pool mem - cdef bint is_frozen cdef vector[hash_t] keys cdef public PreshMap _map - cdef public PreshMap _oov cdef const Utf8Str* intern_unicode(self, unicode py_string) cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 6f676c79a..29a706996 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -86,8 +86,6 @@ cdef class StringStore: """ self.mem = Pool() self._map = PreshMap() - self._oov = PreshMap() - self.is_frozen = freeze if strings is not None: for string in strings: self.add(string) @@ -243,21 +241,12 @@ cdef class StringStore: self.add(word) return self - def set_frozen(self, bint is_frozen): - # TODO - self.is_frozen = is_frozen - - def flush_oov(self): - self._oov = PreshMap() - - def _reset_and_load(self, strings, freeze=False): + def _reset_and_load(self, strings): self.mem = Pool() self._map = PreshMap() - self._oov = PreshMap() self.keys.clear() for string in strings: self.add(string) - self.is_frozen = freeze cdef const Utf8Str* intern_unicode(self, unicode py_string): # 0 means missing, but we don't bother offsetting the index. @@ -275,14 +264,6 @@ cdef class StringStore: value = self._oov.get(key) if value is not NULL: return value - if self.is_frozen: - # OOV store uses 32 bit hashes. Pretty ugly :( - key32 = hash32_utf8(utf8_string, length) - # Important: Make the OOV store own the memory. That way it's trivial - # to flush them all. - value = _allocate(self._oov.mem, utf8_string, length) - self._oov.set(key32, value) - return NULL value = _allocate(self.mem, utf8_string, length) self._map.set(key, value) From a002264fec3f49e85f530bf8cb3d16be0a049071 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 16 Oct 2017 19:34:21 +0200 Subject: [PATCH 4/7] Remove caching of Token in Doc, as caused cycle. --- spacy/tokens/doc.pyx | 13 ++----------- spacy/tokens/token.pxd | 3 --- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 05d393d2b..bf48cf4f5 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -140,7 +140,6 @@ cdef class Doc: self.user_span_hooks = {} self.tensor = numpy.zeros((0,), dtype='float32') self.user_data = {} - self._py_tokens = [] self._vector = None self.noun_chunks_iterator = _get_chunker(self.vocab.lang) cdef unicode orth @@ -209,10 +208,7 @@ cdef class Doc: if i < 0: i = self.length + i bounds_check(i, self.length, PADDING) - if self._py_tokens[i] is not None: - return self._py_tokens[i] - else: - return Token.cinit(self.vocab, &self.c[i], i, self) + return Token.cinit(self.vocab, &self.c[i], i, self) def __iter__(self): """Iterate over `Token` objects, from which the annotations can be @@ -226,10 +222,7 @@ cdef class Doc: """ cdef int i for i in range(self.length): - if self._py_tokens[i] is not None: - yield self._py_tokens[i] - else: - yield Token.cinit(self.vocab, &self.c[i], i, self) + yield Token.cinit(self.vocab, &self.c[i], i, self) def __len__(self): """The number of tokens in the document. @@ -535,7 +528,6 @@ cdef class Doc: self.length += 1 # Set morphological attributes, e.g. by lemma, if possible self.vocab.morphology.assign_untagged(t) - self._py_tokens.append(None) return t.idx + t.lex.length + t.spacy @cython.boundscheck(False) @@ -841,7 +833,6 @@ cdef class Doc: # Set the left/right children, left/right edges set_children_from_heads(self.c, self.length) # Clear the cached Python objects - self._py_tokens = [None] * self.length # Return the merged Python object return self[start] diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd index f63a0490c..b408e04eb 100644 --- a/spacy/tokens/token.pxd +++ b/spacy/tokens/token.pxd @@ -19,10 +19,7 @@ cdef class Token: if offset < 0 or offset >= doc.length: msg = "Attempt to access token at %d, max length %d" raise IndexError(msg % (offset, doc.length)) - if doc._py_tokens[offset] != None: - return doc._py_tokens[offset] cdef Token self = Token.__new__(Token, vocab, doc, offset) - doc._py_tokens[offset] = self return self #cdef inline TokenC struct_from_attrs(Vocab vocab, attrs): From 66e2eb8f397c82505d5b44c1b52071fcda2a5a1c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 16 Oct 2017 19:34:41 +0200 Subject: [PATCH 5/7] Clean up remnant of frozen in StringStore --- spacy/strings.pyx | 4 ---- 1 file changed, 4 deletions(-) diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 29a706996..e6926a75d 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -261,10 +261,6 @@ cdef class StringStore: cdef Utf8Str* value = self._map.get(key) if value is not NULL: return value - value = self._oov.get(key) - if value is not NULL: - return value - value = _allocate(self.mem, utf8_string, length) self._map.set(key, value) self.keys.push_back(key) From 2bc06e4b222c7f38505235b30105bca1d15bf286 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 16 Oct 2017 19:38:29 +0200 Subject: [PATCH 6/7] Bump rolling buffer size to 10k --- spacy/language.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index f092c9806..7fd56ed56 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -526,7 +526,7 @@ class Language(object): for word in doc: recent_strings.add(word.text) recent_refs.add(doc) - if nr_seen < 1000: + if nr_seen < 10000: old_refs.add(doc) nr_seen += 1 elif len(old_refs) == 0: From 41744771611305363484f046b0271b5f0ea071aa Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 16 Oct 2017 19:50:35 +0200 Subject: [PATCH 7/7] Fix equality check in test --- spacy/tests/parser/test_parse_navigate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py index 4d909f0d6..da59b0b59 100644 --- a/spacy/tests/parser/test_parse_navigate.py +++ b/spacy/tests/parser/test_parse_navigate.py @@ -57,9 +57,9 @@ def test_parser_parse_navigate_consistency(en_tokenizer, text, heads): doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) for head in doc: for child in head.lefts: - assert child.head is head + assert child.head == head for child in head.rights: - assert child.head is head + assert child.head == head def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):