From 42b117e561fca0f156d3ea2aacdff7757dd0b150 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 26 Jan 2021 00:40:18 +1100 Subject: [PATCH] Fix Doc.copy bugs (#6809) * Dont let the Doc own LexemeC, to fix Doc.copy * Copy doc.spans * Copy doc.spans --- spacy/tokens/_dict_proxies.py | 3 +++ spacy/tokens/doc.pyx | 1 + spacy/vocab.pyx | 12 ++++++++++-- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/spacy/tokens/_dict_proxies.py b/spacy/tokens/_dict_proxies.py index b10f6d484..7b2d2d5b5 100644 --- a/spacy/tokens/_dict_proxies.py +++ b/spacy/tokens/_dict_proxies.py @@ -33,6 +33,9 @@ class SpanGroups(UserDict): def _make_span_group(self, name: str, spans: Iterable["Span"]) -> SpanGroup: return SpanGroup(self.doc_ref(), name=name, spans=spans) + def copy(self) -> "SpanGroups": + return SpanGroups(self.doc_ref()).from_bytes(self.to_bytes()) + def to_bytes(self) -> bytes: # We don't need to serialize this as a dict, because the groups # know their names. diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 32f8c91fa..489de2201 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1187,6 +1187,7 @@ cdef class Doc: other.user_span_hooks = dict(self.user_span_hooks) other.length = self.length other.max_length = self.max_length + other.spans = self.spans.copy() buff_size = other.max_length + (PADDING*2) assert buff_size > 0 tokens = other.mem.alloc(buff_size, sizeof(TokenC)) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 8359d8452..bcbf300e9 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -161,8 +161,16 @@ cdef class Vocab: return self._new_lexeme(mem, self.strings[orth]) cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: - if len(string) < 3 or self.length < 10000: - mem = self.mem + # I think this heuristic is bad, and the Vocab should always + # own the lexemes. It avoids weird bugs this way, as it's how the thing + # was originally supposed to work. The best solution to the growing + # memory use is to periodically reset the vocab, which is an action + # that should be up to the user to do (so we don't need to keep track + # of the doc ownership). + # TODO: Change the C API so that the mem isn't passed in here. + mem = self.mem + #if len(string) < 3 or self.length < 10000: + # mem = self.mem cdef bint is_oov = mem is not self.mem lex = mem.alloc(1, sizeof(LexemeC)) lex.orth = self.strings.add(string)