From cae077b583c06819cc68489a4bc1243244345086 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 16 Feb 2015 15:20:31 -0500 Subject: [PATCH] * Work on fixing orphaned Token objects bug --- spacy/tokens.pxd | 12 ++++++------ spacy/tokens.pyx | 28 ++++++---------------------- 2 files changed, 12 insertions(+), 28 deletions(-) diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index a146a7c8c..1b482f597 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -61,20 +61,20 @@ cdef class Token: cdef bint _owns_c_data - cdef list _py + cdef Tokens _seq cdef tuple _tag_strings cdef tuple _dep_strings @staticmethod cdef inline Token cinit(Vocab vocab, unicode string, const TokenC* token, int offset, int array_len, - list py_tokens, tuple tag_strings, tuple dep_strings): + Tokens parent_seq, tuple tag_strings, tuple dep_strings): if offset < 0 or offset >= array_len: msg = "Attempt to access token at %d, max length %d" raise IndexError(msg % (offset, array_len)) - if py_tokens[offset] is not None: - return py_tokens[offset] + if parent_seq._py_tokens[offset] is not None: + return parent_seq._py_tokens[offset] cdef Token self = Token.__new__(Token, vocab, string) @@ -82,10 +82,10 @@ cdef class Token: self.i = offset self.array_len = array_len - self._py = py_tokens + self._seq = parent_seq self._tag_strings = tag_strings self._dep_strings = dep_strings - py_tokens[offset] = self + self._seq._py_tokens[offset] = self return self cdef int take_ownership_of_c_data(self) except -1 diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index fc1e14871..58513722f 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -19,7 +19,6 @@ cimport cython from cpython.mem cimport PyMem_Malloc, PyMem_Free from libc.string cimport memcpy -import sys DEF PADDING = 5 @@ -95,21 +94,6 @@ cdef class Tokens: self._tag_strings = tuple() # These will be set by the POS tagger and parser self._dep_strings = tuple() # The strings are arbitrary and model-specific. - def __dealloc__(self): - # The Token object initially only gets a view of the underlying C - # data --- it doesn't own it. But, if we have Token objects that are - # going to outlive this instance, those objects need a copy of the C - # data. - cdef Token token - if self._py_tokens is not None: - for token in self._py_tokens: - if token is not None: - # Why 3? 1 for the entry in the _py_tokens list, - # and 1 for this reference. If we have _another_ ref, then - # the token will live, and needs to own its data. - if sys.getrefcount(token) >= 3: - token.take_ownership_of_c_data() - def __getitem__(self, object i): """Retrieve a token. @@ -124,7 +108,7 @@ cdef class Tokens: bounds_check(i, self.length, PADDING) return Token.cinit(self.vocab, self._string, &self.data[i], i, self.length, - self._py_tokens, self._tag_strings, self._dep_strings) + self, self._tag_strings, self._dep_strings) def __iter__(self): """Iterate over the tokens. @@ -135,7 +119,7 @@ cdef class Tokens: for i in range(self.length): yield Token.cinit(self.vocab, self._string, &self.data[i], i, self.length, - self._py_tokens, self._tag_strings, self._dep_strings) + self, self._tag_strings, self._dep_strings) def __len__(self): return self.length @@ -277,7 +261,7 @@ cdef class Token: def nbor(self, int i=1): return Token.cinit(self.vocab, self._string, self.c, self.i, self.array_len, - self._py, self._tag_strings, self._dep_strings) + self._seq, self._tag_strings, self._dep_strings) property string: def __get__(self): @@ -378,7 +362,7 @@ cdef class Token: elif ptr + ptr.head == self.c: yield Token.cinit(self.vocab, self._string, ptr, ptr - (self.c - self.i), self.array_len, - self._py, self._tag_strings, self._dep_strings) + self._seq, self._tag_strings, self._dep_strings) ptr += 1 else: ptr += 1 @@ -397,7 +381,7 @@ cdef class Token: elif ptr + ptr.head == self.c: yield Token.cinit(self.vocab, self._string, ptr, ptr - (self.c - self.i), self.array_len, - self._py, self._tag_strings, self._dep_strings) + self._seq, self._tag_strings, self._dep_strings) ptr -= 1 else: ptr -= 1 @@ -407,7 +391,7 @@ cdef class Token: """The token predicted by the parser to be the head of the current token.""" return Token.cinit(self.vocab, self._string, self.c + self.c.head, self.i + self.c.head, self.array_len, - self._py, self._tag_strings, self._dep_strings) + self._seq, self._tag_strings, self._dep_strings) property whitespace_: def __get__(self):