From cae077b583c06819cc68489a4bc1243244345086 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 16 Feb 2015 15:20:31 -0500
Subject: [PATCH] * Work on fixing orphaned Token objects bug

---
 spacy/tokens.pxd | 12 ++++++------
 spacy/tokens.pyx | 28 ++++++----------------------
 2 files changed, 12 insertions(+), 28 deletions(-)

diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index a146a7c8c..1b482f597 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -61,20 +61,20 @@ cdef class Token:
     cdef bint _owns_c_data
 
     
-    cdef list _py
+    cdef Tokens _seq
     cdef tuple _tag_strings
     cdef tuple _dep_strings
 
     @staticmethod
     cdef inline Token cinit(Vocab vocab, unicode string,
                             const TokenC* token, int offset, int array_len,
-                            list py_tokens, tuple tag_strings, tuple dep_strings):
+                            Tokens parent_seq, tuple tag_strings, tuple dep_strings):
         if offset < 0 or offset >= array_len:
 
             msg = "Attempt to access token at %d, max length %d"
             raise IndexError(msg % (offset, array_len))
-        if py_tokens[offset] is not None:
-            return py_tokens[offset]
+        if parent_seq._py_tokens[offset] is not None:
+            return parent_seq._py_tokens[offset]
 
         cdef Token self = Token.__new__(Token, vocab, string)
 
@@ -82,10 +82,10 @@ cdef class Token:
         self.i = offset
         self.array_len = array_len
 
-        self._py = py_tokens
+        self._seq = parent_seq
         self._tag_strings = tag_strings
         self._dep_strings = dep_strings
-        py_tokens[offset] = self
+        self._seq._py_tokens[offset] = self
         return self
 
     cdef int take_ownership_of_c_data(self) except -1
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index fc1e14871..58513722f 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -19,7 +19,6 @@ cimport cython
 
 from cpython.mem cimport PyMem_Malloc, PyMem_Free
 from libc.string cimport memcpy
-import sys
 
 DEF PADDING = 5
 
@@ -95,21 +94,6 @@ cdef class Tokens:
         self._tag_strings = tuple() # These will be set by the POS tagger and parser
         self._dep_strings = tuple() # The strings are arbitrary and model-specific.
 
-    def __dealloc__(self):
-        # The Token object initially only gets a view of the underlying C
-        # data --- it doesn't own it. But, if we have Token objects that are
-        # going to outlive this instance, those objects need a copy of the C
-        # data.
-        cdef Token token
-        if self._py_tokens is not None:
-            for token in self._py_tokens:
-                if token is not None:
-                    # Why 3? 1 for the entry in the _py_tokens list,
-                    # and 1 for this reference. If we have _another_ ref, then
-                    # the token will live, and needs to own its data.
-                    if sys.getrefcount(token) >= 3:
-                        token.take_ownership_of_c_data()
-
     def __getitem__(self, object i):
         """Retrieve a token.
         
@@ -124,7 +108,7 @@ cdef class Tokens:
         bounds_check(i, self.length, PADDING)
         return Token.cinit(self.vocab, self._string,
                            &self.data[i], i, self.length,
-                           self._py_tokens, self._tag_strings, self._dep_strings)
+                           self, self._tag_strings, self._dep_strings)
 
     def __iter__(self):
         """Iterate over the tokens.
@@ -135,7 +119,7 @@ cdef class Tokens:
         for i in range(self.length):
             yield Token.cinit(self.vocab, self._string,
                               &self.data[i], i, self.length,
-                              self._py_tokens, self._tag_strings, self._dep_strings)
+                              self, self._tag_strings, self._dep_strings)
 
     def __len__(self):
         return self.length
@@ -277,7 +261,7 @@ cdef class Token:
     def nbor(self, int i=1):
         return Token.cinit(self.vocab, self._string,
                            self.c, self.i, self.array_len,
-                           self._py, self._tag_strings, self._dep_strings)
+                           self._seq, self._tag_strings, self._dep_strings)
 
     property string:
         def __get__(self):
@@ -378,7 +362,7 @@ cdef class Token:
                 elif ptr + ptr.head == self.c:
                     yield Token.cinit(self.vocab, self._string,
                                       ptr, ptr - (self.c - self.i), self.array_len,
-                                      self._py, self._tag_strings, self._dep_strings)
+                                      self._seq, self._tag_strings, self._dep_strings)
                     ptr += 1
                 else:
                     ptr += 1
@@ -397,7 +381,7 @@ cdef class Token:
                 elif ptr + ptr.head == self.c:
                     yield Token.cinit(self.vocab, self._string,
                                       ptr, ptr - (self.c - self.i), self.array_len,
-                                      self._py, self._tag_strings, self._dep_strings)
+                                      self._seq, self._tag_strings, self._dep_strings)
                     ptr -= 1
                 else:
                     ptr -= 1
@@ -407,7 +391,7 @@ cdef class Token:
             """The token predicted by the parser to be the head of the current token."""
             return Token.cinit(self.vocab, self._string,
                                self.c + self.c.head, self.i + self.c.head, self.array_len,
-                               self._py, self._tag_strings, self._dep_strings)
+                               self._seq, self._tag_strings, self._dep_strings)
 
     property whitespace_:
         def __get__(self):