* Pass ownership of C data to Token instances if Tokens object is being garbage-collected, but Token instances are staying alive.

2015-02-11 18:05:06 -05:00 · 2015-02-11 18:05:06 -05:00 · 7572e31f5e
parent db3f26a51b
commit 7572e31f5e
3 changed files with 205 additions and 150 deletions
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -58,6 +58,7 @@ cdef class Token:
    cdef const TokenC* c
    cdef readonly int i
    cdef int array_len
    cdef bint _owns_c_data
    cdef list _py
@ -86,3 +87,5 @@ cdef class Token:
        self._dep_strings = dep_strings
        py_tokens[offset] = self
        return self
    cdef int take_ownership_of_c_data(self) except -1
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -17,6 +17,9 @@ import numpy
 cimport cython
 from cpython.mem cimport PyMem_Malloc, PyMem_Free
 from libc.string cimport memcpy
 import sys
 DEF PADDING = 5
@ -92,6 +95,21 @@ cdef class Tokens:
        self._tag_strings = tuple() # These will be set by the POS tagger and parser
        self._dep_strings = tuple() # The strings are arbitrary and model-specific.
    def __dealloc__(self):
        # The Token object initially only gets a view of the underlying C
        # data --- it doesn't own it. But, if we have Token objects that are
        # going to outlive this instance, those objects need a copy of the C
        # data.
        cdef Token token
        if self._py_tokens is not None:
            for token in self._py_tokens:
                if token is not None:
                    # Why 3? 1 for the entry in the _py_tokens list,
                    # and 1 for this reference. If we have _another_ ref, then
                    # the token will live, and needs to own its data.
                    if sys.getrefcount(token) >= 3:
                        token.take_ownership_of_c_data()
    def __getitem__(self, object i):
        """Retrieve a token.
@ -139,8 +157,6 @@ cdef class Tokens:
        self._py_tokens.append(None)
        return idx + t.lex.length
    @cython.boundscheck(False)
    cpdef long[:,:] to_array(self, object py_attr_ids):
        """Given a list of M attribute IDs, export the tokens to a numpy ndarray
@ -234,196 +250,208 @@ cdef class Tokens:
 cdef class Token:
-    """An individual token."""
+    """An individual token --- i.e. a word, a punctuation symbol, etc.  Created
    via Tokens.__getitem__ and Tokens.__iter__.
    """
    def __cinit__(self, Vocab vocab, unicode string):
        self.vocab = vocab
        self._string = string
    def __dealloc__(self):
        if self._owns_c_data:
            # Cast through const, if we own the data
            PyMem_Free(<void*>self.c)
    def __len__(self):
        return self.c.lex.length
    def __unicode__(self):
        return self.string
    cdef int take_ownership_of_c_data(self) except -1:
        owned_data = <TokenC*>PyMem_Malloc(sizeof(TokenC) * self.array_len)
        memcpy(owned_data, self.c, sizeof(TokenC) * self.array_len)
        self.c = owned_data
        self._owns_c_data = True
    def nbor(self, int i=1):
        return Token.cinit(self.vocab, self._string,
                           self.c, self.i, self.array_len,
                           self._py, self._tag_strings, self._dep_strings)
-    @property
+    property string:
-    def string(self):
+        def __get__(self):
-        cdef int next_idx = (self.c + 1).idx
+            cdef int next_idx = (self.c + 1).idx
-        if next_idx < self.c.idx:
+            if next_idx < self.c.idx:
-            next_idx = self.c.idx + self.c.lex.length
+                next_idx = self.c.idx + self.c.lex.length
-        return self._string[self.c.idx:next_idx]
+            return self._string[self.c.idx:next_idx]
-    @property
+    property prob:
-    def prob(self):
+        def __get__(self):
-        return self.c.lex.prob
+            return self.c.lex.prob
-    @property
+    property idx:
-    def idx(self):
+        def __get__(self):
-        return self.c.idx
+            return self.c.idx
-    @property
+    property cluster:
-    def cluster(self):
+        def __get__(self):
-        return self.c.lex.cluster
+            return self.c.lex.cluster
-    @property
+    property orth:
-    def cluster(self):
+        def __get__(self):
-        return self.c.lex.cluster
+            return self.c.lex.orth
-    @property
+    property lower:
-    def orth(self):
+        def __get__(self):
-        return self.c.lex.orth
+            return self.c.lex.lower
-    @property
+    property norm:
-    def lower(self):
+        def __get__(self):
-        return self.c.lex.lower
+            return self.c.lex.norm
-    @property
+    property shape:
-    def norm(self):
+        def __get__(self):
-        return self.c.lex.norm
+            return self.c.lex.shape
-    @property
+    property prefix:
-    def shape(self):
+        def __get__(self):
-        return self.c.lex.shape
+            return self.c.lex.prefix
-    @property
+    property suffix:
-    def prefix(self):
+        def __get__(self):
-        return self.c.lex.prefix
+            return self.c.lex.suffix
-    @property
+    property lemma:
-    def suffix(self):
+        def __get__(self):
-        return self.c.lex.suffix
+            return self.c.lemma
-    @property
+    property pos:
-    def lemma(self):
+        def __get__(self):
-        return self.c.lemma
+            return self.c.pos
-    @property
+    property tag:
-    def pos(self):
+        def __get__(self):
-        return self.c.pos
+            return self.c.tag
-    @property
+    property dep:
-    def tag(self):
+        def __get__(self):
-        return self.c.tag
+            return self.c.dep
-    @property
+    property repvec:
-    def dep(self):
+        def __get__(self):
-        return self.c.dep
+            return numpy.asarray(<float[:300,]> self.c.lex.repvec)
-    @property
+    property n_lefts:
-    def repvec(self):
+        def __get__(self):
-        return numpy.asarray(<float[:300,]> self.c.lex.repvec)
+            cdef int n = 0
-
+            cdef const TokenC* ptr = self.c - self.i
-    @property 
+            while ptr != self.c:
-    def n_lefts(self):
+                if ptr + ptr.head == self.c:
-        cdef int n = 0
+                    n += 1
        cdef const TokenC* ptr = self.c - self.i
        while ptr != self.c:
            if ptr + ptr.head == self.c:
                n += 1
            ptr += 1
        return n
    @property 
    def n_rights(self):
        cdef int n = 0
        cdef const TokenC* ptr = self.c + (self.array_len - self.i)
        while ptr != self.c:
            if ptr + ptr.head == self.c:
                n += 1
            ptr -= 1
        return n
    @property 
    def lefts(self):
        """The leftward immediate children of the word, in the syntactic
        dependency parse.
        """
        cdef const TokenC* ptr = self.c - self.i
        while ptr < self.c:
            # If this head is still to the right of us, we can skip to it
            # No token that's between this token and this head could be our
            # child.
            if (ptr.head >= 1) and (ptr + ptr.head) < self.c:
                ptr += ptr.head
            elif ptr + ptr.head == self.c:
                yield Token.cinit(self.vocab, self._string,
                                  ptr, ptr - (self.c - self.i), self.array_len,
                                  self._py, self._tag_strings, self._dep_strings)
                ptr += 1
            else:
                ptr += 1
            return n
-    @property
+    property n_rights:
-    def rights(self):
+        def __get__(self):
-        """The rightward immediate children of the word, in the syntactic
+            cdef int n = 0
-        dependency parse."""
+            cdef const TokenC* ptr = self.c + (self.array_len - self.i)
-        cdef const TokenC* ptr = (self.c - self.i) + (self.array_len - 1)
+            while ptr != self.c:
-        while ptr > self.c:
+                if ptr + ptr.head == self.c:
-            # If this head is still to the right of us, we can skip to it
+                    n += 1
            # No token that's between this token and this head could be our
            # child.
            if (ptr.head < 0) and ((ptr + ptr.head) > self.c):
                ptr += ptr.head
            elif ptr + ptr.head == self.c:
                yield Token.cinit(self.vocab, self._string,
                                  ptr, ptr - (self.c - self.i), self.array_len,
                                  self._py, self._tag_strings, self._dep_strings)
                ptr -= 1
            else:
                ptr -= 1
            return n
-    @property
+    property lefts:
-    def head(self):
+        def __get__(self):
-        """The token predicted by the parser to be the head of the current token."""
+            """The leftward immediate children of the word, in the syntactic
-        return Token.cinit(self.vocab, self._string,
+            dependency parse.
-                           self.c + self.c.head, self.i + self.c.head, self.array_len,
+            """
-                           self._py, self._tag_strings, self._dep_strings)
+            cdef const TokenC* ptr = self.c - self.i
            while ptr < self.c:
                # If this head is still to the right of us, we can skip to it
                # No token that's between this token and this head could be our
                # child.
                if (ptr.head >= 1) and (ptr + ptr.head) < self.c:
                    ptr += ptr.head
-    @property
+                elif ptr + ptr.head == self.c:
-    def whitespace_(self):
+                    yield Token.cinit(self.vocab, self._string,
-        return self.string[self.c.lex.length:]
+                                      ptr, ptr - (self.c - self.i), self.array_len,
                                      self._py, self._tag_strings, self._dep_strings)
                    ptr += 1
                else:
                    ptr += 1
-    @property
+    property rights:
-    def orth_(self):
+        def __get__(self):
-        return self.vocab.strings[self.c.lex.orth]
+            """The rightward immediate children of the word, in the syntactic
            dependency parse."""
            cdef const TokenC* ptr = (self.c - self.i) + (self.array_len - 1)
            while ptr > self.c:
                # If this head is still to the right of us, we can skip to it
                # No token that's between this token and this head could be our
                # child.
                if (ptr.head < 0) and ((ptr + ptr.head) > self.c):
                    ptr += ptr.head
                elif ptr + ptr.head == self.c:
                    yield Token.cinit(self.vocab, self._string,
                                      ptr, ptr - (self.c - self.i), self.array_len,
                                      self._py, self._tag_strings, self._dep_strings)
                    ptr -= 1
                else:
                    ptr -= 1
-    @property
+    property head:
-    def lower_(self):
+        def __get__(self):
-        return self.vocab.strings[self.c.lex.lower]
+            """The token predicted by the parser to be the head of the current token."""
            return Token.cinit(self.vocab, self._string,
                               self.c + self.c.head, self.i + self.c.head, self.array_len,
                               self._py, self._tag_strings, self._dep_strings)
-    @property
+    property whitespace_:
-    def norm_(self):
+        def __get__(self):
-        return self.vocab.strings[self.c.lex.norm]
+            return self.string[self.c.lex.length:]
-    @property
+    property orth_:
-    def shape_(self):
+        def __get__(self):
-        return self.vocab.strings[self.c.lex.shape]
+            return self.vocab.strings[self.c.lex.orth]
-    @property
+    property lower_:
-    def prefix_(self):
+        def __get__(self):
-        return self.vocab.strings[self.c.lex.prefix]
+            return self.vocab.strings[self.c.lex.lower]
-    @property
+    property norm_:
-    def suffix_(self):
+        def __get__(self):
-        return self.vocab.strings[self.c.lex.suffix]
+            return self.vocab.strings[self.c.lex.norm]
-    @property
+    property shape_:
-    def lemma_(self):
+        def __get__(self):
-        return self.vocab.strings[self.c.lemma]
+            return self.vocab.strings[self.c.lex.shape]
-    @property
+    property prefix_:
-    def pos_(self):
+        def __get__(self):
-        return _pos_id_to_string[self.c.pos]
+            return self.vocab.strings[self.c.lex.prefix]
-    @property
+    property suffix_:
-    def tag_(self):
+        def __get__(self):
-        return self._tag_strings[self.c.tag]
+            return self.vocab.strings[self.c.lex.suffix]
-    @property
+    property lemma_:
-    def dep_(self):
+        def __get__(self):
-        return self._dep_strings[self.c.dep]
+            return self.vocab.strings[self.c.lemma]
    property pos_:
        def __get__(self):
            return _pos_id_to_string[self.c.pos]
    property tag_:
        def __get__(self):
            return self._tag_strings[self.c.tag]
    property dep_:
        def __get__(self):
            return self._dep_strings[self.c.dep]
 _pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}
--- a/tests/test_token_references.py
+++ b/tests/test_token_references.py
@ -0,0 +1,24 @@
 from __future__ import unicode_literals
 import pytest
 import gc
 from spacy.en import English
 def get_orphan_token(text, i):
    nlp = English()
    tokens = nlp(text)
    gc.collect()
    token = tokens[i]
    del tokens
    return token
 def test_orphan():
    orphan = get_orphan_token('An orphan token', 1)
    gc.collect()
    dummy = get_orphan_token('Load and flush the memory', 0)
    dummy = get_orphan_token('Load again...', 0)
    assert orphan.orth_ == 'orphan'
    assert orphan.pos_ == 'ADJ'
    assert orphan.head.orth_ == 'token'