* Pass ownership of C data to Token instances if Tokens object is being garbage-collected, but Token instances are staying alive.

2015-02-11 18:05:06 -05:00 · 2015-02-11 18:05:06 -05:00 · 7572e31f5e
parent db3f26a51b
commit 7572e31f5e
3 changed files with 205 additions and 150 deletions
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -58,6 +58,7 @@ cdef class Token:
    cdef const TokenC* c
    cdef readonly int i
    cdef int array_len
    cdef bint _owns_c_data
    cdef list _py
@ -86,3 +87,5 @@ cdef class Token:
        self._dep_strings = dep_strings
        py_tokens[offset] = self
        return self
    cdef int take_ownership_of_c_data(self) except -1
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -17,6 +17,9 @@ import numpy
 cimport cython
 from cpython.mem cimport PyMem_Malloc, PyMem_Free
 from libc.string cimport memcpy
 import sys
 DEF PADDING = 5
@ -92,6 +95,21 @@ cdef class Tokens:
        self._tag_strings = tuple() # These will be set by the POS tagger and parser
        self._dep_strings = tuple() # The strings are arbitrary and model-specific.
    def __dealloc__(self):
        # The Token object initially only gets a view of the underlying C
        # data --- it doesn't own it. But, if we have Token objects that are
        # going to outlive this instance, those objects need a copy of the C
        # data.
        cdef Token token
        if self._py_tokens is not None:
            for token in self._py_tokens:
                if token is not None:
                    # Why 3? 1 for the entry in the _py_tokens list,
                    # and 1 for this reference. If we have _another_ ref, then
                    # the token will live, and needs to own its data.
                    if sys.getrefcount(token) >= 3:
                        token.take_ownership_of_c_data()
    def __getitem__(self, object i):
        """Retrieve a token.
@ -139,8 +157,6 @@ cdef class Tokens:
        self._py_tokens.append(None)
        return idx + t.lex.length
    @cython.boundscheck(False)
    cpdef long[:,:] to_array(self, object py_attr_ids):
        """Given a list of M attribute IDs, export the tokens to a numpy ndarray
@ -234,88 +250,100 @@ cdef class Tokens:
 cdef class Token:
-    """An individual token."""
+    """An individual token --- i.e. a word, a punctuation symbol, etc.  Created
    via Tokens.__getitem__ and Tokens.__iter__.
    """
    def __cinit__(self, Vocab vocab, unicode string):
        self.vocab = vocab
        self._string = string
    def __dealloc__(self):
        if self._owns_c_data:
            # Cast through const, if we own the data
            PyMem_Free(<void*>self.c)
    def __len__(self):
        return self.c.lex.length
    def __unicode__(self):
        return self.string
    cdef int take_ownership_of_c_data(self) except -1:
        owned_data = <TokenC*>PyMem_Malloc(sizeof(TokenC) * self.array_len)
        memcpy(owned_data, self.c, sizeof(TokenC) * self.array_len)
        self.c = owned_data
        self._owns_c_data = True
    def nbor(self, int i=1):
        return Token.cinit(self.vocab, self._string,
                           self.c, self.i, self.array_len,
                           self._py, self._tag_strings, self._dep_strings)
-    @property
+    property string:
-    def string(self):
+        def __get__(self):
            cdef int next_idx = (self.c + 1).idx
            if next_idx < self.c.idx:
                next_idx = self.c.idx + self.c.lex.length
            return self._string[self.c.idx:next_idx]
-    @property
+    property prob:
-    def prob(self):
+        def __get__(self):
            return self.c.lex.prob
-    @property
+    property idx:
-    def idx(self):
+        def __get__(self):
            return self.c.idx
-    @property
+    property cluster:
-    def cluster(self):
+        def __get__(self):
            return self.c.lex.cluster
-    @property
+    property orth:
-    def cluster(self):
+        def __get__(self):
        return self.c.lex.cluster
    @property
    def orth(self):
            return self.c.lex.orth
-    @property
+    property lower:
-    def lower(self):
+        def __get__(self):
            return self.c.lex.lower
-    @property
+    property norm:
-    def norm(self):
+        def __get__(self):
            return self.c.lex.norm
-    @property
+    property shape:
-    def shape(self):
+        def __get__(self):
            return self.c.lex.shape
-    @property
+    property prefix:
-    def prefix(self):
+        def __get__(self):
            return self.c.lex.prefix
-    @property
+    property suffix:
-    def suffix(self):
+        def __get__(self):
            return self.c.lex.suffix
-    @property
+    property lemma:
-    def lemma(self):
+        def __get__(self):
            return self.c.lemma
-    @property
+    property pos:
-    def pos(self):
+        def __get__(self):
            return self.c.pos
-    @property
+    property tag:
-    def tag(self):
+        def __get__(self):
            return self.c.tag
-    @property
+    property dep:
-    def dep(self):
+        def __get__(self):
            return self.c.dep
-    @property
+    property repvec:
-    def repvec(self):
+        def __get__(self):
            return numpy.asarray(<float[:300,]> self.c.lex.repvec)
-    @property 
+    property n_lefts:
-    def n_lefts(self):
+        def __get__(self):
            cdef int n = 0
            cdef const TokenC* ptr = self.c - self.i
            while ptr != self.c:
@ -324,8 +352,8 @@ cdef class Token:
                ptr += 1
            return n
-    @property 
+    property n_rights:
-    def n_rights(self):
+        def __get__(self):
            cdef int n = 0
            cdef const TokenC* ptr = self.c + (self.array_len - self.i)
            while ptr != self.c:
@ -334,8 +362,8 @@ cdef class Token:
                ptr -= 1
            return n
-    @property 
+    property lefts:
-    def lefts(self):
+        def __get__(self):
            """The leftward immediate children of the word, in the syntactic
            dependency parse.
            """
@ -355,8 +383,8 @@ cdef class Token:
                else:
                    ptr += 1
-    @property
+    property rights:
-    def rights(self):
+        def __get__(self):
            """The rightward immediate children of the word, in the syntactic
            dependency parse."""
            cdef const TokenC* ptr = (self.c - self.i) + (self.array_len - 1)
@ -374,55 +402,55 @@ cdef class Token:
                else:
                    ptr -= 1
-    @property
+    property head:
-    def head(self):
+        def __get__(self):
            """The token predicted by the parser to be the head of the current token."""
            return Token.cinit(self.vocab, self._string,
                               self.c + self.c.head, self.i + self.c.head, self.array_len,
                               self._py, self._tag_strings, self._dep_strings)
-    @property
+    property whitespace_:
-    def whitespace_(self):
+        def __get__(self):
            return self.string[self.c.lex.length:]
-    @property
+    property orth_:
-    def orth_(self):
+        def __get__(self):
            return self.vocab.strings[self.c.lex.orth]
-    @property
+    property lower_:
-    def lower_(self):
+        def __get__(self):
            return self.vocab.strings[self.c.lex.lower]
-    @property
+    property norm_:
-    def norm_(self):
+        def __get__(self):
            return self.vocab.strings[self.c.lex.norm]
-    @property
+    property shape_:
-    def shape_(self):
+        def __get__(self):
            return self.vocab.strings[self.c.lex.shape]
-    @property
+    property prefix_:
-    def prefix_(self):
+        def __get__(self):
            return self.vocab.strings[self.c.lex.prefix]
-    @property
+    property suffix_:
-    def suffix_(self):
+        def __get__(self):
            return self.vocab.strings[self.c.lex.suffix]
-    @property
+    property lemma_:
-    def lemma_(self):
+        def __get__(self):
            return self.vocab.strings[self.c.lemma]
-    @property
+    property pos_:
-    def pos_(self):
+        def __get__(self):
            return _pos_id_to_string[self.c.pos]
-    @property
+    property tag_:
-    def tag_(self):
+        def __get__(self):
            return self._tag_strings[self.c.tag]
-    @property
+    property dep_:
-    def dep_(self):
+        def __get__(self):
            return self._dep_strings[self.c.dep]
--- a/tests/test_token_references.py
+++ b/tests/test_token_references.py
@ -0,0 +1,24 @@
 from __future__ import unicode_literals
 import pytest
 import gc
 from spacy.en import English
 def get_orphan_token(text, i):
    nlp = English()
    tokens = nlp(text)
    gc.collect()
    token = tokens[i]
    del tokens
    return token
 def test_orphan():
    orphan = get_orphan_token('An orphan token', 1)
    gc.collect()
    dummy = get_orphan_token('Load and flush the memory', 0)
    dummy = get_orphan_token('Load again...', 0)
    assert orphan.orth_ == 'orphan'
    assert orphan.pos_ == 'ADJ'
    assert orphan.head.orth_ == 'token'