* Pass ownership of C data to Token instances if Tokens object is being garbage-collected, but Token instances are staying alive.

This commit is contained in:
Matthew Honnibal 2015-02-11 18:05:06 -05:00
parent db3f26a51b
commit 7572e31f5e
3 changed files with 205 additions and 150 deletions

View File

@ -58,6 +58,7 @@ cdef class Token:
cdef const TokenC* c cdef const TokenC* c
cdef readonly int i cdef readonly int i
cdef int array_len cdef int array_len
cdef bint _owns_c_data
cdef list _py cdef list _py
@ -86,3 +87,5 @@ cdef class Token:
self._dep_strings = dep_strings self._dep_strings = dep_strings
py_tokens[offset] = self py_tokens[offset] = self
return self return self
cdef int take_ownership_of_c_data(self) except -1

View File

@ -17,6 +17,9 @@ import numpy
cimport cython cimport cython
from cpython.mem cimport PyMem_Malloc, PyMem_Free
from libc.string cimport memcpy
import sys
DEF PADDING = 5 DEF PADDING = 5
@ -92,6 +95,21 @@ cdef class Tokens:
self._tag_strings = tuple() # These will be set by the POS tagger and parser self._tag_strings = tuple() # These will be set by the POS tagger and parser
self._dep_strings = tuple() # The strings are arbitrary and model-specific. self._dep_strings = tuple() # The strings are arbitrary and model-specific.
def __dealloc__(self):
# The Token object initially only gets a view of the underlying C
# data --- it doesn't own it. But, if we have Token objects that are
# going to outlive this instance, those objects need a copy of the C
# data.
cdef Token token
if self._py_tokens is not None:
for token in self._py_tokens:
if token is not None:
# Why 3? 1 for the entry in the _py_tokens list,
# and 1 for this reference. If we have _another_ ref, then
# the token will live, and needs to own its data.
if sys.getrefcount(token) >= 3:
token.take_ownership_of_c_data()
def __getitem__(self, object i): def __getitem__(self, object i):
"""Retrieve a token. """Retrieve a token.
@ -139,8 +157,6 @@ cdef class Tokens:
self._py_tokens.append(None) self._py_tokens.append(None)
return idx + t.lex.length return idx + t.lex.length
@cython.boundscheck(False) @cython.boundscheck(False)
cpdef long[:,:] to_array(self, object py_attr_ids): cpdef long[:,:] to_array(self, object py_attr_ids):
"""Given a list of M attribute IDs, export the tokens to a numpy ndarray """Given a list of M attribute IDs, export the tokens to a numpy ndarray
@ -234,88 +250,100 @@ cdef class Tokens:
cdef class Token: cdef class Token:
"""An individual token.""" """An individual token --- i.e. a word, a punctuation symbol, etc. Created
via Tokens.__getitem__ and Tokens.__iter__.
"""
def __cinit__(self, Vocab vocab, unicode string): def __cinit__(self, Vocab vocab, unicode string):
self.vocab = vocab self.vocab = vocab
self._string = string self._string = string
def __dealloc__(self):
if self._owns_c_data:
# Cast through const, if we own the data
PyMem_Free(<void*>self.c)
def __len__(self): def __len__(self):
return self.c.lex.length return self.c.lex.length
def __unicode__(self):
return self.string
cdef int take_ownership_of_c_data(self) except -1:
owned_data = <TokenC*>PyMem_Malloc(sizeof(TokenC) * self.array_len)
memcpy(owned_data, self.c, sizeof(TokenC) * self.array_len)
self.c = owned_data
self._owns_c_data = True
def nbor(self, int i=1): def nbor(self, int i=1):
return Token.cinit(self.vocab, self._string, return Token.cinit(self.vocab, self._string,
self.c, self.i, self.array_len, self.c, self.i, self.array_len,
self._py, self._tag_strings, self._dep_strings) self._py, self._tag_strings, self._dep_strings)
@property property string:
def string(self): def __get__(self):
cdef int next_idx = (self.c + 1).idx cdef int next_idx = (self.c + 1).idx
if next_idx < self.c.idx: if next_idx < self.c.idx:
next_idx = self.c.idx + self.c.lex.length next_idx = self.c.idx + self.c.lex.length
return self._string[self.c.idx:next_idx] return self._string[self.c.idx:next_idx]
@property property prob:
def prob(self): def __get__(self):
return self.c.lex.prob return self.c.lex.prob
@property property idx:
def idx(self): def __get__(self):
return self.c.idx return self.c.idx
@property property cluster:
def cluster(self): def __get__(self):
return self.c.lex.cluster return self.c.lex.cluster
@property property orth:
def cluster(self): def __get__(self):
return self.c.lex.cluster
@property
def orth(self):
return self.c.lex.orth return self.c.lex.orth
@property property lower:
def lower(self): def __get__(self):
return self.c.lex.lower return self.c.lex.lower
@property property norm:
def norm(self): def __get__(self):
return self.c.lex.norm return self.c.lex.norm
@property property shape:
def shape(self): def __get__(self):
return self.c.lex.shape return self.c.lex.shape
@property property prefix:
def prefix(self): def __get__(self):
return self.c.lex.prefix return self.c.lex.prefix
@property property suffix:
def suffix(self): def __get__(self):
return self.c.lex.suffix return self.c.lex.suffix
@property property lemma:
def lemma(self): def __get__(self):
return self.c.lemma return self.c.lemma
@property property pos:
def pos(self): def __get__(self):
return self.c.pos return self.c.pos
@property property tag:
def tag(self): def __get__(self):
return self.c.tag return self.c.tag
@property property dep:
def dep(self): def __get__(self):
return self.c.dep return self.c.dep
@property property repvec:
def repvec(self): def __get__(self):
return numpy.asarray(<float[:300,]> self.c.lex.repvec) return numpy.asarray(<float[:300,]> self.c.lex.repvec)
@property property n_lefts:
def n_lefts(self): def __get__(self):
cdef int n = 0 cdef int n = 0
cdef const TokenC* ptr = self.c - self.i cdef const TokenC* ptr = self.c - self.i
while ptr != self.c: while ptr != self.c:
@ -324,8 +352,8 @@ cdef class Token:
ptr += 1 ptr += 1
return n return n
@property property n_rights:
def n_rights(self): def __get__(self):
cdef int n = 0 cdef int n = 0
cdef const TokenC* ptr = self.c + (self.array_len - self.i) cdef const TokenC* ptr = self.c + (self.array_len - self.i)
while ptr != self.c: while ptr != self.c:
@ -334,8 +362,8 @@ cdef class Token:
ptr -= 1 ptr -= 1
return n return n
@property property lefts:
def lefts(self): def __get__(self):
"""The leftward immediate children of the word, in the syntactic """The leftward immediate children of the word, in the syntactic
dependency parse. dependency parse.
""" """
@ -355,8 +383,8 @@ cdef class Token:
else: else:
ptr += 1 ptr += 1
@property property rights:
def rights(self): def __get__(self):
"""The rightward immediate children of the word, in the syntactic """The rightward immediate children of the word, in the syntactic
dependency parse.""" dependency parse."""
cdef const TokenC* ptr = (self.c - self.i) + (self.array_len - 1) cdef const TokenC* ptr = (self.c - self.i) + (self.array_len - 1)
@ -374,55 +402,55 @@ cdef class Token:
else: else:
ptr -= 1 ptr -= 1
@property property head:
def head(self): def __get__(self):
"""The token predicted by the parser to be the head of the current token.""" """The token predicted by the parser to be the head of the current token."""
return Token.cinit(self.vocab, self._string, return Token.cinit(self.vocab, self._string,
self.c + self.c.head, self.i + self.c.head, self.array_len, self.c + self.c.head, self.i + self.c.head, self.array_len,
self._py, self._tag_strings, self._dep_strings) self._py, self._tag_strings, self._dep_strings)
@property property whitespace_:
def whitespace_(self): def __get__(self):
return self.string[self.c.lex.length:] return self.string[self.c.lex.length:]
@property property orth_:
def orth_(self): def __get__(self):
return self.vocab.strings[self.c.lex.orth] return self.vocab.strings[self.c.lex.orth]
@property property lower_:
def lower_(self): def __get__(self):
return self.vocab.strings[self.c.lex.lower] return self.vocab.strings[self.c.lex.lower]
@property property norm_:
def norm_(self): def __get__(self):
return self.vocab.strings[self.c.lex.norm] return self.vocab.strings[self.c.lex.norm]
@property property shape_:
def shape_(self): def __get__(self):
return self.vocab.strings[self.c.lex.shape] return self.vocab.strings[self.c.lex.shape]
@property property prefix_:
def prefix_(self): def __get__(self):
return self.vocab.strings[self.c.lex.prefix] return self.vocab.strings[self.c.lex.prefix]
@property property suffix_:
def suffix_(self): def __get__(self):
return self.vocab.strings[self.c.lex.suffix] return self.vocab.strings[self.c.lex.suffix]
@property property lemma_:
def lemma_(self): def __get__(self):
return self.vocab.strings[self.c.lemma] return self.vocab.strings[self.c.lemma]
@property property pos_:
def pos_(self): def __get__(self):
return _pos_id_to_string[self.c.pos] return _pos_id_to_string[self.c.pos]
@property property tag_:
def tag_(self): def __get__(self):
return self._tag_strings[self.c.tag] return self._tag_strings[self.c.tag]
@property property dep_:
def dep_(self): def __get__(self):
return self._dep_strings[self.c.dep] return self._dep_strings[self.c.dep]

View File

@ -0,0 +1,24 @@
from __future__ import unicode_literals
import pytest
import gc
from spacy.en import English
def get_orphan_token(text, i):
nlp = English()
tokens = nlp(text)
gc.collect()
token = tokens[i]
del tokens
return token
def test_orphan():
orphan = get_orphan_token('An orphan token', 1)
gc.collect()
dummy = get_orphan_token('Load and flush the memory', 0)
dummy = get_orphan_token('Load again...', 0)
assert orphan.orth_ == 'orphan'
assert orphan.pos_ == 'ADJ'
assert orphan.head.orth_ == 'token'