* Pass ownership of C data to Token instances if Tokens object is being garbage-collected, but Token instances are staying alive.

This commit is contained in:
Matthew Honnibal 2015-02-11 18:05:06 -05:00
parent db3f26a51b
commit 7572e31f5e
3 changed files with 205 additions and 150 deletions

View File

@ -58,6 +58,7 @@ cdef class Token:
cdef const TokenC* c cdef const TokenC* c
cdef readonly int i cdef readonly int i
cdef int array_len cdef int array_len
cdef bint _owns_c_data
cdef list _py cdef list _py
@ -86,3 +87,5 @@ cdef class Token:
self._dep_strings = dep_strings self._dep_strings = dep_strings
py_tokens[offset] = self py_tokens[offset] = self
return self return self
cdef int take_ownership_of_c_data(self) except -1

View File

@ -17,6 +17,9 @@ import numpy
cimport cython cimport cython
from cpython.mem cimport PyMem_Malloc, PyMem_Free
from libc.string cimport memcpy
import sys
DEF PADDING = 5 DEF PADDING = 5
@ -92,6 +95,21 @@ cdef class Tokens:
self._tag_strings = tuple() # These will be set by the POS tagger and parser self._tag_strings = tuple() # These will be set by the POS tagger and parser
self._dep_strings = tuple() # The strings are arbitrary and model-specific. self._dep_strings = tuple() # The strings are arbitrary and model-specific.
def __dealloc__(self):
# The Token object initially only gets a view of the underlying C
# data --- it doesn't own it. But, if we have Token objects that are
# going to outlive this instance, those objects need a copy of the C
# data.
cdef Token token
if self._py_tokens is not None:
for token in self._py_tokens:
if token is not None:
# Why 3? 1 for the entry in the _py_tokens list,
# and 1 for this reference. If we have _another_ ref, then
# the token will live, and needs to own its data.
if sys.getrefcount(token) >= 3:
token.take_ownership_of_c_data()
def __getitem__(self, object i): def __getitem__(self, object i):
"""Retrieve a token. """Retrieve a token.
@ -139,8 +157,6 @@ cdef class Tokens:
self._py_tokens.append(None) self._py_tokens.append(None)
return idx + t.lex.length return idx + t.lex.length
@cython.boundscheck(False) @cython.boundscheck(False)
cpdef long[:,:] to_array(self, object py_attr_ids): cpdef long[:,:] to_array(self, object py_attr_ids):
"""Given a list of M attribute IDs, export the tokens to a numpy ndarray """Given a list of M attribute IDs, export the tokens to a numpy ndarray
@ -234,196 +250,208 @@ cdef class Tokens:
cdef class Token: cdef class Token:
"""An individual token.""" """An individual token --- i.e. a word, a punctuation symbol, etc. Created
via Tokens.__getitem__ and Tokens.__iter__.
"""
def __cinit__(self, Vocab vocab, unicode string): def __cinit__(self, Vocab vocab, unicode string):
self.vocab = vocab self.vocab = vocab
self._string = string self._string = string
def __dealloc__(self):
if self._owns_c_data:
# Cast through const, if we own the data
PyMem_Free(<void*>self.c)
def __len__(self): def __len__(self):
return self.c.lex.length return self.c.lex.length
def __unicode__(self):
return self.string
cdef int take_ownership_of_c_data(self) except -1:
owned_data = <TokenC*>PyMem_Malloc(sizeof(TokenC) * self.array_len)
memcpy(owned_data, self.c, sizeof(TokenC) * self.array_len)
self.c = owned_data
self._owns_c_data = True
def nbor(self, int i=1): def nbor(self, int i=1):
return Token.cinit(self.vocab, self._string, return Token.cinit(self.vocab, self._string,
self.c, self.i, self.array_len, self.c, self.i, self.array_len,
self._py, self._tag_strings, self._dep_strings) self._py, self._tag_strings, self._dep_strings)
@property property string:
def string(self): def __get__(self):
cdef int next_idx = (self.c + 1).idx cdef int next_idx = (self.c + 1).idx
if next_idx < self.c.idx: if next_idx < self.c.idx:
next_idx = self.c.idx + self.c.lex.length next_idx = self.c.idx + self.c.lex.length
return self._string[self.c.idx:next_idx] return self._string[self.c.idx:next_idx]
@property property prob:
def prob(self): def __get__(self):
return self.c.lex.prob return self.c.lex.prob
@property property idx:
def idx(self): def __get__(self):
return self.c.idx return self.c.idx
@property property cluster:
def cluster(self): def __get__(self):
return self.c.lex.cluster return self.c.lex.cluster
@property property orth:
def cluster(self): def __get__(self):
return self.c.lex.cluster return self.c.lex.orth
@property property lower:
def orth(self): def __get__(self):
return self.c.lex.orth return self.c.lex.lower
@property property norm:
def lower(self): def __get__(self):
return self.c.lex.lower return self.c.lex.norm
@property property shape:
def norm(self): def __get__(self):
return self.c.lex.norm return self.c.lex.shape
@property property prefix:
def shape(self): def __get__(self):
return self.c.lex.shape return self.c.lex.prefix
@property property suffix:
def prefix(self): def __get__(self):
return self.c.lex.prefix return self.c.lex.suffix
@property property lemma:
def suffix(self): def __get__(self):
return self.c.lex.suffix return self.c.lemma
@property property pos:
def lemma(self): def __get__(self):
return self.c.lemma return self.c.pos
@property property tag:
def pos(self): def __get__(self):
return self.c.pos return self.c.tag
@property property dep:
def tag(self): def __get__(self):
return self.c.tag return self.c.dep
@property property repvec:
def dep(self): def __get__(self):
return self.c.dep return numpy.asarray(<float[:300,]> self.c.lex.repvec)
@property property n_lefts:
def repvec(self): def __get__(self):
return numpy.asarray(<float[:300,]> self.c.lex.repvec) cdef int n = 0
cdef const TokenC* ptr = self.c - self.i
@property while ptr != self.c:
def n_lefts(self): if ptr + ptr.head == self.c:
cdef int n = 0 n += 1
cdef const TokenC* ptr = self.c - self.i
while ptr != self.c:
if ptr + ptr.head == self.c:
n += 1
ptr += 1
return n
@property
def n_rights(self):
cdef int n = 0
cdef const TokenC* ptr = self.c + (self.array_len - self.i)
while ptr != self.c:
if ptr + ptr.head == self.c:
n += 1
ptr -= 1
return n
@property
def lefts(self):
"""The leftward immediate children of the word, in the syntactic
dependency parse.
"""
cdef const TokenC* ptr = self.c - self.i
while ptr < self.c:
# If this head is still to the right of us, we can skip to it
# No token that's between this token and this head could be our
# child.
if (ptr.head >= 1) and (ptr + ptr.head) < self.c:
ptr += ptr.head
elif ptr + ptr.head == self.c:
yield Token.cinit(self.vocab, self._string,
ptr, ptr - (self.c - self.i), self.array_len,
self._py, self._tag_strings, self._dep_strings)
ptr += 1
else:
ptr += 1 ptr += 1
return n
@property property n_rights:
def rights(self): def __get__(self):
"""The rightward immediate children of the word, in the syntactic cdef int n = 0
dependency parse.""" cdef const TokenC* ptr = self.c + (self.array_len - self.i)
cdef const TokenC* ptr = (self.c - self.i) + (self.array_len - 1) while ptr != self.c:
while ptr > self.c: if ptr + ptr.head == self.c:
# If this head is still to the right of us, we can skip to it n += 1
# No token that's between this token and this head could be our
# child.
if (ptr.head < 0) and ((ptr + ptr.head) > self.c):
ptr += ptr.head
elif ptr + ptr.head == self.c:
yield Token.cinit(self.vocab, self._string,
ptr, ptr - (self.c - self.i), self.array_len,
self._py, self._tag_strings, self._dep_strings)
ptr -= 1
else:
ptr -= 1 ptr -= 1
return n
@property property lefts:
def head(self): def __get__(self):
"""The token predicted by the parser to be the head of the current token.""" """The leftward immediate children of the word, in the syntactic
return Token.cinit(self.vocab, self._string, dependency parse.
self.c + self.c.head, self.i + self.c.head, self.array_len, """
self._py, self._tag_strings, self._dep_strings) cdef const TokenC* ptr = self.c - self.i
while ptr < self.c:
# If this head is still to the right of us, we can skip to it
# No token that's between this token and this head could be our
# child.
if (ptr.head >= 1) and (ptr + ptr.head) < self.c:
ptr += ptr.head
@property elif ptr + ptr.head == self.c:
def whitespace_(self): yield Token.cinit(self.vocab, self._string,
return self.string[self.c.lex.length:] ptr, ptr - (self.c - self.i), self.array_len,
self._py, self._tag_strings, self._dep_strings)
ptr += 1
else:
ptr += 1
@property property rights:
def orth_(self): def __get__(self):
return self.vocab.strings[self.c.lex.orth] """The rightward immediate children of the word, in the syntactic
dependency parse."""
cdef const TokenC* ptr = (self.c - self.i) + (self.array_len - 1)
while ptr > self.c:
# If this head is still to the right of us, we can skip to it
# No token that's between this token and this head could be our
# child.
if (ptr.head < 0) and ((ptr + ptr.head) > self.c):
ptr += ptr.head
elif ptr + ptr.head == self.c:
yield Token.cinit(self.vocab, self._string,
ptr, ptr - (self.c - self.i), self.array_len,
self._py, self._tag_strings, self._dep_strings)
ptr -= 1
else:
ptr -= 1
@property property head:
def lower_(self): def __get__(self):
return self.vocab.strings[self.c.lex.lower] """The token predicted by the parser to be the head of the current token."""
return Token.cinit(self.vocab, self._string,
self.c + self.c.head, self.i + self.c.head, self.array_len,
self._py, self._tag_strings, self._dep_strings)
@property property whitespace_:
def norm_(self): def __get__(self):
return self.vocab.strings[self.c.lex.norm] return self.string[self.c.lex.length:]
@property property orth_:
def shape_(self): def __get__(self):
return self.vocab.strings[self.c.lex.shape] return self.vocab.strings[self.c.lex.orth]
@property property lower_:
def prefix_(self): def __get__(self):
return self.vocab.strings[self.c.lex.prefix] return self.vocab.strings[self.c.lex.lower]
@property property norm_:
def suffix_(self): def __get__(self):
return self.vocab.strings[self.c.lex.suffix] return self.vocab.strings[self.c.lex.norm]
@property property shape_:
def lemma_(self): def __get__(self):
return self.vocab.strings[self.c.lemma] return self.vocab.strings[self.c.lex.shape]
@property property prefix_:
def pos_(self): def __get__(self):
return _pos_id_to_string[self.c.pos] return self.vocab.strings[self.c.lex.prefix]
@property property suffix_:
def tag_(self): def __get__(self):
return self._tag_strings[self.c.tag] return self.vocab.strings[self.c.lex.suffix]
@property property lemma_:
def dep_(self): def __get__(self):
return self._dep_strings[self.c.dep] return self.vocab.strings[self.c.lemma]
property pos_:
def __get__(self):
return _pos_id_to_string[self.c.pos]
property tag_:
def __get__(self):
return self._tag_strings[self.c.tag]
property dep_:
def __get__(self):
return self._dep_strings[self.c.dep]
_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()} _pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}

View File

@ -0,0 +1,24 @@
from __future__ import unicode_literals
import pytest
import gc
from spacy.en import English
def get_orphan_token(text, i):
nlp = English()
tokens = nlp(text)
gc.collect()
token = tokens[i]
del tokens
return token
def test_orphan():
orphan = get_orphan_token('An orphan token', 1)
gc.collect()
dummy = get_orphan_token('Load and flush the memory', 0)
dummy = get_orphan_token('Load again...', 0)
assert orphan.orth_ == 'orphan'
assert orphan.pos_ == 'ADJ'
assert orphan.head.orth_ == 'token'