* Pass ownership of C data to Token instances if Tokens object is being garbage-collected, but Token instances are staying alive.

This commit is contained in:
Matthew Honnibal 2015-02-11 18:05:06 -05:00
parent db3f26a51b
commit 7572e31f5e
3 changed files with 205 additions and 150 deletions

View File

@ -58,6 +58,7 @@ cdef class Token:
cdef const TokenC* c
cdef readonly int i
cdef int array_len
cdef bint _owns_c_data
cdef list _py
@ -86,3 +87,5 @@ cdef class Token:
self._dep_strings = dep_strings
py_tokens[offset] = self
return self
cdef int take_ownership_of_c_data(self) except -1

View File

@ -17,6 +17,9 @@ import numpy
cimport cython
from cpython.mem cimport PyMem_Malloc, PyMem_Free
from libc.string cimport memcpy
import sys
DEF PADDING = 5
@ -92,6 +95,21 @@ cdef class Tokens:
self._tag_strings = tuple() # These will be set by the POS tagger and parser
self._dep_strings = tuple() # The strings are arbitrary and model-specific.
def __dealloc__(self):
# The Token object initially only gets a view of the underlying C
# data --- it doesn't own it. But, if we have Token objects that are
# going to outlive this instance, those objects need a copy of the C
# data.
cdef Token token
if self._py_tokens is not None:
for token in self._py_tokens:
if token is not None:
# Why 3? 1 for the entry in the _py_tokens list,
# and 1 for this reference. If we have _another_ ref, then
# the token will live, and needs to own its data.
if sys.getrefcount(token) >= 3:
token.take_ownership_of_c_data()
def __getitem__(self, object i):
"""Retrieve a token.
@ -139,8 +157,6 @@ cdef class Tokens:
self._py_tokens.append(None)
return idx + t.lex.length
@cython.boundscheck(False)
cpdef long[:,:] to_array(self, object py_attr_ids):
"""Given a list of M attribute IDs, export the tokens to a numpy ndarray
@ -234,196 +250,208 @@ cdef class Tokens:
cdef class Token:
"""An individual token."""
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
via Tokens.__getitem__ and Tokens.__iter__.
"""
def __cinit__(self, Vocab vocab, unicode string):
self.vocab = vocab
self._string = string
def __dealloc__(self):
if self._owns_c_data:
# Cast through const, if we own the data
PyMem_Free(<void*>self.c)
def __len__(self):
return self.c.lex.length
def __unicode__(self):
return self.string
cdef int take_ownership_of_c_data(self) except -1:
owned_data = <TokenC*>PyMem_Malloc(sizeof(TokenC) * self.array_len)
memcpy(owned_data, self.c, sizeof(TokenC) * self.array_len)
self.c = owned_data
self._owns_c_data = True
def nbor(self, int i=1):
return Token.cinit(self.vocab, self._string,
self.c, self.i, self.array_len,
self._py, self._tag_strings, self._dep_strings)
@property
def string(self):
cdef int next_idx = (self.c + 1).idx
if next_idx < self.c.idx:
next_idx = self.c.idx + self.c.lex.length
return self._string[self.c.idx:next_idx]
property string:
def __get__(self):
cdef int next_idx = (self.c + 1).idx
if next_idx < self.c.idx:
next_idx = self.c.idx + self.c.lex.length
return self._string[self.c.idx:next_idx]
@property
def prob(self):
return self.c.lex.prob
property prob:
def __get__(self):
return self.c.lex.prob
@property
def idx(self):
return self.c.idx
property idx:
def __get__(self):
return self.c.idx
@property
def cluster(self):
return self.c.lex.cluster
property cluster:
def __get__(self):
return self.c.lex.cluster
@property
def cluster(self):
return self.c.lex.cluster
property orth:
def __get__(self):
return self.c.lex.orth
@property
def orth(self):
return self.c.lex.orth
property lower:
def __get__(self):
return self.c.lex.lower
@property
def lower(self):
return self.c.lex.lower
property norm:
def __get__(self):
return self.c.lex.norm
@property
def norm(self):
return self.c.lex.norm
property shape:
def __get__(self):
return self.c.lex.shape
@property
def shape(self):
return self.c.lex.shape
property prefix:
def __get__(self):
return self.c.lex.prefix
@property
def prefix(self):
return self.c.lex.prefix
property suffix:
def __get__(self):
return self.c.lex.suffix
@property
def suffix(self):
return self.c.lex.suffix
property lemma:
def __get__(self):
return self.c.lemma
@property
def lemma(self):
return self.c.lemma
property pos:
def __get__(self):
return self.c.pos
@property
def pos(self):
return self.c.pos
property tag:
def __get__(self):
return self.c.tag
@property
def tag(self):
return self.c.tag
property dep:
def __get__(self):
return self.c.dep
@property
def dep(self):
return self.c.dep
property repvec:
def __get__(self):
return numpy.asarray(<float[:300,]> self.c.lex.repvec)
@property
def repvec(self):
return numpy.asarray(<float[:300,]> self.c.lex.repvec)
@property
def n_lefts(self):
cdef int n = 0
cdef const TokenC* ptr = self.c - self.i
while ptr != self.c:
if ptr + ptr.head == self.c:
n += 1
ptr += 1
return n
@property
def n_rights(self):
cdef int n = 0
cdef const TokenC* ptr = self.c + (self.array_len - self.i)
while ptr != self.c:
if ptr + ptr.head == self.c:
n += 1
ptr -= 1
return n
@property
def lefts(self):
"""The leftward immediate children of the word, in the syntactic
dependency parse.
"""
cdef const TokenC* ptr = self.c - self.i
while ptr < self.c:
# If this head is still to the right of us, we can skip to it
# No token that's between this token and this head could be our
# child.
if (ptr.head >= 1) and (ptr + ptr.head) < self.c:
ptr += ptr.head
elif ptr + ptr.head == self.c:
yield Token.cinit(self.vocab, self._string,
ptr, ptr - (self.c - self.i), self.array_len,
self._py, self._tag_strings, self._dep_strings)
ptr += 1
else:
property n_lefts:
def __get__(self):
cdef int n = 0
cdef const TokenC* ptr = self.c - self.i
while ptr != self.c:
if ptr + ptr.head == self.c:
n += 1
ptr += 1
return n
@property
def rights(self):
"""The rightward immediate children of the word, in the syntactic
dependency parse."""
cdef const TokenC* ptr = (self.c - self.i) + (self.array_len - 1)
while ptr > self.c:
# If this head is still to the right of us, we can skip to it
# No token that's between this token and this head could be our
# child.
if (ptr.head < 0) and ((ptr + ptr.head) > self.c):
ptr += ptr.head
elif ptr + ptr.head == self.c:
yield Token.cinit(self.vocab, self._string,
ptr, ptr - (self.c - self.i), self.array_len,
self._py, self._tag_strings, self._dep_strings)
ptr -= 1
else:
property n_rights:
def __get__(self):
cdef int n = 0
cdef const TokenC* ptr = self.c + (self.array_len - self.i)
while ptr != self.c:
if ptr + ptr.head == self.c:
n += 1
ptr -= 1
return n
@property
def head(self):
"""The token predicted by the parser to be the head of the current token."""
return Token.cinit(self.vocab, self._string,
self.c + self.c.head, self.i + self.c.head, self.array_len,
self._py, self._tag_strings, self._dep_strings)
property lefts:
def __get__(self):
"""The leftward immediate children of the word, in the syntactic
dependency parse.
"""
cdef const TokenC* ptr = self.c - self.i
while ptr < self.c:
# If this head is still to the right of us, we can skip to it
# No token that's between this token and this head could be our
# child.
if (ptr.head >= 1) and (ptr + ptr.head) < self.c:
ptr += ptr.head
@property
def whitespace_(self):
return self.string[self.c.lex.length:]
elif ptr + ptr.head == self.c:
yield Token.cinit(self.vocab, self._string,
ptr, ptr - (self.c - self.i), self.array_len,
self._py, self._tag_strings, self._dep_strings)
ptr += 1
else:
ptr += 1
@property
def orth_(self):
return self.vocab.strings[self.c.lex.orth]
property rights:
def __get__(self):
"""The rightward immediate children of the word, in the syntactic
dependency parse."""
cdef const TokenC* ptr = (self.c - self.i) + (self.array_len - 1)
while ptr > self.c:
# If this head is still to the right of us, we can skip to it
# No token that's between this token and this head could be our
# child.
if (ptr.head < 0) and ((ptr + ptr.head) > self.c):
ptr += ptr.head
elif ptr + ptr.head == self.c:
yield Token.cinit(self.vocab, self._string,
ptr, ptr - (self.c - self.i), self.array_len,
self._py, self._tag_strings, self._dep_strings)
ptr -= 1
else:
ptr -= 1
@property
def lower_(self):
return self.vocab.strings[self.c.lex.lower]
property head:
def __get__(self):
"""The token predicted by the parser to be the head of the current token."""
return Token.cinit(self.vocab, self._string,
self.c + self.c.head, self.i + self.c.head, self.array_len,
self._py, self._tag_strings, self._dep_strings)
@property
def norm_(self):
return self.vocab.strings[self.c.lex.norm]
property whitespace_:
def __get__(self):
return self.string[self.c.lex.length:]
@property
def shape_(self):
return self.vocab.strings[self.c.lex.shape]
property orth_:
def __get__(self):
return self.vocab.strings[self.c.lex.orth]
@property
def prefix_(self):
return self.vocab.strings[self.c.lex.prefix]
property lower_:
def __get__(self):
return self.vocab.strings[self.c.lex.lower]
@property
def suffix_(self):
return self.vocab.strings[self.c.lex.suffix]
property norm_:
def __get__(self):
return self.vocab.strings[self.c.lex.norm]
@property
def lemma_(self):
return self.vocab.strings[self.c.lemma]
property shape_:
def __get__(self):
return self.vocab.strings[self.c.lex.shape]
@property
def pos_(self):
return _pos_id_to_string[self.c.pos]
property prefix_:
def __get__(self):
return self.vocab.strings[self.c.lex.prefix]
@property
def tag_(self):
return self._tag_strings[self.c.tag]
property suffix_:
def __get__(self):
return self.vocab.strings[self.c.lex.suffix]
@property
def dep_(self):
return self._dep_strings[self.c.dep]
property lemma_:
def __get__(self):
return self.vocab.strings[self.c.lemma]
property pos_:
def __get__(self):
return _pos_id_to_string[self.c.pos]
property tag_:
def __get__(self):
return self._tag_strings[self.c.tag]
property dep_:
def __get__(self):
return self._dep_strings[self.c.dep]
_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}

View File

@ -0,0 +1,24 @@
from __future__ import unicode_literals
import pytest
import gc
from spacy.en import English
def get_orphan_token(text, i):
nlp = English()
tokens = nlp(text)
gc.collect()
token = tokens[i]
del tokens
return token
def test_orphan():
orphan = get_orphan_token('An orphan token', 1)
gc.collect()
dummy = get_orphan_token('Load and flush the memory', 0)
dummy = get_orphan_token('Load again...', 0)
assert orphan.orth_ == 'orphan'
assert orphan.pos_ == 'ADJ'
assert orphan.head.orth_ == 'token'