mirror of https://github.com/explosion/spaCy.git
* Pass ownership of C data to Token instances if Tokens object is being garbage-collected, but Token instances are staying alive.
This commit is contained in:
parent
db3f26a51b
commit
7572e31f5e
|
@ -58,6 +58,7 @@ cdef class Token:
|
||||||
cdef const TokenC* c
|
cdef const TokenC* c
|
||||||
cdef readonly int i
|
cdef readonly int i
|
||||||
cdef int array_len
|
cdef int array_len
|
||||||
|
cdef bint _owns_c_data
|
||||||
|
|
||||||
|
|
||||||
cdef list _py
|
cdef list _py
|
||||||
|
@ -86,3 +87,5 @@ cdef class Token:
|
||||||
self._dep_strings = dep_strings
|
self._dep_strings = dep_strings
|
||||||
py_tokens[offset] = self
|
py_tokens[offset] = self
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
cdef int take_ownership_of_c_data(self) except -1
|
||||||
|
|
328
spacy/tokens.pyx
328
spacy/tokens.pyx
|
@ -17,6 +17,9 @@ import numpy
|
||||||
|
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
|
from cpython.mem cimport PyMem_Malloc, PyMem_Free
|
||||||
|
from libc.string cimport memcpy
|
||||||
|
import sys
|
||||||
|
|
||||||
DEF PADDING = 5
|
DEF PADDING = 5
|
||||||
|
|
||||||
|
@ -92,6 +95,21 @@ cdef class Tokens:
|
||||||
self._tag_strings = tuple() # These will be set by the POS tagger and parser
|
self._tag_strings = tuple() # These will be set by the POS tagger and parser
|
||||||
self._dep_strings = tuple() # The strings are arbitrary and model-specific.
|
self._dep_strings = tuple() # The strings are arbitrary and model-specific.
|
||||||
|
|
||||||
|
def __dealloc__(self):
|
||||||
|
# The Token object initially only gets a view of the underlying C
|
||||||
|
# data --- it doesn't own it. But, if we have Token objects that are
|
||||||
|
# going to outlive this instance, those objects need a copy of the C
|
||||||
|
# data.
|
||||||
|
cdef Token token
|
||||||
|
if self._py_tokens is not None:
|
||||||
|
for token in self._py_tokens:
|
||||||
|
if token is not None:
|
||||||
|
# Why 3? 1 for the entry in the _py_tokens list,
|
||||||
|
# and 1 for this reference. If we have _another_ ref, then
|
||||||
|
# the token will live, and needs to own its data.
|
||||||
|
if sys.getrefcount(token) >= 3:
|
||||||
|
token.take_ownership_of_c_data()
|
||||||
|
|
||||||
def __getitem__(self, object i):
|
def __getitem__(self, object i):
|
||||||
"""Retrieve a token.
|
"""Retrieve a token.
|
||||||
|
|
||||||
|
@ -139,8 +157,6 @@ cdef class Tokens:
|
||||||
self._py_tokens.append(None)
|
self._py_tokens.append(None)
|
||||||
return idx + t.lex.length
|
return idx + t.lex.length
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@cython.boundscheck(False)
|
@cython.boundscheck(False)
|
||||||
cpdef long[:,:] to_array(self, object py_attr_ids):
|
cpdef long[:,:] to_array(self, object py_attr_ids):
|
||||||
"""Given a list of M attribute IDs, export the tokens to a numpy ndarray
|
"""Given a list of M attribute IDs, export the tokens to a numpy ndarray
|
||||||
|
@ -234,196 +250,208 @@ cdef class Tokens:
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
"""An individual token."""
|
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
|
||||||
|
via Tokens.__getitem__ and Tokens.__iter__.
|
||||||
|
"""
|
||||||
def __cinit__(self, Vocab vocab, unicode string):
|
def __cinit__(self, Vocab vocab, unicode string):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self._string = string
|
self._string = string
|
||||||
|
|
||||||
|
def __dealloc__(self):
|
||||||
|
if self._owns_c_data:
|
||||||
|
# Cast through const, if we own the data
|
||||||
|
PyMem_Free(<void*>self.c)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return self.c.lex.length
|
return self.c.lex.length
|
||||||
|
|
||||||
|
def __unicode__(self):
|
||||||
|
return self.string
|
||||||
|
|
||||||
|
cdef int take_ownership_of_c_data(self) except -1:
|
||||||
|
owned_data = <TokenC*>PyMem_Malloc(sizeof(TokenC) * self.array_len)
|
||||||
|
memcpy(owned_data, self.c, sizeof(TokenC) * self.array_len)
|
||||||
|
self.c = owned_data
|
||||||
|
self._owns_c_data = True
|
||||||
|
|
||||||
def nbor(self, int i=1):
|
def nbor(self, int i=1):
|
||||||
return Token.cinit(self.vocab, self._string,
|
return Token.cinit(self.vocab, self._string,
|
||||||
self.c, self.i, self.array_len,
|
self.c, self.i, self.array_len,
|
||||||
self._py, self._tag_strings, self._dep_strings)
|
self._py, self._tag_strings, self._dep_strings)
|
||||||
|
|
||||||
@property
|
property string:
|
||||||
def string(self):
|
def __get__(self):
|
||||||
cdef int next_idx = (self.c + 1).idx
|
cdef int next_idx = (self.c + 1).idx
|
||||||
if next_idx < self.c.idx:
|
if next_idx < self.c.idx:
|
||||||
next_idx = self.c.idx + self.c.lex.length
|
next_idx = self.c.idx + self.c.lex.length
|
||||||
return self._string[self.c.idx:next_idx]
|
return self._string[self.c.idx:next_idx]
|
||||||
|
|
||||||
@property
|
property prob:
|
||||||
def prob(self):
|
def __get__(self):
|
||||||
return self.c.lex.prob
|
return self.c.lex.prob
|
||||||
|
|
||||||
@property
|
property idx:
|
||||||
def idx(self):
|
def __get__(self):
|
||||||
return self.c.idx
|
return self.c.idx
|
||||||
|
|
||||||
@property
|
property cluster:
|
||||||
def cluster(self):
|
def __get__(self):
|
||||||
return self.c.lex.cluster
|
return self.c.lex.cluster
|
||||||
|
|
||||||
@property
|
property orth:
|
||||||
def cluster(self):
|
def __get__(self):
|
||||||
return self.c.lex.cluster
|
return self.c.lex.orth
|
||||||
|
|
||||||
@property
|
property lower:
|
||||||
def orth(self):
|
def __get__(self):
|
||||||
return self.c.lex.orth
|
return self.c.lex.lower
|
||||||
|
|
||||||
@property
|
property norm:
|
||||||
def lower(self):
|
def __get__(self):
|
||||||
return self.c.lex.lower
|
return self.c.lex.norm
|
||||||
|
|
||||||
@property
|
property shape:
|
||||||
def norm(self):
|
def __get__(self):
|
||||||
return self.c.lex.norm
|
return self.c.lex.shape
|
||||||
|
|
||||||
@property
|
property prefix:
|
||||||
def shape(self):
|
def __get__(self):
|
||||||
return self.c.lex.shape
|
return self.c.lex.prefix
|
||||||
|
|
||||||
@property
|
property suffix:
|
||||||
def prefix(self):
|
def __get__(self):
|
||||||
return self.c.lex.prefix
|
return self.c.lex.suffix
|
||||||
|
|
||||||
@property
|
property lemma:
|
||||||
def suffix(self):
|
def __get__(self):
|
||||||
return self.c.lex.suffix
|
return self.c.lemma
|
||||||
|
|
||||||
@property
|
property pos:
|
||||||
def lemma(self):
|
def __get__(self):
|
||||||
return self.c.lemma
|
return self.c.pos
|
||||||
|
|
||||||
@property
|
property tag:
|
||||||
def pos(self):
|
def __get__(self):
|
||||||
return self.c.pos
|
return self.c.tag
|
||||||
|
|
||||||
@property
|
property dep:
|
||||||
def tag(self):
|
def __get__(self):
|
||||||
return self.c.tag
|
return self.c.dep
|
||||||
|
|
||||||
@property
|
property repvec:
|
||||||
def dep(self):
|
def __get__(self):
|
||||||
return self.c.dep
|
return numpy.asarray(<float[:300,]> self.c.lex.repvec)
|
||||||
|
|
||||||
@property
|
property n_lefts:
|
||||||
def repvec(self):
|
def __get__(self):
|
||||||
return numpy.asarray(<float[:300,]> self.c.lex.repvec)
|
cdef int n = 0
|
||||||
|
cdef const TokenC* ptr = self.c - self.i
|
||||||
@property
|
while ptr != self.c:
|
||||||
def n_lefts(self):
|
if ptr + ptr.head == self.c:
|
||||||
cdef int n = 0
|
n += 1
|
||||||
cdef const TokenC* ptr = self.c - self.i
|
|
||||||
while ptr != self.c:
|
|
||||||
if ptr + ptr.head == self.c:
|
|
||||||
n += 1
|
|
||||||
ptr += 1
|
|
||||||
return n
|
|
||||||
|
|
||||||
@property
|
|
||||||
def n_rights(self):
|
|
||||||
cdef int n = 0
|
|
||||||
cdef const TokenC* ptr = self.c + (self.array_len - self.i)
|
|
||||||
while ptr != self.c:
|
|
||||||
if ptr + ptr.head == self.c:
|
|
||||||
n += 1
|
|
||||||
ptr -= 1
|
|
||||||
return n
|
|
||||||
|
|
||||||
@property
|
|
||||||
def lefts(self):
|
|
||||||
"""The leftward immediate children of the word, in the syntactic
|
|
||||||
dependency parse.
|
|
||||||
"""
|
|
||||||
cdef const TokenC* ptr = self.c - self.i
|
|
||||||
while ptr < self.c:
|
|
||||||
# If this head is still to the right of us, we can skip to it
|
|
||||||
# No token that's between this token and this head could be our
|
|
||||||
# child.
|
|
||||||
if (ptr.head >= 1) and (ptr + ptr.head) < self.c:
|
|
||||||
ptr += ptr.head
|
|
||||||
|
|
||||||
elif ptr + ptr.head == self.c:
|
|
||||||
yield Token.cinit(self.vocab, self._string,
|
|
||||||
ptr, ptr - (self.c - self.i), self.array_len,
|
|
||||||
self._py, self._tag_strings, self._dep_strings)
|
|
||||||
ptr += 1
|
|
||||||
else:
|
|
||||||
ptr += 1
|
ptr += 1
|
||||||
|
return n
|
||||||
|
|
||||||
@property
|
property n_rights:
|
||||||
def rights(self):
|
def __get__(self):
|
||||||
"""The rightward immediate children of the word, in the syntactic
|
cdef int n = 0
|
||||||
dependency parse."""
|
cdef const TokenC* ptr = self.c + (self.array_len - self.i)
|
||||||
cdef const TokenC* ptr = (self.c - self.i) + (self.array_len - 1)
|
while ptr != self.c:
|
||||||
while ptr > self.c:
|
if ptr + ptr.head == self.c:
|
||||||
# If this head is still to the right of us, we can skip to it
|
n += 1
|
||||||
# No token that's between this token and this head could be our
|
|
||||||
# child.
|
|
||||||
if (ptr.head < 0) and ((ptr + ptr.head) > self.c):
|
|
||||||
ptr += ptr.head
|
|
||||||
elif ptr + ptr.head == self.c:
|
|
||||||
yield Token.cinit(self.vocab, self._string,
|
|
||||||
ptr, ptr - (self.c - self.i), self.array_len,
|
|
||||||
self._py, self._tag_strings, self._dep_strings)
|
|
||||||
ptr -= 1
|
|
||||||
else:
|
|
||||||
ptr -= 1
|
ptr -= 1
|
||||||
|
return n
|
||||||
|
|
||||||
@property
|
property lefts:
|
||||||
def head(self):
|
def __get__(self):
|
||||||
"""The token predicted by the parser to be the head of the current token."""
|
"""The leftward immediate children of the word, in the syntactic
|
||||||
return Token.cinit(self.vocab, self._string,
|
dependency parse.
|
||||||
self.c + self.c.head, self.i + self.c.head, self.array_len,
|
"""
|
||||||
self._py, self._tag_strings, self._dep_strings)
|
cdef const TokenC* ptr = self.c - self.i
|
||||||
|
while ptr < self.c:
|
||||||
|
# If this head is still to the right of us, we can skip to it
|
||||||
|
# No token that's between this token and this head could be our
|
||||||
|
# child.
|
||||||
|
if (ptr.head >= 1) and (ptr + ptr.head) < self.c:
|
||||||
|
ptr += ptr.head
|
||||||
|
|
||||||
@property
|
elif ptr + ptr.head == self.c:
|
||||||
def whitespace_(self):
|
yield Token.cinit(self.vocab, self._string,
|
||||||
return self.string[self.c.lex.length:]
|
ptr, ptr - (self.c - self.i), self.array_len,
|
||||||
|
self._py, self._tag_strings, self._dep_strings)
|
||||||
|
ptr += 1
|
||||||
|
else:
|
||||||
|
ptr += 1
|
||||||
|
|
||||||
@property
|
property rights:
|
||||||
def orth_(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lex.orth]
|
"""The rightward immediate children of the word, in the syntactic
|
||||||
|
dependency parse."""
|
||||||
|
cdef const TokenC* ptr = (self.c - self.i) + (self.array_len - 1)
|
||||||
|
while ptr > self.c:
|
||||||
|
# If this head is still to the right of us, we can skip to it
|
||||||
|
# No token that's between this token and this head could be our
|
||||||
|
# child.
|
||||||
|
if (ptr.head < 0) and ((ptr + ptr.head) > self.c):
|
||||||
|
ptr += ptr.head
|
||||||
|
elif ptr + ptr.head == self.c:
|
||||||
|
yield Token.cinit(self.vocab, self._string,
|
||||||
|
ptr, ptr - (self.c - self.i), self.array_len,
|
||||||
|
self._py, self._tag_strings, self._dep_strings)
|
||||||
|
ptr -= 1
|
||||||
|
else:
|
||||||
|
ptr -= 1
|
||||||
|
|
||||||
@property
|
property head:
|
||||||
def lower_(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lex.lower]
|
"""The token predicted by the parser to be the head of the current token."""
|
||||||
|
return Token.cinit(self.vocab, self._string,
|
||||||
|
self.c + self.c.head, self.i + self.c.head, self.array_len,
|
||||||
|
self._py, self._tag_strings, self._dep_strings)
|
||||||
|
|
||||||
@property
|
property whitespace_:
|
||||||
def norm_(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lex.norm]
|
return self.string[self.c.lex.length:]
|
||||||
|
|
||||||
@property
|
property orth_:
|
||||||
def shape_(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lex.shape]
|
return self.vocab.strings[self.c.lex.orth]
|
||||||
|
|
||||||
@property
|
property lower_:
|
||||||
def prefix_(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lex.prefix]
|
return self.vocab.strings[self.c.lex.lower]
|
||||||
|
|
||||||
@property
|
property norm_:
|
||||||
def suffix_(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lex.suffix]
|
return self.vocab.strings[self.c.lex.norm]
|
||||||
|
|
||||||
@property
|
property shape_:
|
||||||
def lemma_(self):
|
def __get__(self):
|
||||||
return self.vocab.strings[self.c.lemma]
|
return self.vocab.strings[self.c.lex.shape]
|
||||||
|
|
||||||
@property
|
property prefix_:
|
||||||
def pos_(self):
|
def __get__(self):
|
||||||
return _pos_id_to_string[self.c.pos]
|
return self.vocab.strings[self.c.lex.prefix]
|
||||||
|
|
||||||
@property
|
property suffix_:
|
||||||
def tag_(self):
|
def __get__(self):
|
||||||
return self._tag_strings[self.c.tag]
|
return self.vocab.strings[self.c.lex.suffix]
|
||||||
|
|
||||||
@property
|
property lemma_:
|
||||||
def dep_(self):
|
def __get__(self):
|
||||||
return self._dep_strings[self.c.dep]
|
return self.vocab.strings[self.c.lemma]
|
||||||
|
|
||||||
|
property pos_:
|
||||||
|
def __get__(self):
|
||||||
|
return _pos_id_to_string[self.c.pos]
|
||||||
|
|
||||||
|
property tag_:
|
||||||
|
def __get__(self):
|
||||||
|
return self._tag_strings[self.c.tag]
|
||||||
|
|
||||||
|
property dep_:
|
||||||
|
def __get__(self):
|
||||||
|
return self._dep_strings[self.c.dep]
|
||||||
|
|
||||||
|
|
||||||
_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}
|
_pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()}
|
||||||
|
|
|
@ -0,0 +1,24 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
import pytest
|
||||||
|
import gc
|
||||||
|
|
||||||
|
from spacy.en import English
|
||||||
|
|
||||||
|
|
||||||
|
def get_orphan_token(text, i):
|
||||||
|
nlp = English()
|
||||||
|
tokens = nlp(text)
|
||||||
|
gc.collect()
|
||||||
|
token = tokens[i]
|
||||||
|
del tokens
|
||||||
|
return token
|
||||||
|
|
||||||
|
|
||||||
|
def test_orphan():
|
||||||
|
orphan = get_orphan_token('An orphan token', 1)
|
||||||
|
gc.collect()
|
||||||
|
dummy = get_orphan_token('Load and flush the memory', 0)
|
||||||
|
dummy = get_orphan_token('Load again...', 0)
|
||||||
|
assert orphan.orth_ == 'orphan'
|
||||||
|
assert orphan.pos_ == 'ADJ'
|
||||||
|
assert orphan.head.orth_ == 'token'
|
Loading…
Reference in New Issue