mirror of https://github.com/explosion/spaCy.git
Add Span.to_array method
This commit is contained in:
parent
19c495f451
commit
482bba1722
|
@ -15,5 +15,5 @@ cdef class Span:
|
|||
cdef public _vector
|
||||
cdef public _vector_norm
|
||||
|
||||
|
||||
cpdef int _recalculate_indices(self) except -1
|
||||
cpdef np.ndarray to_array(self, object features)
|
||||
|
|
|
@ -7,7 +7,7 @@ import numpy
|
|||
import numpy.linalg
|
||||
from libc.math cimport sqrt
|
||||
|
||||
from .doc cimport token_by_start, token_by_end
|
||||
from .doc cimport token_by_start, token_by_end, get_token_attr
|
||||
from ..structs cimport TokenC, LexemeC
|
||||
from ..typedefs cimport flags_t, attr_t, hash_t
|
||||
from ..attrs cimport attr_id_t
|
||||
|
@ -135,6 +135,28 @@ cdef class Span:
|
|||
return 0.0
|
||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
|
||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||
"""Given a list of M attribute IDs, export the tokens to a numpy
|
||||
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
|
||||
The values will be 32-bit integers.
|
||||
|
||||
attr_ids (list[int]): A list of attribute ID ints.
|
||||
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
|
||||
per word, and one column per attribute indicated in the input
|
||||
`attr_ids`.
|
||||
"""
|
||||
cdef int i, j
|
||||
cdef attr_id_t feature
|
||||
cdef np.ndarray[attr_t, ndim=2] output
|
||||
# Make an array from the attributes --- otherwise our inner loop is Python
|
||||
# dict iteration.
|
||||
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
|
||||
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
|
||||
for i in range(self.start, self.end):
|
||||
for j, feature in enumerate(attr_ids):
|
||||
output[i, j] = get_token_attr(&self.doc.c[i], feature)
|
||||
return output
|
||||
|
||||
cpdef int _recalculate_indices(self) except -1:
|
||||
if self.end > self.doc.length \
|
||||
or self.doc.c[self.start].idx != self.start_char \
|
||||
|
|
Loading…
Reference in New Issue