Support strings for attribute list in doc.to_array

This commit is contained in:
Ramanan Balakrishnan 2017-10-19 19:37:14 +05:30
parent 7b9b1be44c
commit b3ab124fc5
No known key found for this signature in database
GPG Key ID: 57283041B6B6D1D1
2 changed files with 39 additions and 8 deletions

View File

@ -17,6 +17,26 @@ def test_doc_array_attr_of_token(en_tokenizer, en_vocab):
assert feats_array[0][0] != feats_array[0][1]
def test_doc_stringy_array_attr_of_token(en_tokenizer, en_vocab):
text = "An example sentence"
tokens = en_tokenizer(text)
example = tokens.vocab["example"]
assert example.orth != example.shape
feats_array = tokens.to_array((ORTH, SHAPE))
feats_array_stringy = tokens.to_array(("ORTH", "SHAPE"))
assert feats_array_stringy[0][0] == feats_array[0][0]
assert feats_array_stringy[0][1] == feats_array[0][1]
def test_doc_scalar_attr_of_token(en_tokenizer, en_vocab):
text = "An example sentence"
tokens = en_tokenizer(text)
example = tokens.vocab["example"]
assert example.orth != example.shape
feats_array = tokens.to_array(ORTH)
assert feats_array.shape == (3,)
def test_doc_array_tag(en_tokenizer):
text = "A nice sentence."
pos = ['DET', 'ADJ', 'NOUN', 'PUNCT']

View File

@ -21,7 +21,7 @@ from .token cimport Token
from .printers import parse_tree
from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..typedefs cimport attr_t, flags_t
from ..attrs import intify_attrs
from ..attrs import intify_attrs, IDS
from ..attrs cimport attr_id_t
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
@ -536,11 +536,15 @@ cdef class Doc:
@cython.boundscheck(False)
cpdef np.ndarray to_array(self, object py_attr_ids):
"""Given a list of M attribute IDs, export the tokens to a numpy
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
The values will be 32-bit integers.
"""Export given token attributes to a numpy `ndarray`.
attr_ids (list[int]): A list of attribute ID ints.
If `attr_ids` is a sequence of M attributes, the output array will
be of shape `(N, M)`, where N is the length of the `Doc`
(in tokens). If `attr_ids` is a single attribute, the output shape will
be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA)
or string name (e.g. 'LEMMA' or 'lemma').
attr_ids (list[]): A list of attributes (int IDs or string names).
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
per word, and one column per attribute indicated in the input
`attr_ids`.
@ -555,11 +559,18 @@ cdef class Doc:
cdef attr_id_t feature
cdef np.ndarray[attr_t, ndim=2] output
cdef np.ndarray[attr_t, ndim=1] output_1D
# Make an array from the attributes --- otherwise our inner loop is Python
# dict iteration.
# Handle scalar/list inputs of strings/ints for py_attr_ids
if( type(py_attr_ids) is not list and type(py_attr_ids) is not tuple ):
py_attr_ids = [ py_attr_ids ]
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
py_attr_ids_input = []
for py_attr_id in py_attr_ids:
if( type(py_attr_id) is int ):
py_attr_ids_input.append(py_attr_id)
else:
py_attr_ids_input.append(IDS[py_attr_id.upper()])
# Make an array from the attributes --- otherwise our inner loop is Python
# dict iteration.
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids_input, dtype=numpy.uint64)
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
for i in range(self.length):
for j, feature in enumerate(attr_ids):