mirror of https://github.com/explosion/spaCy.git
Support strings for attribute list in doc.to_array
This commit is contained in:
parent
7b9b1be44c
commit
b3ab124fc5
|
@ -17,6 +17,26 @@ def test_doc_array_attr_of_token(en_tokenizer, en_vocab):
|
|||
assert feats_array[0][0] != feats_array[0][1]
|
||||
|
||||
|
||||
def test_doc_stringy_array_attr_of_token(en_tokenizer, en_vocab):
|
||||
text = "An example sentence"
|
||||
tokens = en_tokenizer(text)
|
||||
example = tokens.vocab["example"]
|
||||
assert example.orth != example.shape
|
||||
feats_array = tokens.to_array((ORTH, SHAPE))
|
||||
feats_array_stringy = tokens.to_array(("ORTH", "SHAPE"))
|
||||
assert feats_array_stringy[0][0] == feats_array[0][0]
|
||||
assert feats_array_stringy[0][1] == feats_array[0][1]
|
||||
|
||||
|
||||
def test_doc_scalar_attr_of_token(en_tokenizer, en_vocab):
|
||||
text = "An example sentence"
|
||||
tokens = en_tokenizer(text)
|
||||
example = tokens.vocab["example"]
|
||||
assert example.orth != example.shape
|
||||
feats_array = tokens.to_array(ORTH)
|
||||
assert feats_array.shape == (3,)
|
||||
|
||||
|
||||
def test_doc_array_tag(en_tokenizer):
|
||||
text = "A nice sentence."
|
||||
pos = ['DET', 'ADJ', 'NOUN', 'PUNCT']
|
||||
|
|
|
@ -21,7 +21,7 @@ from .token cimport Token
|
|||
from .printers import parse_tree
|
||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||
from ..typedefs cimport attr_t, flags_t
|
||||
from ..attrs import intify_attrs
|
||||
from ..attrs import intify_attrs, IDS
|
||||
from ..attrs cimport attr_id_t
|
||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
|
||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
|
||||
|
@ -536,11 +536,15 @@ cdef class Doc:
|
|||
|
||||
@cython.boundscheck(False)
|
||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||
"""Given a list of M attribute IDs, export the tokens to a numpy
|
||||
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
|
||||
The values will be 32-bit integers.
|
||||
"""Export given token attributes to a numpy `ndarray`.
|
||||
|
||||
attr_ids (list[int]): A list of attribute ID ints.
|
||||
If `attr_ids` is a sequence of M attributes, the output array will
|
||||
be of shape `(N, M)`, where N is the length of the `Doc`
|
||||
(in tokens). If `attr_ids` is a single attribute, the output shape will
|
||||
be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA)
|
||||
or string name (e.g. 'LEMMA' or 'lemma').
|
||||
|
||||
attr_ids (list[]): A list of attributes (int IDs or string names).
|
||||
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
|
||||
per word, and one column per attribute indicated in the input
|
||||
`attr_ids`.
|
||||
|
@ -555,11 +559,18 @@ cdef class Doc:
|
|||
cdef attr_id_t feature
|
||||
cdef np.ndarray[attr_t, ndim=2] output
|
||||
cdef np.ndarray[attr_t, ndim=1] output_1D
|
||||
# Make an array from the attributes --- otherwise our inner loop is Python
|
||||
# dict iteration.
|
||||
# Handle scalar/list inputs of strings/ints for py_attr_ids
|
||||
if( type(py_attr_ids) is not list and type(py_attr_ids) is not tuple ):
|
||||
py_attr_ids = [ py_attr_ids ]
|
||||
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
|
||||
py_attr_ids_input = []
|
||||
for py_attr_id in py_attr_ids:
|
||||
if( type(py_attr_id) is int ):
|
||||
py_attr_ids_input.append(py_attr_id)
|
||||
else:
|
||||
py_attr_ids_input.append(IDS[py_attr_id.upper()])
|
||||
# Make an array from the attributes --- otherwise our inner loop is Python
|
||||
# dict iteration.
|
||||
cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids_input, dtype=numpy.uint64)
|
||||
output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
|
||||
for i in range(self.length):
|
||||
for j, feature in enumerate(attr_ids):
|
||||
|
|
Loading…
Reference in New Issue