Support strings for attribute list in doc.to_array

2017-10-19 19:37:14 +05:30 · 2017-10-19 19:37:14 +05:30 · b3ab124fc5
parent 7b9b1be44c
commit b3ab124fc5
2 changed files with 39 additions and 8 deletions
--- a/spacy/tests/doc/test_array.py
+++ b/spacy/tests/doc/test_array.py
@ -17,6 +17,26 @@ def test_doc_array_attr_of_token(en_tokenizer, en_vocab):
    assert feats_array[0][0] != feats_array[0][1]


+def test_doc_stringy_array_attr_of_token(en_tokenizer, en_vocab):
+    text = "An example sentence"
+    tokens = en_tokenizer(text)
+    example = tokens.vocab["example"]
+    assert example.orth != example.shape
+    feats_array = tokens.to_array((ORTH, SHAPE))
+    feats_array_stringy = tokens.to_array(("ORTH", "SHAPE"))
+    assert feats_array_stringy[0][0] == feats_array[0][0]
+    assert feats_array_stringy[0][1] == feats_array[0][1]
+
+
+def test_doc_scalar_attr_of_token(en_tokenizer, en_vocab):
+    text = "An example sentence"
+    tokens = en_tokenizer(text)
+    example = tokens.vocab["example"]
+    assert example.orth != example.shape
+    feats_array = tokens.to_array(ORTH)
+    assert feats_array.shape == (3,)
+
+
 def test_doc_array_tag(en_tokenizer):
    text = "A nice sentence."
    pos = ['DET', 'ADJ', 'NOUN', 'PUNCT']
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -21,7 +21,7 @@ from .token cimport Token
 from .printers import parse_tree
 from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..typedefs cimport attr_t, flags_t
-from ..attrs import intify_attrs
+from ..attrs import intify_attrs, IDS
 from ..attrs cimport attr_id_t
 from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
@ -536,11 +536,15 @@ cdef class Doc:

    @cython.boundscheck(False)
    cpdef np.ndarray to_array(self, object py_attr_ids):
-        """Given a list of M attribute IDs, export the tokens to a numpy
-        `ndarray` of shape `(N, M)`, where `N` is the length of the document.
-        The values will be 32-bit integers.
+        """Export given token attributes to a numpy `ndarray`.

-        attr_ids (list[int]): A list of attribute ID ints.
+	If `attr_ids` is a sequence of M attributes, the output array will
+	be of shape `(N, M)`, where N is the length of the `Doc`
+	(in tokens). If `attr_ids` is a single attribute, the output shape will
+	be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA)
+	or string name (e.g. 'LEMMA' or 'lemma').
+
+        attr_ids (list[]): A list of attributes (int IDs or string names).
        RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
            per word, and one column per attribute indicated in the input
            `attr_ids`.
@ -555,11 +559,18 @@ cdef class Doc:
        cdef attr_id_t feature
        cdef np.ndarray[attr_t, ndim=2] output
        cdef np.ndarray[attr_t, ndim=1] output_1D
-        # Make an array from the attributes --- otherwise our inner loop is Python
-        # dict iteration.
+        # Handle scalar/list inputs of strings/ints for py_attr_ids
        if( type(py_attr_ids) is not list and type(py_attr_ids) is not tuple ):
            py_attr_ids = [ py_attr_ids ]
-        cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
+        py_attr_ids_input = []
+        for py_attr_id in py_attr_ids:
+            if( type(py_attr_id) is int ):
+                py_attr_ids_input.append(py_attr_id)
+            else:
+                py_attr_ids_input.append(IDS[py_attr_id.upper()])
+        # Make an array from the attributes --- otherwise our inner loop is Python
+        # dict iteration.
+        cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids_input, dtype=numpy.uint64)
        output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
        for i in range(self.length):
            for j, feature in enumerate(attr_ids):