From 69bb0222041f8d43febc7648c04b903804a6b299 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Thu, 4 Dec 2014 20:46:55 +1100
Subject: [PATCH] * Add as_array and count_by method

---
 spacy/tokens.pxd |  5 ++---
 spacy/tokens.pyx | 39 ++++++++++++++++++++++++---------------
 2 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index f91aa16ba..90356b74e 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -2,13 +2,12 @@ import numpy as np
 cimport numpy as np
 
 from cymem.cymem cimport Pool
+from thinc.typedefs cimport atom_t
 
 from .lexeme cimport Lexeme
 from .typedefs cimport flags_t
 from .utf8string cimport StringStore
 
-from thinc.typedefs cimport atom_t
-
 
 cdef class Tokens:
     cdef Pool mem
@@ -30,7 +29,7 @@ cdef class Tokens:
     cdef int push_back(self, int i, const Lexeme* lexeme) except -1
     cpdef int set_tag(self, int i, int tag_type, int tag) except -1
 
-    cpdef np.ndarray[atom_t, ndim=2] get_array(self, list features)
+    cpdef np.ndarray[long, ndim=2] get_array(self, list features)
 
 
 cdef class Token:
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index 7f79dcda9..7fdfa8e1e 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -1,7 +1,13 @@
 # cython: profile=True
+from preshed.maps cimport PreshMap
+from preshed.counter cimport PreshCounter
+
 from .lexeme cimport *
 cimport cython
 
+import numpy as np
+cimport numpy as np
+
 POS = 0
 ENTITY = 0
 
@@ -19,20 +25,10 @@ cdef class Tokens:
     """A sequence of references to Lexeme objects.
 
     The Tokens class provides fast and memory-efficient access to lexical features,
-    and can efficiently export the data to a numpy array.  Specific languages
-    create their own Tokens subclasses, to provide more convenient access to
-    language-specific features.
+    and can efficiently export the data to a numpy array.
 
     >>> from spacy.en import EN
     >>> tokens = EN.tokenize('An example sentence.')
-    >>> tokens.string(0)
-    'An'
-    >>> tokens.prob(0) > tokens.prob(1)
-    True
-    >>> tokens.can_noun(0)
-    False
-    >>> tokens.can_noun(1)
-    True
     """
     def __init__(self, StringStore string_store, string_length=0):
         self._string_store = string_store
@@ -104,15 +100,28 @@ cdef class Tokens:
         elif tag_type == ENTITY:
             self.ner[i] = tag
 
-    cpdef np.ndarray[atom_t, ndim=2] get_array(self, list features):
+    @cython.boundscheck(False)
+    cpdef np.ndarray[long, ndim=2] get_array(self, list attr_ids):
         cdef int i, j
-        cdef np.ndarray[atom_t, ndim=2] output
-        output = np.ndarray(shape=(self.length, len(features)), dtype=int)
+        cdef attr_id_t feature
+        cdef np.ndarray[long, ndim=2] output
+        output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int)
         for i in range(self.length):
-            for j, feature in enumerate(features):
+            for j, feature in enumerate(attr_ids):
                 output[i, j] = get_attr(self.lex[i], feature)
         return output
 
+    def count_by(self, attr_id_t attr_id):
+        cdef int i
+        cdef attr_t attr
+        cdef size_t count
+
+        cdef PreshCounter counts = PreshCounter(2 ** 8)
+        for i in range(self.length):
+            attr = get_attr(self.lex[i], attr_id)
+            counts.inc(attr, 1)
+        return dict(counts)
+
     def _realloc(self, new_size):
         self.max_length = new_size
         n = new_size + (PADDING * 2)