* Improve array features in tokens

2014-10-22 12:55:42 +11:00 · 2014-10-22 12:55:42 +11:00 · 7018b53d3a
parent 43d5964e13
commit 7018b53d3a
2 changed files with 53 additions and 0 deletions
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -1,5 +1,6 @@
 from spacy.lexeme cimport LexemeC
 from libcpp.vector cimport vector
+from thinc.typedefs cimport atom_t


 cdef class Tokens:
@ -9,6 +10,12 @@ cdef class Tokens:

    cdef int extend(self, int i, LexemeC** lexemes, int n) except -1
    cdef int push_back(self, int i, LexemeC* lexeme) except -1
+    cdef int int_array(self, atom_t* atoms, int i, int* indices, int n_idx,
+                       int* features, int n_feat) except -1
+    cdef int string_array(self, atom_t* atoms, int i, int* indices, int n_idx,
+                          int* features, int n_feat) except -1
+    cdef int bool_array(self, atom_t* atoms, int i, int* indices, int n_idx,
+                        int* features, int n_feat) except -1

    cpdef int id(self, size_t i) except -1
    cpdef float prob(self, size_t i) except 1
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -2,6 +2,9 @@
 from .word cimport Lexeme

 from .lexeme cimport *
+cimport numpy
+cimport cython
+import numpy


 cdef class Tokens:
@ -38,6 +41,8 @@ cdef class Tokens:
        del self.pos

    def __getitem__(self, i):
+        if i >= self.lex.size():
+            raise IndexError
        return Lexeme(<size_t>self.lex.at(i))

    def __len__(self):
@ -48,6 +53,45 @@ cdef class Tokens:
        self.idx.push_back(idx)
        return idx + lexeme.ints[<int>LexInt_length]

+    cdef int int_array(self, atom_t* output, int i, int* indices, int n_idx,
+                       int* features, int n_feat):
+        cdef int feat_id, idx
+        cdef int length = self.lex.size()
+        for feat_id in features[:n_feat]:
+            for idx in indices[:n_idx]:
+                if idx < 0 or idx >= length:
+                    output[i] = 0
+                else:
+                    output[i] = self.lex[0][idx].ints[<int>feat_id]
+                i += 1
+        return i
+
+    cdef int string_array(self, atom_t* output, int i, int* indices, int n_idx,
+                          int* features, int n_feat):
+        cdef int feat_id, idx
+        cdef int length = self.lex.size()
+        for feat_id in features[:n_feat]:
+            for idx in indices[:n_idx]:
+                if idx < 0 or idx >= length:
+                    output[i] = 0
+                else:
+                    output[i] = <atom_t>self.lex[0][idx].strings[<int>feat_id]
+                i += 1
+        return i
+
+    cdef int bool_array(self, atom_t* output, int i, int* indices, int n_idx,
+                        int* features, int n_feat):
+        cdef int feat_id, idx
+        cdef int length = self.lex.size()
+        for feat_id in features[:n_feat]:
+            for idx in indices[:n_idx]:
+                if idx < 0 or idx >= length:
+                    output[i] = 0
+                else:
+                    output[i] = lexeme_check_dist_flag(self.lex[0][idx], feat_id)
+                i += 1
+        return i
+
    cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1:
        cdef int i
        if lexemes == NULL:
@ -89,6 +133,8 @@ cdef class Tokens:
    # methods, which requires them to know the IDs.

    cpdef unicode string(self, size_t i):
+        if i >= self.lex.size():
+            raise IndexError
        return self.orig(i)

    cpdef unicode orig(self, size_t i):