* Improve array features in tokens

2014-10-22 12:55:42 +11:00 · 2014-10-22 12:55:42 +11:00 · 7018b53d3a
parent 43d5964e13
commit 7018b53d3a
2 changed files with 53 additions and 0 deletions
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -1,5 +1,6 @@
 from spacy.lexeme cimport LexemeC
 from libcpp.vector cimport vector
 from thinc.typedefs cimport atom_t
 cdef class Tokens:
@ -9,6 +10,12 @@ cdef class Tokens:
    cdef int extend(self, int i, LexemeC** lexemes, int n) except -1
    cdef int push_back(self, int i, LexemeC* lexeme) except -1
    cdef int int_array(self, atom_t* atoms, int i, int* indices, int n_idx,
                       int* features, int n_feat) except -1
    cdef int string_array(self, atom_t* atoms, int i, int* indices, int n_idx,
                          int* features, int n_feat) except -1
    cdef int bool_array(self, atom_t* atoms, int i, int* indices, int n_idx,
                        int* features, int n_feat) except -1
    cpdef int id(self, size_t i) except -1
    cpdef float prob(self, size_t i) except 1
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -2,6 +2,9 @@
 from .word cimport Lexeme
 from .lexeme cimport *
 cimport numpy
 cimport cython
 import numpy
 cdef class Tokens:
@ -38,6 +41,8 @@ cdef class Tokens:
        del self.pos
    def __getitem__(self, i):
        if i >= self.lex.size():
            raise IndexError
        return Lexeme(<size_t>self.lex.at(i))
    def __len__(self):
@ -48,6 +53,45 @@ cdef class Tokens:
        self.idx.push_back(idx)
        return idx + lexeme.ints[<int>LexInt_length]
    cdef int int_array(self, atom_t* output, int i, int* indices, int n_idx,
                       int* features, int n_feat):
        cdef int feat_id, idx
        cdef int length = self.lex.size()
        for feat_id in features[:n_feat]:
            for idx in indices[:n_idx]:
                if idx < 0 or idx >= length:
                    output[i] = 0
                else:
                    output[i] = self.lex[0][idx].ints[<int>feat_id]
                i += 1
        return i
    cdef int string_array(self, atom_t* output, int i, int* indices, int n_idx,
                          int* features, int n_feat):
        cdef int feat_id, idx
        cdef int length = self.lex.size()
        for feat_id in features[:n_feat]:
            for idx in indices[:n_idx]:
                if idx < 0 or idx >= length:
                    output[i] = 0
                else:
                    output[i] = <atom_t>self.lex[0][idx].strings[<int>feat_id]
                i += 1
        return i
    cdef int bool_array(self, atom_t* output, int i, int* indices, int n_idx,
                        int* features, int n_feat):
        cdef int feat_id, idx
        cdef int length = self.lex.size()
        for feat_id in features[:n_feat]:
            for idx in indices[:n_idx]:
                if idx < 0 or idx >= length:
                    output[i] = 0
                else:
                    output[i] = lexeme_check_dist_flag(self.lex[0][idx], feat_id)
                i += 1
        return i
    cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1:
        cdef int i
        if lexemes == NULL:
@ -89,6 +133,8 @@ cdef class Tokens:
    # methods, which requires them to know the IDs.
    cpdef unicode string(self, size_t i):
        if i >= self.lex.size():
            raise IndexError
        return self.orig(i)
    cpdef unicode orig(self, size_t i):