From 7018b53d3ad3a9fbb7d2443d0fc377d28a3502b2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 22 Oct 2014 12:55:42 +1100 Subject: [PATCH] * Improve array features in tokens --- spacy/tokens.pxd | 7 +++++++ spacy/tokens.pyx | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 7c1f77644..bc5c5fe1d 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -1,5 +1,6 @@ from spacy.lexeme cimport LexemeC from libcpp.vector cimport vector +from thinc.typedefs cimport atom_t cdef class Tokens: @@ -9,6 +10,12 @@ cdef class Tokens: cdef int extend(self, int i, LexemeC** lexemes, int n) except -1 cdef int push_back(self, int i, LexemeC* lexeme) except -1 + cdef int int_array(self, atom_t* atoms, int i, int* indices, int n_idx, + int* features, int n_feat) except -1 + cdef int string_array(self, atom_t* atoms, int i, int* indices, int n_idx, + int* features, int n_feat) except -1 + cdef int bool_array(self, atom_t* atoms, int i, int* indices, int n_idx, + int* features, int n_feat) except -1 cpdef int id(self, size_t i) except -1 cpdef float prob(self, size_t i) except 1 diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 0a3a075b1..58c26d7f9 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -2,6 +2,9 @@ from .word cimport Lexeme from .lexeme cimport * +cimport numpy +cimport cython +import numpy cdef class Tokens: @@ -38,6 +41,8 @@ cdef class Tokens: del self.pos def __getitem__(self, i): + if i >= self.lex.size(): + raise IndexError return Lexeme(self.lex.at(i)) def __len__(self): @@ -48,6 +53,45 @@ cdef class Tokens: self.idx.push_back(idx) return idx + lexeme.ints[LexInt_length] + cdef int int_array(self, atom_t* output, int i, int* indices, int n_idx, + int* features, int n_feat): + cdef int feat_id, idx + cdef int length = self.lex.size() + for feat_id in features[:n_feat]: + for idx in indices[:n_idx]: + if idx < 0 or idx >= length: + output[i] = 0 + else: + output[i] = self.lex[0][idx].ints[feat_id] + i += 1 + return i + + cdef int string_array(self, atom_t* output, int i, int* indices, int n_idx, + int* features, int n_feat): + cdef int feat_id, idx + cdef int length = self.lex.size() + for feat_id in features[:n_feat]: + for idx in indices[:n_idx]: + if idx < 0 or idx >= length: + output[i] = 0 + else: + output[i] = self.lex[0][idx].strings[feat_id] + i += 1 + return i + + cdef int bool_array(self, atom_t* output, int i, int* indices, int n_idx, + int* features, int n_feat): + cdef int feat_id, idx + cdef int length = self.lex.size() + for feat_id in features[:n_feat]: + for idx in indices[:n_idx]: + if idx < 0 or idx >= length: + output[i] = 0 + else: + output[i] = lexeme_check_dist_flag(self.lex[0][idx], feat_id) + i += 1 + return i + cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1: cdef int i if lexemes == NULL: @@ -89,6 +133,8 @@ cdef class Tokens: # methods, which requires them to know the IDs. cpdef unicode string(self, size_t i): + if i >= self.lex.size(): + raise IndexError return self.orig(i) cpdef unicode orig(self, size_t i):