* Improve array features in tokens

This commit is contained in:
Matthew Honnibal 2014-10-22 12:55:42 +11:00
parent 43d5964e13
commit 7018b53d3a
2 changed files with 53 additions and 0 deletions

View File

@ -1,5 +1,6 @@
from spacy.lexeme cimport LexemeC from spacy.lexeme cimport LexemeC
from libcpp.vector cimport vector from libcpp.vector cimport vector
from thinc.typedefs cimport atom_t
cdef class Tokens: cdef class Tokens:
@ -9,6 +10,12 @@ cdef class Tokens:
cdef int extend(self, int i, LexemeC** lexemes, int n) except -1 cdef int extend(self, int i, LexemeC** lexemes, int n) except -1
cdef int push_back(self, int i, LexemeC* lexeme) except -1 cdef int push_back(self, int i, LexemeC* lexeme) except -1
cdef int int_array(self, atom_t* atoms, int i, int* indices, int n_idx,
int* features, int n_feat) except -1
cdef int string_array(self, atom_t* atoms, int i, int* indices, int n_idx,
int* features, int n_feat) except -1
cdef int bool_array(self, atom_t* atoms, int i, int* indices, int n_idx,
int* features, int n_feat) except -1
cpdef int id(self, size_t i) except -1 cpdef int id(self, size_t i) except -1
cpdef float prob(self, size_t i) except 1 cpdef float prob(self, size_t i) except 1

View File

@ -2,6 +2,9 @@
from .word cimport Lexeme from .word cimport Lexeme
from .lexeme cimport * from .lexeme cimport *
cimport numpy
cimport cython
import numpy
cdef class Tokens: cdef class Tokens:
@ -38,6 +41,8 @@ cdef class Tokens:
del self.pos del self.pos
def __getitem__(self, i): def __getitem__(self, i):
if i >= self.lex.size():
raise IndexError
return Lexeme(<size_t>self.lex.at(i)) return Lexeme(<size_t>self.lex.at(i))
def __len__(self): def __len__(self):
@ -48,6 +53,45 @@ cdef class Tokens:
self.idx.push_back(idx) self.idx.push_back(idx)
return idx + lexeme.ints[<int>LexInt_length] return idx + lexeme.ints[<int>LexInt_length]
cdef int int_array(self, atom_t* output, int i, int* indices, int n_idx,
int* features, int n_feat):
cdef int feat_id, idx
cdef int length = self.lex.size()
for feat_id in features[:n_feat]:
for idx in indices[:n_idx]:
if idx < 0 or idx >= length:
output[i] = 0
else:
output[i] = self.lex[0][idx].ints[<int>feat_id]
i += 1
return i
cdef int string_array(self, atom_t* output, int i, int* indices, int n_idx,
int* features, int n_feat):
cdef int feat_id, idx
cdef int length = self.lex.size()
for feat_id in features[:n_feat]:
for idx in indices[:n_idx]:
if idx < 0 or idx >= length:
output[i] = 0
else:
output[i] = <atom_t>self.lex[0][idx].strings[<int>feat_id]
i += 1
return i
cdef int bool_array(self, atom_t* output, int i, int* indices, int n_idx,
int* features, int n_feat):
cdef int feat_id, idx
cdef int length = self.lex.size()
for feat_id in features[:n_feat]:
for idx in indices[:n_idx]:
if idx < 0 or idx >= length:
output[i] = 0
else:
output[i] = lexeme_check_dist_flag(self.lex[0][idx], feat_id)
i += 1
return i
cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1: cdef int extend(self, int idx, LexemeC** lexemes, int n) except -1:
cdef int i cdef int i
if lexemes == NULL: if lexemes == NULL:
@ -89,6 +133,8 @@ cdef class Tokens:
# methods, which requires them to know the IDs. # methods, which requires them to know the IDs.
cpdef unicode string(self, size_t i): cpdef unicode string(self, size_t i):
if i >= self.lex.size():
raise IndexError
return self.orig(i) return self.orig(i)
cpdef unicode orig(self, size_t i): cpdef unicode orig(self, size_t i):