* Add as_array and count_by method

This commit is contained in:
Matthew Honnibal 2014-12-04 20:46:55 +11:00
parent e1b1f45cc9
commit 69bb022204
2 changed files with 26 additions and 18 deletions

View File

@ -2,13 +2,12 @@ import numpy as np
cimport numpy as np cimport numpy as np
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from thinc.typedefs cimport atom_t
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .typedefs cimport flags_t from .typedefs cimport flags_t
from .utf8string cimport StringStore from .utf8string cimport StringStore
from thinc.typedefs cimport atom_t
cdef class Tokens: cdef class Tokens:
cdef Pool mem cdef Pool mem
@ -30,7 +29,7 @@ cdef class Tokens:
cdef int push_back(self, int i, const Lexeme* lexeme) except -1 cdef int push_back(self, int i, const Lexeme* lexeme) except -1
cpdef int set_tag(self, int i, int tag_type, int tag) except -1 cpdef int set_tag(self, int i, int tag_type, int tag) except -1
cpdef np.ndarray[atom_t, ndim=2] get_array(self, list features) cpdef np.ndarray[long, ndim=2] get_array(self, list features)
cdef class Token: cdef class Token:

View File

@ -1,7 +1,13 @@
# cython: profile=True # cython: profile=True
from preshed.maps cimport PreshMap
from preshed.counter cimport PreshCounter
from .lexeme cimport * from .lexeme cimport *
cimport cython cimport cython
import numpy as np
cimport numpy as np
POS = 0 POS = 0
ENTITY = 0 ENTITY = 0
@ -19,20 +25,10 @@ cdef class Tokens:
"""A sequence of references to Lexeme objects. """A sequence of references to Lexeme objects.
The Tokens class provides fast and memory-efficient access to lexical features, The Tokens class provides fast and memory-efficient access to lexical features,
and can efficiently export the data to a numpy array. Specific languages and can efficiently export the data to a numpy array.
create their own Tokens subclasses, to provide more convenient access to
language-specific features.
>>> from spacy.en import EN >>> from spacy.en import EN
>>> tokens = EN.tokenize('An example sentence.') >>> tokens = EN.tokenize('An example sentence.')
>>> tokens.string(0)
'An'
>>> tokens.prob(0) > tokens.prob(1)
True
>>> tokens.can_noun(0)
False
>>> tokens.can_noun(1)
True
""" """
def __init__(self, StringStore string_store, string_length=0): def __init__(self, StringStore string_store, string_length=0):
self._string_store = string_store self._string_store = string_store
@ -104,15 +100,28 @@ cdef class Tokens:
elif tag_type == ENTITY: elif tag_type == ENTITY:
self.ner[i] = tag self.ner[i] = tag
cpdef np.ndarray[atom_t, ndim=2] get_array(self, list features): @cython.boundscheck(False)
cpdef np.ndarray[long, ndim=2] get_array(self, list attr_ids):
cdef int i, j cdef int i, j
cdef np.ndarray[atom_t, ndim=2] output cdef attr_id_t feature
output = np.ndarray(shape=(self.length, len(features)), dtype=int) cdef np.ndarray[long, ndim=2] output
output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int)
for i in range(self.length): for i in range(self.length):
for j, feature in enumerate(features): for j, feature in enumerate(attr_ids):
output[i, j] = get_attr(self.lex[i], feature) output[i, j] = get_attr(self.lex[i], feature)
return output return output
def count_by(self, attr_id_t attr_id):
cdef int i
cdef attr_t attr
cdef size_t count
cdef PreshCounter counts = PreshCounter(2 ** 8)
for i in range(self.length):
attr = get_attr(self.lex[i], attr_id)
counts.inc(attr, 1)
return dict(counts)
def _realloc(self, new_size): def _realloc(self, new_size):
self.max_length = new_size self.max_length = new_size
n = new_size + (PADDING * 2) n = new_size + (PADDING * 2)