mirror of https://github.com/explosion/spaCy.git
* Add as_array and count_by method
This commit is contained in:
parent
e1b1f45cc9
commit
69bb022204
|
@ -2,13 +2,12 @@ import numpy as np
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
|
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
from thinc.typedefs cimport atom_t
|
||||||
|
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
from .typedefs cimport flags_t
|
from .typedefs cimport flags_t
|
||||||
from .utf8string cimport StringStore
|
from .utf8string cimport StringStore
|
||||||
|
|
||||||
from thinc.typedefs cimport atom_t
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokens:
|
cdef class Tokens:
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
|
@ -30,7 +29,7 @@ cdef class Tokens:
|
||||||
cdef int push_back(self, int i, const Lexeme* lexeme) except -1
|
cdef int push_back(self, int i, const Lexeme* lexeme) except -1
|
||||||
cpdef int set_tag(self, int i, int tag_type, int tag) except -1
|
cpdef int set_tag(self, int i, int tag_type, int tag) except -1
|
||||||
|
|
||||||
cpdef np.ndarray[atom_t, ndim=2] get_array(self, list features)
|
cpdef np.ndarray[long, ndim=2] get_array(self, list features)
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
|
|
|
@ -1,7 +1,13 @@
|
||||||
# cython: profile=True
|
# cython: profile=True
|
||||||
|
from preshed.maps cimport PreshMap
|
||||||
|
from preshed.counter cimport PreshCounter
|
||||||
|
|
||||||
from .lexeme cimport *
|
from .lexeme cimport *
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
cimport numpy as np
|
||||||
|
|
||||||
POS = 0
|
POS = 0
|
||||||
ENTITY = 0
|
ENTITY = 0
|
||||||
|
|
||||||
|
@ -19,20 +25,10 @@ cdef class Tokens:
|
||||||
"""A sequence of references to Lexeme objects.
|
"""A sequence of references to Lexeme objects.
|
||||||
|
|
||||||
The Tokens class provides fast and memory-efficient access to lexical features,
|
The Tokens class provides fast and memory-efficient access to lexical features,
|
||||||
and can efficiently export the data to a numpy array. Specific languages
|
and can efficiently export the data to a numpy array.
|
||||||
create their own Tokens subclasses, to provide more convenient access to
|
|
||||||
language-specific features.
|
|
||||||
|
|
||||||
>>> from spacy.en import EN
|
>>> from spacy.en import EN
|
||||||
>>> tokens = EN.tokenize('An example sentence.')
|
>>> tokens = EN.tokenize('An example sentence.')
|
||||||
>>> tokens.string(0)
|
|
||||||
'An'
|
|
||||||
>>> tokens.prob(0) > tokens.prob(1)
|
|
||||||
True
|
|
||||||
>>> tokens.can_noun(0)
|
|
||||||
False
|
|
||||||
>>> tokens.can_noun(1)
|
|
||||||
True
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, StringStore string_store, string_length=0):
|
def __init__(self, StringStore string_store, string_length=0):
|
||||||
self._string_store = string_store
|
self._string_store = string_store
|
||||||
|
@ -104,15 +100,28 @@ cdef class Tokens:
|
||||||
elif tag_type == ENTITY:
|
elif tag_type == ENTITY:
|
||||||
self.ner[i] = tag
|
self.ner[i] = tag
|
||||||
|
|
||||||
cpdef np.ndarray[atom_t, ndim=2] get_array(self, list features):
|
@cython.boundscheck(False)
|
||||||
|
cpdef np.ndarray[long, ndim=2] get_array(self, list attr_ids):
|
||||||
cdef int i, j
|
cdef int i, j
|
||||||
cdef np.ndarray[atom_t, ndim=2] output
|
cdef attr_id_t feature
|
||||||
output = np.ndarray(shape=(self.length, len(features)), dtype=int)
|
cdef np.ndarray[long, ndim=2] output
|
||||||
|
output = np.ndarray(shape=(self.length, len(attr_ids)), dtype=int)
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
for j, feature in enumerate(features):
|
for j, feature in enumerate(attr_ids):
|
||||||
output[i, j] = get_attr(self.lex[i], feature)
|
output[i, j] = get_attr(self.lex[i], feature)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
def count_by(self, attr_id_t attr_id):
|
||||||
|
cdef int i
|
||||||
|
cdef attr_t attr
|
||||||
|
cdef size_t count
|
||||||
|
|
||||||
|
cdef PreshCounter counts = PreshCounter(2 ** 8)
|
||||||
|
for i in range(self.length):
|
||||||
|
attr = get_attr(self.lex[i], attr_id)
|
||||||
|
counts.inc(attr, 1)
|
||||||
|
return dict(counts)
|
||||||
|
|
||||||
def _realloc(self, new_size):
|
def _realloc(self, new_size):
|
||||||
self.max_length = new_size
|
self.max_length = new_size
|
||||||
n = new_size + (PADDING * 2)
|
n = new_size + (PADDING * 2)
|
||||||
|
|
Loading…
Reference in New Issue