2014-12-16 11:44:43 +00:00
|
|
|
from libc.stdint cimport uint32_t
|
|
|
|
|
2015-01-19 08:59:55 +00:00
|
|
|
from numpy cimport ndarray
|
2015-01-24 06:29:04 +00:00
|
|
|
cimport numpy
|
2014-12-02 12:48:05 +00:00
|
|
|
|
2014-10-22 14:57:59 +00:00
|
|
|
from cymem.cymem cimport Pool
|
2014-12-21 20:25:43 +00:00
|
|
|
from thinc.typedefs cimport atom_t
|
2014-10-22 14:57:59 +00:00
|
|
|
|
2015-01-25 05:31:07 +00:00
|
|
|
from .typedefs cimport flags_t, attr_id_t, attr_t
|
|
|
|
from .parts_of_speech cimport univ_pos_t
|
2015-01-11 23:26:22 +00:00
|
|
|
from .structs cimport Morphology, TokenC, LexemeC
|
2014-12-21 20:25:43 +00:00
|
|
|
from .vocab cimport Vocab
|
2014-12-19 20:03:26 +00:00
|
|
|
from .strings cimport StringStore
|
2014-12-05 04:56:14 +00:00
|
|
|
|
|
|
|
|
2015-01-11 23:26:22 +00:00
|
|
|
ctypedef const LexemeC* const_Lexeme_ptr
|
2014-12-09 05:50:01 +00:00
|
|
|
ctypedef TokenC* TokenC_ptr
|
|
|
|
|
|
|
|
ctypedef fused LexemeOrToken:
|
|
|
|
const_Lexeme_ptr
|
|
|
|
TokenC_ptr
|
|
|
|
|
|
|
|
|
2015-01-11 23:26:22 +00:00
|
|
|
cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil
|
2014-12-24 06:42:00 +00:00
|
|
|
cdef attr_t get_token_attr(const TokenC* lex, attr_id_t feat_name) nogil
|
|
|
|
|
2015-01-11 23:26:22 +00:00
|
|
|
cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
2014-12-24 06:42:00 +00:00
|
|
|
return lexeme.flags & (1 << flag_id)
|
|
|
|
|
|
|
|
|
2014-09-15 01:22:40 +00:00
|
|
|
cdef class Tokens:
|
2014-10-22 14:57:59 +00:00
|
|
|
cdef Pool mem
|
2014-12-21 20:25:43 +00:00
|
|
|
cdef Vocab vocab
|
2015-01-24 06:29:04 +00:00
|
|
|
|
2014-12-05 04:56:14 +00:00
|
|
|
cdef TokenC* data
|
2015-01-24 06:29:04 +00:00
|
|
|
|
|
|
|
|
2015-01-30 07:04:41 +00:00
|
|
|
cdef list _py_tokens
|
2015-01-21 07:57:09 +00:00
|
|
|
cdef unicode _string
|
2015-01-24 06:29:04 +00:00
|
|
|
cdef list _tag_strings
|
|
|
|
cdef list _dep_strings
|
2014-10-22 14:57:59 +00:00
|
|
|
|
2015-01-25 04:33:54 +00:00
|
|
|
cdef public bint is_tagged
|
|
|
|
cdef public bint is_parsed
|
|
|
|
|
2014-10-22 14:57:59 +00:00
|
|
|
cdef int length
|
|
|
|
cdef int max_length
|
2014-10-14 04:21:03 +00:00
|
|
|
|
2014-12-09 05:50:01 +00:00
|
|
|
cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
|
2014-10-23 13:59:17 +00:00
|
|
|
|
2015-01-05 06:54:13 +00:00
|
|
|
cpdef long[:,:] to_array(self, object features)
|
2014-12-02 12:48:05 +00:00
|
|
|
|
2014-10-23 13:59:17 +00:00
|
|
|
|
|
|
|
cdef class Token:
|
2015-01-13 13:03:48 +00:00
|
|
|
cdef readonly Tokens _seq
|
|
|
|
cdef readonly int i
|
2015-01-14 16:51:47 +00:00
|
|
|
|
|
|
|
cdef readonly attr_t idx
|
|
|
|
cdef readonly attr_t cluster
|
|
|
|
cdef readonly attr_t length
|
2015-01-22 15:08:25 +00:00
|
|
|
cdef readonly attr_t orth
|
2015-01-23 19:17:03 +00:00
|
|
|
cdef readonly attr_t lower
|
|
|
|
cdef readonly attr_t norm
|
2015-01-14 16:51:47 +00:00
|
|
|
cdef readonly attr_t shape
|
|
|
|
cdef readonly attr_t prefix
|
|
|
|
cdef readonly attr_t suffix
|
|
|
|
cdef readonly float prob
|
|
|
|
cdef readonly float sentiment
|
|
|
|
cdef readonly attr_t flags
|
|
|
|
cdef readonly attr_t lemma
|
2015-01-25 05:31:07 +00:00
|
|
|
cdef readonly univ_pos_t pos
|
2015-01-17 06:33:16 +00:00
|
|
|
cdef readonly attr_t tag
|
|
|
|
cdef readonly attr_t dep
|
2015-01-21 15:05:58 +00:00
|
|
|
cdef readonly ndarray repvec
|
2015-01-23 20:32:38 +00:00
|
|
|
cdef readonly unicode string
|