mirror of https://github.com/explosion/spaCy.git
60 lines
1.3 KiB
Cython
60 lines
1.3 KiB
Cython
from libc.stdint cimport uint64_t
|
|
|
|
# Put these above import to avoid circular import problem
|
|
ctypedef int ClusterID
|
|
ctypedef uint64_t StringHash
|
|
ctypedef size_t Lexeme_addr
|
|
ctypedef char Bits8
|
|
ctypedef uint64_t Bits64
|
|
|
|
|
|
from spacy.spacy cimport Language
|
|
|
|
|
|
cdef struct Orthography:
|
|
StringHash last3
|
|
StringHash shape
|
|
StringHash norm
|
|
|
|
size_t length
|
|
Py_UNICODE first
|
|
Bits8 flags
|
|
|
|
|
|
cdef struct Distribution:
|
|
double prob
|
|
ClusterID cluster
|
|
Bits64 tagdict
|
|
Bits8 flags
|
|
|
|
|
|
cdef struct Lexeme:
|
|
StringHash sic # Hash of the original string
|
|
StringHash lex # Hash of the word, with punctuation and clitics split off
|
|
|
|
Distribution* dist # Distribution info, lazy loaded
|
|
Orthography* orth # Extra orthographic views
|
|
Lexeme* tail # Lexemes are linked lists, to deal with sub-tokens
|
|
|
|
|
|
cdef Lexeme BLANK_WORD = Lexeme(0, 0, NULL, NULL, NULL)
|
|
|
|
|
|
cdef enum StringAttr:
|
|
SIC
|
|
LEX
|
|
NORM
|
|
SHAPE
|
|
LAST3
|
|
LENGTH
|
|
|
|
|
|
cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0
|
|
|
|
cpdef StringHash sic_of(size_t lex_id) except 0
|
|
cpdef StringHash lex_of(size_t lex_id) except 0
|
|
cpdef StringHash norm_of(size_t lex_id) except 0
|
|
cpdef StringHash shape_of(size_t lex_id) except 0
|
|
cpdef StringHash last3_of(size_t lex_id) except 0
|
|
cpdef StringHash length_of(size_t lex_id)
|