2014-09-15 01:22:40 +00:00
|
|
|
from cpython.ref cimport Py_INCREF
|
2014-09-17 21:09:24 +00:00
|
|
|
from cymem.cymem cimport Pool
|
2014-09-15 01:22:40 +00:00
|
|
|
|
2014-09-10 18:41:37 +00:00
|
|
|
|
2014-10-09 02:50:05 +00:00
|
|
|
cdef LexemeC* lexeme_init(Pool mem, size_t i, unicode string, double prob,
|
|
|
|
size_t cluster, list views, set flags):
|
2014-09-17 18:02:26 +00:00
|
|
|
cdef LexemeC* lexeme = <LexemeC*>mem.alloc(1, sizeof(LexemeC))
|
2014-10-09 02:50:05 +00:00
|
|
|
lexeme.i = i
|
2014-09-10 18:41:37 +00:00
|
|
|
lexeme.cluster = cluster
|
|
|
|
lexeme.prob = prob
|
2014-09-14 23:31:44 +00:00
|
|
|
lexeme.string = intern_and_encode(string, &lexeme.length)
|
2014-09-17 18:02:26 +00:00
|
|
|
lexeme.views = <char**>mem.alloc(len(views), sizeof(char*))
|
2014-09-14 23:31:44 +00:00
|
|
|
cdef size_t length = 0
|
2014-09-10 18:41:37 +00:00
|
|
|
for i, string in enumerate(views):
|
2014-09-14 23:31:44 +00:00
|
|
|
lexeme.views[i] = intern_and_encode(string, &length)
|
2014-09-10 18:41:37 +00:00
|
|
|
|
|
|
|
for active_flag in flags:
|
|
|
|
lexeme.flags |= (1 << active_flag)
|
|
|
|
return lexeme
|
|
|
|
|
|
|
|
|
2014-09-14 23:31:44 +00:00
|
|
|
cdef char* intern_and_encode(unicode string, size_t* length):
|
2014-09-15 04:34:45 +00:00
|
|
|
cdef bytes byte_string = string.encode('utf8')
|
|
|
|
cdef bytes utf8_string = intern(byte_string)
|
2014-09-15 01:22:40 +00:00
|
|
|
Py_INCREF(utf8_string)
|
2014-09-14 23:31:44 +00:00
|
|
|
length[0] = len(utf8_string)
|
2014-09-10 18:41:37 +00:00
|
|
|
return <char*>utf8_string
|
|
|
|
|
|
|
|
|
|
|
|
cdef bint lexeme_check_flag(LexemeC* lexeme, size_t flag_id):
|
|
|
|
return lexeme.flags & (1 << flag_id)
|
|
|
|
|
|
|
|
|
|
|
|
cdef unicode lexeme_string_view(LexemeC* lexeme, size_t view_id):
|
|
|
|
cdef bytes byte_string = lexeme.views[view_id]
|
|
|
|
return byte_string.decode('utf8')
|
2014-10-09 03:10:46 +00:00
|
|
|
|
|
|
|
|
|
|
|
cdef dict lexeme_pack(LexemeC* lexeme):
|
|
|
|
cdef dict packed = {}
|
|
|
|
packed['i'] = lexeme.i
|
|
|
|
packed['length'] = lexeme.length
|
|
|
|
packed['prob'] = lexeme.prob
|
|
|
|
packed['cluster'] = lexeme.cluster
|
|
|
|
packed['string'] = lexeme.string.decode('utf8')
|
|
|
|
packed['views'] = []
|
|
|
|
cdef size_t i = 0
|
|
|
|
while lexeme.views[i] != NULL:
|
|
|
|
packed['views'].append(lexeme.views[i].decode('utf8'))
|
|
|
|
i += 1
|
|
|
|
packed['flags'] = lexeme.flags
|
|
|
|
return packed
|
|
|
|
|
|
|
|
|
|
|
|
cdef int lexeme_unpack(LexemeC* lex, dict p) except -1:
|
|
|
|
cdef size_t length
|
|
|
|
lex.i = p['i']
|
|
|
|
lex.length = p['length']
|
|
|
|
lex.prob = p['prob']
|
|
|
|
lex.cluster = p['cluster']
|
|
|
|
lex.string = intern_and_encode(p['string'], &length)
|
|
|
|
for i, view in enumerate(p['views']):
|
|
|
|
lex.views[i] = intern_and_encode(view, &length)
|
|
|
|
lex.flags = p['flags']
|