2014-11-05 08:55:10 +00:00
|
|
|
from murmurhash.mrmr cimport hash64
|
|
|
|
from .lexeme cimport *
|
|
|
|
|
|
|
|
|
2014-11-05 09:45:29 +00:00
|
|
|
cdef class Slots:
|
|
|
|
def __init__(self):
|
2014-11-06 17:40:36 +00:00
|
|
|
self.P4 = Token()
|
|
|
|
self.P3 = Token()
|
2014-11-05 09:45:29 +00:00
|
|
|
self.P2 = Token()
|
|
|
|
self.P1 = Token()
|
|
|
|
self.N0 = Token()
|
|
|
|
self.N1 = Token()
|
|
|
|
self.N2 = Token()
|
2014-11-06 17:40:36 +00:00
|
|
|
self.N3 = Token()
|
|
|
|
self.N4 = Token()
|
2014-11-05 09:45:29 +00:00
|
|
|
|
|
|
|
|
|
|
|
cdef void _number_token(Token t, int* n_fields):
|
2014-11-05 08:55:10 +00:00
|
|
|
cdef int i = n_fields[0]
|
2014-11-05 11:11:39 +00:00
|
|
|
t.sic = i; i += 1
|
|
|
|
t.cluster = i; i += 1
|
|
|
|
t.norm = i; i += 1
|
2014-11-05 08:55:10 +00:00
|
|
|
t.shape = i; i += 1
|
2014-11-05 11:11:39 +00:00
|
|
|
t.prefix = i; i += 1
|
|
|
|
t.suffix = i; i += 1
|
|
|
|
t.length = i; i += 1
|
|
|
|
|
|
|
|
t.postype = i; i += 1
|
|
|
|
t.nertype = i; i += 1
|
|
|
|
t.sensetype = i; i += 1
|
|
|
|
|
2014-11-05 08:55:10 +00:00
|
|
|
t.is_alpha = i; i += 1
|
2014-11-05 11:11:39 +00:00
|
|
|
t.is_ascii = i; i += 1
|
2014-11-05 08:55:10 +00:00
|
|
|
t.is_digit = i; i += 1
|
2014-11-05 11:11:39 +00:00
|
|
|
t.is_lower = i; i += 1
|
|
|
|
t.is_punct = i; i += 1
|
|
|
|
t.is_space = i; i += 1
|
2014-11-05 08:55:10 +00:00
|
|
|
t.is_title = i; i += 1
|
|
|
|
t.is_upper = i; i += 1
|
|
|
|
|
2014-11-05 11:11:39 +00:00
|
|
|
t.like_number = i; i += 1
|
|
|
|
t.like_url = i; i += 1
|
|
|
|
|
|
|
|
t.oft_lower = i; i += 1
|
|
|
|
t.oft_title = i; i += 1
|
|
|
|
t.oft_upper = i; i += 1
|
|
|
|
|
|
|
|
t.in_males = i; i += 1
|
|
|
|
t.in_females = i; i += 1
|
|
|
|
t.in_surnames = i; i += 1
|
|
|
|
t.in_places = i; i += 1
|
|
|
|
t.in_games = i; i += 1
|
|
|
|
t.in_celebs = i; i += 1
|
|
|
|
t.in_names = i; i += 1
|
2014-11-05 08:55:10 +00:00
|
|
|
|
|
|
|
t.pos = i; i += 1
|
2014-11-05 11:11:39 +00:00
|
|
|
t.sense = i; i += 1
|
2014-11-05 08:55:10 +00:00
|
|
|
t.ner = i; i += 1
|
|
|
|
|
|
|
|
n_fields[0] = i
|
|
|
|
|
|
|
|
|
2014-11-05 09:45:29 +00:00
|
|
|
cdef int fill_token(Token t, Lexeme* lex, atom_t pos, atom_t ner):
|
2014-11-05 11:11:39 +00:00
|
|
|
t.sic = lex.sic
|
|
|
|
t.cluster = lex.cluster
|
|
|
|
t.norm = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
|
2014-11-05 08:55:10 +00:00
|
|
|
t.shape = lex.shape
|
2014-11-05 11:11:39 +00:00
|
|
|
t.asciied = lex.asciied
|
|
|
|
t.prefix = lex.prefix
|
|
|
|
t.suffix = lex.suffix
|
|
|
|
t.length = lex.length
|
2014-11-05 08:55:10 +00:00
|
|
|
|
2014-11-05 11:11:39 +00:00
|
|
|
t.postype = lex.postype
|
|
|
|
t.nertype = 0
|
|
|
|
t.sensetype = 0
|
|
|
|
|
2014-11-05 08:55:10 +00:00
|
|
|
t.is_alpha = lex.flags & (1 << IS_ALPHA)
|
|
|
|
t.is_digit = lex.flags & (1 << IS_DIGIT)
|
2014-11-05 11:11:39 +00:00
|
|
|
t.is_lower = lex.flags & (1 << IS_LOWER)
|
|
|
|
t.is_punct = lex.flags & (1 << IS_PUNCT)
|
|
|
|
t.is_space = lex.flags & (1 << IS_SPACE)
|
2014-11-05 08:55:10 +00:00
|
|
|
t.is_title = lex.flags & (1 << IS_TITLE)
|
|
|
|
t.is_upper = lex.flags & (1 << IS_UPPER)
|
2014-11-05 11:11:39 +00:00
|
|
|
t.like_url = lex.flags & (1 << LIKE_URL)
|
|
|
|
t.like_number = lex.flags & (1 << LIKE_NUMBER)
|
|
|
|
t.oft_lower = lex.flags & (1 << OFT_LOWER)
|
|
|
|
t.oft_title = lex.flags & (1 << OFT_TITLE)
|
|
|
|
t.oft_upper = lex.flags & (1 << OFT_UPPER)
|
|
|
|
|
|
|
|
t.in_males = lex.flags & (1 << IN_MALES)
|
|
|
|
t.in_females = lex.flags & (1 << IN_FEMALES)
|
|
|
|
t.in_surnames = lex.flags & (1 << IN_SURNAMES)
|
|
|
|
t.in_places = lex.flags & (1 << IN_PLACES)
|
|
|
|
t.in_games = lex.flags & (1 << IN_GAMES)
|
|
|
|
t.in_celebs = lex.flags & (1 << IN_CELEBS)
|
|
|
|
t.in_names = lex.flags & (1 << IN_NAMES)
|
|
|
|
|
2014-11-05 08:55:10 +00:00
|
|
|
t.pos = pos
|
2014-11-05 11:11:39 +00:00
|
|
|
t.sense = 0
|
2014-11-05 08:55:10 +00:00
|
|
|
t.ner = ner
|
|
|
|
|
|
|
|
|
2014-11-05 09:45:29 +00:00
|
|
|
cdef int _flatten_token(atom_t* context, Token ids, Token vals) except -1:
|
2014-11-05 11:11:39 +00:00
|
|
|
context[ids.sic] = vals.sic
|
|
|
|
context[ids.cluster] = vals.cluster
|
|
|
|
context[ids.norm] = vals.norm
|
2014-11-05 08:55:10 +00:00
|
|
|
context[ids.shape] = vals.shape
|
2014-11-05 11:11:39 +00:00
|
|
|
context[ids.asciied] = vals.asciied
|
|
|
|
context[ids.prefix] = vals.prefix
|
|
|
|
context[ids.suffix] = vals.suffix
|
|
|
|
context[ids.length] = vals.length
|
|
|
|
|
|
|
|
context[ids.postype] = vals.postype
|
|
|
|
context[ids.nertype] = vals.nertype
|
|
|
|
context[ids.sensetype] = vals.sensetype
|
|
|
|
|
2014-11-05 08:55:10 +00:00
|
|
|
context[ids.is_alpha] = vals.is_alpha
|
2014-11-05 11:11:39 +00:00
|
|
|
context[ids.is_ascii] = vals.is_ascii
|
2014-11-05 08:55:10 +00:00
|
|
|
context[ids.is_digit] = vals.is_digit
|
2014-11-05 11:11:39 +00:00
|
|
|
context[ids.is_lower] = vals.is_lower
|
|
|
|
context[ids.is_punct] = vals.is_punct
|
2014-11-05 08:55:10 +00:00
|
|
|
context[ids.is_title] = vals.is_title
|
|
|
|
context[ids.is_upper] = vals.is_upper
|
2014-11-05 11:11:39 +00:00
|
|
|
context[ids.like_url] = vals.like_url
|
|
|
|
context[ids.like_number] = vals.like_number
|
|
|
|
context[ids.oft_lower] = vals.oft_lower
|
|
|
|
context[ids.oft_title] = vals.oft_title
|
|
|
|
context[ids.oft_upper] = vals.oft_upper
|
|
|
|
|
|
|
|
context[ids.in_males] = vals.in_males
|
|
|
|
context[ids.in_females] = vals.in_females
|
|
|
|
context[ids.in_surnames] = vals.in_surnames
|
|
|
|
context[ids.in_places] = vals.in_places
|
|
|
|
context[ids.in_games] = vals.in_games
|
|
|
|
context[ids.in_celebs] = vals.in_celebs
|
|
|
|
context[ids.in_names] = vals.in_names
|
|
|
|
|
2014-11-05 08:55:10 +00:00
|
|
|
context[ids.pos] = vals.pos
|
2014-11-05 11:11:39 +00:00
|
|
|
context[ids.sense] = vals.sense
|
2014-11-05 08:55:10 +00:00
|
|
|
context[ids.ner] = vals.ner
|
|
|
|
|
|
|
|
|
2014-11-05 09:45:29 +00:00
|
|
|
cdef hash_t fill_slots(Slots s, int i, Tokens tokens) except 0:
|
2014-11-06 17:40:36 +00:00
|
|
|
fill_token(s.P4, tokens.lex[i-4], tokens.pos[i-4], tokens.ner[i-4])
|
|
|
|
fill_token(s.P3, tokens.lex[i-3], tokens.pos[i-3], tokens.ner[i-3])
|
2014-11-05 09:45:29 +00:00
|
|
|
fill_token(s.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2])
|
|
|
|
fill_token(s.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1])
|
|
|
|
fill_token(s.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i])
|
|
|
|
fill_token(s.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1])
|
|
|
|
fill_token(s.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2])
|
2014-11-06 17:40:36 +00:00
|
|
|
fill_token(s.N3, tokens.lex[i+3], tokens.pos[i+3], tokens.ner[i+3])
|
|
|
|
fill_token(s.N4, tokens.lex[i+4], tokens.pos[i+4], tokens.ner[i+4])
|
2014-11-05 09:45:29 +00:00
|
|
|
return 1
|
2014-11-05 08:55:10 +00:00
|
|
|
|
|
|
|
|
2014-11-05 09:45:29 +00:00
|
|
|
cdef int fill_flat(atom_t* context, Slots s) except -1:
|
2014-11-06 17:40:36 +00:00
|
|
|
_flatten_token(context, FIELD_IDS.P4, s.P4)
|
|
|
|
_flatten_token(context, FIELD_IDS.P3, s.P3)
|
2014-11-05 09:45:29 +00:00
|
|
|
_flatten_token(context, FIELD_IDS.P2, s.P2)
|
|
|
|
_flatten_token(context, FIELD_IDS.P1, s.P1)
|
|
|
|
_flatten_token(context, FIELD_IDS.N0, s.N0)
|
|
|
|
_flatten_token(context, FIELD_IDS.N1, s.N1)
|
|
|
|
_flatten_token(context, FIELD_IDS.N2, s.N2)
|
2014-11-06 17:40:36 +00:00
|
|
|
_flatten_token(context, FIELD_IDS.N3, s.N4)
|
|
|
|
_flatten_token(context, FIELD_IDS.N4, s.N4)
|
2014-11-05 08:55:10 +00:00
|
|
|
|
|
|
|
|
|
|
|
N_FIELDS = 0
|
2014-11-05 09:45:29 +00:00
|
|
|
FIELD_IDS = Slots()
|
2014-11-06 17:40:36 +00:00
|
|
|
_number_token(FIELD_IDS.P4, &N_FIELDS)
|
|
|
|
_number_token(FIELD_IDS.P3, &N_FIELDS)
|
2014-11-05 09:45:29 +00:00
|
|
|
_number_token(FIELD_IDS.P2, &N_FIELDS)
|
|
|
|
_number_token(FIELD_IDS.P1, &N_FIELDS)
|
|
|
|
_number_token(FIELD_IDS.N0, &N_FIELDS)
|
|
|
|
_number_token(FIELD_IDS.N1, &N_FIELDS)
|
|
|
|
_number_token(FIELD_IDS.N2, &N_FIELDS)
|
2014-11-06 17:40:36 +00:00
|
|
|
_number_token(FIELD_IDS.N3, &N_FIELDS)
|
|
|
|
_number_token(FIELD_IDS.N4, &N_FIELDS)
|