From 0a8c84625d9dfce50b11bb17dc65cdbdcb1d6a36 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 5 Nov 2014 19:55:10 +1100 Subject: [PATCH] * Moving feature context stuff to a generalized place --- spacy/context.pxd | 43 ++++++++++++++++++++++ spacy/context.pyx | 93 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 spacy/context.pxd create mode 100644 spacy/context.pyx diff --git a/spacy/context.pxd b/spacy/context.pxd new file mode 100644 index 000000000..9d7cf000f --- /dev/null +++ b/spacy/context.pxd @@ -0,0 +1,43 @@ +from thinc.typedefs cimport atom_t +from .typedefs cimport hash_t +from .tokens cimport Tokens +from .lexeme cimport Lexeme + + +cdef struct Token: + atom_t i + atom_t c + atom_t w + atom_t shape + atom_t pref + atom_t suff + atom_t oft_title + atom_t oft_upper + atom_t is_alpha + atom_t is_digit + atom_t is_title + atom_t is_upper + + atom_t url + atom_t num + + atom_t postype + atom_t pos + atom_t ner + + +cdef struct Slots: + Token P2 + Token P1 + Token N0 + Token N1 + Token N2 + + +cdef Slots FIELD_IDS +cdef int N_FIELDS + + +cdef hash_t fill_slots(Slots* s, int i, Tokens tokens) except 0 + +cdef int fill_flat(atom_t* context, Slots* s) except -1 diff --git a/spacy/context.pyx b/spacy/context.pyx new file mode 100644 index 000000000..465dd32c3 --- /dev/null +++ b/spacy/context.pyx @@ -0,0 +1,93 @@ +from murmurhash.mrmr cimport hash64 +from .lexeme cimport * + + +cdef void _number_token(Token* t, int* n_fields): + cdef int i = n_fields[0] + t.i = i; i += 1 + t.c = i; i += 1 + t.w = i; i += 1 + t.shape = i; i += 1 + t.pref = i; i += 1 + t.suff = i; i += 1 + t.oft_title = i; i += 1 + t.oft_upper = i; i += 1 + t.is_alpha = i; i += 1 + t.is_digit = i; i += 1 + t.is_title = i; i += 1 + t.is_upper = i; i += 1 + + t.url = i; i += 1 + t.num = i; i += 1 + + t.postype = i; i += 1 + t.pos = i; i += 1 + t.ner = i; i += 1 + + n_fields[0] = i + + +cdef int fill_token(Token* t, Lexeme* lex, atom_t pos, atom_t ner): + t.i = lex.sic + t.c = lex.cluster + t.w = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape + t.shape = lex.shape + t.pref = lex.prefix + t.suff = lex.suffix + + t.oft_title = lex.flags & (1 << OFT_TITLE) + t.oft_upper = lex.flags & (1 << OFT_UPPER) + t.is_alpha = lex.flags & (1 << IS_ALPHA) + t.is_digit = lex.flags & (1 << IS_DIGIT) + t.is_title = lex.flags & (1 << IS_TITLE) + t.is_upper = lex.flags & (1 << IS_UPPER) + t.url = lex.flags & (1 << LIKE_URL) + t.num = lex.flags & (1 << LIKE_NUMBER) + t.postype = lex.postype + t.pos = pos + t.ner = ner + + +cdef int _flatten_token(atom_t* context, Token* ids, Token* vals) except -1: + context[ids.i] = vals.i + context[ids.c] = vals.c + context[ids.w] = vals.w + context[ids.shape] = vals.shape + context[ids.pref] = vals.pref + context[ids.suff] = vals.suff + context[ids.oft_title] = vals.oft_title + context[ids.oft_upper] = vals.oft_upper + context[ids.is_alpha] = vals.is_alpha + context[ids.is_digit] = vals.is_digit + context[ids.is_title] = vals.is_title + context[ids.is_upper] = vals.is_upper + context[ids.url] = vals.url + context[ids.num] = vals.num + context[ids.postype] = vals.postype + context[ids.pos] = vals.pos + context[ids.ner] = vals.ner + + +cdef hash_t fill_slots(Slots* s, int i, Tokens tokens) except 0: + fill_token(&s.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2]) + fill_token(&s.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1]) + fill_token(&s.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i]) + fill_token(&s.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1]) + fill_token(&s.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2]) + return hash64(s, sizeof(Slots), 0) + + +cdef int fill_flat(atom_t* context, Slots* s) except -1: + _flatten_token(context, &FIELD_IDS.P2, &s.P2) + _flatten_token(context, &FIELD_IDS.P1, &s.P1) + _flatten_token(context, &FIELD_IDS.N0, &s.N0) + _flatten_token(context, &FIELD_IDS.N1, &s.N1) + _flatten_token(context, &FIELD_IDS.N2, &s.N2) + + +N_FIELDS = 0 +_number_token(&FIELD_IDS.P2, &N_FIELDS) +_number_token(&FIELD_IDS.P1, &N_FIELDS) +_number_token(&FIELD_IDS.N0, &N_FIELDS) +_number_token(&FIELD_IDS.N1, &N_FIELDS) +_number_token(&FIELD_IDS.N2, &N_FIELDS)