From f5c4f2eb52c3618c4dda056c0171b21b1b7a0e63 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 7 Dec 2014 15:28:22 +1100 Subject: [PATCH] * Revise context, focussing on POS tagging for now --- spacy/context.pxd | 97 ++++++++++++++------------------ spacy/context.pyx | 138 +++++----------------------------------------- 2 files changed, 54 insertions(+), 181 deletions(-) diff --git a/spacy/context.pxd b/spacy/context.pxd index 8f798d347..3dd842b6e 100644 --- a/spacy/context.pxd +++ b/spacy/context.pxd @@ -1,66 +1,49 @@ from thinc.typedefs cimport atom_t -from .typedefs cimport hash_t -from .tokens cimport Tokens -from .lexeme cimport Lexeme +from .tokens cimport TokenC -cdef class Token: - cdef readonly atom_t sic - cdef readonly atom_t cluster - cdef readonly atom_t norm - cdef readonly atom_t shape - cdef readonly atom_t asciied - cdef readonly atom_t prefix - cdef readonly atom_t suffix - cdef readonly atom_t length +cpdef enum: + P2_sic + P2_cluster + P2_shape + P2_prefix + P2_suffix + P2_pos + P2_sense - cdef readonly atom_t postype - cdef readonly atom_t nertype - cdef readonly atom_t sensetype + P1_sic + P1_cluster + P1_shape + P1_prefix + P1_suffix + P1_pos + P1_sense - cdef readonly atom_t is_alpha - cdef readonly atom_t is_ascii - cdef readonly atom_t is_digit - cdef readonly atom_t is_lower - cdef readonly atom_t is_punct - cdef readonly atom_t is_space - cdef readonly atom_t is_title - cdef readonly atom_t is_upper - cdef readonly atom_t like_url - cdef readonly atom_t like_number - cdef readonly atom_t oft_lower - cdef readonly atom_t oft_title - cdef readonly atom_t oft_upper + W_sic + W_cluster + W_shape + W_prefix + W_suffix + W_pos + W_sense - cdef readonly atom_t in_males - cdef readonly atom_t in_females - cdef readonly atom_t in_surnames - cdef readonly atom_t in_places - cdef readonly atom_t in_games - cdef readonly atom_t in_celebs - cdef readonly atom_t in_names + N1_sic + N1_cluster + N1_shape + N1_prefix + N1_suffix + N1_pos + N1_sense - cdef readonly atom_t pos - cdef readonly atom_t sense - cdef readonly atom_t ner + N2_sic + N2_cluster + N2_shape + N2_prefix + N2_suffix + N2_pos + N2_sense + + N_FIELDS -cdef class Slots: - cdef readonly Token P4 - cdef readonly Token P3 - cdef readonly Token P2 - cdef readonly Token P1 - cdef readonly Token N0 - cdef readonly Token N1 - cdef readonly Token N2 - cdef readonly Token N3 - cdef readonly Token N4 - - -cdef int N_FIELDS - - -cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1 - - -cpdef Slots FIELD_IDS +cdef int fill_context(atom_t[N_FIELDS] context, const int i, TokenC* tokens) except -1 diff --git a/spacy/context.pyx b/spacy/context.pyx index aeb78ae5c..c81daef2c 100644 --- a/spacy/context.pyx +++ b/spacy/context.pyx @@ -1,126 +1,16 @@ -from murmurhash.mrmr cimport hash64 -from .lexeme cimport * +cdef int fill_context(atom_t[N_FIELDS] context, const int i, TokenC* tokens) except -1: + _fill_from_token(&context[P2_sic], &tokens[i-2]) + _fill_from_token(&context[P1_sic], &tokens[i-1]) + _fill_from_token(&context[W_sic], &tokens[i]) + _fill_from_token(&context[N1_sic], &tokens[i+1]) + _fill_from_token(&context[N2_sic], &tokens[i+2]) -cdef class Slots: - def __init__(self): - self.P4 = Token() - self.P3 = Token() - self.P2 = Token() - self.P1 = Token() - self.N0 = Token() - self.N1 = Token() - self.N2 = Token() - self.N3 = Token() - self.N4 = Token() - - -cdef void _number_token(Token t, int* n_fields): - cdef int i = n_fields[0] - t.sic = i; i += 1 - t.cluster = i; i += 1 - t.norm = i; i += 1 - t.shape = i; i += 1 - t.prefix = i; i += 1 - t.suffix = i; i += 1 - t.length = i; i += 1 - - t.postype = i; i += 1 - t.nertype = i; i += 1 - t.sensetype = i; i += 1 - - t.is_alpha = i; i += 1 - t.is_ascii = i; i += 1 - t.is_digit = i; i += 1 - t.is_lower = i; i += 1 - t.is_punct = i; i += 1 - t.is_space = i; i += 1 - t.is_title = i; i += 1 - t.is_upper = i; i += 1 - - t.like_number = i; i += 1 - t.like_url = i; i += 1 - - t.oft_lower = i; i += 1 - t.oft_title = i; i += 1 - t.oft_upper = i; i += 1 - - t.in_males = i; i += 1 - t.in_females = i; i += 1 - t.in_surnames = i; i += 1 - t.in_places = i; i += 1 - t.in_games = i; i += 1 - t.in_celebs = i; i += 1 - t.in_names = i; i += 1 - - t.pos = i; i += 1 - t.sense = i; i += 1 - t.ner = i; i += 1 - - n_fields[0] = i - - -cdef int _fill_token(atom_t* c, Token t, Lexeme* lex, atom_t pos, atom_t ner): - c[t.sic] = lex.sic - c[t.cluster] = lex.cluster - c[t.norm] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape - c[t.shape] = lex.shape - c[t.asciied] = lex.asciied - c[t.prefix] = lex.prefix - c[t.suffix] = lex.suffix - c[t.length] = lex.length - - c[t.postype] = lex.postype - c[t.nertype] = 0 - c[t.sensetype] = 0 - - c[t.is_alpha] = lex.flags & (1 << IS_ALPHA) - c[t.is_digit] = lex.flags & (1 << IS_DIGIT) - c[t.is_lower] = lex.flags & (1 << IS_LOWER) - c[t.is_punct] = lex.flags & (1 << IS_PUNCT) - c[t.is_space] = lex.flags & (1 << IS_SPACE) - c[t.is_title] = lex.flags & (1 << IS_TITLE) - c[t.is_upper] = lex.flags & (1 << IS_UPPER) - c[t.like_url] = lex.flags & (1 << LIKE_URL) - c[t.like_number] = lex.flags & (1 << LIKE_NUMBER) - c[t.oft_lower] = lex.flags & (1 << OFT_LOWER) - c[t.oft_title] = lex.flags & (1 << OFT_TITLE) - c[t.oft_upper] = lex.flags & (1 << OFT_UPPER) - - c[t.in_males] = lex.flags & (1 << IN_MALES) - c[t.in_females] = lex.flags & (1 << IN_FEMALES) - c[t.in_surnames] = lex.flags & (1 << IN_SURNAMES) - c[t.in_places] = lex.flags & (1 << IN_PLACES) - c[t.in_games] = lex.flags & (1 << IN_GAMES) - c[t.in_celebs] = lex.flags & (1 << IN_CELEBS) - c[t.in_names] = lex.flags & (1 << IN_NAMES) - - c[t.pos] = pos - c[t.sense] = 0 - c[t.ner] = ner - - -cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1: - _fill_token(context, FIELD_IDS.P4, tokens.lex[i-4], tokens.pos[i-4], tokens.ner[i-4]) - _fill_token(context, FIELD_IDS.P3, tokens.lex[i-3], tokens.pos[i-3], tokens.ner[i-3]) - _fill_token(context, FIELD_IDS.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2]) - _fill_token(context, FIELD_IDS.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1]) - _fill_token(context, FIELD_IDS.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i]) - _fill_token(context, FIELD_IDS.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1]) - _fill_token(context, FIELD_IDS.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2]) - _fill_token(context, FIELD_IDS.N3, tokens.lex[i+3], tokens.pos[i+3], tokens.ner[i+3]) - _fill_token(context, FIELD_IDS.N4, tokens.lex[i+4], tokens.pos[i+4], tokens.ner[i+4]) - return 1 - - -N_FIELDS = 0 -FIELD_IDS = Slots() -_number_token(FIELD_IDS.P4, &N_FIELDS) -_number_token(FIELD_IDS.P3, &N_FIELDS) -_number_token(FIELD_IDS.P2, &N_FIELDS) -_number_token(FIELD_IDS.P1, &N_FIELDS) -_number_token(FIELD_IDS.N0, &N_FIELDS) -_number_token(FIELD_IDS.N1, &N_FIELDS) -_number_token(FIELD_IDS.N2, &N_FIELDS) -_number_token(FIELD_IDS.N3, &N_FIELDS) -_number_token(FIELD_IDS.N4, &N_FIELDS) +cdef inline void _fill_from_token(atom_t[N_FIELDS] context, const TokenC* t) nogil: + context[0] = t.lex.sic + context[1] = t.lex.cluster + context[2] = t.lex.shape + context[3] = t.lex.prefix + context[4] = t.lex.suffix + context[5] = t.pos + context[6] = t.sense