From e1b1f45cc942d59bf68513176669f9021695bae7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 4 Dec 2014 20:46:20 +1100 Subject: [PATCH] * Add STEM attribute to lexeme --- spacy/lexeme.pxd | 36 +++++++++++++++++++++++++++++++++--- spacy/lexeme.pyx | 32 ++------------------------------ 2 files changed, 35 insertions(+), 33 deletions(-) diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index a998aeedb..ef0e8fb12 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -72,7 +72,8 @@ cpdef enum attr_id_t: ID SIC - NORM + STEM + DENSE SHAPE ASCIIED PREFIX @@ -89,7 +90,8 @@ cdef struct Lexeme: attr_t id attr_t sic - attr_t norm + attr_t stem + attr_t dense attr_t shape attr_t asciied attr_t prefix @@ -116,4 +118,32 @@ cdef inline bint check_flag(const Lexeme* lexeme, attr_id_t flag_id) nogil: return lexeme.flags & (1 << flag_id) -cdef attr_t get_attr(const Lexeme* lex, attr_id_t attr_id) +cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil: + if feat_name < (sizeof(flags_t) * 8): + return check_flag(lex, feat_name) + elif feat_name == ID: + return lex.id + elif feat_name == SIC: + return lex.sic + elif feat_name == DENSE: + return lex.dense + elif feat_name == STEM: + return lex.stem + elif feat_name == SHAPE: + return lex.shape + elif feat_name == ASCIIED: + return lex.asciied + elif feat_name == PREFIX: + return lex.prefix + elif feat_name == SUFFIX: + return lex.suffix + elif feat_name == LENGTH: + return lex.length + elif feat_name == CLUSTER: + return lex.cluster + elif feat_name == POS_TYPE: + return lex.pos_type + elif feat_name == SENSE_TYPE: + return lex.sense_type + else: + return 0 diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 2090ece50..5c8d7a60e 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -27,38 +27,10 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed, lex.prefix = string_store[string[:1]] lex.suffix = string_store[string[-3:]] - lex.norm = lex.sic # TODO lex.shape = string_store[orth.word_shape(string)] + lex.dense = lex.sic if lex.prob >= -10 else lex.shape + lex.stem = string_store[props.get('stem', string)] lex.asciied = string_store[orth.asciied(string)] lex.flags = props.get('flags', 0) return lex - - -cdef attr_t get_attr(const Lexeme* lex, attr_id_t feat_name): - if feat_name < (sizeof(flags_t) * 8): - return check_flag(lex, feat_name) - elif feat_name == ID: - return lex.id - elif feat_name == SIC: - return lex.sic - elif feat_name == NORM: - return lex.norm - elif feat_name == SHAPE: - return lex.shape - elif feat_name == ASCIIED: - return lex.asciied - elif feat_name == PREFIX: - return lex.prefix - elif feat_name == SUFFIX: - return lex.suffix - elif feat_name == LENGTH: - return lex.length - elif feat_name == CLUSTER: - return lex.cluster - elif feat_name == POS_TYPE: - return lex.pos_type - elif feat_name == SENSE_TYPE: - return lex.sense_type - else: - raise StandardError('Feature ID: %d not found' % feat_name)