From 8e4c69ee8c2bee75e55a364d69ff9ce39ca60b65 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 27 Jul 2015 01:50:06 +0200 Subject: [PATCH] * Add is_oov property, and fix up handling of attributes --- spacy/attrs.pxd | 15 ++------------- spacy/en/__init__.py | 4 ++-- spacy/en/attrs.pxd | 4 +++- spacy/lexeme.pyx | 4 ++++ spacy/tokens/token.pyx | 7 ++++--- spacy/vocab.pyx | 4 ++-- 6 files changed, 17 insertions(+), 21 deletions(-) diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index 578f2040b..2c3e2849d 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -12,19 +12,8 @@ cpdef enum attr_id_t: LIKE_NUM LIKE_EMAIL IS_STOP - FLAG0 - FLAG1 - FLAG2 - FLAG3 - FLAG4 - FLAG5 - FLAG6 - FLAG7 - FLAG8 - FLAG9 - FLAG10 - FLAG11 - FLAG12 + IS_OOV + FLAG13 FLAG14 FLAG15 diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 4eccb1d37..52850ab2c 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -25,9 +25,9 @@ from ..util import read_lang_data from ..attrs import TAG, HEAD, DEP, ENT_TYPE, ENT_IOB -def get_lex_props(string, oov_prob=-30): +def get_lex_props(string, oov_prob=-30, is_oov=False): return { - 'flags': get_flags(string), + 'flags': get_flags(string, is_oov=is_oov), 'length': len(string), 'orth': string, 'lower': string.lower(), diff --git a/spacy/en/attrs.pxd b/spacy/en/attrs.pxd index fcf12ca82..25488220c 100644 --- a/spacy/en/attrs.pxd +++ b/spacy/en/attrs.pxd @@ -1,4 +1,4 @@ -from ..attrs cimport FLAG11, FLAG12, FLAG13, FLAG14 +from ..attrs cimport FLAG13, FLAG14 from ..attrs cimport FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21 from ..attrs cimport FLAG22, FLAG23, FLAG24, FLAG25, FLAG26, FLAG27, FLAG28 from ..attrs cimport FLAG29, FLAG30, FLAG31, FLAG32 @@ -10,6 +10,7 @@ from ..attrs cimport IS_PUNCT as _IS_PUNCT from ..attrs cimport IS_SPACE as _IS_SPACE from ..attrs cimport IS_TITLE as _IS_TITLE from ..attrs cimport IS_UPPER as _IS_UPPER +from ..attrs cimport IS_OOV as _IS_OOV from ..attrs cimport LIKE_EMAIL as _LIKE_EMAIL from ..attrs cimport LIKE_URL as _LIKE_URL from ..attrs cimport LIKE_NUM as _LIKE_NUM @@ -43,6 +44,7 @@ cpdef enum: LIKE_NUM = _LIKE_NUM LIKE_EMAIL = _LIKE_EMAIL IS_STOP = _IS_STOP + IS_OOV = _IS_OOV ORTH = _ORTH SHAPE = _SHAPE diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index fc6261049..07f151114 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -11,6 +11,7 @@ import numpy from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP +from .attrs cimport IS_OOV memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) @@ -51,6 +52,9 @@ cdef class Lexeme: cdef flags_t one = 1 return self.flags & (one << flag_id) + property is_oov: + def __get__(self): return self.check_flag(IS_OOV) + property is_alpha: def __get__(self): return self.check_flag(IS_ALPHA) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 6e7ac7bf6..6aa000f05 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -18,8 +18,7 @@ from ..parts_of_speech cimport CONJ, PUNCT from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP - - +from ..attrs cimport IS_OOV cdef class Token: @@ -286,7 +285,9 @@ cdef class Token: def __get__(self): return self.vocab.strings[self.c.dep] - + property is_oov: + def __get__(self): return check_flag(self.c.lex, IS_OOV) + property is_alpha: def __get__(self): return check_flag(self.c.lex, IS_ALPHA) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 86ae7d338..ac2e11e11 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -95,7 +95,7 @@ cdef class Vocab: if len(string) < 3: mem = self.mem lex = mem.alloc(sizeof(LexemeC), 1) - props = self.lexeme_props_getter(string, self.oov_prob) + props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov) set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) if is_oov: lex.id = 0 @@ -119,7 +119,7 @@ cdef class Vocab: if len(string) < 3: mem = self.mem lex = mem.alloc(sizeof(LexemeC), 1) - props = self.lexeme_props_getter(string) + props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov) set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) if is_oov: lex.id = 0