From 6bb96c122d25db2104cf815dca5c09d28e4fdf8d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 26 Jul 2015 16:37:16 +0200 Subject: [PATCH] * Host IS_ flags in attrs.pxd, and add properties for them on Token and Lexeme objects --- spacy/attrs.pxd | 12 ++++++++ spacy/en/attrs.pxd | 63 +++++++++++++++++------------------------- spacy/en/attrs.pyx | 3 +- spacy/lexeme.pxd | 2 +- spacy/lexeme.pyx | 38 +++++++++++++++++++++++-- spacy/orth.pxd | 1 + spacy/orth.pyx | 5 ++++ spacy/tokens/token.pyx | 36 ++++++++++++++++++++++++ spacy/vocab.pyx | 2 ++ 9 files changed, 120 insertions(+), 42 deletions(-) diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index 515c33e26..578f2040b 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -1,5 +1,17 @@ # Reserve 64 values for flag features cpdef enum attr_id_t: + IS_ALPHA + IS_ASCII + IS_DIGIT + IS_LOWER + IS_PUNCT + IS_SPACE + IS_TITLE + IS_UPPER + LIKE_URL + LIKE_NUM + LIKE_EMAIL + IS_STOP FLAG0 FLAG1 FLAG2 diff --git a/spacy/en/attrs.pxd b/spacy/en/attrs.pxd index 4d19fd119..fcf12ca82 100644 --- a/spacy/en/attrs.pxd +++ b/spacy/en/attrs.pxd @@ -1,8 +1,19 @@ -from ..attrs cimport FLAG0, FLAG1, FLAG2, FLAG3, FLAG4, FLAG5, FLAG6, FLAG7 -from ..attrs cimport FLAG8, FLAG9, FLAG10, FLAG11, FLAG12, FLAG13, FLAG14 +from ..attrs cimport FLAG11, FLAG12, FLAG13, FLAG14 from ..attrs cimport FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21 from ..attrs cimport FLAG22, FLAG23, FLAG24, FLAG25, FLAG26, FLAG27, FLAG28 from ..attrs cimport FLAG29, FLAG30, FLAG31, FLAG32 +from ..attrs cimport IS_ALPHA as _IS_ALPHA +from ..attrs cimport IS_DIGIT as _IS_DIGIT +from ..attrs cimport IS_ASCII as _IS_ASCII +from ..attrs cimport IS_LOWER as _IS_LOWER +from ..attrs cimport IS_PUNCT as _IS_PUNCT +from ..attrs cimport IS_SPACE as _IS_SPACE +from ..attrs cimport IS_TITLE as _IS_TITLE +from ..attrs cimport IS_UPPER as _IS_UPPER +from ..attrs cimport LIKE_EMAIL as _LIKE_EMAIL +from ..attrs cimport LIKE_URL as _LIKE_URL +from ..attrs cimport LIKE_NUM as _LIKE_NUM +from ..attrs cimport IS_STOP as _IS_STOP from ..attrs cimport ORTH as _ORTH from ..attrs cimport SHAPE as _SHAPE from ..attrs cimport LOWER as _LOWER @@ -20,43 +31,19 @@ from ..attrs cimport ENT_TYPE as _ENT_TYPE cpdef enum: - IS_ALPHA = FLAG0 - IS_ASCII = FLAG1 - IS_DIGIT = FLAG2 - IS_LOWER = FLAG3 - IS_PUNCT = FLAG4 - IS_SPACE = FLAG5 - IS_TITLE = FLAG6 - IS_UPPER = FLAG7 - LIKE_URL = FLAG8 - LIKE_NUM = FLAG9 - IS_STOP = FLAG10 + IS_ALPHA = _IS_ALPHA + IS_ASCII = _IS_ASCII + IS_DIGIT = _IS_DIGIT + IS_LOWER = _IS_LOWER + IS_PUNCT = _IS_PUNCT + IS_SPACE = _IS_SPACE + IS_TITLE = _IS_TITLE + IS_UPPER = _IS_UPPER + LIKE_URL = _LIKE_URL + LIKE_NUM = _LIKE_NUM + LIKE_EMAIL = _LIKE_EMAIL + IS_STOP = _IS_STOP - EMO_POS = FLAG11 - EMO_NEG = FLAG12 - - EMO_ANGER = FLAG13 - EMO_APATE = FLAG14 - EMO_DISGUST = FLAG15 - EMO_FEAR = FLAG16 - EMO_JOY = FLAG17 - EMO_SAD = FLAG18 - EMO_SURPRISE = FLAG19 - EMO_TRUST = FLAG20 - - CLR_NONE = FLAG21 - CLR_BLACK = FLAG22 - CLR_BLUE = FLAG23 - CLR_BROWN = FLAG24 - CLR_GREEN = FLAG25 - CLR_GREY = FLAG26 - CLR_ORANGE = FLAG27 - CLR_PURPLE = FLAG28 - CLR_PINK = FLAG29 - CLR_RED = FLAG30 - CLR_WHITE = FLAG31 - CLR_YELLOW = FLAG32 - ORTH = _ORTH SHAPE = _SHAPE LOWER = _LOWER diff --git a/spacy/en/attrs.pyx b/spacy/en/attrs.pyx index a33e51e23..b74dcfd2a 100644 --- a/spacy/en/attrs.pyx +++ b/spacy/en/attrs.pyx @@ -1,6 +1,6 @@ # cython: embedsignature=True from ..orth cimport is_alpha, is_ascii, is_digit, is_lower, is_punct, is_space -from ..orth cimport is_title, is_upper, like_url, like_number +from ..orth cimport is_title, is_upper, like_url, like_number, like_email from ..typedefs cimport flags_t @@ -16,4 +16,5 @@ def get_flags(unicode string): flags |= is_upper(string) << IS_UPPER flags |= like_url(string) << LIKE_URL flags |= like_number(string) << LIKE_NUM + flags |= like_email(string) << LIKE_EMAIL return flags diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index a8deb3c52..e0c99b3e6 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -72,7 +72,7 @@ cdef class Lexeme: py.sentiment = ptr.sentiment return py - cpdef bint check(self, attr_id_t flag_id) except -1 + cpdef bint check_flag(self, attr_id_t flag_id) except -1 cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 2ee0a7714..fc6261049 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -9,6 +9,9 @@ from .orth cimport word_shape from .typedefs cimport attr_t, flags_t import numpy +from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE +from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP + memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) @@ -44,5 +47,36 @@ cdef class Lexeme: def has_repvec(self): return self.l2_norm != 0 - cpdef bint check(self, attr_id_t flag_id) except -1: - return self.flags & (1 << flag_id) + cpdef bint check_flag(self, attr_id_t flag_id) except -1: + cdef flags_t one = 1 + return self.flags & (one << flag_id) + + property is_alpha: + def __get__(self): return self.check_flag(IS_ALPHA) + + property is_ascii: + def __get__(self): return self.check_flag(IS_ASCII) + + property is_digit: + def __get__(self): return self.check_flag(IS_DIGIT) + + property is_lower: + def __get__(self): return self.check_flag(IS_LOWER) + + property is_title: + def __get__(self): return self.check_flag(IS_TITLE) + + property is_punct: + def __get__(self): return self.check_flag(IS_PUNCT) + + property is_space: + def __get__(self): return self.check_flag(IS_SPACE) + + property like_url: + def __get__(self): return self.check_flag(LIKE_URL) + + property like_num: + def __get__(self): return self.check_flag(LIKE_NUM) + + property like_email: + def __get__(self): return self.check_flag(LIKE_EMAIL) diff --git a/spacy/orth.pxd b/spacy/orth.pxd index 20ac8545a..1b990b043 100644 --- a/spacy/orth.pxd +++ b/spacy/orth.pxd @@ -6,6 +6,7 @@ cpdef bint is_ascii(unicode string) cpdef bint is_title(unicode string) cpdef bint is_lower(unicode string) cpdef bint is_upper(unicode string) +cpdef bint like_email(unicode string) cpdef bint like_url(unicode string) cpdef bint like_number(unicode string) cpdef unicode word_shape(unicode string) diff --git a/spacy/orth.pyx b/spacy/orth.pyx index b2273f665..6ffac839b 100644 --- a/spacy/orth.pyx +++ b/spacy/orth.pyx @@ -111,6 +111,11 @@ cpdef bint like_number(unicode string): return False +_like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match +cpdef bint like_email(unicode string): + return _like_email(string) + + cpdef unicode word_shape(unicode string): if len(string) >= 100: return 'LONG' diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index bbc12d549..6e7ac7bf6 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -16,6 +16,11 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST from ..attrs cimport POS, LEMMA, TAG, DEP from ..parts_of_speech cimport CONJ, PUNCT +from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE +from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP + + + cdef class Token: """An individual token --- i.e. a word, a punctuation symbol, etc. Created @@ -281,5 +286,36 @@ cdef class Token: def __get__(self): return self.vocab.strings[self.c.dep] + + property is_alpha: + def __get__(self): return check_flag(self.c.lex, IS_ALPHA) + + property is_ascii: + def __get__(self): return check_flag(self.c.lex, IS_ASCII) + + property is_digit: + def __get__(self): return check_flag(self.c.lex, IS_DIGIT) + + property is_lower: + def __get__(self): return check_flag(self.c.lex, IS_LOWER) + + property is_title: + def __get__(self): return check_flag(self.c.lex, IS_TITLE) + + property is_punct: + def __get__(self): return check_flag(self.c.lex, IS_PUNCT) + + property is_space: + def __get__(self): return check_flag(self.c.lex, IS_SPACE) + + property like_url: + def __get__(self): return check_flag(self.c.lex, LIKE_URL) + + property like_num: + def __get__(self): return check_flag(self.c.lex, LIKE_NUM) + + property like_email: + def __get__(self): return check_flag(self.c.lex, LIKE_EMAIL) + _pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()} diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 3dfa1d658..a2441df27 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -38,6 +38,8 @@ cdef class Vocab: ''' def __init__(self, data_dir=None, get_lex_props=None, load_vectors=True, pos_tags=None, oov_prob=-30): + if oov_prob is None: + oov_prob = -30 self.mem = Pool() self._by_hash = PreshMap() self._by_orth = PreshMap()