diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index d0f476dcd..61a00ba1b 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -14,7 +14,7 @@ cpdef enum attr_id_t: LIKE_EMAIL IS_STOP IS_OOV - + FLAG14 = 14 FLAG15 FLAG16 @@ -85,3 +85,11 @@ cpdef enum attr_id_t: HEAD SPACY PROB + +# Move these up to FLAG14--FLAG18 once we finish the functionality and +# are ready to regenerate the model +#IS_BRACKET +#IS_QUOTE +#IS_LEFT_PUNCT +#IS_RIGHT_PUNCT + diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 3595fbf22..146f3ab26 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -13,7 +13,6 @@ IDS = { "LIKE_EMAIL": LIKE_EMAIL, "IS_STOP": IS_STOP, "IS_OOV": IS_OOV, - "FLAG14": FLAG14, "FLAG15": FLAG15, "FLAG16": FLAG16, diff --git a/spacy/language.py b/spacy/language.py index 36a56413a..e85854735 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -82,6 +82,22 @@ class Language(object): def is_title(string): return orth.is_title(string) + @staticmethod + def is_bracket(string): + return orth.is_bracket(string) + + @staticmethod + def is_quote(string): + return orth.is_quote(string) + + @staticmethod + def is_left_punct(string): + return orth.is_left_punct(string) + + @staticmethod + def is_right_punct(string): + return orth.is_right_punct(string) + @staticmethod def is_upper(string): return orth.is_upper(string) @@ -121,6 +137,10 @@ class Language(object): attrs.IS_SPACE: cls.is_space, attrs.IS_TITLE: cls.is_title, attrs.IS_UPPER: cls.is_upper, + attrs.FLAG14: cls.is_bracket, + attrs.FLAG15: cls.is_quote, + attrs.FLAG16: cls.is_left_punct, + attrs.FLAG17: cls.is_right_punct, attrs.LIKE_URL: cls.like_url, attrs.LIKE_NUM: cls.like_num, attrs.LIKE_EMAIL: cls.like_email, diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 845b29314..1aec4a018 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -18,6 +18,10 @@ import numpy from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP +from .attrs cimport FLAG14 as IS_BRACKET +from .attrs cimport FLAG15 as IS_QUOTE +from .attrs cimport FLAG16 as IS_LEFT_PUNCT +from .attrs cimport FLAG17 as IS_RIGHT_PUNCT from .attrs cimport IS_OOV @@ -183,6 +187,23 @@ cdef class Lexeme: def __get__(self): return Lexeme.c_check_flag(self.c, IS_SPACE) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_SPACE, x) + property is_bracket: + def __get__(self): return Lexeme.c_check_flag(self.c, IS_BRACKET) + def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_BRACKET, x) + + property is_quote: + def __get__(self): return Lexeme.c_check_flag(self.c, IS_QUOTE) + def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_QUOTE, x) + + property is_left_punct: + def __get__(self): return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT) + def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x) + + property is_right_punct: + def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT) + def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x) + + property like_url: def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL) def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 8543be1f1..25b8494bd 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -15,7 +15,6 @@ from libcpp.vector cimport vector from murmurhash.mrmr cimport hash64 from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE -from .attrs cimport FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25 from .tokens.doc cimport get_token_attr from .tokens.doc cimport Doc from .vocab cimport Vocab diff --git a/spacy/orth.pyx b/spacy/orth.pyx index 882e06bf2..9d6495edf 100644 --- a/spacy/orth.pyx +++ b/spacy/orth.pyx @@ -1,4 +1,5 @@ # -*- coding: utf8 -*- +# cython: infer_types=True from __future__ import unicode_literals import unicodedata @@ -48,6 +49,21 @@ cpdef bint is_ascii(unicode string): else: return True +cpdef bint is_bracket(unicode string): + return False + +cpdef bint is_quote(unicode string): + if string in ('"', "'"): + return True + else: + return False + +cpdef bint is_left_punct(unicode string): + return False + +cpdef bint is_right_punct(unicode string): + return False + cpdef bint is_title(unicode string): return string.istitle() diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index 0c60f6f67..942d8aa9c 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -14,7 +14,7 @@ cpdef enum symbol_t: IS_STOP IS_OOV - FLAG14 + FLAG14 = 14 FLAG15 FLAG16 FLAG17 @@ -419,3 +419,10 @@ cpdef enum symbol_t: rcmod root xcomp + +# Move these up to FLAG14--FLAG18 once we finish the functionality +# and are ready to regenerate the model. +#IS_BRACKET +#IS_QUOTE +#IS_LEFT_PUNCT +#IS_RIGHT_PUNCT diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 31b01db98..712bef9a3 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -13,7 +13,6 @@ IDS = { "LIKE_EMAIL": LIKE_EMAIL, "IS_STOP": IS_STOP, "IS_OOV": IS_OOV, - "FLAG14": FLAG14, "FLAG15": FLAG15, "FLAG16": FLAG16, diff --git a/spacy/tests/vocab/test_flag_features.py b/spacy/tests/vocab/test_flag_features.py index 9c544b972..880704e28 100644 --- a/spacy/tests/vocab/test_flag_features.py +++ b/spacy/tests/vocab/test_flag_features.py @@ -41,3 +41,18 @@ def test_is_digit(words): assert not is_digit(words[7]) assert not is_digit(words[8]) assert not is_digit(words[9]) + + +def test_is_quote(words): + pass + + +def test_is_bracket(words): + pass + + +def test_is_left_bracket(words): + pass + +def test_is_right_bracket(words): + pass diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 95515b9c3..342bcf409 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -18,6 +18,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP from ..parts_of_speech cimport CONJ, PUNCT from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE +from ..attrs cimport FLAG14 as IS_BRACKET +from ..attrs cimport FLAG15 as IS_QUOTE +from ..attrs cimport FLAG16 as IS_LEFT_PUNCT +from ..attrs cimport FLAG17 as IS_RIGHT_PUNCT from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from ..attrs cimport IS_OOV @@ -362,6 +366,18 @@ cdef class Token: property is_space: def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE) + + property is_bracket: + def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET) + + property is_quote: + def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE) + + property is_left_punct: + def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT) + + property is_right_punct: + def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT) property like_url: def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL)