mirror of https://github.com/explosion/spaCy.git
* Merge master into rethinc2
This commit is contained in:
commit
1ef84a0557
|
@ -14,7 +14,7 @@ cpdef enum attr_id_t:
|
|||
LIKE_EMAIL
|
||||
IS_STOP
|
||||
IS_OOV
|
||||
|
||||
|
||||
FLAG14 = 14
|
||||
FLAG15
|
||||
FLAG16
|
||||
|
@ -85,3 +85,11 @@ cpdef enum attr_id_t:
|
|||
HEAD
|
||||
SPACY
|
||||
PROB
|
||||
|
||||
# Move these up to FLAG14--FLAG18 once we finish the functionality and
|
||||
# are ready to regenerate the model
|
||||
#IS_BRACKET
|
||||
#IS_QUOTE
|
||||
#IS_LEFT_PUNCT
|
||||
#IS_RIGHT_PUNCT
|
||||
|
||||
|
|
|
@ -13,7 +13,6 @@ IDS = {
|
|||
"LIKE_EMAIL": LIKE_EMAIL,
|
||||
"IS_STOP": IS_STOP,
|
||||
"IS_OOV": IS_OOV,
|
||||
|
||||
"FLAG14": FLAG14,
|
||||
"FLAG15": FLAG15,
|
||||
"FLAG16": FLAG16,
|
||||
|
|
|
@ -82,6 +82,22 @@ class Language(object):
|
|||
def is_title(string):
|
||||
return orth.is_title(string)
|
||||
|
||||
@staticmethod
|
||||
def is_bracket(string):
|
||||
return orth.is_bracket(string)
|
||||
|
||||
@staticmethod
|
||||
def is_quote(string):
|
||||
return orth.is_quote(string)
|
||||
|
||||
@staticmethod
|
||||
def is_left_punct(string):
|
||||
return orth.is_left_punct(string)
|
||||
|
||||
@staticmethod
|
||||
def is_right_punct(string):
|
||||
return orth.is_right_punct(string)
|
||||
|
||||
@staticmethod
|
||||
def is_upper(string):
|
||||
return orth.is_upper(string)
|
||||
|
@ -121,6 +137,10 @@ class Language(object):
|
|||
attrs.IS_SPACE: cls.is_space,
|
||||
attrs.IS_TITLE: cls.is_title,
|
||||
attrs.IS_UPPER: cls.is_upper,
|
||||
attrs.FLAG14: cls.is_bracket,
|
||||
attrs.FLAG15: cls.is_quote,
|
||||
attrs.FLAG16: cls.is_left_punct,
|
||||
attrs.FLAG17: cls.is_right_punct,
|
||||
attrs.LIKE_URL: cls.like_url,
|
||||
attrs.LIKE_NUM: cls.like_num,
|
||||
attrs.LIKE_EMAIL: cls.like_email,
|
||||
|
|
|
@ -18,6 +18,10 @@ import numpy
|
|||
|
||||
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||
from .attrs cimport FLAG14 as IS_BRACKET
|
||||
from .attrs cimport FLAG15 as IS_QUOTE
|
||||
from .attrs cimport FLAG16 as IS_LEFT_PUNCT
|
||||
from .attrs cimport FLAG17 as IS_RIGHT_PUNCT
|
||||
from .attrs cimport IS_OOV
|
||||
|
||||
|
||||
|
@ -183,6 +187,23 @@ cdef class Lexeme:
|
|||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_SPACE)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_SPACE, x)
|
||||
|
||||
property is_bracket:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_BRACKET)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_BRACKET, x)
|
||||
|
||||
property is_quote:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_QUOTE)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_QUOTE, x)
|
||||
|
||||
property is_left_punct:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x)
|
||||
|
||||
property is_right_punct:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
|
||||
|
||||
|
||||
property like_url:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL)
|
||||
def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x)
|
||||
|
|
|
@ -15,7 +15,6 @@ from libcpp.vector cimport vector
|
|||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE
|
||||
from .attrs cimport FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25
|
||||
from .tokens.doc cimport get_token_attr
|
||||
from .tokens.doc cimport Doc
|
||||
from .vocab cimport Vocab
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
# -*- coding: utf8 -*-
|
||||
# cython: infer_types=True
|
||||
from __future__ import unicode_literals
|
||||
import unicodedata
|
||||
|
||||
|
@ -48,6 +49,21 @@ cpdef bint is_ascii(unicode string):
|
|||
else:
|
||||
return True
|
||||
|
||||
cpdef bint is_bracket(unicode string):
|
||||
return False
|
||||
|
||||
cpdef bint is_quote(unicode string):
|
||||
if string in ('"', "'"):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
cpdef bint is_left_punct(unicode string):
|
||||
return False
|
||||
|
||||
cpdef bint is_right_punct(unicode string):
|
||||
return False
|
||||
|
||||
|
||||
cpdef bint is_title(unicode string):
|
||||
return string.istitle()
|
||||
|
|
|
@ -14,7 +14,7 @@ cpdef enum symbol_t:
|
|||
IS_STOP
|
||||
IS_OOV
|
||||
|
||||
FLAG14
|
||||
FLAG14 = 14
|
||||
FLAG15
|
||||
FLAG16
|
||||
FLAG17
|
||||
|
@ -419,3 +419,10 @@ cpdef enum symbol_t:
|
|||
rcmod
|
||||
root
|
||||
xcomp
|
||||
|
||||
# Move these up to FLAG14--FLAG18 once we finish the functionality
|
||||
# and are ready to regenerate the model.
|
||||
#IS_BRACKET
|
||||
#IS_QUOTE
|
||||
#IS_LEFT_PUNCT
|
||||
#IS_RIGHT_PUNCT
|
||||
|
|
|
@ -13,7 +13,6 @@ IDS = {
|
|||
"LIKE_EMAIL": LIKE_EMAIL,
|
||||
"IS_STOP": IS_STOP,
|
||||
"IS_OOV": IS_OOV,
|
||||
|
||||
"FLAG14": FLAG14,
|
||||
"FLAG15": FLAG15,
|
||||
"FLAG16": FLAG16,
|
||||
|
|
|
@ -41,3 +41,18 @@ def test_is_digit(words):
|
|||
assert not is_digit(words[7])
|
||||
assert not is_digit(words[8])
|
||||
assert not is_digit(words[9])
|
||||
|
||||
|
||||
def test_is_quote(words):
|
||||
pass
|
||||
|
||||
|
||||
def test_is_bracket(words):
|
||||
pass
|
||||
|
||||
|
||||
def test_is_left_bracket(words):
|
||||
pass
|
||||
|
||||
def test_is_right_bracket(words):
|
||||
pass
|
||||
|
|
|
@ -18,6 +18,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP
|
|||
from ..parts_of_speech cimport CONJ, PUNCT
|
||||
|
||||
from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||
from ..attrs cimport FLAG14 as IS_BRACKET
|
||||
from ..attrs cimport FLAG15 as IS_QUOTE
|
||||
from ..attrs cimport FLAG16 as IS_LEFT_PUNCT
|
||||
from ..attrs cimport FLAG17 as IS_RIGHT_PUNCT
|
||||
from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||
from ..attrs cimport IS_OOV
|
||||
|
||||
|
@ -362,6 +366,18 @@ cdef class Token:
|
|||
|
||||
property is_space:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_SPACE)
|
||||
|
||||
property is_bracket:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_BRACKET)
|
||||
|
||||
property is_quote:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_QUOTE)
|
||||
|
||||
property is_left_punct:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_LEFT_PUNCT)
|
||||
|
||||
property is_right_punct:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_RIGHT_PUNCT)
|
||||
|
||||
property like_url:
|
||||
def __get__(self): return Lexeme.c_check_flag(self.c.lex, LIKE_URL)
|
||||
|
|
Loading…
Reference in New Issue