mirror of https://github.com/explosion/spaCy.git
Re-add basic like_num check to global lex_attrs
This commit is contained in:
parent
88adeee548
commit
2829a024ef
|
@ -41,6 +41,20 @@ def is_ascii(text):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def like_num(text):
|
||||||
|
# can be overwritten by lang with list of number words
|
||||||
|
text = text.replace(',', '').replace('.', '')
|
||||||
|
if text.isdigit():
|
||||||
|
return True
|
||||||
|
if text.count('/') == 1:
|
||||||
|
num, denom = text.split('/')
|
||||||
|
if num.isdigit() and denom.isdigit():
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def is_bracket(text):
|
def is_bracket(text):
|
||||||
brackets = ('(',')','[',']','{','}','<','>')
|
brackets = ('(',')','[',']','{','}','<','>')
|
||||||
return text in brackets
|
return text in brackets
|
||||||
|
@ -119,7 +133,6 @@ LEX_ATTRS = {
|
||||||
attrs.CLUSTER: lambda string: 0,
|
attrs.CLUSTER: lambda string: 0,
|
||||||
attrs.IS_ALPHA: lambda string: string.isalpha(),
|
attrs.IS_ALPHA: lambda string: string.isalpha(),
|
||||||
attrs.IS_DIGIT: lambda string: string.isdigit(),
|
attrs.IS_DIGIT: lambda string: string.isdigit(),
|
||||||
attrs.LIKE_NUM: lambda string: string.isdigit(), # overwritten by lang
|
|
||||||
attrs.IS_LOWER: lambda string: string.islower(),
|
attrs.IS_LOWER: lambda string: string.islower(),
|
||||||
attrs.IS_SPACE: lambda string: string.isspace(),
|
attrs.IS_SPACE: lambda string: string.isspace(),
|
||||||
attrs.IS_TITLE: lambda string: string.istitle(),
|
attrs.IS_TITLE: lambda string: string.istitle(),
|
||||||
|
@ -127,6 +140,7 @@ LEX_ATTRS = {
|
||||||
attrs.LIKE_EMAIL: lambda string: _like_email(string),
|
attrs.LIKE_EMAIL: lambda string: _like_email(string),
|
||||||
attrs.IS_STOP: lambda string: False,
|
attrs.IS_STOP: lambda string: False,
|
||||||
attrs.IS_OOV: lambda string: True,
|
attrs.IS_OOV: lambda string: True,
|
||||||
|
attrs.LIKE_NUM: like_num,
|
||||||
attrs.IS_PUNCT: is_punct,
|
attrs.IS_PUNCT: is_punct,
|
||||||
attrs.IS_ASCII: is_ascii,
|
attrs.IS_ASCII: is_ascii,
|
||||||
attrs.SHAPE: word_shape,
|
attrs.SHAPE: word_shape,
|
||||||
|
|
Loading…
Reference in New Issue