From 2829a024ef4fe5a45993580826e4261028c65138 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 9 May 2017 01:15:23 +0200 Subject: [PATCH] Re-add basic like_num check to global lex_attrs --- spacy/lang/lex_attrs.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/spacy/lang/lex_attrs.py b/spacy/lang/lex_attrs.py index 87d0a0df6..7cdbbc46e 100644 --- a/spacy/lang/lex_attrs.py +++ b/spacy/lang/lex_attrs.py @@ -41,6 +41,20 @@ def is_ascii(text): return True +def like_num(text): + # can be overwritten by lang with list of number words + text = text.replace(',', '').replace('.', '') + if text.isdigit(): + return True + if text.count('/') == 1: + num, denom = text.split('/') + if num.isdigit() and denom.isdigit(): + return True + return False + + + + def is_bracket(text): brackets = ('(',')','[',']','{','}','<','>') return text in brackets @@ -119,7 +133,6 @@ LEX_ATTRS = { attrs.CLUSTER: lambda string: 0, attrs.IS_ALPHA: lambda string: string.isalpha(), attrs.IS_DIGIT: lambda string: string.isdigit(), - attrs.LIKE_NUM: lambda string: string.isdigit(), # overwritten by lang attrs.IS_LOWER: lambda string: string.islower(), attrs.IS_SPACE: lambda string: string.isspace(), attrs.IS_TITLE: lambda string: string.istitle(), @@ -127,6 +140,7 @@ LEX_ATTRS = { attrs.LIKE_EMAIL: lambda string: _like_email(string), attrs.IS_STOP: lambda string: False, attrs.IS_OOV: lambda string: True, + attrs.LIKE_NUM: like_num, attrs.IS_PUNCT: is_punct, attrs.IS_ASCII: is_ascii, attrs.SHAPE: word_shape,