diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 893ec0845..d4e8a38c5 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -21,7 +21,7 @@ IDS = { "IS_QUOTE": IS_QUOTE, "IS_LEFT_PUNCT": IS_LEFT_PUNCT, "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT, - "FLAG18": FLAG18, + "IS_CURRENCY": IS_CURRENCY, "FLAG19": FLAG19, "FLAG20": FLAG20, "FLAG21": FLAG21, diff --git a/spacy/lang/lex_attrs.py b/spacy/lang/lex_attrs.py index c3bb4a8ff..f1279f035 100644 --- a/spacy/lang/lex_attrs.py +++ b/spacy/lang/lex_attrs.py @@ -69,6 +69,14 @@ def is_right_punct(text): return text in right_punct +def is_currency(text): + # can be overwritten by lang with list of currency words, e.g. dollar, euro + for char in text: + if unicodedata.category(char) != 'Sc': + return False + return True + + def like_email(text): return bool(_like_email(text)) @@ -164,5 +172,6 @@ LEX_ATTRS = { attrs.IS_QUOTE: is_quote, attrs.IS_LEFT_PUNCT: is_left_punct, attrs.IS_RIGHT_PUNCT: is_right_punct, + attrs.IS_CURRENCY: is_currency, attrs.LIKE_URL: like_url }