From d1a221a37419c76f69c4a3a7f64cc4afb0eea3c8 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 31 May 2021 17:03:40 +0900 Subject: [PATCH] Add all symbols in Unicode Currency Symbols block (#8212) * Add all symbols in Unicode Currency Symbols block In #8102 it came up that the rupee symbol was treated different from dollar / euro / yen symbols. This adds many symbols not already included. * Fix test * Fix training test --- spacy/lang/char_classes.py | 5 ++++- spacy/tests/training/test_training.py | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 6fbc45817..9e5441a4f 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -260,7 +260,10 @@ _units = ( "кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб" "كم كم² كم³ م م² م³ سم سم² سم³ مم مم² مم³ كم غرام جرام جم كغ ملغ كوب اكواب" ) -_currency = r"\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼ ₴" +_currency = ( + r"\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼ ₴ ₠ ₡ ₢ ₣ ₤ ₥ ₦ ₧ ₨ ₩ ₪ ₫ € ₭ ₮ ₯ ₰ " + r"₱ ₲ ₳ ₴ ₵ ₶ ₷ ₸ ₹ ₺ ₻ ₼ ₽ ₾ ₿" +) # These expressions contain various unicode variations, including characters # used in Chinese (see #1333, #1340, #1351) – unless there are cross-language diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 321c08c1e..0ea5f0fcc 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -336,8 +336,8 @@ def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer): def test_gold_biluo_4791(en_vocab, en_tokenizer): - doc = en_tokenizer("I'll return the ₹54 amount") - gold_words = ["I", "'ll", "return", "the", "₹", "54", "amount"] + doc = en_tokenizer("I'll return the A54 amount") + gold_words = ["I", "'ll", "return", "the", "A", "54", "amount"] gold_spaces = [False, True, True, True, False, True, False] entities = [(16, 19, "MONEY")] example = Example.from_dict(