diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 6fbc45817..9e5441a4f 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -260,7 +260,10 @@ _units = ( "кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб" "كم كم² كم³ م م² م³ سم سم² سم³ مم مم² مم³ كم غرام جرام جم كغ ملغ كوب اكواب" ) -_currency = r"\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼ ₴" +_currency = ( + r"\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼ ₴ ₠ ₡ ₢ ₣ ₤ ₥ ₦ ₧ ₨ ₩ ₪ ₫ € ₭ ₮ ₯ ₰ " + r"₱ ₲ ₳ ₴ ₵ ₶ ₷ ₸ ₹ ₺ ₻ ₼ ₽ ₾ ₿" +) # These expressions contain various unicode variations, including characters # used in Chinese (see #1333, #1340, #1351) – unless there are cross-language diff --git a/spacy/tests/training/test_training.py b/spacy/tests/training/test_training.py index 321c08c1e..0ea5f0fcc 100644 --- a/spacy/tests/training/test_training.py +++ b/spacy/tests/training/test_training.py @@ -336,8 +336,8 @@ def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer): def test_gold_biluo_4791(en_vocab, en_tokenizer): - doc = en_tokenizer("I'll return the ₹54 amount") - gold_words = ["I", "'ll", "return", "the", "₹", "54", "amount"] + doc = en_tokenizer("I'll return the A54 amount") + gold_words = ["I", "'ll", "return", "the", "A", "54", "amount"] gold_spaces = [False, True, True, True, False, True, False] entities = [(16, 19, "MONEY")] example = Example.from_dict(