mirror of https://github.com/explosion/spaCy.git
Add all symbols in Unicode Currency Symbols block (#8212)
* Add all symbols in Unicode Currency Symbols block In #8102 it came up that the rupee symbol was treated different from dollar / euro / yen symbols. This adds many symbols not already included. * Fix test * Fix training test
This commit is contained in:
parent
fc37715cfb
commit
d1a221a374
|
@ -260,7 +260,10 @@ _units = (
|
|||
"кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб"
|
||||
"كم كم² كم³ م م² م³ سم سم² سم³ مم مم² مم³ كم غرام جرام جم كغ ملغ كوب اكواب"
|
||||
)
|
||||
_currency = r"\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼ ₴"
|
||||
_currency = (
|
||||
r"\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼ ₴ ₠ ₡ ₢ ₣ ₤ ₥ ₦ ₧ ₨ ₩ ₪ ₫ € ₭ ₮ ₯ ₰ "
|
||||
r"₱ ₲ ₳ ₴ ₵ ₶ ₷ ₸ ₹ ₺ ₻ ₼ ₽ ₾ ₿"
|
||||
)
|
||||
|
||||
# These expressions contain various unicode variations, including characters
|
||||
# used in Chinese (see #1333, #1340, #1351) – unless there are cross-language
|
||||
|
|
|
@ -336,8 +336,8 @@ def test_gold_biluo_additional_whitespace(en_vocab, en_tokenizer):
|
|||
|
||||
|
||||
def test_gold_biluo_4791(en_vocab, en_tokenizer):
|
||||
doc = en_tokenizer("I'll return the ₹54 amount")
|
||||
gold_words = ["I", "'ll", "return", "the", "₹", "54", "amount"]
|
||||
doc = en_tokenizer("I'll return the A54 amount")
|
||||
gold_words = ["I", "'ll", "return", "the", "A", "54", "amount"]
|
||||
gold_spaces = [False, True, True, True, False, True, False]
|
||||
entities = [(16, 19, "MONEY")]
|
||||
example = Example.from_dict(
|
||||
|
|
Loading…
Reference in New Issue