From b97d98783a211f485ac7bbd69dee292f31cd0849 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 2 Sep 2020 13:06:16 +0200 Subject: [PATCH] Fix Hungarian % tokenization (#6013) --- spacy/lang/hu/punctuation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py index 597f01b65..f827cd677 100644 --- a/spacy/lang/hu/punctuation.py +++ b/spacy/lang/hu/punctuation.py @@ -7,6 +7,7 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "") _currency = r"\$¢£€¥฿" _quotes = CONCAT_QUOTES.replace("'", "") +_units = UNITS.replace("%", "") _prefixes = ( LIST_PUNCT @@ -26,7 +27,7 @@ _suffixes = ( r"(?<=[0-9])\+", r"(?<=°[FfCcKk])\.", r"(?<=[0-9])(?:[{c}])".format(c=_currency), - r"(?<=[0-9])(?:{u})".format(u=UNITS), + r"(?<=[0-9])(?:{u})".format(u=_units), r"(?<=[{al}{e}{q}(?:{c})])\.".format( al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency ),