mirror of https://github.com/explosion/spaCy.git
Fix Hungarian % tokenization (#6013)
This commit is contained in:
parent
70238543c8
commit
b97d98783a
|
@ -7,6 +7,7 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "")
|
||||||
|
|
||||||
_currency = r"\$¢£€¥฿"
|
_currency = r"\$¢£€¥฿"
|
||||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||||
|
_units = UNITS.replace("%", "")
|
||||||
|
|
||||||
_prefixes = (
|
_prefixes = (
|
||||||
LIST_PUNCT
|
LIST_PUNCT
|
||||||
|
@ -26,7 +27,7 @@ _suffixes = (
|
||||||
r"(?<=[0-9])\+",
|
r"(?<=[0-9])\+",
|
||||||
r"(?<=°[FfCcKk])\.",
|
r"(?<=°[FfCcKk])\.",
|
||||||
r"(?<=[0-9])(?:[{c}])".format(c=_currency),
|
r"(?<=[0-9])(?:[{c}])".format(c=_currency),
|
||||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
r"(?<=[0-9])(?:{u})".format(u=_units),
|
||||||
r"(?<=[{al}{e}{q}(?:{c})])\.".format(
|
r"(?<=[{al}{e}{q}(?:{c})])\.".format(
|
||||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
|
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
|
||||||
),
|
),
|
||||||
|
|
Loading…
Reference in New Issue