Fix default punctuation rules for hindi text (#3625 explosion)

This commit is contained in:
yash 2019-07-11 15:00:51 +05:30
parent d5311b3c42
commit 815f8d13dd
1 changed files with 3 additions and 1 deletions

View File

@ -9,6 +9,8 @@ _bengali = r"\u0980-\u09FF"
_hebrew = r"\u0591-\u05F4\uFB1D-\uFB4F"
_hindi = r"\u0900-\u097F"
# Latin standard
_latin_u_standard = r"A-Z"
_latin_l_standard = r"a-z"
@ -193,7 +195,7 @@ _ukrainian = r"а-щюяіїєґА-ЩЮЯІЇЄҐ"
_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper
_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower
_uncased = _bengali + _hebrew + _persian + _sinhala
_uncased = _bengali + _hebrew + _persian + _sinhala + _hindi
ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased)
ALPHA_LOWER = group_chars(_lower + _uncased)