From bee79619278a5e055ac744798a6ea0001951c94b Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Sat, 14 Sep 2019 14:23:06 +0200 Subject: [PATCH] Add Kannada, Tamil, and Telugu unicode blocks (#4288) Add Kannada, Tamil, and Telugu unicode blocks to uncased character classes so that period is recognized as a suffix during tokenization. (I'm sure a few symbols in the code blocks should not be ALPHA, but this is mainly relevant for suffix detection and seems to be an improvement in practice.) --- spacy/lang/char_classes.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 9f6c3266e..131bdcd51 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -11,6 +11,12 @@ _hebrew = r"\u0591-\u05F4\uFB1D-\uFB4F" _hindi = r"\u0900-\u097F" +_kannada = r"\u0C80-\u0CFF" + +_tamil = r"\u0B80-\u0BFF" + +_telugu = r"\u0C00-\u0C7F" + # Latin standard _latin_u_standard = r"A-Z" _latin_l_standard = r"a-z" @@ -195,7 +201,7 @@ _ukrainian = r"а-щюяіїєґА-ЩЮЯІЇЄҐ" _upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper _lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower -_uncased = _bengali + _hebrew + _persian + _sinhala + _hindi +_uncased = _bengali + _hebrew + _persian + _sinhala + _hindi + _kannada + _tamil + _telugu ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased) ALPHA_LOWER = group_chars(_lower + _uncased)