From b4d526c357a606775e870c2dbe2a794140517d5d Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 30 Nov 2021 22:36:39 +0000 Subject: [PATCH] Add Japanese kana characters to default exceptions (fix #9693) (#9742) This includes the main kana, or phonetic characters, used in Japanese. There are some supplemental kana blocks in Unicode outside the BMP that could also be included, but because their actual use is rare I omitted them for now, but maybe they should be added. The omitted blocks are: - Kana Supplement - Kana Extended (A and B) - Small Kana Extension --- spacy/lang/char_classes.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 9e5441a4f..b15bb3cf3 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -45,6 +45,10 @@ _hangul_syllables = r"\uAC00-\uD7AF" _hangul_jamo = r"\u1100-\u11FF" _hangul = _hangul_syllables + _hangul_jamo +_hiragana = r"\u3040-\u309F" +_katakana = r"\u30A0-\u30FFー" +_kana = _hiragana + _katakana + # letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh _latin_u_extendedA = ( r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C" @@ -244,6 +248,7 @@ _uncased = ( + _tamil + _telugu + _hangul + + _kana + _cjk )