mirror of https://github.com/explosion/spaCy.git
This includes the main kana, or phonetic characters, used in Japanese. There are some supplemental kana blocks in Unicode outside the BMP that could also be included, but because their actual use is rare I omitted them for now, but maybe they should be added. The omitted blocks are: - Kana Supplement - Kana Extended (A and B) - Small Kana Extension
This commit is contained in:
parent
58e29776bd
commit
b4d526c357
|
@ -45,6 +45,10 @@ _hangul_syllables = r"\uAC00-\uD7AF"
|
||||||
_hangul_jamo = r"\u1100-\u11FF"
|
_hangul_jamo = r"\u1100-\u11FF"
|
||||||
_hangul = _hangul_syllables + _hangul_jamo
|
_hangul = _hangul_syllables + _hangul_jamo
|
||||||
|
|
||||||
|
_hiragana = r"\u3040-\u309F"
|
||||||
|
_katakana = r"\u30A0-\u30FFー"
|
||||||
|
_kana = _hiragana + _katakana
|
||||||
|
|
||||||
# letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh
|
# letters with diacritics - Catalan, Czech, Latin, Latvian, Lithuanian, Polish, Slovak, Turkish, Welsh
|
||||||
_latin_u_extendedA = (
|
_latin_u_extendedA = (
|
||||||
r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C"
|
r"\u0100\u0102\u0104\u0106\u0108\u010A\u010C\u010E\u0110\u0112\u0114\u0116\u0118\u011A\u011C"
|
||||||
|
@ -244,6 +248,7 @@ _uncased = (
|
||||||
+ _tamil
|
+ _tamil
|
||||||
+ _telugu
|
+ _telugu
|
||||||
+ _hangul
|
+ _hangul
|
||||||
|
+ _kana
|
||||||
+ _cjk
|
+ _cjk
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue