From 2bda582135b5687609c4816182aa4cfae5d865f3 Mon Sep 17 00:00:00 2001 From: Ali Zarezade Date: Tue, 23 Jan 2018 13:20:36 +0330 Subject: [PATCH] Add Persian character and symbols MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add Persian characters and the following: - ٪ used instead of % - ؟ used instead of ? - ﷼ used instead of $ - ، used instead of , - ؛ used instead of ; --- spacy/lang/char_classes.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 68d8eecc7..f91fa43c8 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -15,12 +15,13 @@ _hebrew = r'[\p{L}&&\p{Hebrew}]' _latin_lower = r'[\p{Ll}&&\p{Latin}]' _latin_upper = r'[\p{Lu}&&\p{Latin}]' _latin = r'[[\p{Ll}||\p{Lu}]&&\p{Latin}]' +_persian = r'[\p{L}&&\p{Arabic}]' _russian_lower = r'[ёа-я]' _russian_upper = r'[ЁА-Я]' _upper = [_latin_upper, _russian_upper] _lower = [_latin_lower, _russian_lower] -_uncased = [_bengali, _hebrew] +_uncased = [_bengali, _hebrew, _persian] ALPHA = merge_char_classes(_upper + _lower + _uncased) ALPHA_LOWER = merge_char_classes(_lower + _uncased) @@ -29,14 +30,14 @@ ALPHA_UPPER = merge_char_classes(_upper + _uncased) _units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft ' 'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb ' - 'TB T G M K % км км² км³ м м² м³ дм дм² дм³ см см² см³ мм мм² мм³ нм ' + 'TB T G M K % ٪ км км² км³ м м² м³ дм дм² дм³ см см² см³ мм мм² мм³ нм ' 'кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб') -_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$ ₽' +_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼' # These expressions contain various unicode variations, including characters # used in Chinese (see #1333, #1340, #1351) – unless there are cross-language # conflicts, spaCy's base tokenizer should handle all of those by default -_punct = r'… …… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · ।' +_punct = r'… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ؛' _quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉' _hyphens = '- – — -- --- —— ~'