2017-05-08 21:59:33 +00:00
|
|
|
|
# coding: utf8
|
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
|
|
import regex as re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
re.DEFAULT_VERSION = re.VERSION1
|
|
|
|
|
merge_char_classes = lambda classes: '[{}]'.format('||'.join(classes))
|
|
|
|
|
split_chars = lambda char: list(char.strip().split(' '))
|
|
|
|
|
merge_chars = lambda char: char.strip().replace(' ', '|')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_bengali = r'[\p{L}&&\p{Bengali}]'
|
|
|
|
|
_hebrew = r'[\p{L}&&\p{Hebrew}]'
|
|
|
|
|
_latin_lower = r'[\p{Ll}&&\p{Latin}]'
|
|
|
|
|
_latin_upper = r'[\p{Lu}&&\p{Latin}]'
|
|
|
|
|
_latin = r'[[\p{Ll}||\p{Lu}]&&\p{Latin}]'
|
2018-01-23 09:50:36 +00:00
|
|
|
|
_persian = r'[\p{L}&&\p{Arabic}]'
|
2017-11-21 19:23:59 +00:00
|
|
|
|
_russian_lower = r'[ёа-я]'
|
|
|
|
|
_russian_upper = r'[ЁА-Я]'
|
2017-05-08 21:59:33 +00:00
|
|
|
|
|
2017-11-21 19:23:59 +00:00
|
|
|
|
_upper = [_latin_upper, _russian_upper]
|
|
|
|
|
_lower = [_latin_lower, _russian_lower]
|
2018-01-23 09:50:36 +00:00
|
|
|
|
_uncased = [_bengali, _hebrew, _persian]
|
2017-05-08 21:59:33 +00:00
|
|
|
|
|
|
|
|
|
ALPHA = merge_char_classes(_upper + _lower + _uncased)
|
|
|
|
|
ALPHA_LOWER = merge_char_classes(_lower + _uncased)
|
|
|
|
|
ALPHA_UPPER = merge_char_classes(_upper + _uncased)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_units = ('km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft '
|
|
|
|
|
'kg g mg µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb '
|
2018-01-23 14:41:33 +00:00
|
|
|
|
'TB T G M K % км км² км³ м м² м³ дм дм² дм³ см см² см³ мм мм² мм³ нм '
|
2017-11-21 19:23:59 +00:00
|
|
|
|
'кг г мг м/с км/ч кПа Па мбар Кб КБ кб Мб МБ мб Гб ГБ гб Тб ТБ тб')
|
2018-01-23 09:50:36 +00:00
|
|
|
|
_currency = r'\$ £ € ¥ ฿ US\$ C\$ A\$ ₽ ﷼'
|
2017-10-14 10:52:59 +00:00
|
|
|
|
|
|
|
|
|
# These expressions contain various unicode variations, including characters
|
|
|
|
|
# used in Chinese (see #1333, #1340, #1351) – unless there are cross-language
|
|
|
|
|
# conflicts, spaCy's base tokenizer should handle all of those by default
|
2018-01-23 14:41:33 +00:00
|
|
|
|
_punct = r'… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ؛ ٪'
|
2017-10-14 10:52:59 +00:00
|
|
|
|
_quotes = r'\' \'\' " ” “ `` ` ‘ ´ ‘‘ ’’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉'
|
2017-09-26 14:38:08 +00:00
|
|
|
|
_hyphens = '- – — -- --- —— ~'
|
2017-10-14 10:52:59 +00:00
|
|
|
|
|
|
|
|
|
# Various symbols like dingbats, but also emoji
|
|
|
|
|
# Details: https://www.compart.com/en/unicode/category/So
|
2017-05-27 15:57:10 +00:00
|
|
|
|
_other_symbols = r'[\p{So}]'
|
2017-05-08 21:59:33 +00:00
|
|
|
|
|
2017-10-14 10:52:59 +00:00
|
|
|
|
|
2017-05-08 21:59:33 +00:00
|
|
|
|
UNITS = merge_chars(_units)
|
|
|
|
|
CURRENCY = merge_chars(_currency)
|
|
|
|
|
QUOTES = merge_chars(_quotes)
|
|
|
|
|
PUNCT = merge_chars(_punct)
|
|
|
|
|
HYPHENS = merge_chars(_hyphens)
|
2017-05-27 15:57:10 +00:00
|
|
|
|
ICONS = _other_symbols
|
2017-05-08 21:59:33 +00:00
|
|
|
|
|
|
|
|
|
LIST_UNITS = split_chars(_units)
|
|
|
|
|
LIST_CURRENCY = split_chars(_currency)
|
|
|
|
|
LIST_QUOTES = split_chars(_quotes)
|
|
|
|
|
LIST_PUNCT = split_chars(_punct)
|
|
|
|
|
LIST_HYPHENS = split_chars(_hyphens)
|
|
|
|
|
LIST_ELLIPSES = [r'\.\.+', '…']
|
2017-05-27 15:57:10 +00:00
|
|
|
|
LIST_ICONS = [_other_symbols]
|