spaCy/spacy/lang/tt/punctuation.py

20 lines
914 B
Python

# coding: utf8
from __future__ import unicode_literals
from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, QUOTES, HYPHENS
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
_hyphens_no_dash = HYPHENS.replace('-', '').strip('|').replace('||', '')
_infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}])[,!?/\(\)]+(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}{q}])[:<>=](?=[{a}])'.format(a=ALPHA, q=QUOTES),
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])'.format(a=ALPHA, q=QUOTES),
r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(a=ALPHA,
h=_hyphens_no_dash),
r'(?<=[0-9])-(?=[0-9])'])
TOKENIZER_INFIXES = _infixes