diff --git a/spacy/lang/id/tokenizer_exceptions.py b/spacy/lang/id/tokenizer_exceptions.py index ef52996c8..5d5a6fac5 100644 --- a/spacy/lang/id/tokenizer_exceptions.py +++ b/spacy/lang/id/tokenizer_exceptions.py @@ -5,7 +5,6 @@ import regex as re from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS from ..tokenizer_exceptions import URL_PATTERN -from ..char_classes import ALPHA from ...symbols import ORTH @@ -39,18 +38,16 @@ kuartal lintas m macro makro me mem meng micro mid mikro mini multi neo nge no non on pan pasca pe pem poli poly post pra pre pro re se self serba seri sub super t trans ultra un x""".split() -_hyphen_infix = """me-kan me-kannya men-kan men-kannya meng-kannya ke-an -ke-annya di-kan di-kannya de-isasi ber-an berke-an""".split() +_hyphen_infix = """ber-an berke-an de-isasi di-kan di-kannya di-nya ke-an +ke-annya me-kan me-kannya men-kan men-kannya meng-kannya pe-an pen-an +per-an per-i se-an se-nya ter-i ter-kan ter-kannya""".split() _hyphen_suffix = """el""" -_regular_exp = ['^{p}-.*$'.format(p=prefix) for prefix in _hyphen_prefix] -_regular_exp += ['^{p}-.*$'.format(p=prefix.title()) for prefix in _hyphen_prefix] -_regular_exp += ['^{p}-.*$'.format(p=prefix.upper()) for prefix in _hyphen_prefix] -_regular_exp += ['^{0}-.*-{1}$'.format(*infix.title().split('-')) for infix in _hyphen_infix] -_regular_exp += ['^.*-{s}$'.format(s=suffix) for suffix in _hyphen_suffix] +_regular_exp = ['^{p}-[A-Za-z0-9]+$'.format(p=prefix) for prefix in _hyphen_prefix] +_regular_exp += ['^{0}-[A-Za-z0-9]+-{1}$'.format(*infix.split('-')) for infix in _hyphen_infix] +_regular_exp += ['^[A-Za-z0-9]+-{s}$'.format(s=suffix) for suffix in _hyphen_suffix] _regular_exp.append(URL_PATTERN) - TOKENIZER_EXCEPTIONS = dict(_exc) TOKEN_MATCH = re.compile('|'.join('(?:{})'.format(m) for m in _regular_exp), re.IGNORECASE).match