From c1f3fe99feeb926b2b8a502110cb9e83cc593ca8 Mon Sep 17 00:00:00 2001 From: Jim Geovedi Date: Mon, 24 Jul 2017 13:57:21 +0700 Subject: [PATCH] updated punctuation rules --- spacy/lang/id/punctuation.py | 40 +++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/spacy/lang/id/punctuation.py b/spacy/lang/id/punctuation.py index 68af3e126..85da34074 100644 --- a/spacy/lang/id/punctuation.py +++ b/spacy/lang/id/punctuation.py @@ -2,33 +2,49 @@ from __future__ import unicode_literals from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES -from ..char_classes import merge_chars, split_chars, _currency +from ..char_classes import merge_chars, split_chars, _currency, _units from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES -from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER +from ..char_classes import QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER, HYPHENS + +_units = (_units + 's bit Gbps Mbps mbps Kbps kbps ƒ ppi px ' + 'Hz kHz MHz GHz mAh ' + 'ratus rb ribu ribuan ' + 'juta jt jutaan mill?iar million bil[l]?iun bilyun billion ' + ) +_currency = (_currency + r'Rp IDR RMB SGD S\$ ') +_months = ('Januari Februari Maret April Mei Juni Juli Agustus September Oktober November Desember ' + 'January February March May June July August October December ' + 'Jan Feb Mar Jun Jul Aug Sept Oct Okt Nov Des ') -_units = ('s bit Gbps Mbps mbps Kbps kbps ƒ ' - 'kHz MHz GHz mAh ') -_currency = r'Rp IDR RMB SGD S\$' + _currency UNITS = merge_chars(_units) CURRENCY = merge_chars(_currency) HTML_PREFIX = r'<(b|strong|i|em|p|span|div|br)\s?/>' HTML_SUFFIX = r'' +MONTHS = merge_chars(_months) +LIST_CURRENCY = split_chars(_currency) -_prefixes = TOKENIZER_PREFIXES + split_chars(_currency) + [HTML_PREFIX] + ['ke-'] -_suffixes = TOKENIZER_SUFFIXES + [ - r'(?<=[0-9])(?:{})'.format(CURRENCY), - r'(?<=[0-9])(?:{})'.format(UNITS), +_prefixes = TOKENIZER_PREFIXES + LIST_CURRENCY + [HTML_PREFIX] + ['[Kk]e-', '/', '—'] + +_suffixes = TOKENIZER_SUFFIXES + [r'\-[Nn]ya', '—'] + [ + r'(?<=[0-9])(?:{c})'.format(c=CURRENCY), + r'(?<=[0-9])(?:{u})'.format(u=UNITS), r'(?<=[0-9])%', r'(?<=[0-9{a}]{h})(?:[\.,:-])'.format(a=ALPHA, h=HTML_SUFFIX), r'(?<=[0-9{a}])(?:{h})'.format(a=ALPHA, h=HTML_SUFFIX), ] _infixes = TOKENIZER_INFIXES + [ - r'(?<=[0-9])([\\/])(?=[0-9%-])', - r'(?<=[0-9])(%)(?=[{a}0-9/])'.format(a=ALPHA), - r'(?<=[0-9]{u})\/(?=[0-9])'.format(u=UNITS), + r'(?<=[0-9])[\\/](?=[0-9%-])', + r'(?<=[0-9])%(?=[{a}0-9/])'.format(a=ALPHA), + r'(?<={u})[\/-](?=[0-9])'.format(u=UNITS), + r'(?<={m})[\/-](?=[0-9])'.format(m=MONTHS), + r'(?<=[0-9\)][\.,])"(?=[0-9])', + r'(?<=[{a}\)][\.,\'])["—](?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}])-(?=[0-9])'.format(a=ALPHA), + r'(?<=[0-9])-(?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}])[\/-](?={c}{a})'.format(a=ALPHA, c=CURRENCY), ] TOKENIZER_PREFIXES = _prefixes