mirror of https://github.com/explosion/spaCy.git
update punctuation rules
This commit is contained in:
parent
62443d495a
commit
edec51b1b1
|
@ -19,15 +19,15 @@ _months = ('Januari Februari Maret April Mei Juni Juli Agustus September Oktober
|
||||||
|
|
||||||
UNITS = merge_chars(_units)
|
UNITS = merge_chars(_units)
|
||||||
CURRENCY = merge_chars(_currency)
|
CURRENCY = merge_chars(_currency)
|
||||||
HTML_PREFIX = r'<(b|strong|i|em|p|span|div|br)\s?/>'
|
HTML_PREFIX = r'<(b|strong|i|em|p|span|div|br)\s?/>|<a([^>]+)>'
|
||||||
HTML_SUFFIX = r'</(b|strong|i|em|p|span|div)>'
|
HTML_SUFFIX = r'</(b|strong|i|em|p|span|div|a)>'
|
||||||
MONTHS = merge_chars(_months)
|
MONTHS = merge_chars(_months)
|
||||||
LIST_CURRENCY = split_chars(_currency)
|
LIST_CURRENCY = split_chars(_currency)
|
||||||
|
|
||||||
|
|
||||||
_prefixes = TOKENIZER_PREFIXES + LIST_CURRENCY + [HTML_PREFIX] + ['[Kk]e-', '/', '—']
|
_prefixes = TOKENIZER_PREFIXES + LIST_CURRENCY + [HTML_PREFIX] + ['/', '—']
|
||||||
|
|
||||||
_suffixes = TOKENIZER_SUFFIXES + [r'\-[Nn]ya', '—', '-'] + [
|
_suffixes = TOKENIZER_SUFFIXES + [r'\-[Nn]ya', '-[KkMm]u' '-el', '[—-]'] + [
|
||||||
r'(?<=[0-9])(?:{c})'.format(c=CURRENCY),
|
r'(?<=[0-9])(?:{c})'.format(c=CURRENCY),
|
||||||
r'(?<=[0-9])(?:{u})'.format(u=UNITS),
|
r'(?<=[0-9])(?:{u})'.format(u=UNITS),
|
||||||
r'(?<=[0-9])%',
|
r'(?<=[0-9])%',
|
||||||
|
|
Loading…
Reference in New Issue