mirror of https://github.com/explosion/spaCy.git
Reorganise global punctuation rules
This commit is contained in:
parent
a91278cb32
commit
014bda0ae3
|
@ -1,106 +1,32 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import regex as re
|
||||
re.DEFAULT_VERSION = re.VERSION1
|
||||
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
||||
from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
|
||||
from .char_classes import CURRENCY, UNITS
|
||||
|
||||
|
||||
_UNITS = """
|
||||
km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft kg g mg
|
||||
µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb
|
||||
TB T G M K
|
||||
"""
|
||||
_prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
|
||||
LIST_CURRENCY)
|
||||
|
||||
|
||||
_CURRENCY = r"""
|
||||
\$ £ € ¥ ฿ US\$ C\$ A\$
|
||||
"""
|
||||
_suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
|
||||
[r'(?<=[0-9])\+',
|
||||
r'(?<=°[FfCcKk])\.',
|
||||
r'(?<=[0-9])(?:{})'.format(CURRENCY),
|
||||
r'(?<=[0-9])(?:{})'.format(UNITS),
|
||||
r'(?<=[0-9{}{}(?:{})])\.'.format(ALPHA_LOWER, r'%²\-\)\]\+', QUOTES),
|
||||
r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)])
|
||||
|
||||
|
||||
_QUOTES = r"""
|
||||
' '' " ” “ `` ` ‘ ´ ‚ , „ » «
|
||||
"""
|
||||
_infixes = (LIST_ELLIPSES +
|
||||
[r'(?<=[0-9])[+\-\*^](?=[0-9-])',
|
||||
r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
|
||||
r'(?<=[{a}"])[:<>=/](?=[{a}])'.format(a=ALPHA)])
|
||||
|
||||
|
||||
_PUNCT = r"""
|
||||
… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &
|
||||
"""
|
||||
|
||||
|
||||
_HYPHENS = r"""
|
||||
- – — -- ---
|
||||
"""
|
||||
|
||||
|
||||
LIST_ELLIPSES = [
|
||||
r'\.\.+',
|
||||
"…"
|
||||
]
|
||||
|
||||
|
||||
LIST_CURRENCY = list(_CURRENCY.strip().split())
|
||||
LIST_QUOTES = list(_QUOTES.strip().split())
|
||||
LIST_PUNCT = list(_PUNCT.strip().split())
|
||||
LIST_HYPHENS = list(_HYPHENS.strip().split())
|
||||
|
||||
|
||||
BENGALI = r'[\p{L}&&\p{Bengali}]'
|
||||
HEBREW = r'[\p{L}&&\p{Hebrew}]'
|
||||
LATIN_LOWER = r'[\p{Ll}&&\p{Latin}]'
|
||||
LATIN_UPPER = r'[\p{Lu}&&\p{Latin}]'
|
||||
LATIN = r'[[\p{Ll}||\p{Lu}]&&\p{Latin}]'
|
||||
|
||||
|
||||
ALPHA_LOWER = '[{}]'.format('||'.join([BENGALI, HEBREW, LATIN_LOWER]))
|
||||
ALPHA_UPPER = '[{}]'.format('||'.join([BENGALI, HEBREW, LATIN_UPPER]))
|
||||
ALPHA = '[{}]'.format('||'.join([BENGALI, HEBREW, LATIN]))
|
||||
|
||||
|
||||
QUOTES = _QUOTES.strip().replace(' ', '|')
|
||||
CURRENCY = _CURRENCY.strip().replace(' ', '|')
|
||||
UNITS = _UNITS.strip().replace(' ', '|').replace('\n', '|')
|
||||
HYPHENS = _HYPHENS.strip().replace(' ', '|')
|
||||
|
||||
|
||||
|
||||
# Prefixes
|
||||
|
||||
TOKENIZER_PREFIXES = (
|
||||
['§', '%', '=', r'\+'] +
|
||||
LIST_PUNCT +
|
||||
LIST_ELLIPSES +
|
||||
LIST_QUOTES +
|
||||
LIST_CURRENCY
|
||||
)
|
||||
|
||||
|
||||
# Suffixes
|
||||
|
||||
TOKENIZER_SUFFIXES = (
|
||||
LIST_PUNCT +
|
||||
LIST_ELLIPSES +
|
||||
LIST_QUOTES +
|
||||
[
|
||||
r'(?<=[0-9])\+',
|
||||
r'(?<=°[FfCcKk])\.',
|
||||
r'(?<=[0-9])(?:{c})'.format(c=CURRENCY),
|
||||
r'(?<=[0-9])(?:{u})'.format(u=UNITS),
|
||||
r'(?<=[0-9{al}{p}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES),
|
||||
r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER),
|
||||
"'s", "'S", "’s", "’S"
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
# Infixes
|
||||
|
||||
TOKENIZER_INFIXES = (
|
||||
LIST_ELLIPSES +
|
||||
[
|
||||
r'(?<=[0-9])[+\-\*^](?=[0-9-])',
|
||||
r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
|
||||
r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
|
||||
r'(?<=[{a}"])[:<>=/](?=[{a}])'.format(a=ALPHA)
|
||||
]
|
||||
)
|
||||
TOKENIZER_PREFIXES = _prefixes
|
||||
TOKENIZER_SUFFIXES = _suffixes
|
||||
TOKENIZER_INFIXES = _infixes
|
||||
|
|
Loading…
Reference in New Issue