Reorganise global punctuation rules

2017-05-09 00:00:46 +02:00 · 2017-05-09 00:00:46 +02:00 · 014bda0ae3
parent a91278cb32
commit 014bda0ae3
1 changed files with 21 additions and 95 deletions
--- a/spacy/lang/punctuation.py
+++ b/spacy/lang/punctuation.py
@ -1,106 +1,32 @@
 # coding: utf8
 from __future__ import unicode_literals
-import regex as re
+from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
-re.DEFAULT_VERSION = re.VERSION1
+from .char_classes import ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS, QUOTES
 from .char_classes import CURRENCY, UNITS
-_UNITS = """
+_prefixes = (['§', '%', '=', r'\+'] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
-km km² km³ m m² m³ dm dm² dm³ cm cm² cm³ mm mm² mm³ ha µm nm yd in ft kg g mg
+             LIST_CURRENCY)
 µg t lb oz m/s km/h kmh mph hPa Pa mbar mb MB kb KB gb GB tb
 TB T G M K
 """
-_CURRENCY = r"""
+_suffixes = (["'s", "'S", "’s", "’S"] + LIST_PUNCT + LIST_ELLIPSES + LIST_QUOTES +
-\$ £ € ¥ ฿ US\$ C\$ A\$
+             [r'(?<=[0-9])\+',
-"""
+              r'(?<=°[FfCcKk])\.',
              r'(?<=[0-9])(?:{})'.format(CURRENCY),
              r'(?<=[0-9])(?:{})'.format(UNITS),
              r'(?<=[0-9{}{}(?:{})])\.'.format(ALPHA_LOWER, r'%²\-\)\]\+', QUOTES),
              r'(?<=[{a}][{a}])\.'.format(a=ALPHA_UPPER)])
-_QUOTES = r"""
+_infixes = (LIST_ELLIPSES +
-' '' " ” “ `` ` ‘ ´ ‚ , „ » «
+            [r'(?<=[0-9])[+\-\*^](?=[0-9-])',
-"""
+             r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
             r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
             r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
             r'(?<=[{a}"])[:<>=/](?=[{a}])'.format(a=ALPHA)])
-_PUNCT = r"""
+TOKENIZER_PREFIXES = _prefixes
-… , : ; \! \? ¿ ¡ \( \) \[ \] \{ \} < > _ # \* &
+TOKENIZER_SUFFIXES = _suffixes
-"""
+TOKENIZER_INFIXES = _infixes
 _HYPHENS = r"""
 - – — -- ---
 """
 LIST_ELLIPSES = [
    r'\.\.+',
    "…"
 ]
 LIST_CURRENCY = list(_CURRENCY.strip().split())
 LIST_QUOTES = list(_QUOTES.strip().split())
 LIST_PUNCT = list(_PUNCT.strip().split())
 LIST_HYPHENS = list(_HYPHENS.strip().split())
 BENGALI = r'[\p{L}&&\p{Bengali}]'
 HEBREW = r'[\p{L}&&\p{Hebrew}]'
 LATIN_LOWER = r'[\p{Ll}&&\p{Latin}]'
 LATIN_UPPER = r'[\p{Lu}&&\p{Latin}]'
 LATIN = r'[[\p{Ll}||\p{Lu}]&&\p{Latin}]'
 ALPHA_LOWER = '[{}]'.format('||'.join([BENGALI, HEBREW, LATIN_LOWER]))
 ALPHA_UPPER = '[{}]'.format('||'.join([BENGALI, HEBREW, LATIN_UPPER]))
 ALPHA = '[{}]'.format('||'.join([BENGALI, HEBREW, LATIN]))
 QUOTES = _QUOTES.strip().replace(' ', '|')
 CURRENCY = _CURRENCY.strip().replace(' ', '|')
 UNITS = _UNITS.strip().replace(' ', '|').replace('\n', '|')
 HYPHENS = _HYPHENS.strip().replace(' ', '|')
 # Prefixes
 TOKENIZER_PREFIXES = (
    ['§', '%', '=', r'\+'] +
    LIST_PUNCT +
    LIST_ELLIPSES +
    LIST_QUOTES +
    LIST_CURRENCY
 )
 # Suffixes
 TOKENIZER_SUFFIXES = (
    LIST_PUNCT +
    LIST_ELLIPSES +
    LIST_QUOTES +
    [
        r'(?<=[0-9])\+',
        r'(?<=°[FfCcKk])\.',
        r'(?<=[0-9])(?:{c})'.format(c=CURRENCY),
        r'(?<=[0-9])(?:{u})'.format(u=UNITS),
        r'(?<=[0-9{al}{p}(?:{q})])\.'.format(al=ALPHA_LOWER, p=r'%²\-\)\]\+', q=QUOTES),
        r'(?<=[{au}][{au}])\.'.format(au=ALPHA_UPPER),
        "'s", "'S", "’s", "’S"
    ]
 )
 # Infixes
 TOKENIZER_INFIXES = (
    LIST_ELLIPSES +
    [
        r'(?<=[0-9])[+\-\*^](?=[0-9-])',
        r'(?<=[{al}])\.(?=[{au}])'.format(al=ALPHA_LOWER, au=ALPHA_UPPER),
        r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
        r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(a=ALPHA, h=HYPHENS),
        r'(?<=[{a}"])[:<>=/](?=[{a}])'.format(a=ALPHA)
    ]
 )