spaCy/spacy/lang/tokenizer_exceptions.py

import re

from .char_classes import ALPHA_LOWER
from ..symbols import ORTH, POS, TAG, LEMMA, SPACE


# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
# and https://gist.github.com/dperini/729294 (Diego Perini, MIT License)
# A few mods to this regex to account for use cases represented in test_urls
URL_PATTERN = (
    # fmt: off
    r"^"
    # protocol identifier (mods: make optional and expand schemes)
    # (see: https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml)
    r"(?:(?:[\w\+\-\.]{2,})://)?"
    # mailto:user or user:pass authentication
    r"(?:\S+(?::\S*)?@)?"
    r"(?:"
    # IP address exclusion
    # private & local networks
    r"(?!(?:10|127)(?:\.\d{1,3}){3})"
    r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
    r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
    # IP address dotted notation octets
    # excludes loopback network 0.0.0.0
    # excludes reserved space >= 224.0.0.0
    # excludes network & broadcast addresses
    # (first & last IP address of each class)
    # MH: Do we really need this? Seems excessive, and seems to have caused
    # Issue #957
    r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
    r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
    r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
    r"|"
    # host & domain names
    # mods: match is case-sensitive, so include [A-Z]
      "(?:"
        "(?:"
          "[A-Za-z0-9\u00a1-\uffff]"
          "[A-Za-z0-9\u00a1-\uffff_-]{0,62}"
        ")?"
        "[A-Za-z0-9\u00a1-\uffff]\."
      ")+"
    # TLD identifier
    # mods: use ALPHA_LOWER instead of a wider range so that this doesn't match
    # strings like "lower.Upper", which can be split on "." by infixes in some
    # languages
    r"(?:[" + ALPHA_LOWER + "]{2,63})"
    r")"
    # port number
    r"(?::\d{2,5})?"
    # resource path
    r"(?:[/?#]\S*)?"
    r"$"
    # fmt: on
).strip()

TOKEN_MATCH = re.compile(URL_PATTERN, re.UNICODE).match


BASE_EXCEPTIONS = {}


for exc_data in [
    {ORTH: " ", POS: SPACE, TAG: "_SP"},
    {ORTH: "\t", POS: SPACE, TAG: "_SP"},
    {ORTH: "\\t", POS: SPACE, TAG: "_SP"},
    {ORTH: "\n", POS: SPACE, TAG: "_SP"},
    {ORTH: "\\n", POS: SPACE, TAG: "_SP"},
    {ORTH: "\u2014"},
    {ORTH: "\u00a0", POS: SPACE, LEMMA: "  ", TAG: "_SP"},
]:
    BASE_EXCEPTIONS[exc_data[ORTH]] = [exc_data]


for orth in [
    "'",
    '\\")',
    "<space>",
    "''",
    "C++",
    "a.",
    "b.",
    "c.",
    "d.",
    "e.",
    "f.",
    "g.",
    "h.",
    "i.",
    "j.",
    "k.",
    "l.",
    "m.",
    "n.",
    "o.",
    "p.",
    "q.",
    "r.",
    "s.",
    "t.",
    "u.",
    "v.",
    "w.",
    "x.",
    "y.",
    "z.",
    "ä.",
    "ö.",
    "ü.",
]:
    BASE_EXCEPTIONS[orth] = [{ORTH: orth}]


emoticons = set(
    r"""
:)
:-)
:))
:-))
:)))
:-)))
(:
(-:
=)
(=
:]
:-]
[:
[-:
:o)
(o:
:}
:-}
8)
8-)
(-8
;)
;-)
(;
(-;
:(
:-(
:((
:-((
:(((
:-(((
):
)-:
=(
>:(
:')
:'-)
:'(
:'-(
:/
:-/
=/
=|
:|
:-|
:1
:P
:-P
:p
:-p
:O
:-O
:o
:-o
:0
:-0
:()
>:o
:*
:-*
:3
:-3
=3
:>
:->
:X
:-X
:x
:-x
:D
:-D
;D
;-D
=D
xD
XD
xDD
XDD
8D
8-D

^_^
^__^
^___^
>.<
>.>
<.<
._.
;_;
-_-
-__-
v.v
V.V
v_v
V_V
o_o
o_O
O_o
O_O
0_o
o_0
0_0
o.O
O.o
O.O
o.o
0.0
o.0
0.o
@_@
<3
<33
<333
</3
(^_^)
(-_-)
(._.)
(>_<)
(*_*)
(¬_¬)
ಠ_ಠ
ಠ︵ಠ
(ಠ_ಠ)
¯\(ツ)/¯
(╯°□°）╯︵┻━┻
><(((*>
""".split()
)


for orth in emoticons:
    BASE_EXCEPTIONS[orth] = [{ORTH: orth}]