mirror of https://github.com/explosion/spaCy.git
214 lines
3.1 KiB
Python
214 lines
3.1 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals
|
|
|
|
# The use of this module turns out to be important, to avoid pathological
|
|
# back-tracking. See Issue #957
|
|
import regex as re
|
|
|
|
from ..symbols import ORTH, POS, LEMMA, SPACE, PUNCT
|
|
|
|
|
|
# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
|
|
# A few minor mods to this regex to account for use cases represented in test_urls
|
|
URL_PATTERN = (
|
|
r"^"
|
|
# in order to support the prefix tokenization (see prefix test cases in test_urls).
|
|
r"(?=[\w])"
|
|
# protocol identifier
|
|
r"(?:(?:https?|ftp|mailto)://)?"
|
|
# user:pass authentication
|
|
r"(?:\S+(?::\S*)?@)?"
|
|
r"(?:"
|
|
# IP address exclusion
|
|
# private & local networks
|
|
r"(?!(?:10|127)(?:\.\d{1,3}){3})"
|
|
r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
|
|
r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
|
|
# IP address dotted notation octets
|
|
# excludes loopback network 0.0.0.0
|
|
# excludes reserved space >= 224.0.0.0
|
|
# excludes network & broadcast addresses
|
|
# (first & last IP address of each class)
|
|
# MH: Do we really need this? Seems excessive, and seems to have caused
|
|
# Issue #957
|
|
r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
|
|
r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
|
|
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
|
|
r"|"
|
|
# host name
|
|
r"(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)"
|
|
# domain name
|
|
r"(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*"
|
|
# TLD identifier
|
|
r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))"
|
|
r")"
|
|
# port number
|
|
r"(?::\d{2,5})?"
|
|
# resource path
|
|
r"(?:/\S*)?"
|
|
# query parameters
|
|
r"\??(:?\S*)?"
|
|
# in order to support the suffix tokenization (see suffix test cases in test_urls),
|
|
r"(?<=[\w/])"
|
|
r"$"
|
|
).strip()
|
|
|
|
TOKEN_MATCH = re.compile(URL_PATTERN, re.UNICODE).match
|
|
|
|
|
|
|
|
BASE_EXCEPTIONS = {}
|
|
|
|
|
|
for exc_data in [
|
|
{ORTH: " ", POS: SPACE},
|
|
{ORTH: "\t", POS: SPACE},
|
|
{ORTH: "\\t", POS: SPACE},
|
|
{ORTH: "\n", POS: SPACE},
|
|
{ORTH: "\\n", POS: SPACE},
|
|
{ORTH: "\u2014", POS: PUNCT, LEMMA: "--"},
|
|
{ORTH: "\u00a0", POS: SPACE, LEMMA: " "}]:
|
|
BASE_EXCEPTIONS[exc_data[ORTH]] = [dict(exc_data)]
|
|
|
|
|
|
for orth in [
|
|
"'", "\\\")", "<space>", "''", "C++", "a.", "b.", "c.", "d.", "e.", "f.",
|
|
"g.", "h.", "i.", "j.", "k.", "l.", "m.", "n.", "o.", "p.", "q.", "r.",
|
|
"s.", "t.", "u.", "v.", "w.", "x.", "y.", "z.", "ä.", "ö.", "ü."]:
|
|
BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
|
|
|
|
|
|
emoticons = set("""
|
|
:)
|
|
:-)
|
|
:))
|
|
:-))
|
|
:)))
|
|
:-)))
|
|
(:
|
|
(-:
|
|
=)
|
|
(=
|
|
")
|
|
:]
|
|
:-]
|
|
[:
|
|
[-:
|
|
:o)
|
|
(o:
|
|
:}
|
|
:-}
|
|
8)
|
|
8-)
|
|
(-8
|
|
;)
|
|
;-)
|
|
(;
|
|
(-;
|
|
:(
|
|
:-(
|
|
:((
|
|
:-((
|
|
:(((
|
|
:-(((
|
|
):
|
|
)-:
|
|
=(
|
|
>:(
|
|
:')
|
|
:'-)
|
|
:'(
|
|
:'-(
|
|
:/
|
|
:-/
|
|
=/
|
|
=|
|
|
:|
|
|
:-|
|
|
:1
|
|
:P
|
|
:-P
|
|
:p
|
|
:-p
|
|
:O
|
|
:-O
|
|
:o
|
|
:-o
|
|
:0
|
|
:-0
|
|
:()
|
|
>:o
|
|
:*
|
|
:-*
|
|
:3
|
|
:-3
|
|
=3
|
|
:>
|
|
:->
|
|
:X
|
|
:-X
|
|
:x
|
|
:-x
|
|
:D
|
|
:-D
|
|
;D
|
|
;-D
|
|
=D
|
|
xD
|
|
XD
|
|
xDD
|
|
XDD
|
|
8D
|
|
8-D
|
|
|
|
^_^
|
|
^__^
|
|
^___^
|
|
>.<
|
|
>.>
|
|
<.<
|
|
._.
|
|
;_;
|
|
-_-
|
|
-__-
|
|
v.v
|
|
V.V
|
|
v_v
|
|
V_V
|
|
o_o
|
|
o_O
|
|
O_o
|
|
O_O
|
|
0_o
|
|
o_0
|
|
0_0
|
|
o.O
|
|
O.o
|
|
O.O
|
|
o.o
|
|
0.0
|
|
o.0
|
|
0.o
|
|
@_@
|
|
<3
|
|
<33
|
|
<333
|
|
</3
|
|
(^_^)
|
|
(-_-)
|
|
(._.)
|
|
(>_<)
|
|
(*_*)
|
|
(¬_¬)
|
|
ಠ_ಠ
|
|
ಠ︵ಠ
|
|
(ಠ_ಠ)
|
|
¯\(ツ)/¯
|
|
(╯°□°)╯︵┻━┻
|
|
><(((*>
|
|
""".split())
|
|
|
|
|
|
for orth in emoticons:
|
|
BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
|