spaCy/spacy/lang/tokenizer_exceptions.py

# coding: utf8
from __future__ import unicode_literals

# The use of this module turns out to be important, to avoid pathological
# back-tracking. See Issue #957
import regex

from ..symbols import ORTH, POS, LEMMA, SPACE, PUNCT


# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
# A few minor mods to this regex to account for use cases represented in test_urls
URL_PATTERN = (
    r"^"
    # in order to support the prefix tokenization (see prefix test cases in test_urls).
    r"(?=[\w])"
    # protocol identifier
    r"(?:(?:https?|ftp|mailto)://)?"
    # user:pass authentication
    r"(?:\S+(?::\S*)?@)?"
    r"(?:"
    # IP address exclusion
    # private & local networks
    r"(?!(?:10|127)(?:\.\d{1,3}){3})"
    r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
    r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
    # IP address dotted notation octets
    # excludes loopback network 0.0.0.0
    # excludes reserved space >= 224.0.0.0
    # excludes network & broadcast addresses
    # (first & last IP address of each class)
    # MH: Do we really need this? Seems excessive, and seems to have caused
    # Issue #957
    r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
    r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
    r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
    r"|"
    # host name
    r"(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)"
    # domain name
    r"(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*"
    # TLD identifier
    r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))"
    r")"
    # port number
    r"(?::\d{2,5})?"
    # resource path
    r"(?:/\S*)?"
    # query parameters
    r"\??(:?\S*)?"
    # in order to support the suffix tokenization (see suffix test cases in test_urls),
    r"(?<=[\w/])"
    r"$"
).strip()

TOKEN_MATCH = regex.compile(URL_PATTERN, regex.UNICODE).match


BASE_EXCEPTIONS = {}


for exc_data in [
    {ORTH: " ", POS: SPACE},
    {ORTH: "\t", POS: SPACE},
    {ORTH: "\\t", POS: SPACE},
    {ORTH: "\n", POS: SPACE},
    {ORTH: "\\n", POS: SPACE},
    {ORTH: "\u2014", POS: PUNCT, LEMMA: "--"},
    {ORTH: "\u00a0", POS: SPACE, LEMMA: "  "}]:
    BASE_EXCEPTIONS[exc_data[ORTH]] = [dict(exc_data)]


for orth in [
    "'", "\\\")", "<space>", "''", "C++", "a.", "b.", "c.", "d.", "e.", "f.",
    "g.", "h.", "i.", "j.", "k.", "l.", "m.", "n.", "o.", "p.", "q.", "r.",
    "s.", "t.", "u.", "v.", "w.", "x.", "y.", "z.", "ä.", "ö.", "ü."]:
    BASE_EXCEPTIONS[orth] = [{ORTH: orth}]


emoticons = set("""
:)
:-)
:))
:-))
:)))
:-)))
(:
(-:
=)
(=
")
:]
:-]
[:
[-:
:o)
(o:
:}
:-}
8)
8-)
(-8
;)
;-)
(;
(-;
:(
:-(
:((
:-((
:(((
:-(((
):
)-:
=(
>:(
:')
:'-)
:'(
:'-(
:/
:-/
=/
=|
:|
:-|
:1
:P
:-P
:p
:-p
:O
:-O
:o
:-o
:0
:-0
:()
>:o
:*
:-*
:3
:-3
=3
:>
:->
:X
:-X
:x
:-x
:D
:-D
;D
;-D
=D
xD
XD
xDD
XDD
8D
8-D

^_^
^__^
^___^
>.<
>.>
<.<
._.
;_;
-_-
-__-
v.v
V.V
v_v
V_V
o_o
o_O
O_o
O_O
0_o
o_0
0_0
o.O
O.o
O.O
o.o
0.0
o.0
0.o
@_@
<3
<33
<333
</3
(^_^)
(-_-)
(._.)
(>_<)
(*_*)
(¬_¬)
ಠ_ಠ
ಠ︵ಠ
(ಠ_ಠ)
¯\(ツ)/¯
(╯°□°）╯︵┻━┻
><(((*>
""".split())


for orth in emoticons:
    BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
-												Merge base tokenizer exceptions

											
										
										
											2017-05-08 13:55:52 +00:00
+								# coding: utf8
-												Revert "Revert "Merge remote-tracking branch 'origin/master'""

This reverts commit fb9d3bb022e89f2cd63f2dd61efcac2eeb65cff9.

											
										
										
											2017-01-03 17:17:57 +00:00
+								from __future__ import unicode_literals
-												Rename test #913 -> #957, comment

Make test for #957 reference correct bug. Add comment.

Previous commit closes #957.

											
										
										
											2017-04-07 13:54:25 +00:00
+								# The use of this module turns out to be important, to avoid pathological
 								# back-tracking. See Issue #957
-												Switch to regex module for URL identification

The URL detection regex was failing on input such as 0.1.2.3, as this
input triggered excessive back-tracking in the builtin re module.
The solution was to switch to the regex module, which behaves better.

Closes #913.

											
										
										
											2017-04-07 13:47:36 +00:00
+								import regex
-												Revert "Revert "Merge remote-tracking branch 'origin/master'""

This reverts commit fb9d3bb022e89f2cd63f2dd61efcac2eeb65cff9.

											
										
										
											2017-01-03 17:17:57 +00:00
-												Merge base tokenizer exceptions

											
										
										
											2017-05-08 13:55:52 +00:00
+								from ..symbols import ORTH, POS, LEMMA, SPACE, PUNCT
-												Issue #840 - URL pattenr too broad

											
										
										
											2017-03-04 22:13:11 +00:00
+								# URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex
 								# A few minor mods to this regex to account for use cases represented in test_urls
-												Rename _URL_PATTERN to URL_PATTERN

											
										
										
											2017-05-08 22:00:00 +00:00
+								URL_PATTERN = (
-												Issue #840 - URL pattenr too broad

											
										
										
											2017-03-04 22:13:11 +00:00
+								    r"^"
 								    # in order to support the prefix tokenization (see prefix test cases in test_urls).
 								    r"(?=[\w])"
 								    # protocol identifier
 								    r"(?:(?:https?|ftp|mailto)://)?"
 								    # user:pass authentication
 								    r"(?:\S+(?::\S*)?@)?"
 								    r"(?:"
 								    # IP address exclusion
 								    # private & local networks
 								    r"(?!(?:10|127)(?:\.\d{1,3}){3})"
 								    r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
 								    r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
 								    # IP address dotted notation octets
 								    # excludes loopback network 0.0.0.0
 								    # excludes reserved space >= 224.0.0.0
 								    # excludes network & broadcast addresses
 								    # (first & last IP address of each class)
-												Rename test #913 -> #957, comment

Make test for #957 reference correct bug. Add comment.

Previous commit closes #957.

											
										
										
											2017-04-07 13:54:25 +00:00
+								    # MH: Do we really need this? Seems excessive, and seems to have caused
 								    # Issue #957
-												Issue #840 - URL pattenr too broad

											
										
										
											2017-03-04 22:13:11 +00:00
+								    r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
 								    r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
 								    r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
 								    r"|"
 								    # host name
 								    r"(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)"
 								    # domain name
 								    r"(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*"
 								    # TLD identifier
 								    r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))"
 								    r")"
 								    # port number
 								    r"(?::\d{2,5})?"
 								    # resource path
 								    r"(?:/\S*)?"
 								    # query parameters
 								    r"\??(:?\S*)?"
 								    # in order to support the suffix tokenization (see suffix test cases in test_urls),
 								    r"(?<=[\w/])"
 								    r"$"
 								).strip()
-												Revert "Revert "Merge remote-tracking branch 'origin/master'""

This reverts commit fb9d3bb022e89f2cd63f2dd61efcac2eeb65cff9.

											
										
										
											2017-01-03 17:17:57 +00:00
-												Rename _URL_PATTERN to URL_PATTERN

											
										
										
											2017-05-08 22:00:00 +00:00
+								TOKEN_MATCH = regex.compile(URL_PATTERN, regex.UNICODE).match
-												Revert "Revert "Merge remote-tracking branch 'origin/master'""

This reverts commit fb9d3bb022e89f2cd63f2dd61efcac2eeb65cff9.

											
										
										
											2017-01-03 17:17:57 +00:00
-												Merge base tokenizer exceptions

											
										
										
											2017-05-08 13:55:52 +00:00
 								BASE_EXCEPTIONS = {}
 								for exc_data in [
 								    {ORTH: " ", POS: SPACE},
 								    {ORTH: "\t", POS: SPACE},
 								    {ORTH: "\\t", POS: SPACE},
 								    {ORTH: "\n", POS: SPACE},
 								    {ORTH: "\\n", POS: SPACE},
 								    {ORTH: "\u2014", POS: PUNCT, LEMMA: "--"},
 								    {ORTH: "\u00a0", POS: SPACE, LEMMA: "  "}]:
 								    BASE_EXCEPTIONS[exc_data[ORTH]] = [dict(exc_data)]
 								for orth in [
 								    "'", "\\\")", "<space>", "''", "C++", "a.", "b.", "c.", "d.", "e.", "f.",
 								    "g.", "h.", "i.", "j.", "k.", "l.", "m.", "n.", "o.", "p.", "q.", "r.",
 								    "s.", "t.", "u.", "v.", "w.", "x.", "y.", "z.", "ä.", "ö.", "ü."]:
 								    BASE_EXCEPTIONS[orth] = [{ORTH: orth}]
 								emoticons = set("""
 								:)
 								:-)
 								:))
 								:-))
 								:)))
 								:-)))
 								(:
 								(-:
 								=)
 								(=
 								")
 								:]
 								:-]
 								[:
 								[-:
 								:o)
 								(o:
 								:}
 								:-}
 )
 -)
 								(-8
 								;)
 								;-)
 								(;
 								(-;
 								:(
 								:-(
 								:((
 								:-((
 								:(((
 								:-(((
 								):
 								)-:
 								=(
 								>:(
 								:')
 								:'-)
 								:'(
 								:'-(
 								:/
 								:-/
 								=/
 								=|
 								:|
 								:-|
 								:1
 								:P
 								:-P
 								:p
 								:-p
 								:O
 								:-O
 								:o
 								:-o
 								:0
 								:-0
 								:()
 								>:o
 								:*
 								:-*
 								:3
 								:-3
 								=3
 								:>
 								:->
 								:X
 								:-X
 								:x
 								:-x
 								:D
 								:-D
 								;D
 								;-D
 								=D
 								xD
 								XD
 								xDD
 								XDD
 D
 -D
 								^_^
 								^__^
 								^___^
 								>.<
 								>.>
 								<.<
 								._.
 								;_;
 								-_-
 								-__-
 								v.v
 								V.V
 								v_v
 								V_V
 								o_o
 								o_O
 								O_o
 								O_O
 _o
 								o_0
 _0
 								o.O
 								O.o
 								O.O
 								o.o
 .0
 								o.0
 .o
 								@_@
 								<3
 								<33
 								<333
 								</3
 								(^_^)
 								(-_-)
 								(._.)
 								(>_<)
 								(*_*)
 								(¬_¬)
 								ಠ_ಠ
 								ಠ︵ಠ
 								(ಠ_ಠ)
 								¯\(ツ)/¯
 								(╯°□°）╯︵┻━┻
 								><(((*>
 								""".split())
 								for orth in emoticons:
 								    BASE_EXCEPTIONS[orth] = [{ORTH: orth}]