diff --git a/spacy/language_data/tokenizer_exceptions.py b/spacy/language_data/tokenizer_exceptions.py index 1208e1219..b84adb2c4 100644 --- a/spacy/language_data/tokenizer_exceptions.py +++ b/spacy/language_data/tokenizer_exceptions.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals +# The use of this module turns out to be important, to avoid pathological +# back-tracking. See Issue #957 import regex # URL validation regex courtesy of: https://mathiasbynens.be/demo/url-regex @@ -23,6 +25,8 @@ _URL_PATTERN = ( # excludes reserved space >= 224.0.0.0 # excludes network & broadcast addresses # (first & last IP address of each class) + # MH: Do we really need this? Seems excessive, and seems to have caused + # Issue #957 r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" diff --git a/spacy/tests/regression/test_issue913.py b/spacy/tests/regression/test_issue957.py similarity index 100% rename from spacy/tests/regression/test_issue913.py rename to spacy/tests/regression/test_issue957.py