diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 13140a230..c903448b0 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -37,13 +37,13 @@ URL_PATTERN = ( r"|" # host & domain names # mods: match is case-sensitive, so include [A-Z] - "(?:" # noqa - "(?:" - "[A-Za-z0-9\u00a1-\uffff]" - "[A-Za-z0-9\u00a1-\uffff_-]{0,62}" - ")?" - "[A-Za-z0-9\u00a1-\uffff]\." - ")+" + r"(?:" # noqa + r"(?:" + r"[A-Za-z0-9\u00a1-\uffff]" + r"[A-Za-z0-9\u00a1-\uffff_-]{0,62}" + r")?" + r"[A-Za-z0-9\u00a1-\uffff]\." + r")+" # TLD identifier # mods: use ALPHA_LOWER instead of a wider range so that this doesn't match # strings like "lower.Upper", which can be split on "." by infixes in some