From 7d7b65ffd42c56ce3a0aa73b18196eb20a1dcc24 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 26 Aug 2020 04:00:49 +0200 Subject: [PATCH] Fix raw strings in URL pattern (#5972) Add missing raw string specifiers. --- spacy/lang/tokenizer_exceptions.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 13140a230..c903448b0 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -37,13 +37,13 @@ URL_PATTERN = ( r"|" # host & domain names # mods: match is case-sensitive, so include [A-Z] - "(?:" # noqa - "(?:" - "[A-Za-z0-9\u00a1-\uffff]" - "[A-Za-z0-9\u00a1-\uffff_-]{0,62}" - ")?" - "[A-Za-z0-9\u00a1-\uffff]\." - ")+" + r"(?:" # noqa + r"(?:" + r"[A-Za-z0-9\u00a1-\uffff]" + r"[A-Za-z0-9\u00a1-\uffff_-]{0,62}" + r")?" + r"[A-Za-z0-9\u00a1-\uffff]\." + r")+" # TLD identifier # mods: use ALPHA_LOWER instead of a wider range so that this doesn't match # strings like "lower.Upper", which can be split on "." by infixes in some