From 81d3a1edb11a914e8854193137900d3cd80ad3dd Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 27 Jul 2021 12:07:01 +0200 Subject: [PATCH] Use tokenizer URL_MATCH pattern in LIKE_URL (#8765) --- spacy/lang/lex_attrs.py | 3 +++ spacy/tests/lang/test_attrs.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/spacy/lang/lex_attrs.py b/spacy/lang/lex_attrs.py index 12016c273..6ed981a06 100644 --- a/spacy/lang/lex_attrs.py +++ b/spacy/lang/lex_attrs.py @@ -3,6 +3,7 @@ import unicodedata import re from .. import attrs +from .tokenizer_exceptions import URL_MATCH _like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match @@ -109,6 +110,8 @@ def like_url(text: str) -> bool: return True if tld.isalpha() and tld in _tlds: return True + if URL_MATCH(text): + return True return False diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py index b39109455..6a7a404fd 100644 --- a/spacy/tests/lang/test_attrs.py +++ b/spacy/tests/lang/test_attrs.py @@ -58,9 +58,10 @@ def test_lex_attrs_is_currency(text, match): ("www.google.com", True), ("google.com", True), ("sydney.com", True), - ("2girls1cup.org", True), + ("1abc2def.org", True), ("http://stupid", True), ("www.hi", True), + ("example.com/example", True), ("dog", False), ("1.2", False), ("1.a", False),