diff --git a/spacy/lang/lex_attrs.py b/spacy/lang/lex_attrs.py index 12016c273..6ed981a06 100644 --- a/spacy/lang/lex_attrs.py +++ b/spacy/lang/lex_attrs.py @@ -3,6 +3,7 @@ import unicodedata import re from .. import attrs +from .tokenizer_exceptions import URL_MATCH _like_email = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)").match @@ -109,6 +110,8 @@ def like_url(text: str) -> bool: return True if tld.isalpha() and tld in _tlds: return True + if URL_MATCH(text): + return True return False diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py index b39109455..6a7a404fd 100644 --- a/spacy/tests/lang/test_attrs.py +++ b/spacy/tests/lang/test_attrs.py @@ -58,9 +58,10 @@ def test_lex_attrs_is_currency(text, match): ("www.google.com", True), ("google.com", True), ("sydney.com", True), - ("2girls1cup.org", True), + ("1abc2def.org", True), ("http://stupid", True), ("www.hi", True), + ("example.com/example", True), ("dog", False), ("1.2", False), ("1.a", False),