diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 556afc2c9..0be714dfe 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -4,13 +4,17 @@ from __future__ import unicode_literals import pytest -URLS = [ +URLS_BASIC = [ "http://www.nytimes.com/2016/04/20/us/politics/new-york-primary-preview.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=a-lede-package-region®ion=top-news&WT.nav=top-news&_r=0", - "www.google.com?q=google", "www.red-stars.com", - "http://foo.com/blah_(wikipedia)#cite-1", "mailto:foo.bar@baz.com", - "mailto:foo-bar@baz-co.com" + +] + +URLS_FULL = URLS_BASIC + [ + "mailto:foo-bar@baz-co.com", + "www.google.com?q=google", + "http://foo.com/blah_(wikipedia)#cite-1" ] @@ -25,32 +29,14 @@ SUFFIXES = [ '"', ":", ">"] -@pytest.mark.parametrize("url", URLS) +@pytest.mark.parametrize("url", URLS_BASIC) def test_tokenizer_handles_simple_url(tokenizer, url): tokens = tokenizer(url) assert len(tokens) == 1 assert tokens[0].text == url -@pytest.mark.parametrize("prefix", PREFIXES) -@pytest.mark.parametrize("url", URLS) -def test_tokenizer_handles_prefixed_url(tokenizer, prefix, url): - tokens = tokenizer(prefix + url) - assert len(tokens) == 2 - assert tokens[0].text == prefix - assert tokens[1].text == url - - -@pytest.mark.parametrize("suffix", SUFFIXES) -@pytest.mark.parametrize("url", URLS) -def test_tokenizer_handles_suffixed_url(tokenizer, url, suffix): - tokens = tokenizer(url + suffix) - assert len(tokens) == 2 - assert tokens[0].text == url - assert tokens[1].text == suffix - - -@pytest.mark.parametrize("url", URLS) +@pytest.mark.parametrize("url", URLS_BASIC) def test_tokenizer_handles_simple_surround_url(tokenizer, url): tokens = tokenizer("(" + url + ")") assert len(tokens) == 3 @@ -61,8 +47,28 @@ def test_tokenizer_handles_simple_surround_url(tokenizer, url): @pytest.mark.slow @pytest.mark.parametrize("prefix", PREFIXES) +@pytest.mark.parametrize("url", URLS_FULL) +def test_tokenizer_handles_prefixed_url(tokenizer, prefix, url): + tokens = tokenizer(prefix + url) + assert len(tokens) == 2 + assert tokens[0].text == prefix + assert tokens[1].text == url + + +@pytest.mark.slow @pytest.mark.parametrize("suffix", SUFFIXES) -@pytest.mark.parametrize("url", URLS) +@pytest.mark.parametrize("url", URLS_FULL) +def test_tokenizer_handles_suffixed_url(tokenizer, url, suffix): + tokens = tokenizer(url + suffix) + assert len(tokens) == 2 + assert tokens[0].text == url + assert tokens[1].text == suffix + + +@pytest.mark.slow +@pytest.mark.parametrize("prefix", PREFIXES) +@pytest.mark.parametrize("suffix", SUFFIXES) +@pytest.mark.parametrize("url", URLS_FULL) def test_tokenizer_handles_surround_url(tokenizer, prefix, suffix, url): tokens = tokenizer(prefix + url + suffix) assert len(tokens) == 3 @@ -74,7 +80,7 @@ def test_tokenizer_handles_surround_url(tokenizer, prefix, suffix, url): @pytest.mark.slow @pytest.mark.parametrize("prefix1", PREFIXES) @pytest.mark.parametrize("prefix2", PREFIXES) -@pytest.mark.parametrize("url", URLS) +@pytest.mark.parametrize("url", URLS_FULL) def test_tokenizer_handles_two_prefix_url(tokenizer, prefix1, prefix2, url): tokens = tokenizer(prefix1 + prefix2 + url) assert len(tokens) == 3 @@ -86,7 +92,7 @@ def test_tokenizer_handles_two_prefix_url(tokenizer, prefix1, prefix2, url): @pytest.mark.slow @pytest.mark.parametrize("suffix1", SUFFIXES) @pytest.mark.parametrize("suffix2", SUFFIXES) -@pytest.mark.parametrize("url", URLS) +@pytest.mark.parametrize("url", URLS_FULL) def test_tokenizer_handles_two_prefix_url(tokenizer, suffix1, suffix2, url): tokens = tokenizer(url + suffix1 + suffix2) assert len(tokens) == 3