From 637f78503666f3551c199ad5b264b03d7d61bcd0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 16:25:38 +0100 Subject: [PATCH] Add general sanity tests for all tokenizers --- spacy/tests/tokenizer/conftest.py | 23 +++++++ spacy/tests/tokenizer/test_tokenizer.py | 80 +++++++++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 spacy/tests/tokenizer/conftest.py create mode 100644 spacy/tests/tokenizer/test_tokenizer.py diff --git a/spacy/tests/tokenizer/conftest.py b/spacy/tests/tokenizer/conftest.py new file mode 100644 index 000000000..c8e340208 --- /dev/null +++ b/spacy/tests/tokenizer/conftest.py @@ -0,0 +1,23 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +from ...en import English +from ...de import German +from ...es import Spanish +from ...it import Italian +from ...fr import French +from ...pt import Portuguese +from ...nl import Dutch +from ...sv import Swedish +from ...hu import Hungarian + + +LANGUAGES = [English, German, Spanish, Italian, French, Dutch, Swedish, Hungarian] + + +@pytest.fixture(params=LANGUAGES) +def tokenizer(request): + lang = request.param + return lang.Defaults.create_tokenizer() diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py new file mode 100644 index 000000000..49bfdcb26 --- /dev/null +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -0,0 +1,80 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +def test_tokenizer_handles_no_word(tokenizer): + tokens = tokenizer("") + assert len(tokens) == 0 + + +@pytest.mark.parametrize('text', ["lorem"]) +def test_tokenizer_handles_single_word(tokenizer, text): + tokens = tokenizer(text) + assert tokens[0].text == text + + +@pytest.mark.parametrize('text', ["lorem ipsum"]) +def test_tokenizer_handles_two_words(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 2 + assert tokens[0].text != tokens[1].text + + +@pytest.mark.parametrize('text', ["lorem ipsum"]) +def test_tokenizer_splits_double_space(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 3 + assert tokens[1].text == " " + + +@pytest.mark.parametrize('text', ["lorem\nipsum"]) +def test_tokenizer_splits_newline(tokenizer, text): + tokens = tokenizer(text) + assert len(tokens) == 3 + assert tokens[1].text == "\n" + + +def test_tokenizer_handles_punct(tokenizer): + text = "Lorem, ipsum." + tokens = tokenizer(text) + assert len(tokens) == 4 + assert tokens[0].text == "Lorem" + assert tokens[1].text == "," + assert tokens[2].text == "ipsum" + assert tokens[1].text != "Lorem" + + +def test_tokenizer_handles_digits(tokenizer): + exceptions = ["hu"] + text = "Lorem ipsum: 1984." + tokens = tokenizer(text) + + if tokens[0].lang_ not in exceptions: + assert len(tokens) == 5 + assert tokens[0].text == "Lorem" + assert tokens[3].text == "1984" + + +def test_tokenizer_handles_long_text(tokenizer): + text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit + +Cras egestas orci non porttitor maximus. +Maecenas quis odio id dolor rhoncus dignissim. Curabitur sed velit at orci ultrices sagittis. Nulla commodo euismod arcu eget vulputate. + +Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, non lacinia enim nibh eget ipsum. Vestibulum in bibendum mauris. + +"Nullam porta fringilla enim, a dictum orci consequat in." Mauris nec malesuada justo.""" + + tokens = tokenizer(text) + assert len(tokens) > 5 + + +def test_tokenizer_suspected_freeing_strings(tokenizer): + text1 = "Lorem dolor sit amet, consectetur adipiscing elit." + text2 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit." + tokens1 = tokenizer(text1) + tokens2 = tokenizer(text2) + assert tokens1[0].text == "Lorem" + assert tokens2[0].text == "Lorem"