From 55b46d7cf64dcb9b0206c8b5aab5468de1236280 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 18:11:25 +0100 Subject: [PATCH] Add tokenizer tests for German --- spacy/tests/de/__init__.py | 0 spacy/tests/de/conftest.py | 11 ++ spacy/tests/de/tokenizer/__init__.py | 0 spacy/tests/de/tokenizer/test_exceptions.py | 27 ++++ .../de/tokenizer/test_prefix_suffix_infix.py | 116 ++++++++++++++++++ spacy/tests/de/tokenizer/test_text.py | 40 ++++++ 6 files changed, 194 insertions(+) create mode 100644 spacy/tests/de/__init__.py create mode 100644 spacy/tests/de/conftest.py create mode 100644 spacy/tests/de/tokenizer/__init__.py create mode 100644 spacy/tests/de/tokenizer/test_exceptions.py create mode 100644 spacy/tests/de/tokenizer/test_prefix_suffix_infix.py create mode 100644 spacy/tests/de/tokenizer/test_text.py diff --git a/spacy/tests/de/__init__.py b/spacy/tests/de/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/de/conftest.py b/spacy/tests/de/conftest.py new file mode 100644 index 000000000..c6b8be26e --- /dev/null +++ b/spacy/tests/de/conftest.py @@ -0,0 +1,11 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +from ...de import German + + +@pytest.fixture +def de_tokenizer(): + return German.Defaults.create_tokenizer() diff --git a/spacy/tests/de/tokenizer/__init__.py b/spacy/tests/de/tokenizer/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/de/tokenizer/test_exceptions.py b/spacy/tests/de/tokenizer/test_exceptions.py new file mode 100644 index 000000000..13da3dc33 --- /dev/null +++ b/spacy/tests/de/tokenizer/test_exceptions.py @@ -0,0 +1,27 @@ +# coding: utf-8 +"""Test that tokenizer exceptions and emoticons are handles correctly.""" + + +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"]) +def test_tokenizer_splits_contractions(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 2 + + +@pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."]) +def test_tokenizer_handles_abbr(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 1 + + +def test_tokenizer_handles_exc_in_text(de_tokenizer): + text = "Ich bin z.Zt. im Urlaub." + tokens = de_tokenizer(text) + assert len(tokens) == 6 + assert tokens[2].text == "z.Zt." + assert tokens[2].lemma_ == "zur Zeit" diff --git a/spacy/tests/de/tokenizer/test_prefix_suffix_infix.py b/spacy/tests/de/tokenizer/test_prefix_suffix_infix.py new file mode 100644 index 000000000..dcf4f4ef0 --- /dev/null +++ b/spacy/tests/de/tokenizer/test_prefix_suffix_infix.py @@ -0,0 +1,116 @@ +# coding: utf-8 +"""Test that tokenizer prefixes, suffixes and infixes are handled correctly.""" + + +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', ["(unter)"]) +def test_tokenizer_splits_no_special(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["unter'm"]) +def test_tokenizer_splits_no_punct(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 2 + + +@pytest.mark.parametrize('text', ["(unter'm"]) +def test_tokenizer_splits_prefix_punct(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["unter'm)"]) +def test_tokenizer_splits_suffix_punct(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["(unter'm)"]) +def test_tokenizer_splits_even_wrap(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 4 + + +@pytest.mark.parametrize('text', ["(unter'm?)"]) +def test_tokenizer_splits_uneven_wrap(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 5 + + +@pytest.mark.parametrize('text,length', [("z.B.", 1), ("zb.", 2), ("(z.B.", 2)]) +def test_tokenizer_splits_prefix_interact(de_tokenizer, text, length): + tokens = de_tokenizer(text) + assert len(tokens) == length + + +@pytest.mark.parametrize('text', ["z.B.)"]) +def test_tokenizer_splits_suffix_interact(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 2 + + +@pytest.mark.parametrize('text', ["(z.B.)"]) +def test_tokenizer_splits_even_wrap_interact(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["(z.B.?)"]) +def test_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 4 + + +@pytest.mark.parametrize('text', ["blau-rot"]) +def test_tokenizer_splits_hyphens(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"]) +def test_tokenizer_splits_numeric_range(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["blau.Rot", "Hallo.Welt"]) +def test_tokenizer_splits_period_infix(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + + +@pytest.mark.parametrize('text', ["Hallo,Welt", "eins,zwei"]) +def test_tokenizer_splits_comma_infix(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + assert tokens[0].text == text.split(",")[0] + assert tokens[1].text == "," + assert tokens[2].text == text.split(",")[1] + + +@pytest.mark.parametrize('text', ["blau...Rot", "blau...rot"]) +def test_tokenizer_splits_ellipsis_infix(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 3 + + +def test_tokenizer_splits_double_hyphen_infix(de_tokenizer): + tokens = de_tokenizer("Viele Regeln--wie die Bindestrich-Regeln--sind kompliziert.") + assert len(tokens) == 12 + assert tokens[0].text == "Viele" + assert tokens[1].text == "Regeln" + assert tokens[2].text == "--" + assert tokens[3].text == "wie" + assert tokens[4].text == "die" + assert tokens[5].text == "Bindestrich" + assert tokens[6].text == "-" + assert tokens[7].text == "Regeln" + assert tokens[8].text == "--" + assert tokens[9].text == "sind" + assert tokens[10].text == "kompliziert" diff --git a/spacy/tests/de/tokenizer/test_text.py b/spacy/tests/de/tokenizer/test_text.py new file mode 100644 index 000000000..a5cbd5383 --- /dev/null +++ b/spacy/tests/de/tokenizer/test_text.py @@ -0,0 +1,40 @@ +# coding: utf-8 +"""Test that longer and mixed texts are tokenized correctly.""" + + +from __future__ import unicode_literals + +import pytest + + +def test_tokenizer_handles_long_text(de_tokenizer): + text = """Die Verwandlung + +Als Gregor Samsa eines Morgens aus unruhigen Träumen erwachte, fand er sich in seinem Bett zu einem ungeheueren Ungeziefer verwandelt. + +Er lag auf seinem panzerartig harten Rücken und sah, wenn er den Kopf ein wenig hob, seinen gewölbten, braunen, von bogenförmigen Versteifungen geteilten Bauch, auf dessen Höhe sich die Bettdecke, zum gänzlichen Niedergleiten bereit, kaum noch erhalten konnte. Seine vielen, im Vergleich zu seinem sonstigen Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen. + +»Was ist mit mir geschehen?«, dachte er.""" + + tokens = de_tokenizer(text) + assert len(tokens) == 104 + + +@pytest.mark.parametrize('text,length', [ + ("Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", 1), + ("Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", 1), + ("Kraftfahrzeug-Haftpflichtversicherung", 3), + ("Vakuum-Mittelfrequenz-Induktionsofen", 5) + ]) +def test_tokenizer_handles_long_words(de_tokenizer, text, length): + tokens = de_tokenizer(text) + assert len(tokens) == length + + +@pytest.mark.parametrize('text,length', [ + ("»Was ist mit mir geschehen?«, dachte er.", 12), + ("“Dies frühzeitige Aufstehen”, dachte er, “macht einen ganz blödsinnig. ", 15) + ]) +def test_tokenizer_handles_examples(de_tokenizer, text, length): + tokens = de_tokenizer(text) + assert len(tokens) == length