From ece30c28a88e55f29a22fbe5f93a9988cec44836 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 16 Sep 2017 20:40:15 +0200 Subject: [PATCH] Don't split hyphenated words in German This way, the tokenizer matches the tokenization in German treebanks --- spacy/lang/de/__init__.py | 2 ++ spacy/lang/de/punctuation.py | 20 ++++++++++++++++ .../tests/lang/de/test_prefix_suffix_infix.py | 24 +++++++++---------- spacy/tests/lang/de/test_text.py | 14 +++++------ 4 files changed, 40 insertions(+), 20 deletions(-) create mode 100644 spacy/lang/de/punctuation.py diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index b8a7580a0..1c64541e6 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS +from .punctuation import TOKENIZER_INFIXES from .tag_map import TAG_MAP from .stop_words import STOP_WORDS from .lemmatizer import LOOKUP @@ -23,6 +24,7 @@ class GermanDefaults(Language.Defaults): NORM_EXCEPTIONS, BASE_NORMS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + infixes = tuple(TOKENIZER_INFIXES) tag_map = dict(TAG_MAP) stop_words = set(STOP_WORDS) syntax_iterators = dict(SYNTAX_ITERATORS) diff --git a/spacy/lang/de/punctuation.py b/spacy/lang/de/punctuation.py new file mode 100644 index 000000000..7024ed118 --- /dev/null +++ b/spacy/lang/de/punctuation.py @@ -0,0 +1,20 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER + + +_quotes = QUOTES.replace("'", '') + +_infixes = (LIST_ELLIPSES + LIST_ICONS + + [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER), + r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA), + r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes), + r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA), + r'(?<=[0-9])-(?=[0-9])']) + + +TOKENIZER_INFIXES = _infixes diff --git a/spacy/tests/lang/de/test_prefix_suffix_infix.py b/spacy/tests/lang/de/test_prefix_suffix_infix.py index dcf4f4ef0..bdc68037e 100644 --- a/spacy/tests/lang/de/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/de/test_prefix_suffix_infix.py @@ -67,12 +67,6 @@ def test_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text): assert len(tokens) == 4 -@pytest.mark.parametrize('text', ["blau-rot"]) -def test_tokenizer_splits_hyphens(de_tokenizer, text): - tokens = de_tokenizer(text) - assert len(tokens) == 3 - - @pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"]) def test_tokenizer_splits_numeric_range(de_tokenizer, text): tokens = de_tokenizer(text) @@ -100,17 +94,21 @@ def test_tokenizer_splits_ellipsis_infix(de_tokenizer, text): assert len(tokens) == 3 +@pytest.mark.parametrize('text', ['Islam-Konferenz', 'Ost-West-Konflikt']) +def test_tokenizer_keeps_hyphens(de_tokenizer, text): + tokens = de_tokenizer(text) + assert len(tokens) == 1 + + def test_tokenizer_splits_double_hyphen_infix(de_tokenizer): tokens = de_tokenizer("Viele Regeln--wie die Bindestrich-Regeln--sind kompliziert.") - assert len(tokens) == 12 + assert len(tokens) == 10 assert tokens[0].text == "Viele" assert tokens[1].text == "Regeln" assert tokens[2].text == "--" assert tokens[3].text == "wie" assert tokens[4].text == "die" - assert tokens[5].text == "Bindestrich" - assert tokens[6].text == "-" - assert tokens[7].text == "Regeln" - assert tokens[8].text == "--" - assert tokens[9].text == "sind" - assert tokens[10].text == "kompliziert" + assert tokens[5].text == "Bindestrich-Regeln" + assert tokens[6].text == "--" + assert tokens[7].text == "sind" + assert tokens[8].text == "kompliziert" diff --git a/spacy/tests/lang/de/test_text.py b/spacy/tests/lang/de/test_text.py index 84fa6f2a5..34180b982 100644 --- a/spacy/tests/lang/de/test_text.py +++ b/spacy/tests/lang/de/test_text.py @@ -25,15 +25,15 @@ Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen. assert len(tokens) == 109 -@pytest.mark.parametrize('text,length', [ - ("Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", 1), - ("Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", 1), - ("Kraftfahrzeug-Haftpflichtversicherung", 3), - ("Vakuum-Mittelfrequenz-Induktionsofen", 5) +@pytest.mark.parametrize('text', [ + "Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", + "Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", + "Kraftfahrzeug-Haftpflichtversicherung", + "Vakuum-Mittelfrequenz-Induktionsofen" ]) -def test_tokenizer_handles_long_words(de_tokenizer, text, length): +def test_tokenizer_handles_long_words(de_tokenizer, text): tokens = de_tokenizer(text) - assert len(tokens) == length + assert len(tokens) == 1 @pytest.mark.parametrize('text,length', [