From 35100c8bddc830998558e30f4a0b000f6d78e7f0 Mon Sep 17 00:00:00 2001 From: Michael Wallin Date: Sat, 4 Feb 2017 16:21:34 +0200 Subject: [PATCH 1/2] [issue 805] Add regression test and the required fixture --- spacy/tests/conftest.py | 5 +++++ spacy/tests/regression/test_issue805.py | 15 +++++++++++++++ 2 files changed, 20 insertions(+) create mode 100644 spacy/tests/regression/test_issue805.py diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 3d9e0adcc..d21d7d313 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -68,6 +68,11 @@ def fi_tokenizer(): return Finnish.Defaults.create_tokenizer() +@pytest.fixture +def sv_tokenizer(): + return Swedish.Defaults.create_tokenizer() + + @pytest.fixture def stringstore(): return StringStore() diff --git a/spacy/tests/regression/test_issue805.py b/spacy/tests/regression/test_issue805.py new file mode 100644 index 000000000..f23aff426 --- /dev/null +++ b/spacy/tests/regression/test_issue805.py @@ -0,0 +1,15 @@ +# encoding: utf8 +from __future__ import unicode_literals + +import pytest + +SV_TOKEN_EXCEPTION_TESTS = [ + ('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']), + ('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']) +] + +@pytest.mark.parametrize('text,expected_tokens', SV_TOKEN_EXCEPTION_TESTS) +def test_issue805(sv_tokenizer, text, expected_tokens): + tokens = sv_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list From d25556bf8093ea69dc8c64bd769015a930bc0497 Mon Sep 17 00:00:00 2001 From: Michael Wallin Date: Sat, 4 Feb 2017 16:22:21 +0200 Subject: [PATCH 2/2] [issue 805] Fix issue --- spacy/sv/language_data.py | 6 ++++-- spacy/sv/tokenizer_exceptions.py | 1 - 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/spacy/sv/language_data.py b/spacy/sv/language_data.py index a4a657c33..324351d06 100644 --- a/spacy/sv/language_data.py +++ b/spacy/sv/language_data.py @@ -5,12 +5,14 @@ from .. import language_data as base from ..language_data import update_exc, strings_to_exc from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY STOP_WORDS = set(STOP_WORDS) - -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) diff --git a/spacy/sv/tokenizer_exceptions.py b/spacy/sv/tokenizer_exceptions.py index d8d4e8823..b9705c39c 100644 --- a/spacy/sv/tokenizer_exceptions.py +++ b/spacy/sv/tokenizer_exceptions.py @@ -107,7 +107,6 @@ ORTH_ONLY = [ "p.g.a.", "ref.", "resp.", - "s.", "s.a.s.", "s.k.", "st.",