From 6aa241bcec8c9bacac5c9a793fc8a255a291cac5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Lind=20Kristiansen?= Date: Fri, 24 Nov 2017 15:03:24 +0100 Subject: [PATCH] Add day of month tokenizer exceptions for Danish. --- spacy/lang/da/tokenizer_exceptions.py | 6 ++++++ spacy/tests/lang/da/test_exceptions.py | 5 +++++ 2 files changed, 11 insertions(+) diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py index 773bf1512..21ab27d7f 100644 --- a/spacy/lang/da/tokenizer_exceptions.py +++ b/spacy/lang/da/tokenizer_exceptions.py @@ -117,6 +117,12 @@ for orth in [ "øv.", "øvr.", "årg.", "årh.", ""]: _exc[orth] = [{ORTH: orth}] +# Dates +for h in range(1, 31 + 1): + for period in ["."]: + _exc["%d%s" % (h, period)] = [ + {ORTH: "%d." % h}] + _custom_base_exc = { "i.": [ {ORTH: "i", LEMMA: "i", NORM: "i"}, diff --git a/spacy/tests/lang/da/test_exceptions.py b/spacy/tests/lang/da/test_exceptions.py index d836a6b5c..71e34fc5c 100644 --- a/spacy/tests/lang/da/test_exceptions.py +++ b/spacy/tests/lang/da/test_exceptions.py @@ -14,6 +14,11 @@ def test_da_tokenizer_handles_ambiguous_abbr(da_tokenizer, text): tokens = da_tokenizer(text) assert len(tokens) == 2 +@pytest.mark.parametrize('text', ["1.", "10.", "31."]) +def test_da_tokenizer_handles_dates(da_tokenizer, text): + tokens = da_tokenizer(text) + assert len(tokens) == 1 + def test_da_tokenizer_handles_exc_in_text(da_tokenizer): text = "Det er bl.a. ikke meningen" tokens = da_tokenizer(text)