From 26aee70d95792b904c64cb30b3e2625ff750e9eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=B8ren=20Lind=20Kristiansen?= Date: Fri, 12 Jul 2019 15:20:42 +0200 Subject: [PATCH] Make Danish tokenizer split on forward slash --- spacy/lang/da/punctuation.py | 3 ++- spacy/lang/da/tokenizer_exceptions.py | 8 ++++++++ spacy/tests/lang/da/test_exceptions.py | 24 ++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/spacy/lang/da/punctuation.py b/spacy/lang/da/punctuation.py index cf18ca36b..b6b852c55 100644 --- a/spacy/lang/da/punctuation.py +++ b/spacy/lang/da/punctuation.py @@ -14,10 +14,11 @@ _infixes = ( + [ r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), - r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA), + r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), + r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA), ] ) diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py index be7528b23..d669fb981 100644 --- a/spacy/lang/da/tokenizer_exceptions.py +++ b/spacy/lang/da/tokenizer_exceptions.py @@ -52,6 +52,7 @@ for exc_data in [ {ORTH: "Ons.", LEMMA: "onsdag"}, {ORTH: "Fre.", LEMMA: "fredag"}, {ORTH: "Lør.", LEMMA: "lørdag"}, + {ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller", TAG: "CC"}, ]: _exc[exc_data[ORTH]] = [exc_data] @@ -64,6 +65,8 @@ for orth in [ "mik.", "pers.", "A.D.", + "A/B", + "a/s", "A/S", "B.C.", "BK.", @@ -79,7 +82,9 @@ for orth in [ "Kprs.", "L.A.", "Ll.", + "m/k", "m/s", + "m/sek.", "M/S", "Mag.", "Mr.", @@ -90,6 +95,7 @@ for orth in [ "Sdr.", "Skt.", "Spl.", + "TCP/IP", "Vg.", ]: _exc[orth] = [{ORTH: orth}] @@ -141,6 +147,7 @@ for orth in [ "brolægn.", "bto.", "bygn.", + "c/o", "ca.", "cand.", "d.d.", @@ -293,6 +300,7 @@ for orth in [ "kgl.", "kl.", "kld.", + "km/t", "knsp.", "komm.", "kons.", diff --git a/spacy/tests/lang/da/test_exceptions.py b/spacy/tests/lang/da/test_exceptions.py index 87ea5acd6..a522ab5e8 100644 --- a/spacy/tests/lang/da/test_exceptions.py +++ b/spacy/tests/lang/da/test_exceptions.py @@ -43,3 +43,27 @@ def test_da_tokenizer_handles_custom_base_exc(da_tokenizer): def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm): tokens = da_tokenizer(text) assert tokens[0].norm_ == norm + + +@pytest.mark.parametrize( + "text,n_tokens", + [ + ("Godt og/eller skidt", 3), + ("Kør 4 km/t på vejen", 5), + ("Det blæser 12 m/s.", 5), + ("Det blæser 12 m/sek. på havnen", 6), + ("Windows 8/Windows 10", 5), + ("Billeten virker til bus/tog/metro", 8), + ("26/02/2019", 1), + ("Kristiansen c/o Madsen", 3), + ("Sprogteknologi a/s", 2), + ("De boede i A/B Bellevue", 5), + ("Rotorhastigheden er 3400 o/m.", 5), + ("Jeg købte billet t/r.", 5), + ("Murerarbejdsmand m/k søges", 3), + ("Netværket kører over TCP/IP", 4), + ], +) +def test_da_tokenizer_slash(da_tokenizer, text, n_tokens): + tokens = da_tokenizer(text) + assert len(tokens) == n_tokens