From 1914c488d34c0c0e8b8ac160c2a3a0f34d7fa3c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emil=20Stenstr=C3=B6m?= Date: Sun, 5 Aug 2018 14:14:30 +0200 Subject: [PATCH] Swedish: Exceptions for single letter words ending sentence (#2615) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Exceptions for single letter words ending sentence Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."), should be tokenized as two separate tokens. * Add test --- spacy/lang/sv/tokenizer_exceptions.py | 8 +++++++- spacy/tests/lang/sv/test_tokenizer.py | 3 ++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/spacy/lang/sv/tokenizer_exceptions.py b/spacy/lang/sv/tokenizer_exceptions.py index 4f84ef9b5..aa03e61cb 100644 --- a/spacy/lang/sv/tokenizer_exceptions.py +++ b/spacy/lang/sv/tokenizer_exceptions.py @@ -1,7 +1,7 @@ # coding: utf8 from __future__ import unicode_literals -from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA +from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA, PUNCT _exc = {} @@ -78,5 +78,11 @@ for orth in [ "s.k.", "st.", "s:t", "t.ex.", "t.o.m.", "ung.", "äv.", "övers."]: _exc[orth] = [{ORTH: orth}] +# Sentences ending in "i." (as in "... peka i."), "m." (as in "...än 2000 m."), +# should be tokenized as two separate tokens. +for orth in ["i", "m"]: + _exc[orth + "."] = [ + {ORTH: orth, LEMMA: orth, NORM: orth}, + {ORTH: ".", TAG: PUNCT}] TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/tests/lang/sv/test_tokenizer.py b/spacy/tests/lang/sv/test_tokenizer.py index dbb3e1dd1..7f81e250e 100644 --- a/spacy/tests/lang/sv/test_tokenizer.py +++ b/spacy/tests/lang/sv/test_tokenizer.py @@ -6,7 +6,8 @@ import pytest SV_TOKEN_EXCEPTION_TESTS = [ ('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']), - ('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']) + ('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']), + ('Anders I. tycker om ord med i i.', ["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."]) ]