From fdf4776262b978a710a73a95e67c8dca87f855e7 Mon Sep 17 00:00:00 2001 From: Magnus Burton Date: Thu, 22 Dec 2016 22:45:18 +0100 Subject: [PATCH 1/2] Added Swedish abbreviations --- spacy/sv/tokenizer_exceptions.py | 76 ++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 spacy/sv/tokenizer_exceptions.py diff --git a/spacy/sv/tokenizer_exceptions.py b/spacy/sv/tokenizer_exceptions.py new file mode 100644 index 000000000..ab2691eda --- /dev/null +++ b/spacy/sv/tokenizer_exceptions.py @@ -0,0 +1,76 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * +from ..language_data import PRON_LEMMA + + +TOKENIZER_EXCEPTIONS = { + +} + + +ORTH_ONLY = [ + "ang.", + "anm.", + "bil.", + "bl.a.", + "ca", + "cm", + "dl", + "dvs.", + "e.Kr.", + "el.", + "e.d.", + "eng.", + "etc.", + "exkl.", + "f.d.", + "fid.", + "f.Kr.", + "forts.", + "fr.o.m.", + "f.ö.", + "förf.", + "ha", + "hg", + "inkl.", + "i sht", + "i st", + "jmf", + "jur.", + "kcal", + "kg", + "kl.", + "km", + "kr.", + "l", + "lat.", + "m", + "m.a.o.", + "max.", + "m.fl.", + "min.", + "mm", + "m.m.", + "ngn", + "ngt", + "nr", + "obs.", + "o.d.", + "osv.", + "p.g.a.", + "ref.", + "resp.", + "s.", + "s.a.s.", + "s.k.", + "st.", + "s:t", + "t.ex.", + "t.o.m.", + "tfn", + "ung.", + "äv.", + "övers." +] From 7f411fd01c931b73f717b114934662ebb2739555 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 23 Dec 2016 14:30:06 +0100 Subject: [PATCH 2/2] Remove exceptions containing whitespace / no special chars --- spacy/sv/tokenizer_exceptions.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/spacy/sv/tokenizer_exceptions.py b/spacy/sv/tokenizer_exceptions.py index ab2691eda..6cf144b44 100644 --- a/spacy/sv/tokenizer_exceptions.py +++ b/spacy/sv/tokenizer_exceptions.py @@ -15,9 +15,6 @@ ORTH_ONLY = [ "anm.", "bil.", "bl.a.", - "ca", - "cm", - "dl", "dvs.", "e.Kr.", "el.", @@ -32,30 +29,16 @@ ORTH_ONLY = [ "fr.o.m.", "f.ö.", "förf.", - "ha", - "hg", "inkl.", - "i sht", - "i st", - "jmf", "jur.", - "kcal", - "kg", "kl.", - "km", "kr.", - "l", "lat.", - "m", "m.a.o.", "max.", "m.fl.", "min.", - "mm", "m.m.", - "ngn", - "ngt", - "nr", "obs.", "o.d.", "osv.", @@ -69,7 +52,6 @@ ORTH_ONLY = [ "s:t", "t.ex.", "t.o.m.", - "tfn", "ung.", "äv.", "övers."