diff --git a/spacy/sv/tokenizer_exceptions.py b/spacy/sv/tokenizer_exceptions.py index e0b1f20c3..2b01857a2 100644 --- a/spacy/sv/tokenizer_exceptions.py +++ b/spacy/sv/tokenizer_exceptions.py @@ -4,6 +4,9 @@ from __future__ import unicode_literals from ..symbols import * from ..language_data import PRON_LEMMA + +EXC = {} + # Verbs for verb_data in [ @@ -25,7 +28,8 @@ for verb_data in [ {ORTH: "u", LEMMA: PRON_LEMMA, NORM: "du"} ] -TOKENIZER_EXCEPTIONS = { + +ABBREVIATIONS = { "jan.": [ {ORTH: "jan.", LEMMA: "januari"} ], @@ -149,6 +153,10 @@ TOKENIZER_EXCEPTIONS = { } +TOKENIZER_EXCEPTIONS = dict(EXC) +TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS) + + ORTH_ONLY = [ "ang.", "anm.",