From 69eab727d7fbb9c766ed42a0fa07ba6c2851cebd Mon Sep 17 00:00:00 2001 From: Magnus Burton Date: Thu, 19 Jan 2017 14:08:52 +0100 Subject: [PATCH] Added loops to handle contractions with verbs --- spacy/sv/tokenizer_exceptions.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/spacy/sv/tokenizer_exceptions.py b/spacy/sv/tokenizer_exceptions.py index 43732612f..a7fc33b18 100644 --- a/spacy/sv/tokenizer_exceptions.py +++ b/spacy/sv/tokenizer_exceptions.py @@ -4,6 +4,26 @@ from __future__ import unicode_literals from ..symbols import * from ..language_data import PRON_LEMMA +# Verbs + +for verb_data in [ + {ORTH: "driver"}, + {ORTH: "kör"}, + {ORTH: "hörr", LEMMA: "hör"}, + {ORTH: "fattar"}, + {ORTH: "hajar", LEMMA: "förstår"}, + {ORTH: "lever"}, + {ORTH: "serr", LEMMA: "ser"}, + {ORTH: "fixar"} +]: + verb_data_tc = dict(verb_data) + verb_data_tc[ORTH] = verb_data_tc[ORTH].title() + + for data in [verb_data, verb_data_tc]: + EXC[data[ORTH] + "u"] = [ + dict(data), + {ORTH: "u", LEMMA: PRON_LEMMA, NORM: "du"} + ] TOKENIZER_EXCEPTIONS = { "jan.": [