spaCy/spacy/lang/da/tokenizer_exceptions.py

# encoding: utf8
from __future__ import unicode_literals

from ...symbols import ORTH, LEMMA, NORM


_exc = {}

for exc_data in [
    {ORTH: "Kbh.", LEMMA: "København", NORM: "København"},
    {ORTH: "Jan.", LEMMA: "januar", NORM: "januar"},
    {ORTH: "Feb.", LEMMA: "februar", NORM: "februar"},
    {ORTH: "Mar.", LEMMA: "marts", NORM: "marts"},
    {ORTH: "Apr.", LEMMA: "april", NORM: "april"},
    {ORTH: "Maj.", LEMMA: "maj", NORM: "maj"},
    {ORTH: "Jun.", LEMMA: "juni", NORM: "juni"},
    {ORTH: "Jul.", LEMMA: "juli", NORM: "juli"},
    {ORTH: "Aug.", LEMMA: "august", NORM: "august"},
    {ORTH: "Sep.", LEMMA: "september", NORM: "september"},
    {ORTH: "Okt.", LEMMA: "oktober", NORM: "oktober"},
    {ORTH: "Nov.", LEMMA: "november", NORM: "november"},
    {ORTH: "Dec.", LEMMA: "december", NORM: "december"}]:
    _exc[exc_data[ORTH]] = [exc_data]

for orth in [
    "A/S", "beg.", "bl.a.", "ca.", "d.s.s.", "dvs.", "f.eks.", "fr.", "hhv.",
    "if.", "iflg.", "m.a.o.", "mht.", "min.", "osv.", "pga.", "resp.", "self.",
    "t.o.m.", "vha.", ""]:
    _exc[orth] = [{ORTH: orth}]


TOKENIZER_EXCEPTIONS = _exc
Add Danish language data 2017-05-10 19:15:12 +00:00			`# encoding: utf8`
			`from __future__ import unicode_literals`

Add short names of months to tokenizer_exceptions 2017-07-03 13:44:17 +00:00			`from ...symbols import ORTH, LEMMA, NORM`
Add Danish language data 2017-05-10 19:15:12 +00:00

			`_exc = {}`

Add short names of months to tokenizer_exceptions 2017-07-03 13:44:17 +00:00			`for exc_data in [`
			`{ORTH: "Kbh.", LEMMA: "København", NORM: "København"},`
			`{ORTH: "Jan.", LEMMA: "januar", NORM: "januar"},`
			`{ORTH: "Feb.", LEMMA: "februar", NORM: "februar"},`
			`{ORTH: "Mar.", LEMMA: "marts", NORM: "marts"},`
			`{ORTH: "Apr.", LEMMA: "april", NORM: "april"},`
			`{ORTH: "Maj.", LEMMA: "maj", NORM: "maj"},`
			`{ORTH: "Jun.", LEMMA: "juni", NORM: "juni"},`
			`{ORTH: "Jul.", LEMMA: "juli", NORM: "juli"},`
			`{ORTH: "Aug.", LEMMA: "august", NORM: "august"},`
			`{ORTH: "Sep.", LEMMA: "september", NORM: "september"},`
			`{ORTH: "Okt.", LEMMA: "oktober", NORM: "oktober"},`
			`{ORTH: "Nov.", LEMMA: "november", NORM: "november"},`
			`{ORTH: "Dec.", LEMMA: "december", NORM: "december"}]:`
Tidy up tokenizer exceptions 2017-11-01 22:02:45 +00:00			`_exc[exc_data[ORTH]] = [exc_data]`
Add Danish language data 2017-05-10 19:15:12 +00:00
			`for orth in [`
			`"A/S", "beg.", "bl.a.", "ca.", "d.s.s.", "dvs.", "f.eks.", "fr.", "hhv.",`
			`"if.", "iflg.", "m.a.o.", "mht.", "min.", "osv.", "pga.", "resp.", "self.",`
			`"t.o.m.", "vha.", ""]:`
			`_exc[orth] = [{ORTH: orth}]`


Don't copy exception dicts if not necessary and tidy up 2017-10-31 20:05:29 +00:00			`TOKENIZER_EXCEPTIONS = _exc`