Add missing EXC variable and combine tokenizer exceptions

This commit is contained in:
ines 2017-02-05 11:42:52 +01:00
parent 30a52d576b
commit 6715615d55
1 changed files with 9 additions and 1 deletions

View File

@ -4,6 +4,9 @@ from __future__ import unicode_literals
from ..symbols import * from ..symbols import *
from ..language_data import PRON_LEMMA from ..language_data import PRON_LEMMA
EXC = {}
# Verbs # Verbs
for verb_data in [ for verb_data in [
@ -25,7 +28,8 @@ for verb_data in [
{ORTH: "u", LEMMA: PRON_LEMMA, NORM: "du"} {ORTH: "u", LEMMA: PRON_LEMMA, NORM: "du"}
] ]
TOKENIZER_EXCEPTIONS = {
ABBREVIATIONS = {
"jan.": [ "jan.": [
{ORTH: "jan.", LEMMA: "januari"} {ORTH: "jan.", LEMMA: "januari"}
], ],
@ -149,6 +153,10 @@ TOKENIZER_EXCEPTIONS = {
} }
TOKENIZER_EXCEPTIONS = dict(EXC)
TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS)
ORTH_ONLY = [ ORTH_ONLY = [
"ang.", "ang.",
"anm.", "anm.",