Add missing EXC variable and combine tokenizer exceptions

This commit is contained in:
ines 2017-02-05 11:42:52 +01:00
parent 30a52d576b
commit 6715615d55
1 changed files with 9 additions and 1 deletions

View File

@ -4,6 +4,9 @@ from __future__ import unicode_literals
from ..symbols import *
from ..language_data import PRON_LEMMA
EXC = {}
# Verbs
for verb_data in [
@ -25,7 +28,8 @@ for verb_data in [
{ORTH: "u", LEMMA: PRON_LEMMA, NORM: "du"}
]
TOKENIZER_EXCEPTIONS = {
ABBREVIATIONS = {
"jan.": [
{ORTH: "jan.", LEMMA: "januari"}
],
@ -149,6 +153,10 @@ TOKENIZER_EXCEPTIONS = {
}
TOKENIZER_EXCEPTIONS = dict(EXC)
TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS)
ORTH_ONLY = [
"ang.",
"anm.",