mirror of https://github.com/explosion/spaCy.git
Add missing EXC variable and combine tokenizer exceptions
This commit is contained in:
parent
30a52d576b
commit
6715615d55
|
@ -4,6 +4,9 @@ from __future__ import unicode_literals
|
||||||
from ..symbols import *
|
from ..symbols import *
|
||||||
from ..language_data import PRON_LEMMA
|
from ..language_data import PRON_LEMMA
|
||||||
|
|
||||||
|
|
||||||
|
EXC = {}
|
||||||
|
|
||||||
# Verbs
|
# Verbs
|
||||||
|
|
||||||
for verb_data in [
|
for verb_data in [
|
||||||
|
@ -25,7 +28,8 @@ for verb_data in [
|
||||||
{ORTH: "u", LEMMA: PRON_LEMMA, NORM: "du"}
|
{ORTH: "u", LEMMA: PRON_LEMMA, NORM: "du"}
|
||||||
]
|
]
|
||||||
|
|
||||||
TOKENIZER_EXCEPTIONS = {
|
|
||||||
|
ABBREVIATIONS = {
|
||||||
"jan.": [
|
"jan.": [
|
||||||
{ORTH: "jan.", LEMMA: "januari"}
|
{ORTH: "jan.", LEMMA: "januari"}
|
||||||
],
|
],
|
||||||
|
@ -149,6 +153,10 @@ TOKENIZER_EXCEPTIONS = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
TOKENIZER_EXCEPTIONS = dict(EXC)
|
||||||
|
TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS)
|
||||||
|
|
||||||
|
|
||||||
ORTH_ONLY = [
|
ORTH_ONLY = [
|
||||||
"ang.",
|
"ang.",
|
||||||
"anm.",
|
"anm.",
|
||||||
|
|
Loading…
Reference in New Issue