Improved tokenization for UD_Norwegian-Bokmaal

This commit is contained in:
Adriane Boyd 2019-12-02 13:48:27 +01:00
parent 494ec23adb
commit 79737adb90
3 changed files with 109 additions and 38 deletions

View File

@ -2,6 +2,8 @@
from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS
from .morph_rules import MORPH_RULES
from .syntax_iterators import SYNTAX_ITERATORS
@ -21,6 +23,9 @@ class NorwegianDefaults(Language.Defaults):
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS
morph_rules = MORPH_RULES
tag_map = TAG_MAP

View File

@ -1,16 +1,33 @@
# coding: utf8
from __future__ import unicode_literals
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
from ..punctuation import TOKENIZER_SUFFIXES
from ..char_classes import CURRENCY, PUNCT, UNITS, LIST_CURRENCY
# Punctuation stolen from Danish
# Punctuation adapted from Danish
_quotes = CONCAT_QUOTES.replace("'", "")
_list_punct = [x for x in LIST_PUNCT if x != "#"]
_list_icons = [x for x in LIST_ICONS if x != "°"]
_list_icons = [x.replace("\\u00B0", "") for x in _list_icons]
_list_quotes = [x for x in LIST_QUOTES if x != "\\'"]
_prefixes = (
["§", "%", "=", "", "", r"\+(?![0-9])"]
+ _list_punct
+ LIST_ELLIPSES
+ LIST_QUOTES
+ LIST_CURRENCY
+ LIST_ICONS
)
_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ _list_icons
+ [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
@ -21,13 +38,26 @@ _infixes = (
]
)
_suffixes = [
suffix
for suffix in TOKENIZER_SUFFIXES
if suffix not in ["'s", "'S", "s", "S", r"\'"]
]
_suffixes += [r"(?<=[^sSxXzZ])\'"]
_suffixes = (
LIST_PUNCT
+ LIST_ELLIPSES
+ _list_quotes
+ _list_icons
+ ["", ""]
+ [
r"(?<=[0-9])\+",
r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[{al}{e}{p}(?:{q})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=_quotes, p=PUNCT
),
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
]
+ [r"(?<=[^sSxXzZ])'"]
)
TOKENIZER_PREFIXES = _prefixes
TOKENIZER_INFIXES = _infixes
TOKENIZER_SUFFIXES = _suffixes

View File

@ -24,57 +24,80 @@ for exc_data in [
for orth in [
"adm.dir.",
"a.m.",
"andelsnr",
"Ap.",
"Aq.",
"Ca.",
"Chr.",
"Co.",
"Co.",
"Dr.",
"F.eks.",
"Fr.p.",
"Frp.",
"Grl.",
"Kr.",
"Kr.F.",
"Kr.F.s",
"Mr.",
"Mrs.",
"Pb.",
"Pr.",
"Sp.",
"Sp.",
"St.",
"a.m.",
"ad.",
"adm.dir.",
"andelsnr",
"b.c.",
"bl.a.",
"bla.",
"bm.",
"bnr.",
"bto.",
"c.c.",
"ca.",
"cand.mag.",
"c.c.",
"co.",
"d.d.",
"dept.",
"d.m.",
"dr.philos.",
"dvs.",
"d.y.",
"E. coli",
"dept.",
"dr.",
"dr.med.",
"dr.philos.",
"dr.psychol.",
"dvs.",
"e.Kr.",
"e.l.",
"eg.",
"ekskl.",
"e.Kr.",
"el.",
"e.l.",
"et.",
"etc.",
"etg.",
"ev.",
"evt.",
"f.",
"f.Kr.",
"f.eks.",
"f.o.m.",
"fhv.",
"fk.",
"f.Kr.",
"f.o.m.",
"foreg.",
"fork.",
"fv.",
"fvt.",
"g.",
"gt.",
"gl.",
"gno.",
"gnr.",
"grl.",
"gt.",
"h.r.adv.",
"hhv.",
"hoh.",
"hr.",
"h.r.adv.",
"ifb.",
"ifm.",
"iht.",
@ -83,39 +106,45 @@ for orth in [
"jf.",
"jr.",
"jun.",
"juris.",
"kfr.",
"kgl.",
"kgl.res.",
"kl.",
"komm.",
"kr.",
"kst.",
"lat.",
"lø.",
"m.a.o.",
"m.fl.",
"m.m.",
"m.v.",
"ma.",
"mag.art.",
"m.a.o.",
"md.",
"mfl.",
"mht.",
"mill.",
"min.",
"m.m.",
"mnd.",
"moh.",
"Mr.",
"mrd.",
"muh.",
"mv.",
"mva.",
"n.å.",
"ndf.",
"no.",
"nov.",
"nr.",
"nto.",
"nyno.",
"n.å.",
"o.a.",
"o.l.",
"off.",
"ofl.",
"okt.",
"o.l.",
"on.",
"op.",
"org.",
@ -123,14 +152,15 @@ for orth in [
"ovf.",
"p.",
"p.a.",
"Pb.",
"p.g.a.",
"p.m.",
"p.t.",
"pga.",
"ph.d.",
"pkt.",
"p.m.",
"pr.",
"pst.",
"p.t.",
"pt.",
"red.anm.",
"ref.",
"res.",
@ -139,6 +169,10 @@ for orth in [
"rv.",
"s.",
"s.d.",
"s.k.",
"s.k.",
"s.u.",
"s.å.",
"sen.",
"sep.",
"siviling.",
@ -148,16 +182,17 @@ for orth in [
"sr.",
"sst.",
"st.",
"stip.",
"stk.",
"st.meld.",
"st.prp.",
"stip.",
"stk.",
"stud.",
"s.u.",
"sv.",
"sø.",
"s.å.",
"såk.",
"sø.",
"t.h.",
"t.o.m.",
"t.v.",
"temp.",
"ti.",
"tils.",
@ -165,7 +200,6 @@ for orth in [
"tl;dr",
"tlf.",
"to.",
"t.o.m.",
"ult.",
"utg.",
"v.",
@ -179,8 +213,10 @@ for orth in [
"vol.",
"vs.",
"vsa.",
"©NTB",
"årg.",
"årh.",
"§§",
]:
_exc[orth] = [{ORTH: orth}]