Improved tokenization for UD_Norwegian-Bokmaal

This commit is contained in:
Adriane Boyd 2019-12-02 13:48:27 +01:00
parent 494ec23adb
commit 79737adb90
3 changed files with 109 additions and 38 deletions

View File

@ -2,6 +2,8 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from .punctuation import TOKENIZER_SUFFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .morph_rules import MORPH_RULES from .morph_rules import MORPH_RULES
from .syntax_iterators import SYNTAX_ITERATORS from .syntax_iterators import SYNTAX_ITERATORS
@ -21,6 +23,9 @@ class NorwegianDefaults(Language.Defaults):
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
) )
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
prefixes = TOKENIZER_PREFIXES
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS stop_words = STOP_WORDS
morph_rules = MORPH_RULES morph_rules = MORPH_RULES
tag_map = TAG_MAP tag_map = TAG_MAP

View File

@ -1,16 +1,33 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ..char_classes import LIST_ELLIPSES, LIST_ICONS from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
from ..punctuation import TOKENIZER_SUFFIXES from ..char_classes import CURRENCY, PUNCT, UNITS, LIST_CURRENCY
# Punctuation stolen from Danish
# Punctuation adapted from Danish
_quotes = CONCAT_QUOTES.replace("'", "") _quotes = CONCAT_QUOTES.replace("'", "")
_list_punct = [x for x in LIST_PUNCT if x != "#"]
_list_icons = [x for x in LIST_ICONS if x != "°"]
_list_icons = [x.replace("\\u00B0", "") for x in _list_icons]
_list_quotes = [x for x in LIST_QUOTES if x != "\\'"]
_prefixes = (
["§", "%", "=", "", "", r"\+(?![0-9])"]
+ _list_punct
+ LIST_ELLIPSES
+ LIST_QUOTES
+ LIST_CURRENCY
+ LIST_ICONS
)
_infixes = ( _infixes = (
LIST_ELLIPSES LIST_ELLIPSES
+ LIST_ICONS + _list_icons
+ [ + [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
@ -21,13 +38,26 @@ _infixes = (
] ]
) )
_suffixes = [ _suffixes = (
suffix LIST_PUNCT
for suffix in TOKENIZER_SUFFIXES + LIST_ELLIPSES
if suffix not in ["'s", "'S", "s", "S", r"\'"] + _list_quotes
] + _list_icons
_suffixes += [r"(?<=[^sSxXzZ])\'"] + ["", ""]
+ [
r"(?<=[0-9])\+",
r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[{al}{e}{p}(?:{q})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=_quotes, p=PUNCT
),
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
]
+ [r"(?<=[^sSxXzZ])'"]
)
TOKENIZER_PREFIXES = _prefixes
TOKENIZER_INFIXES = _infixes TOKENIZER_INFIXES = _infixes
TOKENIZER_SUFFIXES = _suffixes TOKENIZER_SUFFIXES = _suffixes

View File

@ -24,57 +24,80 @@ for exc_data in [
for orth in [ for orth in [
"adm.dir.", "Ap.",
"a.m.",
"andelsnr",
"Aq.", "Aq.",
"Ca.",
"Chr.",
"Co.",
"Co.",
"Dr.",
"F.eks.",
"Fr.p.",
"Frp.",
"Grl.",
"Kr.",
"Kr.F.",
"Kr.F.s",
"Mr.",
"Mrs.",
"Pb.",
"Pr.",
"Sp.",
"Sp.",
"St.",
"a.m.",
"ad.",
"adm.dir.",
"andelsnr",
"b.c.", "b.c.",
"bl.a.", "bl.a.",
"bla.", "bla.",
"bm.", "bm.",
"bnr.", "bnr.",
"bto.", "bto.",
"c.c.",
"ca.", "ca.",
"cand.mag.", "cand.mag.",
"c.c.",
"co.", "co.",
"d.d.", "d.d.",
"dept.",
"d.m.", "d.m.",
"dr.philos.",
"dvs.",
"d.y.", "d.y.",
"E. coli", "dept.",
"dr.",
"dr.med.",
"dr.philos.",
"dr.psychol.",
"dvs.",
"e.Kr.",
"e.l.",
"eg.", "eg.",
"ekskl.", "ekskl.",
"e.Kr.",
"el.", "el.",
"e.l.",
"et.", "et.",
"etc.", "etc.",
"etg.", "etg.",
"ev.", "ev.",
"evt.", "evt.",
"f.", "f.",
"f.Kr.",
"f.eks.", "f.eks.",
"f.o.m.",
"fhv.", "fhv.",
"fk.", "fk.",
"f.Kr.",
"f.o.m.",
"foreg.", "foreg.",
"fork.", "fork.",
"fv.", "fv.",
"fvt.", "fvt.",
"g.", "g.",
"gt.",
"gl.", "gl.",
"gno.", "gno.",
"gnr.", "gnr.",
"grl.", "grl.",
"gt.",
"h.r.adv.",
"hhv.", "hhv.",
"hoh.", "hoh.",
"hr.", "hr.",
"h.r.adv.",
"ifb.", "ifb.",
"ifm.", "ifm.",
"iht.", "iht.",
@ -83,39 +106,45 @@ for orth in [
"jf.", "jf.",
"jr.", "jr.",
"jun.", "jun.",
"juris.",
"kfr.", "kfr.",
"kgl.",
"kgl.res.", "kgl.res.",
"kl.", "kl.",
"komm.", "komm.",
"kr.", "kr.",
"kst.", "kst.",
"lat.",
"lø.", "lø.",
"m.a.o.",
"m.fl.",
"m.m.",
"m.v.",
"ma.", "ma.",
"mag.art.", "mag.art.",
"m.a.o.",
"md.", "md.",
"mfl.", "mfl.",
"mht.",
"mill.", "mill.",
"min.", "min.",
"m.m.",
"mnd.", "mnd.",
"moh.", "moh.",
"Mr.", "mrd.",
"muh.", "muh.",
"mv.", "mv.",
"mva.", "mva.",
"n.å.",
"ndf.", "ndf.",
"no.", "no.",
"nov.", "nov.",
"nr.", "nr.",
"nto.", "nto.",
"nyno.", "nyno.",
"n.å.",
"o.a.", "o.a.",
"o.l.",
"off.", "off.",
"ofl.", "ofl.",
"okt.", "okt.",
"o.l.",
"on.", "on.",
"op.", "op.",
"org.", "org.",
@ -123,14 +152,15 @@ for orth in [
"ovf.", "ovf.",
"p.", "p.",
"p.a.", "p.a.",
"Pb.", "p.g.a.",
"p.m.",
"p.t.",
"pga.", "pga.",
"ph.d.", "ph.d.",
"pkt.", "pkt.",
"p.m.",
"pr.", "pr.",
"pst.", "pst.",
"p.t.", "pt.",
"red.anm.", "red.anm.",
"ref.", "ref.",
"res.", "res.",
@ -139,6 +169,10 @@ for orth in [
"rv.", "rv.",
"s.", "s.",
"s.d.", "s.d.",
"s.k.",
"s.k.",
"s.u.",
"s.å.",
"sen.", "sen.",
"sep.", "sep.",
"siviling.", "siviling.",
@ -148,16 +182,17 @@ for orth in [
"sr.", "sr.",
"sst.", "sst.",
"st.", "st.",
"stip.",
"stk.",
"st.meld.", "st.meld.",
"st.prp.", "st.prp.",
"stip.",
"stk.",
"stud.", "stud.",
"s.u.",
"sv.", "sv.",
"sø.",
"s.å.",
"såk.", "såk.",
"sø.",
"t.h.",
"t.o.m.",
"t.v.",
"temp.", "temp.",
"ti.", "ti.",
"tils.", "tils.",
@ -165,7 +200,6 @@ for orth in [
"tl;dr", "tl;dr",
"tlf.", "tlf.",
"to.", "to.",
"t.o.m.",
"ult.", "ult.",
"utg.", "utg.",
"v.", "v.",
@ -179,8 +213,10 @@ for orth in [
"vol.", "vol.",
"vs.", "vs.",
"vsa.", "vsa.",
"©NTB",
"årg.", "årg.",
"årh.", "årh.",
"§§",
]: ]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]