diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index 086761f82..e6c58b7de 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES +from .punctuation import TOKENIZER_SUFFIXES from .stop_words import STOP_WORDS from .morph_rules import MORPH_RULES from .syntax_iterators import SYNTAX_ITERATORS @@ -21,6 +23,9 @@ class NorwegianDefaults(Language.Defaults): Language.Defaults.lex_attr_getters[NORM], BASE_NORMS ) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + prefixes = TOKENIZER_PREFIXES + infixes = TOKENIZER_INFIXES + suffixes = TOKENIZER_SUFFIXES stop_words = STOP_WORDS morph_rules = MORPH_RULES tag_map = TAG_MAP diff --git a/spacy/lang/nb/punctuation.py b/spacy/lang/nb/punctuation.py index b49aa9838..7672809ec 100644 --- a/spacy/lang/nb/punctuation.py +++ b/spacy/lang/nb/punctuation.py @@ -1,16 +1,33 @@ # coding: utf8 from __future__ import unicode_literals -from ..char_classes import LIST_ELLIPSES, LIST_ICONS +from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER -from ..punctuation import TOKENIZER_SUFFIXES +from ..char_classes import CURRENCY, PUNCT, UNITS, LIST_CURRENCY -# Punctuation stolen from Danish + +# Punctuation adapted from Danish _quotes = CONCAT_QUOTES.replace("'", "") +_list_punct = [x for x in LIST_PUNCT if x != "#"] +_list_icons = [x for x in LIST_ICONS if x != "°"] +_list_icons = [x.replace("\\u00B0", "") for x in _list_icons] +_list_quotes = [x for x in LIST_QUOTES if x != "\\'"] + + +_prefixes = ( + ["§", "%", "=", "—", "–", r"\+(?![0-9])"] + + _list_punct + + LIST_ELLIPSES + + LIST_QUOTES + + LIST_CURRENCY + + LIST_ICONS +) + + _infixes = ( LIST_ELLIPSES - + LIST_ICONS + + _list_icons + [ r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), @@ -21,13 +38,26 @@ _infixes = ( ] ) -_suffixes = [ - suffix - for suffix in TOKENIZER_SUFFIXES - if suffix not in ["'s", "'S", "’s", "’S", r"\'"] -] -_suffixes += [r"(?<=[^sSxXzZ])\'"] +_suffixes = ( + LIST_PUNCT + + LIST_ELLIPSES + + _list_quotes + + _list_icons + + ["—", "–"] + + [ + r"(?<=[0-9])\+", + r"(?<=°[FfCcKk])\.", + r"(?<=[0-9])(?:{c})".format(c=CURRENCY), + r"(?<=[0-9])(?:{u})".format(u=UNITS), + r"(?<=[{al}{e}{p}(?:{q})])\.".format( + al=ALPHA_LOWER, e=r"%²\-\+", q=_quotes, p=PUNCT + ), + r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER), + ] + + [r"(?<=[^sSxXzZ])'"] +) +TOKENIZER_PREFIXES = _prefixes TOKENIZER_INFIXES = _infixes TOKENIZER_SUFFIXES = _suffixes diff --git a/spacy/lang/nb/tokenizer_exceptions.py b/spacy/lang/nb/tokenizer_exceptions.py index 92ac09841..3f4aa79f6 100644 --- a/spacy/lang/nb/tokenizer_exceptions.py +++ b/spacy/lang/nb/tokenizer_exceptions.py @@ -24,57 +24,80 @@ for exc_data in [ for orth in [ - "adm.dir.", - "a.m.", - "andelsnr", + "Ap.", "Aq.", + "Ca.", + "Chr.", + "Co.", + "Co.", + "Dr.", + "F.eks.", + "Fr.p.", + "Frp.", + "Grl.", + "Kr.", + "Kr.F.", + "Kr.F.s", + "Mr.", + "Mrs.", + "Pb.", + "Pr.", + "Sp.", + "Sp.", + "St.", + "a.m.", + "ad.", + "adm.dir.", + "andelsnr", "b.c.", "bl.a.", "bla.", "bm.", "bnr.", "bto.", + "c.c.", "ca.", "cand.mag.", - "c.c.", "co.", "d.d.", - "dept.", "d.m.", - "dr.philos.", - "dvs.", "d.y.", - "E. coli", + "dept.", + "dr.", + "dr.med.", + "dr.philos.", + "dr.psychol.", + "dvs.", + "e.Kr.", + "e.l.", "eg.", "ekskl.", - "e.Kr.", "el.", - "e.l.", "et.", "etc.", "etg.", "ev.", "evt.", "f.", + "f.Kr.", "f.eks.", + "f.o.m.", "fhv.", "fk.", - "f.Kr.", - "f.o.m.", "foreg.", "fork.", "fv.", "fvt.", "g.", - "gt.", "gl.", "gno.", "gnr.", "grl.", + "gt.", + "h.r.adv.", "hhv.", "hoh.", "hr.", - "h.r.adv.", "ifb.", "ifm.", "iht.", @@ -83,39 +106,45 @@ for orth in [ "jf.", "jr.", "jun.", + "juris.", "kfr.", + "kgl.", "kgl.res.", "kl.", "komm.", "kr.", "kst.", + "lat.", "lø.", + "m.a.o.", + "m.fl.", + "m.m.", + "m.v.", "ma.", "mag.art.", - "m.a.o.", "md.", "mfl.", + "mht.", "mill.", "min.", - "m.m.", "mnd.", "moh.", - "Mr.", + "mrd.", "muh.", "mv.", "mva.", + "n.å.", "ndf.", "no.", "nov.", "nr.", "nto.", "nyno.", - "n.å.", "o.a.", + "o.l.", "off.", "ofl.", "okt.", - "o.l.", "on.", "op.", "org.", @@ -123,14 +152,15 @@ for orth in [ "ovf.", "p.", "p.a.", - "Pb.", + "p.g.a.", + "p.m.", + "p.t.", "pga.", "ph.d.", "pkt.", - "p.m.", "pr.", "pst.", - "p.t.", + "pt.", "red.anm.", "ref.", "res.", @@ -139,6 +169,10 @@ for orth in [ "rv.", "s.", "s.d.", + "s.k.", + "s.k.", + "s.u.", + "s.å.", "sen.", "sep.", "siviling.", @@ -148,16 +182,17 @@ for orth in [ "sr.", "sst.", "st.", - "stip.", - "stk.", "st.meld.", "st.prp.", + "stip.", + "stk.", "stud.", - "s.u.", "sv.", - "sø.", - "s.å.", "såk.", + "sø.", + "t.h.", + "t.o.m.", + "t.v.", "temp.", "ti.", "tils.", @@ -165,7 +200,6 @@ for orth in [ "tl;dr", "tlf.", "to.", - "t.o.m.", "ult.", "utg.", "v.", @@ -179,8 +213,10 @@ for orth in [ "vol.", "vs.", "vsa.", + "©NTB", "årg.", "årh.", + "§§", ]: _exc[orth] = [{ORTH: orth}]