spaCy/spacy/nl/language_data.py

844 lines
15 KiB
Python
Raw Normal View History

# encoding: utf8
from __future__ import unicode_literals
import re
# Stop words are retrieved from http://www.damienvanholten.com/downloads/dutch-stop-words.txt
STOP_WORDS = set("""
aan
af
al
alles
als
altijd
andere
ben
bij
daar
dan
dat
de
der
deze
die
dit
doch
doen
door
dus
een
eens
en
er
ge
geen
geweest
haar
had
heb
hebben
heeft
hem
het
hier
hij
hoe
hun
iemand
iets
ik
in
is
ja
je
kan
kon
kunnen
maar
me
meer
men
met
mij
mijn
moet
na
naar
niet
niets
nog
nu
of
om
omdat
ons
ook
op
over
reeds
te
tegen
toch
toen
tot
u
uit
uw
van
veel
voor
want
waren
was
wat
we
wel
werd
wezen
wie
wij
wil
worden
zal
ze
zei
zelf
zich
zij
zijn
zo
zonder
zou
""".split())
TOKENIZER_PREFIXES = map(re.escape, r'''
,
"
(
[
{
*
<
>
$
£
'
``
`
#
US$
C$
A$
a-
....
...
»
_
§
'''.strip().split('\n'))
TOKENIZER_SUFFIXES = r'''
,
\"
\)
\]
\}
\*
\!
\?
%
\$
>
:
;
'
«
_
''
's
'S
s
S
°
\.\.
\.\.\.
\.\.\.\.
(?<=[a-zäöüßÖÄÜ)\]"'´«‘’%\)²“”])\.
\-\-
´
(?<=[0-9])km²
(?<=[0-9])
(?<=[0-9])cm²
(?<=[0-9])mm²
(?<=[0-9])km³
(?<=[0-9])
(?<=[0-9])cm³
(?<=[0-9])mm³
(?<=[0-9])ha
(?<=[0-9])km
(?<=[0-9])m
(?<=[0-9])cm
(?<=[0-9])mm
(?<=[0-9])µm
(?<=[0-9])nm
(?<=[0-9])yd
(?<=[0-9])in
(?<=[0-9])ft
(?<=[0-9])kg
(?<=[0-9])g
(?<=[0-9])mg
(?<=[0-9])µg
(?<=[0-9])t
(?<=[0-9])lb
(?<=[0-9])oz
(?<=[0-9])m/s
(?<=[0-9])km/h
(?<=[0-9])mph
(?<=[0-9])°C
(?<=[0-9])°K
(?<=[0-9])°F
(?<=[0-9])hPa
(?<=[0-9])Pa
(?<=[0-9])mbar
(?<=[0-9])mb
(?<=[0-9])T
(?<=[0-9])G
(?<=[0-9])M
(?<=[0-9])K
(?<=[0-9])kb
'''.strip().split('\n')
TOKENIZER_INFIXES = r'''
\.\.\.
(?<=[a-z])\.(?=[A-Z])
(?<=[a-zöäüßA-ZÖÄÜ"]):(?=[a-zöäüßA-ZÖÄÜ])
(?<=[a-zöäüßA-ZÖÄÜ"])>(?=[a-zöäüßA-ZÖÄÜ])
(?<=[a-zöäüßA-ZÖÄÜ"])<(?=[a-zöäüßA-ZÖÄÜ])
(?<=[a-zöäüßA-ZÖÄÜ"])=(?=[a-zöäüßA-ZÖÄÜ])
'''.strip().split('\n')
#TODO Make tokenizer excpetions for Dutch
TOKENIZER_EXCEPTIONS = {}
#TODO insert TAG_MAP for Dutch
TAG_MAP = {
"VNW(pers,pron,nomin,red,3p,ev,masc)": {
"pos": "PRON"
},
"VNW(pers,pron,obl,vol,3,ev,masc)": {
"pos": "PRON"
},
"N(soort,ev,basis,gen)": {
"pos": "NOUN"
},
"WW(pv,tgw,mv)": {
"pos": "VERB"
},
"VNW(pers,pron,obl,vol,2v,ev)": {
"pos": "PRON"
},
"LID(onbep,stan,agr)": {
"pos": "DET"
},
"VNW(pers,pron,stan,nadr,2v,mv)": {
"pos": "PRON"
},
"VNW(onbep,pron,stan,vol,3o,ev)": {
"pos": "PRON"
},
"LID(bep,dial)": {
"pos": "DET"
},
"VNW(pers,pron,nomin,red,1,ev)": {
"pos": "PRON"
},
"WW(inf,nom,zonder,zonder-n)": {
"pos": "VERB"
},
"VNW(pr,pron,obl,vol,1,ev)": {
"pos": "PRON"
},
"SPEC(enof)": {
"pos": "X"
},
"VNW(onbep,det,stan,nom,met-e,mv-n)": {
"pos": "PRON"
},
"VNW(onbep,det,stan,nom,met-e,zonder-n)": {
"pos": "PRON"
},
"VNW(vb,det,stan,prenom,zonder,evon)": {
"pos": "PRON"
},
"VNW(bez,det,stan,vol,1,mv,prenom,zonder,evon)": {
"pos": "PRON"
},
"VNW(onbep,grad,stan,nom,met-e,zonder-n,sup)": {
"pos": "PRON"
},
"TW(hoofd,nom,mv-n,basis)": {
"pos": "NUM"
},
"VNW(onbep,pron,dial)": {
"pos": "PRON"
},
"VNW(aanw,det,stan,nom,met-e,mv-n)": {
"pos": "PRON"
},
"N(soort,ev,dim,onz,stan)": {
"pos": "NOUN"
},
"VNW(aanw,pron,gen,vol,3o,ev)": {
"pos": "PRON"
},
"VNW(bez,det,stan,vol,3,mv,prenom,zonder,agr)": {
"pos": "PRON"
},
"VNW(onbep,grad,stan,vrij,zonder,basis)": {
"pos": "PRON"
},
"VNW(bez,det,stan,vol,1,ev,prenom,zonder,agr)": {
"pos": "PRON"
},
"WW(pv,tgw,ev)": {
"pos": "VERB"
},
"ADJ(vrij,comp,zonder)": {
"pos": "ADJ"
},
"VZ(fin)": {
"pos": "ADP"
},
"VNW(onbep,grad,stan,prenom,met-e,agr,sup)": {
"pos": "PRON"
},
"WW(inf,vrij,zonder)": {
"pos": "VERB"
},
"ADJ(nom,basis,zonder,zonder-n)": {
"pos": "ADJ"
},
"VNW(pers,pron,obl,vol,3,getal,fem)": {
"pos": "PRON"
},
"VNW(refl,pron,obl,red,3,getal)": {
"pos": "PRON"
},
"VNW(onbep,grad,stan,prenom,zonder,agr,comp)": {
"pos": "PRON"
},
"VNW(recip,pron,gen,vol,persoon,mv)": {
"pos": "PRON"
},
"ADJ(prenom,basis,met-e,bijz)": {
"pos": "ADJ"
},
"N(soort,ev,basis,onz,stan)": {
"pos": "NOUN"
},
"VNW(bez,det,stan,vol,3,ev,prenom,zonder,agr)": {
"pos": "PRON"
},
"WW(pv,verl,ev)": {
"pos": "VERB"
},
"TW(rang,prenom,stan)": {
"pos": "ADJ"
},
"VNW(pr,pron,obl,vol,1,mv)": {
"pos": "PRON"
},
"ADJ(nom,sup,zonder,zonder-n)": {
"pos": "ADJ"
},
"VNW(pr,pron,obl,red,1,ev)": {
"pos": "PRON"
},
"VNW(aanw,det,dat,nom,met-e,zonder-n)": {
"pos": "PRON"
},
"WW(pv,conj,ev)": {
"pos": "VERB"
},
"SPEC(afk)": {
"pos": "X"
},
"TW(rang,nom,zonder-n)": {
"pos": "ADJ"
},
"VNW(onbep,det,gen,prenom,met-e,mv)": {
"pos": "PRON"
},
"VNW(vb,pron,gen,vol,3p,mv)": {
"pos": "PRON"
},
"VNW(betr,pron,stan,vol,3,ev)": {
"pos": "PRON"
},
"VNW(pers,pron,nomin,red,1,mv)": {
"pos": "PRON"
},
"VNW(vb,pron,stan,vol,3o,ev)": {
"pos": "PRON"
},
"WW(pv,verl,mv)": {
"pos": "VERB"
},
"TW(hoofd,prenom,stan)": {
"pos": "NUM"
},
"VNW(aanw,det,stan,prenom,met-e,rest)": {
"pos": "PRON"
},
"VNW(vb,det,stan,prenom,met-e,rest)": {
"pos": "PRON"
},
"VNW(pers,pron,nomin,vol,3p,mv)": {
"pos": "PRON"
},
"VNW(pr,pron,obl,vol,2,getal)": {
"pos": "PRON"
},
"ADJ(prenom,basis,zonder)": {
"pos": "ADJ"
},
"TSW()": {
"pos": "INTJ"
},
"VNW(betr,det,stan,nom,zonder,zonder-n)": {
"pos": "PRON"
},
"VZ(init)": {
"pos": "ADP"
},
"VNW(pers,pron,nomin,nadr,3v,ev,fem)": {
"pos": "PRON"
},
"ADJ(vrij,dim,zonder)": {
"pos": "ADJ"
},
"TW(hoofd,dial)": {
"pos": "NUM"
},
"VNW(onbep,grad,stan,prenom,met-e,agr,basis)": {
"pos": "PRON"
},
"TW(hoofd,nom,zonder-n,dim)": {
"pos": "NUM"
},
"ADJ(prenom,comp,zonder)": {
"pos": "ADJ"
},
"WW(od,prenom,met-e)": {
"pos": "VERB"
},
"VNW(bez,det,dial)": {
"pos": "PRON"
},
"VNW(bez,det,stan,red,3,ev,prenom,zonder,agr)": {
"pos": "PRON"
},
"VNW(aanw,det,stan,prenom,zonder,agr)": {
"pos": "PRON"
},
"N(soort,mv,basis)": {
"pos": "NOUN"
},
"VNW(onbep,pron,gen,vol,3p,ev)": {
"pos": "PRON"
},
"LID(onbep,dial)": {
"pos": "DET"
},
"VNW(bez,det,stan,vol,2v,ev,prenom,zonder,agr)": {
"pos": "PRON"
},
"N(soort,ev,basis,genus,stan)": {
"pos": "NOUN"
},
"VNW(aanw,det,dial)": {
"pos": "PRON"
},
"N(soort,ev,basis,dat)": {
"pos": "NOUN"
},
"VNW(onbep,det,stan,prenom,zonder,agr)": {
"pos": "PRON"
},
"LID(bep,gen,rest3)": {
"pos": "DET"
},
"TSW(dial)": {
"pos": "INTJ"
},
"ADJ(nom,basis,met-e,mv-n)": {
"pos": "ADJ"
},
"VNW(onbep,grad,stan,prenom,met-e,mv,basis)": {
"pos": "PRON"
},
"BW(dial)": {
"pos": "ADV"
},
"ADJ(nom,comp,met-e,mv-n)": {
"pos": "ADJ"
},
"LID(bep,stan,evon)": {
"pos": "DET"
},
"WW(vd,nom,met-e,mv-n)": {
"pos": "VERB"
},
"VNW(onbep,grad,stan,nom,zonder,zonder-n,sup)": {
"pos": "PRON"
},
"VNW(pers,pron,obl,nadr,3p,mv)": {
"pos": "PRON"
},
"WW(vd,prenom,met-e)": {
"pos": "VERB"
},
"VNW(bez,det,stan,vol,3m,ev,prenom,met-e,rest)": {
"pos": "PRON"
},
"VG(neven)": {
"pos": "CONJ"
},
"VNW(pers,pron,nomin,vol,2b,getal)": {
"pos": "PRON"
},
"WW(pv,verl,met-t)": {
"pos": "VERB"
},
"VNW(recip,pron,obl,vol,persoon,mv)": {
"pos": "PRON"
},
"ADJ(prenom,comp,met-e,stan)": {
"pos": "ADJ"
},
"VNW(onbep,grad,stan,prenom,met-e,agr,comp)": {
"pos": "PRON"
},
"ADJ(nom,comp,met-e,zonder-n,stan)": {
"pos": "ADJ"
},
"SPEC(deeleigen)": {
"pos": "X"
},
"VNW(vb,pron,stan,vol,3p,getal)": {
"pos": "PRON"
},
"ADJ(postnom,basis,zonder)": {
"pos": "ADJ"
},
"WW(od,nom,met-e,zonder-n)": {
"pos": "VERB"
},
"VNW(vrag,pron,dial)": {
"pos": "PRON"
},
"VNW(onbep,grad,stan,nom,met-e,zonder-n,basis)": {
"pos": "PRON"
},
"VNW(bez,det,stan,vol,2,getal,prenom,zonder,agr)": {
"pos": "PRON"
},
"VNW(onbep,det,dial)": {
"pos": "PRON"
},
"TW(rang,dial)": {
"pos": "ADJ"
},
"VNW(onbep,det,stan,prenom,zonder,evon)": {
"pos": "PRON"
},
"N(soort,dial)": {
"pos": "NOUN"
},
"VNW(excl,pron,stan,vol,3,getal)": {
"pos": "PRON"
},
"WW(vd,vrij,zonder)": {
"pos": "VERB"
},
"SPEC(vreemd)": {
"pos": "X"
},
"VNW(aanw,adv-pron,stan,red,3,getal)": {
"pos": "PRON"
},
"WW(vd,nom,met-e,zonder-n)": {
"pos": "VERB"
},
"VNW(aanw,adv-pron,obl,vol,3o,getal)": {
"pos": "PRON"
},
"VNW(aanw,det,stan,nom,met-e,zonder-n)": {
"pos": "PRON"
},
"ADJ(dial)": {
"pos": "ADJ"
},
"ADJ(vrij,sup,zonder)": {
"pos": "ADJ"
},
"ADJ(nom,sup,met-e,mv-n)": {
"pos": "ADJ"
},
"LID(bep,gen,evmo)": {
"pos": "DET"
},
"VNW(onbep,grad,stan,nom,met-e,mv-n,basis)": {
"pos": "PRON"
},
"VG(onder,dial)": {
"pos": "SCONJ"
},
"ADJ(vrij,basis,zonder)": {
"pos": "ADJ"
},
"ADJ(postnom,basis,met-s)": {
"pos": "ADJ"
},
"VNW(aanw,pron,stan,vol,3,getal)": {
"pos": "PRON"
},
"VG(onder)": {
"pos": "SCONJ"
},
"WW(od,prenom,zonder)": {
"pos": "VERB"
},
"VNW(pers,pron,nomin,red,3,ev,masc)": {
"pos": "PRON"
},
"VNW(onbep,grad,stan,vrij,zonder,comp)": {
"pos": "PRON"
},
"VNW(betr,pron,gen,vol,3o,getal)": {
"pos": "PRON"
},
"VNW(aanw,det,stan,vrij,zonder)": {
"pos": "PRON"
},
"LET()": {
"pos": "PUNCT"
},
"VNW(pers,pron,nomin,vol,1,ev)": {
"pos": "PRON"
},
"VNW(refl,pron,obl,nadr,3,getal)": {
"pos": "PRON"
},
"VNW(pers,pron,nomin,red,2,getal)": {
"pos": "PRON"
},
"N(soort,mv,dim)": {
"pos": "NOUN"
},
"VNW(pers,pron,stan,red,3,ev,fem)": {
"pos": "PRON"
},
"VNW(pers,pron,obl,nadr,3m,ev,masc)": {
"pos": "PRON"
},
"VNW(onbep,adv-pron,obl,vol,3o,getal)": {
"pos": "PRON"
},
"VNW(pers,pron,nomin,vol,2v,ev)": {
"pos": "PRON"
},
"ADJ(nom,basis,met-e,zonder-n,stan)": {
"pos": "ADJ"
},
"SPEC(symb)": {
"pos": "X"
},
"VNW(aanw,pron,gen,vol,3m,ev)": {
"pos": "PRON"
},
"VNW(refl,pron,dial)": {
"pos": "PRON"
},
"VNW(onbep,det,stan,prenom,met-e,evz)": {
"pos": "PRON"
},
"VNW(pers,pron,obl,red,3,ev,masc)": {
"pos": "PRON"
},
"VNW(onbep,det,stan,nom,zonder,zonder-n)": {
"pos": "PRON"
},
"VNW(onbep,det,stan,prenom,met-e,rest)": {
"pos": "PRON"
},
"VNW(onbep,det,stan,prenom,met-e,mv)": {
"pos": "PRON"
},
"VNW(pers,pron,nomin,red,2v,ev)": {
"pos": "PRON"
},
"ADJ(prenom,basis,met-e,stan)": {
"pos": "ADJ"
},
"VNW(bez,det,stan,red,1,ev,prenom,zonder,agr)": {
"pos": "PRON"
},
"SPEC(afgebr)": {
"pos": "X"
},
"VNW(onbep,pron,stan,vol,3p,ev)": {
"pos": "PRON"
},
"VNW(onbep,grad,stan,nom,met-e,mv-n,sup)": {
"pos": "PRON"
},
"VNW(onbep,det,stan,prenom,met-e,agr)": {
"pos": "PRON"
},
"WW(pv,tgw,met-t)": {
"pos": "VERB"
},
"VNW(aanw,det,stan,prenom,zonder,rest)": {
"pos": "PRON"
},
"VNW(pers,pron,stan,red,3,ev,onz)": {
"pos": "PRON"
},
"WW(vd,prenom,zonder)": {
"pos": "VERB"
},
"VNW(pers,pron,nomin,vol,1,mv)": {
"pos": "PRON"
},
"WW(od,nom,met-e,mv-n)": {
"pos": "VERB"
},
"VNW(aanw,pron,stan,vol,3o,ev)": {
"pos": "PRON"
},
"VNW(pers,pron,dial)": {
"pos": "PRON"
},
"VNW(pr,pron,obl,red,2v,getal)": {
"pos": "PRON"
},
"ADJ(nom,basis,zonder,mv-n)": {
"pos": "ADJ"
},
"VNW(onbep,det,stan,vrij,zonder)": {
"pos": "PRON"
},
"LID(bep,stan,rest)": {
"pos": "DET"
},
"VNW(pers,pron,nomin,vol,3v,ev,fem)": {
"pos": "PRON"
},
"VNW(pers,pron,nomin,vol,3,ev,masc)": {
"pos": "PRON"
},
"VNW(pers,pron,stan,red,3,mv)": {
"pos": "PRON"
},
"VNW(bez,det,stan,nadr,2v,mv,prenom,zonder,agr)": {
"pos": "PRON"
},
"ADJ(nom,sup,met-e,zonder-n,stan)": {
"pos": "ADJ"
},
"VNW(pers,pron,obl,vol,3p,mv)": {
"pos": "PRON"
},
"VNW(bez,det,stan,vol,1,mv,prenom,met-e,rest)": {
"pos": "PRON"
},
"VNW(onbep,grad,stan,vrij,zonder,sup)": {
"pos": "PRON"
},
"VNW(bez,det,stan,red,2v,ev,prenom,zonder,agr)": {
"pos": "PRON"
},
"TW(hoofd,vrij)": {
"pos": "NUM"
},
"VNW(onbep,grad,stan,prenom,zonder,agr,basis)": {
"pos": "PRON"
},
"VNW(aanw,det,stan,prenom,zonder,evon)": {
"pos": "PRON"
},
"VNW(onbep,adv-pron,gen,red,3,getal)": {
"pos": "PRON"
},
"VNW(pers,pron,nomin,vol,2,getal)": {
"pos": "PRON"
},
"VNW(pr,pron,obl,nadr,1,ev)": {
"pos": "PRON"
},
"VNW(pr,pron,obl,nadr,2v,getal)": {
"pos": "PRON"
},
"VNW(vb,det,stan,nom,met-e,zonder-n)": {
"pos": "PRON"
},
"VNW(betr,pron,stan,vol,persoon,getal)": {
"pos": "PRON"
},
"TW(hoofd,nom,zonder-n,basis)": {
"pos": "NUM"
},
"VNW(vb,pron,gen,vol,3m,ev)": {
"pos": "PRON"
},
"WW(inf,prenom,zonder)": {
"pos": "VERB"
},
"TW(rang,nom,mv-n)": {
"pos": "ADJ"
},
"SPEC(meta)": {
"pos": "X"
},
"LID(bep,dat,evmo)": {
"pos": "DET"
},
"N(soort,ev,basis,zijd,stan)": {
"pos": "NOUN"
},
"VNW(pers,pron,nomin,nadr,3m,ev,masc)": {
"pos": "PRON"
},
"WW(od,vrij,zonder)": {
"pos": "VERB"
},
"VNW(vb,adv-pron,obl,vol,3o,getal)": {
"pos": "PRON"
},
"ADJ(prenom,sup,zonder)": {
"pos": "ADJ"
},
"BW()": {
"pos": "ADV"
},
"VZ(versm)": {
"pos": "ADP"
},
"ADJ(prenom,sup,met-e,stan)": {
"pos": "ADJ"
}
}