spaCy/spacy/lang/hu/tokenizer_exceptions.py

# coding: utf8
from __future__ import unicode_literals

import re

from ..punctuation import ALPHA_LOWER, CURRENCY
from ..tokenizer_exceptions import URL_PATTERN
from ...symbols import ORTH


_exc = {}

for orth in [
    "-e",
    "A.",
    "AG.",
    "AkH.",
    "Aö.",
    "B.",
    "B.CS.",
    "B.S.",
    "B.Sc.",
    "B.ú.é.k.",
    "BE.",
    "BEK.",
    "BSC.",
    "BSc.",
    "BTK.",
    "Bat.",
    "Be.",
    "Bek.",
    "Bfok.",
    "Bk.",
    "Bp.",
    "Bros.",
    "Bt.",
    "Btk.",
    "Btke.",
    "Btét.",
    "C.",
    "CSC.",
    "Cal.",
    "Cg.",
    "Cgf.",
    "Cgt.",
    "Cia.",
    "Co.",
    "Colo.",
    "Comp.",
    "Copr.",
    "Corp.",
    "Cos.",
    "Cs.",
    "Csc.",
    "Csop.",
    "Cstv.",
    "Ctv.",
    "Ctvr.",
    "D.",
    "DR.",
    "Dipl.",
    "Dr.",
    "Dsz.",
    "Dzs.",
    "E.",
    "EK.",
    "EU.",
    "F.",
    "Fla.",
    "Folyt.",
    "Fpk.",
    "Főszerk.",
    "G.",
    "GK.",
    "GM.",
    "Gfv.",
    "Gmk.",
    "Gr.",
    "Group.",
    "Gt.",
    "Gy.",
    "H.",
    "HKsz.",
    "Hmvh.",
    "I.",
    "Ifj.",
    "Inc.",
    "Inform.",
    "Int.",
    "J.",
    "Jr.",
    "Jv.",
    "K.",
    "K.m.f.",
    "KB.",
    "KER.",
    "KFT.",
    "KRT.",
    "Kb.",
    "Ker.",
    "Kft.",
    "Kg.",
    "Kht.",
    "Kkt.",
    "Kong.",
    "Korm.",
    "Kr.",
    "Kr.e.",
    "Kr.u.",
    "Krt.",
    "L.",
    "LB.",
    "Llc.",
    "Ltd.",
    "M.",
    "M.A.",
    "M.S.",
    "M.SC.",
    "M.Sc.",
    "MA.",
    "MH.",
    "MSC.",
    "MSc.",
    "Mass.",
    "Max.",
    "Mlle.",
    "Mme.",
    "Mo.",
    "Mr.",
    "Mrs.",
    "Ms.",
    "Mt.",
    "N.",
    "N.N.",
    "NB.",
    "NBr.",
    "Nat.",
    "No.",
    "Nr.",
    "Ny.",
    "Nyh.",
    "Nyr.",
    "Nyrt.",
    "O.",
    "OJ.",
    "Op.",
    "P.",
    "P.H.",
    "P.S.",
    "PH.D.",
    "PHD.",
    "PROF.",
    "Pf.",
    "Ph.D",
    "PhD.",
    "Pk.",
    "Pl.",
    "Plc.",
    "Pp.",
    "Proc.",
    "Prof.",
    "Ptk.",
    "R.",
    "RT.",
    "Rer.",
    "Rt.",
    "S.",
    "S.B.",
    "SZOLG.",
    "Salg.",
    "Sch.",
    "Spa.",
    "St.",
    "Sz.",
    "SzRt.",
    "Szerk.",
    "Szfv.",
    "Szjt.",
    "Szolg.",
    "Szt.",
    "Sztv.",
    "Szvt.",
    "Számv.",
    "T.",
    "TEL.",
    "Tel.",
    "Ty.",
    "Tyr.",
    "U.",
    "Ui.",
    "Ut.",
    "V.",
    "VB.",
    "Vcs.",
    "Vhr.",
    "Vht.",
    "Várm.",
    "W.",
    "X.",
    "X.Y.",
    "Y.",
    "Z.",
    "Zrt.",
    "Zs.",
    "a.C.",
    "ac.",
    "adj.",
    "adm.",
    "ag.",
    "agit.",
    "alez.",
    "alk.",
    "all.",
    "altbgy.",
    "an.",
    "ang.",
    "arch.",
    "at.",
    "atc.",
    "aug.",
    "b.a.",
    "b.s.",
    "b.sc.",
    "bek.",
    "belker.",
    "berend.",
    "biz.",
    "bizt.",
    "bo.",
    "bp.",
    "br.",
    "bsc.",
    "bt.",
    "btk.",
    "ca.",
    "cc.",
    "cca.",
    "cf.",
    "cif.",
    "co.",
    "corp.",
    "cos.",
    "cs.",
    "csc.",
    "csüt.",
    "cső.",
    "ctv.",
    "dbj.",
    "dd.",
    "ddr.",
    "de.",
    "dec.",
    "dikt.",
    "dipl.",
    "dj.",
    "dk.",
    "dl.",
    "dny.",
    "dolg.",
    "dr.",
    "du.",
    "dzs.",
    "ea.",
    "ed.",
    "eff.",
    "egyh.",
    "ell.",
    "elv.",
    "elvt.",
    "em.",
    "eng.",
    "eny.",
    "et.",
    "etc.",
    "ev.",
    "ezr.",
    "eü.",
    "f.h.",
    "f.é.",
    "fam.",
    "fb.",
    "febr.",
    "fej.",
    "felv.",
    "felügy.",
    "ff.",
    "ffi.",
    "fhdgy.",
    "fil.",
    "fiz.",
    "fm.",
    "foglalk.",
    "ford.",
    "fp.",
    "fr.",
    "frsz.",
    "fszla.",
    "fszt.",
    "ft.",
    "fuv.",
    "főig.",
    "főisk.",
    "főtörm.",
    "főv.",
    "gazd.",
    "gimn.",
    "gk.",
    "gkv.",
    "gmk.",
    "gondn.",
    "gr.",
    "grav.",
    "gy.",
    "gyak.",
    "gyártm.",
    "gör.",
    "hads.",
    "hallg.",
    "hdm.",
    "hdp.",
    "hds.",
    "hg.",
    "hiv.",
    "hk.",
    "hm.",
    "ho.",
    "honv.",
    "hp.",
    "hr.",
    "hrsz.",
    "hsz.",
    "ht.",
    "htb.",
    "hv.",
    "hőm.",
    "i.e.",
    "i.sz.",
    "id.",
    "ie.",
    "ifj.",
    "ig.",
    "igh.",
    "ill.",
    "imp.",
    "inc.",
    "ind.",
    "inform.",
    "inic.",
    "int.",
    "io.",
    "ip.",
    "ir.",
    "irod.",
    "irod.",
    "isk.",
    "ism.",
    "izr.",
    "iá.",
    "jan.",
    "jav.",
    "jegyz.",
    "jgmk.",
    "jjv.",
    "jkv.",
    "jogh.",
    "jogt.",
    "jr.",
    "jvb.",
    "júl.",
    "jún.",
    "karb.",
    "kat.",
    "kath.",
    "kb.",
    "kcs.",
    "kd.",
    "ker.",
    "kf.",
    "kft.",
    "kht.",
    "kir.",
    "kirend.",
    "kisip.",
    "kiv.",
    "kk.",
    "kkt.",
    "klin.",
    "km.",
    "korm.",
    "kp.",
    "krt.",
    "kt.",
    "ktsg.",
    "kult.",
    "kv.",
    "kve.",
    "képv.",
    "kísérl.",
    "kóth.",
    "könyvt.",
    "körz.",
    "köv.",
    "közj.",
    "közl.",
    "közp.",
    "közt.",
    "kü.",
    "lat.",
    "ld.",
    "legs.",
    "lg.",
    "lgv.",
    "loc.",
    "lt.",
    "ltd.",
    "ltp.",
    "luth.",
    "m.a.",
    "m.s.",
    "m.sc.",
    "ma.",
    "mat.",
    "max.",
    "mb.",
    "med.",
    "megh.",
    "met.",
    "mf.",
    "mfszt.",
    "min.",
    "miss.",
    "mjr.",
    "mjv.",
    "mk.",
    "mlle.",
    "mme.",
    "mn.",
    "mozg.",
    "mr.",
    "mrs.",
    "ms.",
    "msc.",
    "má.",
    "máj.",
    "márc.",
    "mé.",
    "mélt.",
    "mü.",
    "műh.",
    "műsz.",
    "műv.",
    "művez.",
    "nagyker.",
    "nagys.",
    "nat.",
    "nb.",
    "neg.",
    "nk.",
    "no.",
    "nov.",
    "nu.",
    "ny.",
    "nyilv.",
    "nyrt.",
    "nyug.",
    "obj.",
    "okl.",
    "okt.",
    "old.",
    "olv.",
    "orsz.",
    "ort.",
    "ov.",
    "ovh.",
    "pf.",
    "pg.",
    "ph.d",
    "ph.d.",
    "phd.",
    "phil.",
    "pjt.",
    "pk.",
    "pl.",
    "plb.",
    "plc.",
    "pld.",
    "plur.",
    "pol.",
    "polg.",
    "poz.",
    "pp.",
    "proc.",
    "prof.",
    "prot.",
    "pság.",
    "ptk.",
    "pu.",
    "pü.",
    "r.k.",
    "rac.",
    "rad.",
    "red.",
    "ref.",
    "reg.",
    "rer.",
    "rev.",
    "rf.",
    "rkp.",
    "rkt.",
    "rt.",
    "rtg.",
    "röv.",
    "s.b.",
    "s.k.",
    "sa.",
    "sb.",
    "sel.",
    "sgt.",
    "sm.",
    "st.",
    "stat.",
    "stb.",
    "strat.",
    "stud.",
    "sz.",
    "szakm.",
    "szaksz.",
    "szakszerv.",
    "szd.",
    "szds.",
    "szept.",
    "szerk.",
    "szf.",
    "szimf.",
    "szjt.",
    "szkv.",
    "szla.",
    "szn.",
    "szolg.",
    "szt.",
    "szubj.",
    "szöv.",
    "szül.",
    "tanm.",
    "tb.",
    "tbk.",
    "tc.",
    "techn.",
    "tek.",
    "tel.",
    "tf.",
    "tgk.",
    "ti.",
    "tip.",
    "tisztv.",
    "titks.",
    "tk.",
    "tkp.",
    "tny.",
    "tp.",
    "tszf.",
    "tszk.",
    "tszkv.",
    "tv.",
    "tvr.",
    "ty.",
    "törv.",
    "tü.",
    "ua.",
    "ui.",
    "unit.",
    "uo.",
    "uv.",
    "vas.",
    "vb.",
    "vegy.",
    "vh.",
    "vhol.",
    "vhr.",
    "vill.",
    "vizsg.",
    "vk.",
    "vkf.",
    "vkny.",
    "vm.",
    "vol.",
    "vs.",
    "vsz.",
    "vv.",
    "vál.",
    "várm.",
    "vízv.",
    "vö.",
    "zrt.",
    "zs.",
    "Á.",
    "Áe.",
    "Áht.",
    "É.",
    "Épt.",
    "Ész.",
    "Új-Z.",
    "ÚjZ.",
    "Ún.",
    "á.",
    "ált.",
    "ápr.",
    "ásv.",
    "é.",
    "ék.",
    "ény.",
    "érk.",
    "évf.",
    "í.",
    "ó.",
    "össz.",
    "ötk.",
    "özv.",
    "ú.",
    "ú.n.",
    "úm.",
    "ún.",
    "út.",
    "üag.",
    "üd.",
    "üdv.",
    "üe.",
    "ümk.",
    "ütk.",
    "üv.",
    "ű.",
    "őrgy.",
    "őrpk.",
    "őrv.",
]:
    _exc[orth] = [{ORTH: orth}]


_ord_num_or_date = r"([A-Z0-9]+[./-])*(\d+\.?)"
_num = r"[+\-]?\d+([,.]\d+)*"
_ops = r"[=<>+\-\*/^()÷%²]"
_suffixes = r"-[{al}]+".format(al=ALPHA_LOWER)
_numeric_exp = r"({n})(({o})({n}))*[%]?".format(n=_num, o=_ops)
_time_exp = r"\d+(:\d+)*(\.\d+)?"

_nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(
    ne=_numeric_exp, t=_time_exp, on=_ord_num_or_date, c=CURRENCY, s=_suffixes
)


TOKENIZER_EXCEPTIONS = _exc
TOKEN_MATCH = re.compile(r"^({u})|({n})$".format(u=URL_PATTERN, n=_nums)).match
Use consistent unicode declarations 2017-03-12 12:07:28 +00:00			`# coding: utf8`
Refactored language data structure 2016-12-20 21:28:20 +00:00			`from __future__ import unicode_literals`
Added Hungarian resource files. 2016-12-08 11:06:36 +00:00
Replacing regex library with re to increase tokenization speed (#3218) * replace unicode categories with raw list of code points * simplifying ranges * fixing variable length quotes * removing redundant regular expression * small cleanup of regexp notations * quotes and alpha as ranges instead of alterations * removed most regexp dependencies and features * exponential backtracking - unit tests * rewrote expression with pathological backtracking * disabling double hyphen tests for now * test additional variants of repeating punctuation * remove regex and redundant backslashes from load_reddit script * small typo fixes * disable double punctuation test for russian * clean up old comments * format block code * final cleanup * naming consistency * french strings as unicode for python 2 support * french regular expression case insensitive 2019-02-01 07:05:22 +00:00			`import re`
Fixed Hungarian tokenizer for numbers 2017-01-14 14:51:59 +00:00
Fix relative imports 2017-05-08 20:29:04 +00:00			`from ..punctuation import ALPHA_LOWER, CURRENCY`
			`from ..tokenizer_exceptions import URL_PATTERN`
			`from ...symbols import ORTH`

Fixed Hungarian tokenizer for numbers 2017-01-14 14:51:59 +00:00
Reorganise Hungarian language data 2017-05-08 13:49:56 +00:00			`_exc = {}`
Fixed Hungarian tokenizer for numbers 2017-01-14 14:51:59 +00:00
Reorganise Hungarian language data 2017-05-08 13:49:56 +00:00			`for orth in [`
💫 Tidy up and auto-format .py files (#2983) <!--- Provide a general summary of your changes in the title. --> ## Description - [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files. - [x] Update flake8 config to exclude very large files (lemmatization tables etc.) - [x] Update code to be compatible with flake8 rules - [x] Fix various small bugs, inconsistencies and messy stuff in the language data - [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means) Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results. At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information. ### Types of change enhancement, code style ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-11-30 16:03:03 +00:00			`"-e",`
			`"A.",`
			`"AG.",`
			`"AkH.",`
			`"Aö.",`
			`"B.",`
			`"B.CS.",`
			`"B.S.",`
			`"B.Sc.",`
			`"B.ú.é.k.",`
			`"BE.",`
			`"BEK.",`
			`"BSC.",`
			`"BSc.",`
			`"BTK.",`
			`"Bat.",`
			`"Be.",`
			`"Bek.",`
			`"Bfok.",`
			`"Bk.",`
			`"Bp.",`
			`"Bros.",`
			`"Bt.",`
			`"Btk.",`
			`"Btke.",`
			`"Btét.",`
			`"C.",`
			`"CSC.",`
			`"Cal.",`
			`"Cg.",`
			`"Cgf.",`
			`"Cgt.",`
			`"Cia.",`
			`"Co.",`
			`"Colo.",`
			`"Comp.",`
			`"Copr.",`
			`"Corp.",`
			`"Cos.",`
			`"Cs.",`
			`"Csc.",`
			`"Csop.",`
			`"Cstv.",`
			`"Ctv.",`
			`"Ctvr.",`
			`"D.",`
			`"DR.",`
			`"Dipl.",`
			`"Dr.",`
			`"Dsz.",`
			`"Dzs.",`
			`"E.",`
			`"EK.",`
			`"EU.",`
			`"F.",`
			`"Fla.",`
			`"Folyt.",`
			`"Fpk.",`
			`"Főszerk.",`
			`"G.",`
			`"GK.",`
			`"GM.",`
			`"Gfv.",`
			`"Gmk.",`
			`"Gr.",`
			`"Group.",`
			`"Gt.",`
			`"Gy.",`
			`"H.",`
			`"HKsz.",`
			`"Hmvh.",`
			`"I.",`
			`"Ifj.",`
			`"Inc.",`
			`"Inform.",`
			`"Int.",`
			`"J.",`
			`"Jr.",`
			`"Jv.",`
			`"K.",`
			`"K.m.f.",`
			`"KB.",`
			`"KER.",`
			`"KFT.",`
			`"KRT.",`
			`"Kb.",`
			`"Ker.",`
			`"Kft.",`
			`"Kg.",`
			`"Kht.",`
			`"Kkt.",`
			`"Kong.",`
			`"Korm.",`
			`"Kr.",`
			`"Kr.e.",`
			`"Kr.u.",`
			`"Krt.",`
			`"L.",`
			`"LB.",`
			`"Llc.",`
			`"Ltd.",`
			`"M.",`
			`"M.A.",`
			`"M.S.",`
			`"M.SC.",`
			`"M.Sc.",`
			`"MA.",`
			`"MH.",`
			`"MSC.",`
			`"MSc.",`
			`"Mass.",`
			`"Max.",`
			`"Mlle.",`
			`"Mme.",`
			`"Mo.",`
			`"Mr.",`
			`"Mrs.",`
			`"Ms.",`
			`"Mt.",`
			`"N.",`
			`"N.N.",`
			`"NB.",`
			`"NBr.",`
			`"Nat.",`
			`"No.",`
			`"Nr.",`
			`"Ny.",`
			`"Nyh.",`
			`"Nyr.",`
			`"Nyrt.",`
			`"O.",`
			`"OJ.",`
			`"Op.",`
			`"P.",`
			`"P.H.",`
			`"P.S.",`
			`"PH.D.",`
			`"PHD.",`
			`"PROF.",`
			`"Pf.",`
			`"Ph.D",`
			`"PhD.",`
			`"Pk.",`
			`"Pl.",`
			`"Plc.",`
			`"Pp.",`
			`"Proc.",`
			`"Prof.",`
			`"Ptk.",`
			`"R.",`
			`"RT.",`
			`"Rer.",`
			`"Rt.",`
			`"S.",`
			`"S.B.",`
			`"SZOLG.",`
			`"Salg.",`
			`"Sch.",`
			`"Spa.",`
			`"St.",`
			`"Sz.",`
			`"SzRt.",`
			`"Szerk.",`
			`"Szfv.",`
			`"Szjt.",`
			`"Szolg.",`
			`"Szt.",`
			`"Sztv.",`
			`"Szvt.",`
			`"Számv.",`
			`"T.",`
			`"TEL.",`
			`"Tel.",`
			`"Ty.",`
			`"Tyr.",`
			`"U.",`
			`"Ui.",`
			`"Ut.",`
			`"V.",`
			`"VB.",`
			`"Vcs.",`
			`"Vhr.",`
			`"Vht.",`
			`"Várm.",`
			`"W.",`
			`"X.",`
			`"X.Y.",`
			`"Y.",`
			`"Z.",`
			`"Zrt.",`
			`"Zs.",`
			`"a.C.",`
			`"ac.",`
			`"adj.",`
			`"adm.",`
			`"ag.",`
			`"agit.",`
			`"alez.",`
			`"alk.",`
			`"all.",`
			`"altbgy.",`
			`"an.",`
			`"ang.",`
			`"arch.",`
			`"at.",`
			`"atc.",`
			`"aug.",`
			`"b.a.",`
			`"b.s.",`
			`"b.sc.",`
			`"bek.",`
			`"belker.",`
			`"berend.",`
			`"biz.",`
			`"bizt.",`
			`"bo.",`
			`"bp.",`
			`"br.",`
			`"bsc.",`
			`"bt.",`
			`"btk.",`
			`"ca.",`
			`"cc.",`
			`"cca.",`
			`"cf.",`
			`"cif.",`
			`"co.",`
			`"corp.",`
			`"cos.",`
			`"cs.",`
			`"csc.",`
			`"csüt.",`
			`"cső.",`
			`"ctv.",`
			`"dbj.",`
			`"dd.",`
			`"ddr.",`
			`"de.",`
			`"dec.",`
			`"dikt.",`
			`"dipl.",`
			`"dj.",`
			`"dk.",`
			`"dl.",`
			`"dny.",`
			`"dolg.",`
			`"dr.",`
			`"du.",`
			`"dzs.",`
			`"ea.",`
			`"ed.",`
			`"eff.",`
			`"egyh.",`
			`"ell.",`
			`"elv.",`
			`"elvt.",`
			`"em.",`
			`"eng.",`
			`"eny.",`
			`"et.",`
			`"etc.",`
			`"ev.",`
			`"ezr.",`
			`"eü.",`
			`"f.h.",`
			`"f.é.",`
			`"fam.",`
			`"fb.",`
			`"febr.",`
			`"fej.",`
			`"felv.",`
			`"felügy.",`
			`"ff.",`
			`"ffi.",`
			`"fhdgy.",`
			`"fil.",`
			`"fiz.",`
			`"fm.",`
			`"foglalk.",`
			`"ford.",`
			`"fp.",`
			`"fr.",`
			`"frsz.",`
			`"fszla.",`
			`"fszt.",`
			`"ft.",`
			`"fuv.",`
			`"főig.",`
			`"főisk.",`
			`"főtörm.",`
			`"főv.",`
			`"gazd.",`
			`"gimn.",`
			`"gk.",`
			`"gkv.",`
			`"gmk.",`
			`"gondn.",`
			`"gr.",`
			`"grav.",`
			`"gy.",`
			`"gyak.",`
			`"gyártm.",`
			`"gör.",`
			`"hads.",`
			`"hallg.",`
			`"hdm.",`
			`"hdp.",`
			`"hds.",`
			`"hg.",`
			`"hiv.",`
			`"hk.",`
			`"hm.",`
			`"ho.",`
			`"honv.",`
			`"hp.",`
			`"hr.",`
			`"hrsz.",`
			`"hsz.",`
			`"ht.",`
			`"htb.",`
			`"hv.",`
			`"hőm.",`
			`"i.e.",`
			`"i.sz.",`
			`"id.",`
			`"ie.",`
			`"ifj.",`
			`"ig.",`
			`"igh.",`
			`"ill.",`
			`"imp.",`
			`"inc.",`
			`"ind.",`
			`"inform.",`
			`"inic.",`
			`"int.",`
			`"io.",`
			`"ip.",`
			`"ir.",`
			`"irod.",`
			`"irod.",`
			`"isk.",`
			`"ism.",`
			`"izr.",`
			`"iá.",`
			`"jan.",`
			`"jav.",`
			`"jegyz.",`
			`"jgmk.",`
			`"jjv.",`
			`"jkv.",`
			`"jogh.",`
			`"jogt.",`
			`"jr.",`
			`"jvb.",`
			`"júl.",`
			`"jún.",`
			`"karb.",`
			`"kat.",`
			`"kath.",`
			`"kb.",`
			`"kcs.",`
			`"kd.",`
			`"ker.",`
			`"kf.",`
			`"kft.",`
			`"kht.",`
			`"kir.",`
			`"kirend.",`
			`"kisip.",`
			`"kiv.",`
			`"kk.",`
			`"kkt.",`
			`"klin.",`
			`"km.",`
			`"korm.",`
			`"kp.",`
			`"krt.",`
			`"kt.",`
			`"ktsg.",`
			`"kult.",`
			`"kv.",`
			`"kve.",`
			`"képv.",`
			`"kísérl.",`
			`"kóth.",`
			`"könyvt.",`
			`"körz.",`
			`"köv.",`
			`"közj.",`
			`"közl.",`
			`"közp.",`
			`"közt.",`
			`"kü.",`
			`"lat.",`
			`"ld.",`
			`"legs.",`
			`"lg.",`
			`"lgv.",`
			`"loc.",`
			`"lt.",`
			`"ltd.",`
			`"ltp.",`
			`"luth.",`
			`"m.a.",`
			`"m.s.",`
			`"m.sc.",`
			`"ma.",`
			`"mat.",`
			`"max.",`
			`"mb.",`
			`"med.",`
			`"megh.",`
			`"met.",`
			`"mf.",`
			`"mfszt.",`
			`"min.",`
			`"miss.",`
			`"mjr.",`
			`"mjv.",`
			`"mk.",`
			`"mlle.",`
			`"mme.",`
			`"mn.",`
			`"mozg.",`
			`"mr.",`
			`"mrs.",`
			`"ms.",`
			`"msc.",`
			`"má.",`
			`"máj.",`
			`"márc.",`
			`"mé.",`
			`"mélt.",`
			`"mü.",`
			`"műh.",`
			`"műsz.",`
			`"műv.",`
			`"művez.",`
			`"nagyker.",`
			`"nagys.",`
			`"nat.",`
			`"nb.",`
			`"neg.",`
			`"nk.",`
			`"no.",`
			`"nov.",`
			`"nu.",`
			`"ny.",`
			`"nyilv.",`
			`"nyrt.",`
			`"nyug.",`
			`"obj.",`
			`"okl.",`
			`"okt.",`
			`"old.",`
			`"olv.",`
			`"orsz.",`
			`"ort.",`
			`"ov.",`
			`"ovh.",`
			`"pf.",`
			`"pg.",`
			`"ph.d",`
			`"ph.d.",`
			`"phd.",`
			`"phil.",`
			`"pjt.",`
			`"pk.",`
			`"pl.",`
			`"plb.",`
			`"plc.",`
			`"pld.",`
			`"plur.",`
			`"pol.",`
			`"polg.",`
			`"poz.",`
			`"pp.",`
			`"proc.",`
			`"prof.",`
			`"prot.",`
			`"pság.",`
			`"ptk.",`
			`"pu.",`
			`"pü.",`
			`"r.k.",`
			`"rac.",`
			`"rad.",`
			`"red.",`
			`"ref.",`
			`"reg.",`
			`"rer.",`
			`"rev.",`
			`"rf.",`
			`"rkp.",`
			`"rkt.",`
			`"rt.",`
			`"rtg.",`
			`"röv.",`
			`"s.b.",`
			`"s.k.",`
			`"sa.",`
			`"sb.",`
			`"sel.",`
			`"sgt.",`
			`"sm.",`
			`"st.",`
			`"stat.",`
			`"stb.",`
			`"strat.",`
			`"stud.",`
			`"sz.",`
			`"szakm.",`
			`"szaksz.",`
			`"szakszerv.",`
			`"szd.",`
			`"szds.",`
			`"szept.",`
			`"szerk.",`
			`"szf.",`
			`"szimf.",`
			`"szjt.",`
			`"szkv.",`
			`"szla.",`
			`"szn.",`
			`"szolg.",`
			`"szt.",`
			`"szubj.",`
			`"szöv.",`
			`"szül.",`
			`"tanm.",`
			`"tb.",`
			`"tbk.",`
			`"tc.",`
			`"techn.",`
			`"tek.",`
			`"tel.",`
			`"tf.",`
			`"tgk.",`
			`"ti.",`
			`"tip.",`
			`"tisztv.",`
			`"titks.",`
			`"tk.",`
			`"tkp.",`
			`"tny.",`
			`"tp.",`
			`"tszf.",`
			`"tszk.",`
			`"tszkv.",`
			`"tv.",`
			`"tvr.",`
			`"ty.",`
			`"törv.",`
			`"tü.",`
			`"ua.",`
			`"ui.",`
			`"unit.",`
			`"uo.",`
			`"uv.",`
			`"vas.",`
			`"vb.",`
			`"vegy.",`
			`"vh.",`
			`"vhol.",`
			`"vhr.",`
			`"vill.",`
			`"vizsg.",`
			`"vk.",`
			`"vkf.",`
			`"vkny.",`
			`"vm.",`
			`"vol.",`
			`"vs.",`
			`"vsz.",`
			`"vv.",`
			`"vál.",`
			`"várm.",`
			`"vízv.",`
			`"vö.",`
			`"zrt.",`
			`"zs.",`
			`"Á.",`
			`"Áe.",`
			`"Áht.",`
			`"É.",`
			`"Épt.",`
			`"Ész.",`
			`"Új-Z.",`
			`"ÚjZ.",`
			`"Ún.",`
			`"á.",`
			`"ált.",`
			`"ápr.",`
			`"ásv.",`
			`"é.",`
			`"ék.",`
			`"ény.",`
			`"érk.",`
			`"évf.",`
			`"í.",`
			`"ó.",`
			`"össz.",`
			`"ötk.",`
			`"özv.",`
			`"ú.",`
			`"ú.n.",`
			`"úm.",`
			`"ún.",`
			`"út.",`
			`"üag.",`
			`"üd.",`
			`"üdv.",`
			`"üe.",`
			`"ümk.",`
			`"ütk.",`
			`"üv.",`
			`"ű.",`
			`"őrgy.",`
			`"őrpk.",`
			`"őrv.",`
			`]:`
Reorganise Hungarian language data 2017-05-08 13:49:56 +00:00			`_exc[orth] = [{ORTH: orth}]`
Fixed Hungarian tokenizer for numbers 2017-01-14 14:51:59 +00:00

Fix regex deprecation warnings 2019-02-21 10:56:47 +00:00			`_ord_num_or_date = r"([A-Z0-9]+[./-])*(\d+\.?)"`
			`_num = r"[+\-]?\d+([,.]\d+)*"`
			`_ops = r"[=<>+\-\*/^()÷%²]"`
			`_suffixes = r"-[{al}]+".format(al=ALPHA_LOWER)`
			`_numeric_exp = r"({n})(({o})({n}))*[%]?".format(n=_num, o=_ops)`
			`_time_exp = r"\d+(:\d+)*(\.\d+)?"`
Reorganise Hungarian language data 2017-05-08 13:49:56 +00:00
Fix regex deprecation warnings 2019-02-21 10:56:47 +00:00			`_nums = r"(({ne})\|({t})\|({on})\|({c}))({s})?".format(`
💫 Tidy up and auto-format .py files (#2983) <!--- Provide a general summary of your changes in the title. --> ## Description - [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files. - [x] Update flake8 config to exclude very large files (lemmatization tables etc.) - [x] Update code to be compatible with flake8 rules - [x] Fix various small bugs, inconsistencies and messy stuff in the language data - [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means) Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results. At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information. ### Types of change enhancement, code style ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. 2018-11-30 16:03:03 +00:00			`ne=_numeric_exp, t=_time_exp, on=_ord_num_or_date, c=CURRENCY, s=_suffixes`
			`)`
Reorganise Hungarian language data 2017-05-08 13:49:56 +00:00

Don't copy exception dicts if not necessary and tidy up 2017-10-31 20:05:29 +00:00			`TOKENIZER_EXCEPTIONS = _exc`
Fix regex deprecation warnings 2019-02-21 10:56:47 +00:00			`TOKEN_MATCH = re.compile(r"^({u})\|({n})$".format(u=URL_PATTERN, n=_nums)).match`