spaCy/spacy/lang/id/tokenizer_exceptions.py

# coding: utf8
from __future__ import unicode_literals

import regex as re

from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS
from ..tokenizer_exceptions import URL_PATTERN
from ...symbols import ORTH


_exc = {}

for orth in ID_BASE_EXCEPTIONS:
    _exc[orth] = [{ORTH: orth}]

    orth_title = orth.title()
    _exc[orth_title] = [{ORTH: orth_title}]

    orth_caps = orth.upper()
    _exc[orth_caps] = [{ORTH: orth_caps}]

    orth_lower = orth.lower()
    _exc[orth_lower] = [{ORTH: orth_lower}]

    if '-' in orth:
        orth_title = '-'.join([part.title() for part in orth.split('-')])
        _exc[orth_title] = [{ORTH: orth_title}]

        orth_caps = '-'.join([part.upper() for part in orth.split('-')])
        _exc[orth_caps] = [{ORTH: orth_caps}]


for orth in [
    "'d", "a.m.", "Adm.", "Bros.", "co.", "Co.", "Corp.", "D.C.", "Dr.", "e.g.",
    "E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.", "I.E.", "Inc.", "Jr.",
    "Ltd.", "Md.", "Messrs.", "Mo.", "Mont.", "Mr.", "Mrs.", "Ms.", "p.m.",
    "Ph.D.", "Rep.", "Rev.", "Sen.", "St.", "vs.",
    "B.A.", "B.Ch.E.", "B.Sc.", "Dr.", "Dra.", "Drs.", "Hj.", "Ka.", "Kp.",
    "M.Ag.", "M.Hum.", "M.Kes,", "M.Kom.", "M.M.", "M.P.", "M.Pd.", "M.Sc.",
    "M.Si.", "M.Sn.", "M.T.", "M.Th.", "No.", "Pjs.", "Plt.", "R.A.", "S.Ag.",
    "S.E.", "S.H.", "S.Hut.", "S.K.M.", "S.Kedg.", "S.Kedh.", "S.Kom.",
    "S.Pd.", "S.Pol.", "S.Psi.", "S.S.", "S.Sos.", "S.T.", "S.Tekp.", "S.Th.",
    "a.l.", "a.n.", "a.s.", "b.d.", "d.a.", "d.l.", "d/h", "dkk.", "dll.",
    "dr.", "drh.", "ds.", "dsb.", "dst.", "faks.", "fax.", "hlm.", "i/o",
    "n.b.", "p.p." "pjs.", "s.d.", "tel.", "u.p.",
    ]:
    _exc[orth] = [{ORTH: orth}]

TOKENIZER_EXCEPTIONS = _exc
added tokenizer_exceptions 2017-07-23 15:55:05 +00:00			`# coding: utf8`
			`from __future__ import unicode_literals`

updated tokenizer exceptions 2017-07-26 12:13:47 +00:00			`import regex as re`

cursed of copy & paste 2017-07-24 07:11:51 +00:00			`from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS`
updated tokenizer exceptions 2017-07-26 12:13:47 +00:00			`from ..tokenizer_exceptions import URL_PATTERN`
added missing import 2017-07-24 07:12:34 +00:00			`from ...symbols import ORTH`
enable tokenizer exceptions 2017-07-24 07:11:10 +00:00
updated tokenizer exceptions 2017-07-26 12:13:47 +00:00
enable tokenizer exceptions 2017-07-24 07:11:10 +00:00			`_exc = {}`

updated tokenizer exceptions 2017-07-26 12:13:47 +00:00			`for orth in ID_BASE_EXCEPTIONS:`
enable tokenizer exceptions 2017-07-24 07:11:10 +00:00			`_exc[orth] = [{ORTH: orth}]`

updated tokenizer exceptions 2017-07-26 12:13:47 +00:00			`orth_title = orth.title()`
			`_exc[orth_title] = [{ORTH: orth_title}]`

			`orth_caps = orth.upper()`
			`_exc[orth_caps] = [{ORTH: orth_caps}]`

			`orth_lower = orth.lower()`
			`_exc[orth_lower] = [{ORTH: orth_lower}]`

			`if '-' in orth:`
			`orth_title = '-'.join([part.title() for part in orth.split('-')])`
			`_exc[orth_title] = [{ORTH: orth_title}]`

			`orth_caps = '-'.join([part.upper() for part in orth.split('-')])`
			`_exc[orth_caps] = [{ORTH: orth_caps}]`


indonesian abbr. 2017-08-20 05:16:50 +00:00			`for orth in [`
			`"'d", "a.m.", "Adm.", "Bros.", "co.", "Co.", "Corp.", "D.C.", "Dr.", "e.g.",`
			`"E.g.", "E.G.", "Gen.", "Gov.", "i.e.", "I.e.", "I.E.", "Inc.", "Jr.",`
			`"Ltd.", "Md.", "Messrs.", "Mo.", "Mont.", "Mr.", "Mrs.", "Ms.", "p.m.",`
			`"Ph.D.", "Rep.", "Rev.", "Sen.", "St.", "vs.",`
			`"B.A.", "B.Ch.E.", "B.Sc.", "Dr.", "Dra.", "Drs.", "Hj.", "Ka.", "Kp.",`
			`"M.Ag.", "M.Hum.", "M.Kes,", "M.Kom.", "M.M.", "M.P.", "M.Pd.", "M.Sc.",`
			`"M.Si.", "M.Sn.", "M.T.", "M.Th.", "No.", "Pjs.", "Plt.", "R.A.", "S.Ag.",`
			`"S.E.", "S.H.", "S.Hut.", "S.K.M.", "S.Kedg.", "S.Kedh.", "S.Kom.",`
			`"S.Pd.", "S.Pol.", "S.Psi.", "S.S.", "S.Sos.", "S.T.", "S.Tekp.", "S.Th.",`
			`"a.l.", "a.n.", "a.s.", "b.d.", "d.a.", "d.l.", "d/h", "dkk.", "dll.",`
			`"dr.", "drh.", "ds.", "dsb.", "dst.", "faks.", "fax.", "hlm.", "i/o",`
			`"n.b.", "p.p." "pjs.", "s.d.", "tel.", "u.p.",`
			`]:`
			`_exc[orth] = [{ORTH: orth}]`

Don't copy exception dicts if not necessary and tidy up 2017-10-31 20:05:29 +00:00			`TOKENIZER_EXCEPTIONS = _exc`