cpython/Doc/tools/toc2bkm.py

#! /usr/bin/env python

"""Convert a LaTeX .toc file to some PDFTeX magic to create that neat outline.

The output file has an extension of '.bkm' instead of '.out', since hyperref
already uses that extension.
"""

import getopt
import os
import re
import string
import sys


# Ench item in an entry is a tuple of:
#
#   Section #,  Title String,  Page #,  List of Sub-entries
#
# The return value of parse_toc() is such a tuple.

cline_re = r"""^
\\contentsline\ \{([a-z]*)}             # type of section in $1
\{(?:\\numberline\ \{([0-9.A-Z]+)})?     # section number
(.*)}                                   # title string
\{(\d+)}$"""                            # page number

cline_rx = re.compile(cline_re, re.VERBOSE)

OUTER_TO_INNER = -1

_transition_map = {
    ('chapter', 'section'): OUTER_TO_INNER,
    ('section', 'subsection'): OUTER_TO_INNER,
    ('subsection', 'subsubsection'): OUTER_TO_INNER,
    ('subsubsection', 'subsection'): 1,
    ('subsection', 'section'): 1,
    ('section', 'chapter'): 1,
    ('subsection', 'chapter'): 2,
    ('subsubsection', 'section'): 2,
    ('subsubsection', 'chapter'): 3,
    }

INCLUDED_LEVELS = ("chapter", "section", "subsection", "subsubsection")


def parse_toc(fp, bigpart=None):
    toc = top = []
    stack = [toc]
    level = bigpart or 'chapter'
    lineno = 0
    while 1:
        line = fp.readline()
        if not line:
            break
        lineno = lineno + 1
        m = cline_rx.match(line)
        if m:
            stype, snum, title, pageno = m.group(1, 2, 3, 4)
            title = clean_title(title)
            entry = (stype, snum, title, string.atoi(pageno), [])
            if stype == level:
                toc.append(entry)
            else:
                if stype not in INCLUDED_LEVELS:
                    # we don't want paragraphs & subparagraphs
                    continue
                direction = _transition_map[(level, stype)]
                if direction == OUTER_TO_INNER:
                    toc = toc[-1][-1]
                    stack.insert(0, toc)
                    toc.append(entry)
                else:
                    for i in range(direction):
                        del stack[0]
                        toc = stack[0]
                    toc.append(entry)
                level = stype
        else:
            sys.stderr.write("l.%s: " + line)
    return top


hackscore_rx = re.compile(r"\\hackscore\s*{[^}]*}")
raisebox_rx = re.compile(r"\\raisebox\s*{[^}]*}")
title_rx = re.compile(r"\\([a-zA-Z])+\s+")
title_trans = string.maketrans("", "")

def clean_title(title):
    title = raisebox_rx.sub("", title)
    title = hackscore_rx.sub(r"\\_", title)
    pos = 0
    while 1:
        m = title_rx.search(title, pos)
        if m:
            start = m.start()
            if title[start:start+15] != "\\textunderscore":
                title = title[:start] + title[m.end():]
            pos = start + 1
        else:
            break
    title = string.translate(title, title_trans, "{}")
    return title


def write_toc(toc, fp):
    for entry in toc:
        write_toc_entry(entry, fp, 0)

def write_toc_entry(entry, fp, layer):
    stype, snum, title, pageno, toc = entry
    s = "\\pdfoutline goto name{page%03d}" % pageno
    if toc:
        s = "%s count -%d" % (s, len(toc))
    if snum:
        title = "%s %s" % (snum, title)
    s = "%s {%s}\n" % (s, title)
    fp.write(s)
    for entry in toc:
        write_toc_entry(entry, fp, layer + 1)


def process(ifn, ofn, bigpart=None):
    toc = parse_toc(open(ifn), bigpart)
    write_toc(toc, open(ofn, "w"))


def main():
    bigpart = None
    opts, args = getopt.getopt(sys.argv[1:], "c:")
    if opts:
        bigpart = opts[0][1]
    if not args:
        usage()
        sys.exit(2)
    for filename in args:
        base, ext = os.path.splitext(filename)
        ext = ext or ".toc"
        process(base + ext, base + ".bkm", bigpart)


if __name__ == "__main__":
    main()
Utility to help create outline information for PDF. 1998-03-06 21:29:00 +00:00			`#! /usr/bin/env python`

			`"""Convert a LaTeX .toc file to some PDFTeX magic to create that neat outline.`

			`The output file has an extension of '.bkm' instead of '.out', since hyperref`
Don't attempt to add paragraph and subparagraph sections to the PDF outline. Work around font-long bogosity caused by ' in docstring. 1998-10-07 14:12:20 +00:00			`already uses that extension.`
Utility to help create outline information for PDF. 1998-03-06 21:29:00 +00:00			`"""`

Allow the user to specify the "biggest" section type from the command line; default is "chapter". Use 'python toc2bkm.py -c section' to use with Python HOWTO documents. 1998-03-07 15:34:50 +00:00			`import getopt`
Utility to help create outline information for PDF. 1998-03-06 21:29:00 +00:00			`import os`
			`import re`
			`import string`
			`import sys`


			`# Ench item in an entry is a tuple of:`
			`#`
			`# Section #, Title String, Page #, List of Sub-entries`
Mostly spurious change to ensure that everyone's version of this picks up the execute bit.... problem discovered by Guido. 1998-05-14 20:07:10 +00:00			`#`
			`# The return value of parse_toc() is such a tuple.`
Utility to help create outline information for PDF. 1998-03-06 21:29:00 +00:00
			`cline_re = r"""^`
			`\\contentsline\ \{([a-z]*)} # type of section in $1`
			`\{(?:\\numberline\ \{([0-9.A-Z]+)})? # section number`
			`(.*)} # title string`
Hush the nanny. 2000-10-07 12:50:05 +00:00			`\{(\d+)}$""" # page number`
Utility to help create outline information for PDF. 1998-03-06 21:29:00 +00:00
			`cline_rx = re.compile(cline_re, re.VERBOSE)`

			`OUTER_TO_INNER = -1`

			`_transition_map = {`
			`('chapter', 'section'): OUTER_TO_INNER,`
			`('section', 'subsection'): OUTER_TO_INNER,`
			`('subsection', 'subsubsection'): OUTER_TO_INNER,`
			`('subsubsection', 'subsection'): 1,`
			`('subsection', 'section'): 1,`
			`('section', 'chapter'): 1,`
			`('subsection', 'chapter'): 2,`
			`('subsubsection', 'section'): 2,`
			`('subsubsection', 'chapter'): 3,`
			`}`

Don't attempt to add paragraph and subparagraph sections to the PDF outline. Work around font-long bogosity caused by ' in docstring. 1998-10-07 14:12:20 +00:00			`INCLUDED_LEVELS = ("chapter", "section", "subsection", "subsubsection")`


Allow the user to specify the "biggest" section type from the command line; default is "chapter". Use 'python toc2bkm.py -c section' to use with Python HOWTO documents. 1998-03-07 15:34:50 +00:00			`def parse_toc(fp, bigpart=None):`
Utility to help create outline information for PDF. 1998-03-06 21:29:00 +00:00			`toc = top = []`
			`stack = [toc]`
Allow the user to specify the "biggest" section type from the command line; default is "chapter". Use 'python toc2bkm.py -c section' to use with Python HOWTO documents. 1998-03-07 15:34:50 +00:00			`level = bigpart or 'chapter'`
Utility to help create outline information for PDF. 1998-03-06 21:29:00 +00:00			`lineno = 0`
			`while 1:`
Hush the nanny. 2000-10-07 12:50:05 +00:00			`line = fp.readline()`
			`if not line:`
			`break`
			`lineno = lineno + 1`
			`m = cline_rx.match(line)`
			`if m:`
			`stype, snum, title, pageno = m.group(1, 2, 3, 4)`
			`title = clean_title(title)`
			`entry = (stype, snum, title, string.atoi(pageno), [])`
			`if stype == level:`
			`toc.append(entry)`
			`else:`
Don't attempt to add paragraph and subparagraph sections to the PDF outline. Work around font-long bogosity caused by ' in docstring. 1998-10-07 14:12:20 +00:00			`if stype not in INCLUDED_LEVELS:`
			`# we don't want paragraphs & subparagraphs`
			`continue`
Hush the nanny. 2000-10-07 12:50:05 +00:00			`direction = _transition_map[(level, stype)]`
			`if direction == OUTER_TO_INNER:`
			`toc = toc[-1][-1]`
			`stack.insert(0, toc)`
			`toc.append(entry)`
			`else:`
			`for i in range(direction):`
			`del stack[0]`
			`toc = stack[0]`
			`toc.append(entry)`
			`level = stype`
			`else:`
			`sys.stderr.write("l.%s: " + line)`
Utility to help create outline information for PDF. 1998-03-06 21:29:00 +00:00			`return top`


clean_title(): Clean a little more carefully. Still does funny things with underscores. Might this be a fundamental PDF limitation? Hm, could still be a TeX thing. 1998-03-10 14:02:35 +00:00			`hackscore_rx = re.compile(r"\\hackscore\s{[^}]}")`
			`raisebox_rx = re.compile(r"\\raisebox\s{[^}]}")`
			`title_rx = re.compile(r"\\([a-zA-Z])+\s+")`
Utility to help create outline information for PDF. 1998-03-06 21:29:00 +00:00			`title_trans = string.maketrans("", "")`

			`def clean_title(title):`
clean_title(): Clean a little more carefully. Still does funny things with underscores. Might this be a fundamental PDF limitation? Hm, could still be a TeX thing. 1998-03-10 14:02:35 +00:00			`title = raisebox_rx.sub("", title)`
			`title = hackscore_rx.sub(r"\\_", title)`
			`pos = 0`
Utility to help create outline information for PDF. 1998-03-06 21:29:00 +00:00			`while 1:`
Hush the nanny. 2000-10-07 12:50:05 +00:00			`m = title_rx.search(title, pos)`
			`if m:`
			`start = m.start()`
			`if title[start:start+15] != "\\textunderscore":`
			`title = title[:start] + title[m.end():]`
			`pos = start + 1`
			`else:`
			`break`
clean_title(): Clean a little more carefully. Still does funny things with underscores. Might this be a fundamental PDF limitation? Hm, could still be a TeX thing. 1998-03-10 14:02:35 +00:00			`title = string.translate(title, title_trans, "{}")`
			`return title`
Utility to help create outline information for PDF. 1998-03-06 21:29:00 +00:00

			`def write_toc(toc, fp):`
			`for entry in toc:`
Hush the nanny. 2000-10-07 12:50:05 +00:00			`write_toc_entry(entry, fp, 0)`
Utility to help create outline information for PDF. 1998-03-06 21:29:00 +00:00
			`def write_toc_entry(entry, fp, layer):`
			`stype, snum, title, pageno, toc = entry`
Fix the broken PDF links, really this time! Appending a junk char to the end doesn't quite do it, or doesn't seem to at any rate. Instead, pad the page numbers to always be 3 charaters wide, with leading zeros. 1998-04-15 17:50:22 +00:00			`s = "\\pdfoutline goto name{page%03d}" % pageno`
Utility to help create outline information for PDF. 1998-03-06 21:29:00 +00:00			`if toc:`
Hush the nanny. 2000-10-07 12:50:05 +00:00			`s = "%s count -%d" % (s, len(toc))`
Utility to help create outline information for PDF. 1998-03-06 21:29:00 +00:00			`if snum:`
Hush the nanny. 2000-10-07 12:50:05 +00:00			`title = "%s %s" % (snum, title)`
Utility to help create outline information for PDF. 1998-03-06 21:29:00 +00:00			`s = "%s {%s}\n" % (s, title)`
			`fp.write(s)`
			`for entry in toc:`
Hush the nanny. 2000-10-07 12:50:05 +00:00			`write_toc_entry(entry, fp, layer + 1)`
Utility to help create outline information for PDF. 1998-03-06 21:29:00 +00:00

process(): New function that contains the "orchestration" of the actual work. main(): Just handle the command line and filename determination, calling process() to do the work. These changes make this more import-friendly. 1999-03-03 19:25:56 +00:00			`def process(ifn, ofn, bigpart=None):`
			`toc = parse_toc(open(ifn), bigpart)`
			`write_toc(toc, open(ofn, "w"))`


Utility to help create outline information for PDF. 1998-03-06 21:29:00 +00:00			`def main():`
Allow the user to specify the "biggest" section type from the command line; default is "chapter". Use 'python toc2bkm.py -c section' to use with Python HOWTO documents. 1998-03-07 15:34:50 +00:00			`bigpart = None`
			`opts, args = getopt.getopt(sys.argv[1:], "c:")`
			`if opts:`
Hush the nanny. 2000-10-07 12:50:05 +00:00			`bigpart = opts[0][1]`
Allow the user to specify the "biggest" section type from the command line; default is "chapter". Use 'python toc2bkm.py -c section' to use with Python HOWTO documents. 1998-03-07 15:34:50 +00:00			`if not args:`
Hush the nanny. 2000-10-07 12:50:05 +00:00			`usage()`
			`sys.exit(2)`
Allow the user to specify the "biggest" section type from the command line; default is "chapter". Use 'python toc2bkm.py -c section' to use with Python HOWTO documents. 1998-03-07 15:34:50 +00:00			`for filename in args:`
Hush the nanny. 2000-10-07 12:50:05 +00:00			`base, ext = os.path.splitext(filename)`
			`ext = ext or ".toc"`
process(): New function that contains the "orchestration" of the actual work. main(): Just handle the command line and filename determination, calling process() to do the work. These changes make this more import-friendly. 1999-03-03 19:25:56 +00:00			`process(base + ext, base + ".bkm", bigpart)`
Utility to help create outline information for PDF. 1998-03-06 21:29:00 +00:00

			`if __name__ == "__main__":`
			`main()`