cpython/Doc/tools/sgmlconv/docfixer.py

#! /usr/bin/env python

"""Promote the IDs from <label/> elements to the enclosing section / chapter /
whatever, then remove the <label/> elements.  This allows *ML style internal
linking rather than the bogus LaTeX model.

Note that <label/>s in <title> elements are promoted two steps, since the
<title> elements are artificially created from the section parameter, and the
label really refers to the sectioning construct.
"""
__version__ = '$Revision$'


import errno
import esistools
import re
import string
import sys
import xml.dom.core
import xml.dom.esis_builder


# Workaround to deal with invalid documents (multiple root elements).  This
# does not indicate a bug in the DOM implementation.
#
def get_documentElement(self):
    docelem = None
    for n in self._node.children:
        if n.type == xml.dom.core.ELEMENT:
            docelem = xml.dom.core.Element(n, self, self)
    return docelem

xml.dom.core.Document.get_documentElement = get_documentElement


# Replace get_childNodes for the Document class; without this, children
# accessed from the Document object via .childNodes (no matter how many
# levels of access are used) will be given an ownerDocument of None.
#
def get_childNodes(self):
    return xml.dom.core.NodeList(self._node.children, self, self)

xml.dom.core.Document.get_childNodes = get_childNodes


def get_first_element(doc, gi):
    for n in doc.childNodes:
        if n.nodeType == xml.dom.core.ELEMENT and n.tagName == gi:
            return n

def extract_first_element(doc, gi):
    node = get_first_element(doc, gi)
    if node is not None:
        doc.removeChild(node)
    return node


def simplify(doc):
    # Try to rationalize the document a bit, since these things are simply
    # not valid SGML/XML documents as they stand, and need a little work.
    documentclass = "document"
    inputs = []
    node = extract_first_element(doc, "documentclass")
    if node is not None:
        documentclass = node.getAttribute("classname")
    node = extract_first_element(doc, "title")
    if node is not None:
        inputs.append(node)
    # update the name of the root element
    node = get_first_element(doc, "document")
    if node is not None:
        node._node.name = documentclass
    while 1:
        node = extract_first_element(doc, "input")
        if node is None:
            break
        inputs.append(node)
    if inputs:
        docelem = doc.documentElement
        inputs.reverse()
        for node in inputs:
            text = doc.createTextNode("\n")
            docelem.insertBefore(text, docelem.firstChild)
            docelem.insertBefore(node, text)
        docelem.insertBefore(doc.createTextNode("\n"), docelem.firstChild)
    while doc.firstChild.nodeType == xml.dom.core.TEXT:
        doc.removeChild(doc.firstChild)


def cleanup_root_text(doc):
    discards = []
    skip = 0
    for n in doc.childNodes:
        prevskip = skip
        skip = 0
        if n.nodeType == xml.dom.core.TEXT and not prevskip:
            discards.append(n)
        elif n.nodeType == xml.dom.core.ELEMENT and n.tagName == "COMMENT":
            skip = 1
    for node in discards:
        doc.removeChild(node)


def rewrite_desc_entries(doc, argname_gi):
    argnodes = doc.getElementsByTagName(argname_gi)
    for node in argnodes:
        parent = node.parentNode
        nodes = []
        for n in parent.childNodes:
            if n.nodeType != xml.dom.core.ELEMENT or n.tagName != argname_gi:
                nodes.append(n)
        desc = doc.createElement("description")
        for n in nodes:
            parent.removeChild(n)
            desc.appendChild(n)
        if node.childNodes:
            # keep the <args>...</args>, newline & indent
            parent.insertBefore(doc.createText("\n  "), node)
        else:
            # no arguments, remove the <args/> node
            parent.removeChild(node)
        parent.appendChild(doc.createText("\n  "))
        parent.appendChild(desc)
        parent.appendChild(doc.createText("\n"))

def handle_args(doc):
    rewrite_desc_entries(doc, "args")
    rewrite_desc_entries(doc, "constructor-args")


def handle_appendix(doc):
    # must be called after simplfy() if document is multi-rooted to begin with
    docelem = doc.documentElement
    toplevel = docelem.tagName == "manual" and "chapter" or "section"
    appendices = 0
    nodes = []
    for node in docelem.childNodes:
        if appendices:
            nodes.append(node)
        elif node.nodeType == xml.dom.core.ELEMENT:
            appnodes = node.getElementsByTagName("appendix")
            if appnodes:
                appendices = 1
                parent = appnodes[0].parentNode
                parent.removeChild(appnodes[0])
                parent.normalize()
    if nodes:
        map(docelem.removeChild, nodes)
        docelem.appendChild(doc.createTextNode("\n\n\n"))
        back = doc.createElement("back-matter")
        docelem.appendChild(back)
        back.appendChild(doc.createTextNode("\n"))
        while nodes and nodes[0].nodeType == xml.dom.core.TEXT \
              and not string.strip(nodes[0].data):
            del nodes[0]
        map(back.appendChild, nodes)
        docelem.appendChild(doc.createTextNode("\n"))


def handle_labels(doc):
    labels = doc.getElementsByTagName("label")
    for label in labels:
        id = label.getAttribute("id")
        if not id:
            continue
        parent = label.parentNode
        if parent.tagName == "title":
            parent.parentNode.setAttribute("id", id)
        else:
            parent.setAttribute("id", id)
        # now, remove <label id="..."/> from parent:
        parent.removeChild(label)


def fixup_trailing_whitespace(doc, wsmap):
    queue = [doc]
    while queue:
        node = queue[0]
        del queue[0]
        if node.nodeType == xml.dom.core.ELEMENT \
           and wsmap.has_key(node.tagName):
            ws = wsmap[node.tagName]
            children = node.childNodes
            children.reverse()
            if children[0].nodeType == xml.dom.core.TEXT:
                data = string.rstrip(children[0].data) + ws
                children[0].data = data
            children.reverse()
            # hack to get the title in place:
            if node.tagName == "title" \
               and node.parentNode.firstChild.nodeType == xml.dom.core.ELEMENT:
                node.parentNode.insertBefore(doc.createText("\n  "),
                                             node.parentNode.firstChild)
        for child in node.childNodes:
            if child.nodeType == xml.dom.core.ELEMENT:
                queue.append(child)


def normalize(doc):
    for node in doc.childNodes:
        if node.nodeType == xml.dom.core.ELEMENT:
            node.normalize()


def cleanup_trailing_parens(doc, element_names):
    d = {}
    for gi in element_names:
        d[gi] = gi
    rewrite_element = d.has_key
    queue = []
    for node in doc.childNodes:
        if node.nodeType == xml.dom.core.ELEMENT:
            queue.append(node)
    while queue:
        node = queue[0]
        del queue[0]
        if rewrite_element(node.tagName):
            children = node.childNodes
            if len(children) == 1 \
               and children[0].nodeType == xml.dom.core.TEXT:
                data = children[0].data
                if data[-2:] == "()":
                    children[0].data = data[:-2]
        else:
            for child in node.childNodes:
                if child.nodeType == xml.dom.core.ELEMENT:
                    queue.append(child)


def cleanup_synopses(doc):
    # Actually, this should build a "moduleinfo" element from various
    # parts of the meta-information in the section.  <moduleinfo> needs
    # some design work before we can really do anything real.
    synopses = doc.getElementsByTagName("modulesynopsis")
    for node in synopses:
        node._node.name = "synopsis"
        parent = node.parentNode
        if parent.tagName == "section":
            children = parent.childNodes
            parent.removeChild(node)
            parent.insertBefore(node, children[2])
            text = doc.createTextNode("\n  ")
            parent.insertBefore(text, node)


_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
  
def write_esis(doc, ofp, knownempty):
    for node in doc.childNodes:
        nodeType = node.nodeType
        if nodeType == xml.dom.core.ELEMENT:
            gi = node.tagName
            if knownempty(gi):
                if node.hasChildNodes():
                    raise ValueError, "declared-empty node has children"
                ofp.write("e\n")
            for k, v in node.attributes.items():
                value = v.value
                if _token_rx.match(value):
                    dtype = "TOKEN"
                else:
                    dtype = "CDATA"
                ofp.write("A%s %s %s\n" % (k, dtype, esistools.encode(value)))
            ofp.write("(%s\n" % gi)
            write_esis(node, ofp, knownempty)
            ofp.write(")%s\n" % gi)
        elif nodeType == xml.dom.core.TEXT:
            ofp.write("-%s\n" % esistools.encode(node.data))
        else:
            raise RuntimeError, "unsupported node type: %s" % nodeType


def convert(ifp, ofp):
    p = esistools.ExtendedEsisBuilder()
    p.feed(ifp.read())
    doc = p.document
    normalize(doc)
    handle_args(doc)
    simplify(doc)
    handle_labels(doc)
    handle_appendix(doc)
    fixup_trailing_whitespace(doc, {
        "abstract": "\n",
        "title": "",
        "chapter": "\n\n",
        "section": "\n\n",
        "subsection": "\n\n",
        "subsubsection": "\n\n",
        "paragraph": "\n\n",
        "subparagraph": "\n\n",
        })
    cleanup_root_text(doc)
    cleanup_trailing_parens(doc, ["function", "method", "cfunction"])
    cleanup_synopses(doc)
    #
    d = {}
    for gi in p.get_empties():
        d[gi] = gi
    knownempty = d.has_key
    #
    try:
        write_esis(doc, ofp, knownempty)
    except IOError, (err, msg):
        # Ignore EPIPE; it just means that whoever we're writing to stopped
        # reading.  The rest of the output would be ignored.  All other errors
        # should still be reported,
        if err != errno.EPIPE:
            raise


def main():
    if len(sys.argv) == 1:
        ifp = sys.stdin
        ofp = sys.stdout
    elif len(sys.argv) == 2:
        ifp = open(sys.argv[1])
        ofp = sys.stdout
    elif len(sys.argv) == 3:
        ifp = open(sys.argv[1])
        ofp = open(sys.argv[2], "w")
    else:
        usage()
        sys.exit(2)
    convert(ifp, ofp)


if __name__ == "__main__":
    main()
Script to squirrel around with the DOM tree of document fragments from the LaTeX-based ESIS streams to be a little better structured, and generally perform clean-up. Preliminary. 1998-11-23 17:02:03 +00:00			`#! /usr/bin/env python`

			`"""Promote the IDs from <label/> elements to the enclosing section / chapter /`
			`whatever, then remove the <label/> elements. This allows *ML style internal`
			`linking rather than the bogus LaTeX model.`

			`Note that <label/>s in <title> elements are promoted two steps, since the`
			`<title> elements are artificially created from the section parameter, and the`
			`label really refers to the sectioning construct.`
			`"""`
			`__version__ = '$Revision$'`


			`import errno`
Use esistools. Generate ESIS data instead of XML. Misc. changes to some transforms. 1998-12-01 19:03:01 +00:00			`import esistools`
			`import re`
Script to squirrel around with the DOM tree of document fragments from the LaTeX-based ESIS streams to be a little better structured, and generally perform clean-up. Preliminary. 1998-11-23 17:02:03 +00:00			`import string`
			`import sys`
			`import xml.dom.core`
			`import xml.dom.esis_builder`


			`# Workaround to deal with invalid documents (multiple root elements). This`
			`# does not indicate a bug in the DOM implementation.`
			`#`
			`def get_documentElement(self):`
			`docelem = None`
			`for n in self._node.children:`
			`if n.type == xml.dom.core.ELEMENT:`
			`docelem = xml.dom.core.Element(n, self, self)`
			`return docelem`

			`xml.dom.core.Document.get_documentElement = get_documentElement`


			`# Replace get_childNodes for the Document class; without this, children`
			`# accessed from the Document object via .childNodes (no matter how many`
			`# levels of access are used) will be given an ownerDocument of None.`
			`#`
			`def get_childNodes(self):`
			`return xml.dom.core.NodeList(self._node.children, self, self)`

			`xml.dom.core.Document.get_childNodes = get_childNodes`


			`def get_first_element(doc, gi):`
			`for n in doc.childNodes:`
			`if n.nodeType == xml.dom.core.ELEMENT and n.tagName == gi:`
			`return n`

			`def extract_first_element(doc, gi):`
			`node = get_first_element(doc, gi)`
			`if node is not None:`
			`doc.removeChild(node)`
			`return node`


			`def simplify(doc):`
			`# Try to rationalize the document a bit, since these things are simply`
			`# not valid SGML/XML documents as they stand, and need a little work.`
			`documentclass = "document"`
			`inputs = []`
			`node = extract_first_element(doc, "documentclass")`
			`if node is not None:`
			`documentclass = node.getAttribute("classname")`
			`node = extract_first_element(doc, "title")`
			`if node is not None:`
			`inputs.append(node)`
			`# update the name of the root element`
			`node = get_first_element(doc, "document")`
			`if node is not None:`
			`node._node.name = documentclass`
			`while 1:`
			`node = extract_first_element(doc, "input")`
			`if node is None:`
			`break`
			`inputs.append(node)`
			`if inputs:`
			`docelem = doc.documentElement`
			`inputs.reverse()`
			`for node in inputs:`
			`text = doc.createTextNode("\n")`
			`docelem.insertBefore(text, docelem.firstChild)`
			`docelem.insertBefore(node, text)`
			`docelem.insertBefore(doc.createTextNode("\n"), docelem.firstChild)`
			`while doc.firstChild.nodeType == xml.dom.core.TEXT:`
			`doc.removeChild(doc.firstChild)`


			`def cleanup_root_text(doc):`
			`discards = []`
			`skip = 0`
			`for n in doc.childNodes:`
			`prevskip = skip`
			`skip = 0`
			`if n.nodeType == xml.dom.core.TEXT and not prevskip:`
			`discards.append(n)`
Use esistools. Generate ESIS data instead of XML. Misc. changes to some transforms. 1998-12-01 19:03:01 +00:00			`elif n.nodeType == xml.dom.core.ELEMENT and n.tagName == "COMMENT":`
Script to squirrel around with the DOM tree of document fragments from the LaTeX-based ESIS streams to be a little better structured, and generally perform clean-up. Preliminary. 1998-11-23 17:02:03 +00:00			`skip = 1`
			`for node in discards:`
			`doc.removeChild(node)`


			`def rewrite_desc_entries(doc, argname_gi):`
			`argnodes = doc.getElementsByTagName(argname_gi)`
			`for node in argnodes:`
			`parent = node.parentNode`
			`nodes = []`
			`for n in parent.childNodes:`
			`if n.nodeType != xml.dom.core.ELEMENT or n.tagName != argname_gi:`
			`nodes.append(n)`
			`desc = doc.createElement("description")`
			`for n in nodes:`
			`parent.removeChild(n)`
			`desc.appendChild(n)`
			`if node.childNodes:`
			`# keep the <args>...</args>, newline & indent`
			`parent.insertBefore(doc.createText("\n "), node)`
			`else:`
			`# no arguments, remove the <args/> node`
			`parent.removeChild(node)`
			`parent.appendChild(doc.createText("\n "))`
			`parent.appendChild(desc)`
			`parent.appendChild(doc.createText("\n"))`

			`def handle_args(doc):`
			`rewrite_desc_entries(doc, "args")`
			`rewrite_desc_entries(doc, "constructor-args")`


Use esistools. Generate ESIS data instead of XML. Misc. changes to some transforms. 1998-12-01 19:03:01 +00:00			`def handle_appendix(doc):`
			`# must be called after simplfy() if document is multi-rooted to begin with`
			`docelem = doc.documentElement`
			`toplevel = docelem.tagName == "manual" and "chapter" or "section"`
			`appendices = 0`
			`nodes = []`
			`for node in docelem.childNodes:`
			`if appendices:`
			`nodes.append(node)`
			`elif node.nodeType == xml.dom.core.ELEMENT:`
			`appnodes = node.getElementsByTagName("appendix")`
			`if appnodes:`
			`appendices = 1`
			`parent = appnodes[0].parentNode`
			`parent.removeChild(appnodes[0])`
			`parent.normalize()`
			`if nodes:`
			`map(docelem.removeChild, nodes)`
			`docelem.appendChild(doc.createTextNode("\n\n\n"))`
			`back = doc.createElement("back-matter")`
			`docelem.appendChild(back)`
			`back.appendChild(doc.createTextNode("\n"))`
			`while nodes and nodes[0].nodeType == xml.dom.core.TEXT \`
			`and not string.strip(nodes[0].data):`
			`del nodes[0]`
			`map(back.appendChild, nodes)`
			`docelem.appendChild(doc.createTextNode("\n"))`
Script to squirrel around with the DOM tree of document fragments from the LaTeX-based ESIS streams to be a little better structured, and generally perform clean-up. Preliminary. 1998-11-23 17:02:03 +00:00

			`def handle_labels(doc):`
			`labels = doc.getElementsByTagName("label")`
			`for label in labels:`
			`id = label.getAttribute("id")`
			`if not id:`
			`continue`
			`parent = label.parentNode`
			`if parent.tagName == "title":`
			`parent.parentNode.setAttribute("id", id)`
			`else:`
			`parent.setAttribute("id", id)`
			`# now, remove <label id="..."/> from parent:`
			`parent.removeChild(label)`


Add some additional cleanup transformations. 1998-11-23 23:10:35 +00:00			`def fixup_trailing_whitespace(doc, wsmap):`
			`queue = [doc]`
			`while queue:`
			`node = queue[0]`
			`del queue[0]`
			`if node.nodeType == xml.dom.core.ELEMENT \`
			`and wsmap.has_key(node.tagName):`
			`ws = wsmap[node.tagName]`
			`children = node.childNodes`
			`children.reverse()`
			`if children[0].nodeType == xml.dom.core.TEXT:`
			`data = string.rstrip(children[0].data) + ws`
			`children[0].data = data`
			`children.reverse()`
			`# hack to get the title in place:`
			`if node.tagName == "title" \`
			`and node.parentNode.firstChild.nodeType == xml.dom.core.ELEMENT:`
			`node.parentNode.insertBefore(doc.createText("\n "),`
			`node.parentNode.firstChild)`
			`for child in node.childNodes:`
			`if child.nodeType == xml.dom.core.ELEMENT:`
			`queue.append(child)`


			`def normalize(doc):`
			`for node in doc.childNodes:`
			`if node.nodeType == xml.dom.core.ELEMENT:`
			`node.normalize()`


			`def cleanup_trailing_parens(doc, element_names):`
			`d = {}`
			`for gi in element_names:`
			`d[gi] = gi`
			`rewrite_element = d.has_key`
			`queue = []`
			`for node in doc.childNodes:`
			`if node.nodeType == xml.dom.core.ELEMENT:`
			`queue.append(node)`
			`while queue:`
			`node = queue[0]`
			`del queue[0]`
			`if rewrite_element(node.tagName):`
			`children = node.childNodes`
			`if len(children) == 1 \`
			`and children[0].nodeType == xml.dom.core.TEXT:`
			`data = children[0].data`
			`if data[-2:] == "()":`
			`children[0].data = data[:-2]`
			`else:`
			`for child in node.childNodes:`
			`if child.nodeType == xml.dom.core.ELEMENT:`
			`queue.append(child)`


Added a transform to start cleaning up the modulesynopsis stuff a little; more thinking is needed about what we really want. 1998-12-10 05:07:09 +00:00			`def cleanup_synopses(doc):`
			`# Actually, this should build a "moduleinfo" element from various`
			`# parts of the meta-information in the section. <moduleinfo> needs`
			`# some design work before we can really do anything real.`
			`synopses = doc.getElementsByTagName("modulesynopsis")`
			`for node in synopses:`
			`node._node.name = "synopsis"`
			`parent = node.parentNode`
			`if parent.tagName == "section":`
			`children = parent.childNodes`
			`parent.removeChild(node)`
			`parent.insertBefore(node, children[2])`
			`text = doc.createTextNode("\n ")`
			`parent.insertBefore(text, node)`


Use esistools. Generate ESIS data instead of XML. Misc. changes to some transforms. 1998-12-01 19:03:01 +00:00			`_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")`

			`def write_esis(doc, ofp, knownempty):`
			`for node in doc.childNodes:`
			`nodeType = node.nodeType`
			`if nodeType == xml.dom.core.ELEMENT:`
			`gi = node.tagName`
			`if knownempty(gi):`
			`if node.hasChildNodes():`
			`raise ValueError, "declared-empty node has children"`
			`ofp.write("e\n")`
			`for k, v in node.attributes.items():`
			`value = v.value`
			`if _token_rx.match(value):`
			`dtype = "TOKEN"`
			`else:`
			`dtype = "CDATA"`
			`ofp.write("A%s %s %s\n" % (k, dtype, esistools.encode(value)))`
			`ofp.write("(%s\n" % gi)`
			`write_esis(node, ofp, knownempty)`
			`ofp.write(")%s\n" % gi)`
			`elif nodeType == xml.dom.core.TEXT:`
			`ofp.write("-%s\n" % esistools.encode(node.data))`
			`else:`
			`raise RuntimeError, "unsupported node type: %s" % nodeType`


Script to squirrel around with the DOM tree of document fragments from the LaTeX-based ESIS streams to be a little better structured, and generally perform clean-up. Preliminary. 1998-11-23 17:02:03 +00:00			`def convert(ifp, ofp):`
Use esistools. Generate ESIS data instead of XML. Misc. changes to some transforms. 1998-12-01 19:03:01 +00:00			`p = esistools.ExtendedEsisBuilder()`
Script to squirrel around with the DOM tree of document fragments from the LaTeX-based ESIS streams to be a little better structured, and generally perform clean-up. Preliminary. 1998-11-23 17:02:03 +00:00			`p.feed(ifp.read())`
			`doc = p.document`
Add some additional cleanup transformations. 1998-11-23 23:10:35 +00:00			`normalize(doc)`
Script to squirrel around with the DOM tree of document fragments from the LaTeX-based ESIS streams to be a little better structured, and generally perform clean-up. Preliminary. 1998-11-23 17:02:03 +00:00			`handle_args(doc)`
			`simplify(doc)`
			`handle_labels(doc)`
Use esistools. Generate ESIS data instead of XML. Misc. changes to some transforms. 1998-12-01 19:03:01 +00:00			`handle_appendix(doc)`
Add some additional cleanup transformations. 1998-11-23 23:10:35 +00:00			`fixup_trailing_whitespace(doc, {`
			`"abstract": "\n",`
			`"title": "",`
			`"chapter": "\n\n",`
			`"section": "\n\n",`
			`"subsection": "\n\n",`
			`"subsubsection": "\n\n",`
			`"paragraph": "\n\n",`
			`"subparagraph": "\n\n",`
			`})`
Script to squirrel around with the DOM tree of document fragments from the LaTeX-based ESIS streams to be a little better structured, and generally perform clean-up. Preliminary. 1998-11-23 17:02:03 +00:00			`cleanup_root_text(doc)`
Add some additional cleanup transformations. 1998-11-23 23:10:35 +00:00			`cleanup_trailing_parens(doc, ["function", "method", "cfunction"])`
Added a transform to start cleaning up the modulesynopsis stuff a little; more thinking is needed about what we really want. 1998-12-10 05:07:09 +00:00			`cleanup_synopses(doc)`
Use esistools. Generate ESIS data instead of XML. Misc. changes to some transforms. 1998-12-01 19:03:01 +00:00			`#`
			`d = {}`
			`for gi in p.get_empties():`
			`d[gi] = gi`
			`knownempty = d.has_key`
			`#`
Script to squirrel around with the DOM tree of document fragments from the LaTeX-based ESIS streams to be a little better structured, and generally perform clean-up. Preliminary. 1998-11-23 17:02:03 +00:00			`try:`
Use esistools. Generate ESIS data instead of XML. Misc. changes to some transforms. 1998-12-01 19:03:01 +00:00			`write_esis(doc, ofp, knownempty)`
Script to squirrel around with the DOM tree of document fragments from the LaTeX-based ESIS streams to be a little better structured, and generally perform clean-up. Preliminary. 1998-11-23 17:02:03 +00:00			`except IOError, (err, msg):`
			`# Ignore EPIPE; it just means that whoever we're writing to stopped`
			`# reading. The rest of the output would be ignored. All other errors`
			`# should still be reported,`
			`if err != errno.EPIPE:`
			`raise`


			`def main():`
			`if len(sys.argv) == 1:`
			`ifp = sys.stdin`
			`ofp = sys.stdout`
			`elif len(sys.argv) == 2:`
			`ifp = open(sys.argv[1])`
			`ofp = sys.stdout`
			`elif len(sys.argv) == 3:`
			`ifp = open(sys.argv[1])`
			`ofp = open(sys.argv[2], "w")`
			`else:`
			`usage()`
			`sys.exit(2)`
			`convert(ifp, ofp)`


			`if __name__ == "__main__":`
			`main()`