cpython/Doc/tools/sgmlconv/docfixer.py

329 lines
10 KiB
Python
Raw Normal View History

#! /usr/bin/env python
"""Promote the IDs from <label/> elements to the enclosing section / chapter /
whatever, then remove the <label/> elements. This allows *ML style internal
linking rather than the bogus LaTeX model.
Note that <label/>s in <title> elements are promoted two steps, since the
<title> elements are artificially created from the section parameter, and the
label really refers to the sectioning construct.
"""
__version__ = '$Revision$'
import errno
import esistools
import re
import string
import sys
import xml.dom.core
import xml.dom.esis_builder
# Workaround to deal with invalid documents (multiple root elements). This
# does not indicate a bug in the DOM implementation.
#
def get_documentElement(self):
docelem = None
for n in self._node.children:
if n.type == xml.dom.core.ELEMENT:
docelem = xml.dom.core.Element(n, self, self)
return docelem
xml.dom.core.Document.get_documentElement = get_documentElement
# Replace get_childNodes for the Document class; without this, children
# accessed from the Document object via .childNodes (no matter how many
# levels of access are used) will be given an ownerDocument of None.
#
def get_childNodes(self):
return xml.dom.core.NodeList(self._node.children, self, self)
xml.dom.core.Document.get_childNodes = get_childNodes
def get_first_element(doc, gi):
for n in doc.childNodes:
if n.nodeType == xml.dom.core.ELEMENT and n.tagName == gi:
return n
def extract_first_element(doc, gi):
node = get_first_element(doc, gi)
if node is not None:
doc.removeChild(node)
return node
def simplify(doc):
# Try to rationalize the document a bit, since these things are simply
# not valid SGML/XML documents as they stand, and need a little work.
documentclass = "document"
inputs = []
node = extract_first_element(doc, "documentclass")
if node is not None:
documentclass = node.getAttribute("classname")
node = extract_first_element(doc, "title")
if node is not None:
inputs.append(node)
# update the name of the root element
node = get_first_element(doc, "document")
if node is not None:
node._node.name = documentclass
while 1:
node = extract_first_element(doc, "input")
if node is None:
break
inputs.append(node)
if inputs:
docelem = doc.documentElement
inputs.reverse()
for node in inputs:
text = doc.createTextNode("\n")
docelem.insertBefore(text, docelem.firstChild)
docelem.insertBefore(node, text)
docelem.insertBefore(doc.createTextNode("\n"), docelem.firstChild)
while doc.firstChild.nodeType == xml.dom.core.TEXT:
doc.removeChild(doc.firstChild)
def cleanup_root_text(doc):
discards = []
skip = 0
for n in doc.childNodes:
prevskip = skip
skip = 0
if n.nodeType == xml.dom.core.TEXT and not prevskip:
discards.append(n)
elif n.nodeType == xml.dom.core.ELEMENT and n.tagName == "COMMENT":
skip = 1
for node in discards:
doc.removeChild(node)
def rewrite_desc_entries(doc, argname_gi):
argnodes = doc.getElementsByTagName(argname_gi)
for node in argnodes:
parent = node.parentNode
nodes = []
for n in parent.childNodes:
if n.nodeType != xml.dom.core.ELEMENT or n.tagName != argname_gi:
nodes.append(n)
desc = doc.createElement("description")
for n in nodes:
parent.removeChild(n)
desc.appendChild(n)
if node.childNodes:
# keep the <args>...</args>, newline & indent
parent.insertBefore(doc.createText("\n "), node)
else:
# no arguments, remove the <args/> node
parent.removeChild(node)
parent.appendChild(doc.createText("\n "))
parent.appendChild(desc)
parent.appendChild(doc.createText("\n"))
def handle_args(doc):
rewrite_desc_entries(doc, "args")
rewrite_desc_entries(doc, "constructor-args")
def handle_appendix(doc):
# must be called after simplfy() if document is multi-rooted to begin with
docelem = doc.documentElement
toplevel = docelem.tagName == "manual" and "chapter" or "section"
appendices = 0
nodes = []
for node in docelem.childNodes:
if appendices:
nodes.append(node)
elif node.nodeType == xml.dom.core.ELEMENT:
appnodes = node.getElementsByTagName("appendix")
if appnodes:
appendices = 1
parent = appnodes[0].parentNode
parent.removeChild(appnodes[0])
parent.normalize()
if nodes:
map(docelem.removeChild, nodes)
docelem.appendChild(doc.createTextNode("\n\n\n"))
back = doc.createElement("back-matter")
docelem.appendChild(back)
back.appendChild(doc.createTextNode("\n"))
while nodes and nodes[0].nodeType == xml.dom.core.TEXT \
and not string.strip(nodes[0].data):
del nodes[0]
map(back.appendChild, nodes)
docelem.appendChild(doc.createTextNode("\n"))
def handle_labels(doc):
labels = doc.getElementsByTagName("label")
for label in labels:
id = label.getAttribute("id")
if not id:
continue
parent = label.parentNode
if parent.tagName == "title":
parent.parentNode.setAttribute("id", id)
else:
parent.setAttribute("id", id)
# now, remove <label id="..."/> from parent:
parent.removeChild(label)
def fixup_trailing_whitespace(doc, wsmap):
queue = [doc]
while queue:
node = queue[0]
del queue[0]
if node.nodeType == xml.dom.core.ELEMENT \
and wsmap.has_key(node.tagName):
ws = wsmap[node.tagName]
children = node.childNodes
children.reverse()
if children[0].nodeType == xml.dom.core.TEXT:
data = string.rstrip(children[0].data) + ws
children[0].data = data
children.reverse()
# hack to get the title in place:
if node.tagName == "title" \
and node.parentNode.firstChild.nodeType == xml.dom.core.ELEMENT:
node.parentNode.insertBefore(doc.createText("\n "),
node.parentNode.firstChild)
for child in node.childNodes:
if child.nodeType == xml.dom.core.ELEMENT:
queue.append(child)
def normalize(doc):
for node in doc.childNodes:
if node.nodeType == xml.dom.core.ELEMENT:
node.normalize()
def cleanup_trailing_parens(doc, element_names):
d = {}
for gi in element_names:
d[gi] = gi
rewrite_element = d.has_key
queue = []
for node in doc.childNodes:
if node.nodeType == xml.dom.core.ELEMENT:
queue.append(node)
while queue:
node = queue[0]
del queue[0]
if rewrite_element(node.tagName):
children = node.childNodes
if len(children) == 1 \
and children[0].nodeType == xml.dom.core.TEXT:
data = children[0].data
if data[-2:] == "()":
children[0].data = data[:-2]
else:
for child in node.childNodes:
if child.nodeType == xml.dom.core.ELEMENT:
queue.append(child)
def cleanup_synopses(doc):
# Actually, this should build a "moduleinfo" element from various
# parts of the meta-information in the section. <moduleinfo> needs
# some design work before we can really do anything real.
synopses = doc.getElementsByTagName("modulesynopsis")
for node in synopses:
node._node.name = "synopsis"
parent = node.parentNode
if parent.tagName == "section":
children = parent.childNodes
parent.removeChild(node)
parent.insertBefore(node, children[2])
text = doc.createTextNode("\n ")
parent.insertBefore(text, node)
_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
def write_esis(doc, ofp, knownempty):
for node in doc.childNodes:
nodeType = node.nodeType
if nodeType == xml.dom.core.ELEMENT:
gi = node.tagName
if knownempty(gi):
if node.hasChildNodes():
raise ValueError, "declared-empty node has children"
ofp.write("e\n")
for k, v in node.attributes.items():
value = v.value
if _token_rx.match(value):
dtype = "TOKEN"
else:
dtype = "CDATA"
ofp.write("A%s %s %s\n" % (k, dtype, esistools.encode(value)))
ofp.write("(%s\n" % gi)
write_esis(node, ofp, knownempty)
ofp.write(")%s\n" % gi)
elif nodeType == xml.dom.core.TEXT:
ofp.write("-%s\n" % esistools.encode(node.data))
else:
raise RuntimeError, "unsupported node type: %s" % nodeType
def convert(ifp, ofp):
p = esistools.ExtendedEsisBuilder()
p.feed(ifp.read())
doc = p.document
normalize(doc)
handle_args(doc)
simplify(doc)
handle_labels(doc)
handle_appendix(doc)
fixup_trailing_whitespace(doc, {
"abstract": "\n",
"title": "",
"chapter": "\n\n",
"section": "\n\n",
"subsection": "\n\n",
"subsubsection": "\n\n",
"paragraph": "\n\n",
"subparagraph": "\n\n",
})
cleanup_root_text(doc)
cleanup_trailing_parens(doc, ["function", "method", "cfunction"])
cleanup_synopses(doc)
#
d = {}
for gi in p.get_empties():
d[gi] = gi
knownempty = d.has_key
#
try:
write_esis(doc, ofp, knownempty)
except IOError, (err, msg):
# Ignore EPIPE; it just means that whoever we're writing to stopped
# reading. The rest of the output would be ignored. All other errors
# should still be reported,
if err != errno.EPIPE:
raise
def main():
if len(sys.argv) == 1:
ifp = sys.stdin
ofp = sys.stdout
elif len(sys.argv) == 2:
ifp = open(sys.argv[1])
ofp = sys.stdout
elif len(sys.argv) == 3:
ifp = open(sys.argv[1])
ofp = open(sys.argv[2], "w")
else:
usage()
sys.exit(2)
convert(ifp, ofp)
if __name__ == "__main__":
main()