#! /usr/bin/env python """Generate ESIS events based on a LaTeX source document and configuration data. """ __version__ = '$Revision$' import errno import re import string import StringIO import sys from esistools import encode DEBUG = 0 class Error(Exception): pass class LaTeXFormatError(Error): pass _begin_env_rx = re.compile(r"[\\]begin{([^}]*)}") _end_env_rx = re.compile(r"[\\]end{([^}]*)}") _begin_macro_rx = re.compile("[\\\\]([a-zA-Z]+[*]?)({|\\s*\n?)") _comment_rx = re.compile("%+ ?(.*)\n[ \t]*") _text_rx = re.compile(r"[^]%\\{}]+") _optional_rx = re.compile(r"\s*[[]([^]]*)[]]") # _parameter_rx is this complicated to allow {...} inside a parameter; # this is useful to match tabular layout specifications like {c|p{24pt}} _parameter_rx = re.compile("[ \n]*{(([^{}}]|{[^}]*})*)}") _token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$") _start_group_rx = re.compile("[ \n]*{") _start_optional_rx = re.compile("[ \n]*[[]") ESCAPED_CHARS = "$%#^ {}&~" def dbgmsg(msg): if DEBUG: sys.stderr.write(msg + "\n") def pushing(name, point, depth): dbgmsg("%s<%s> at %s" % (" "*depth, name, point)) def popping(name, point, depth): dbgmsg("%s at %s" % (" "*depth, name, point)) class Conversion: def __init__(self, ifp, ofp, table=None, discards=(), autoclosing=()): self.ofp_stack = [ofp] self.pop_output() self.table = table self.discards = discards self.autoclosing = autoclosing self.line = string.join(map(string.rstrip, ifp.readlines()), "\n") self.err_write = sys.stderr.write self.preamble = 1 def push_output(self, ofp): self.ofp_stack.append(self.ofp) self.ofp = ofp self.write = ofp.write def pop_output(self): self.ofp = self.ofp_stack.pop() self.write = self.ofp.write def subconvert(self, endchar=None, depth=0): stack = [] line = self.line if DEBUG and endchar: self.err_write( "subconvert(%s)\n line = %s\n" % (`endchar`, `line[:20]`)) while line: if line[0] == endchar and not stack: if DEBUG: self.err_write("subconvert() --> %s\n" % `line[1:21]`) self.line = line return line m = _comment_rx.match(line) if m: text = m.group(1) if text: self.write("(COMMENT\n- %s \n)COMMENT\n-\\n\n" % encode(text)) line = line[m.end():] continue m = _begin_env_rx.match(line) if m: # re-write to use the macro handler line = r"\%s %s" % (m.group(1), line[m.end():]) continue m = _end_env_rx.match(line) if m: # end of environment envname = m.group(1) if envname == "document": # special magic for n in stack[1:]: if n not in self.autoclosing: raise LaTeXFormatError( "open element on stack: " + `n`) # should be more careful, but this is easier to code: stack = [] self.write(")document\n") elif stack and envname == stack[-1]: self.write(")%s\n" % envname) del stack[-1] popping(envname, "a", len(stack) + depth) else: self.err_write("stack: %s\n" % `stack`) raise LaTeXFormatError( "environment close for %s doesn't match" % envname) line = line[m.end():] continue m = _begin_macro_rx.match(line) if m: # start of macro macroname = m.group(1) if macroname == "verbatim": # really magic case! pos = string.find(line, "\\end{verbatim}") text = line[m.end(1):pos] self.write("(verbatim\n") self.write("-%s\n" % encode(text)) self.write(")verbatim\n") line = line[pos + len("\\end{verbatim}"):] continue numbered = 1 opened = 0 if macroname[-1] == "*": macroname = macroname[:-1] numbered = 0 if macroname in self.autoclosing and macroname in stack: while stack[-1] != macroname: top = stack.pop() if top and top not in self.discards: self.write(")%s\n-\\n\n" % top) popping(top, "b", len(stack) + depth) if macroname not in self.discards: self.write("-\\n\n)%s\n-\\n\n" % macroname) popping(macroname, "c", len(stack) + depth - 1) del stack[-1] # if macroname in self.discards: self.push_output(StringIO.StringIO()) else: self.push_output(self.ofp) # params, optional, empty, environ = self.start_macro(macroname) if not numbered: self.write("Anumbered TOKEN no\n") # rip off the macroname if params: if optional and len(params) == 1: line = line[m.end():] else: line = line[m.end(1):] elif empty: line = line[m.end(1):] else: line = line[m.end():] # # Very ugly special case to deal with \item[]. The catch # is that this needs to occur outside the for loop that # handles attribute parsing so we can 'continue' the outer # loop. # if optional and type(params[0]) is type(()): # the attribute name isn't used in this special case pushing(macroname, "a", depth + len(stack)) stack.append(macroname) self.write("(%s\n" % macroname) m = _start_optional_rx.match(line) if m: self.line = line[m.end():] line = self.subconvert("]", depth + len(stack)) line = "}" + line continue # handle attribute mappings here: for attrname in params: if optional: optional = 0 if type(attrname) is type(""): m = _optional_rx.match(line) if m: line = line[m.end():] self.write("A%s TOKEN %s\n" % (attrname, encode(m.group(1)))) elif type(attrname) is type(()): # This is a sub-element; but don't place the # element we found on the stack (\section-like) pushing(macroname, "b", len(stack) + depth) stack.append(macroname) self.write("(%s\n" % macroname) macroname = attrname[0] m = _start_group_rx.match(line) if m: line = line[m.end():] elif type(attrname) is type([]): # A normal subelement: ...... attrname = attrname[0] if not opened: opened = 1 self.write("(%s\n" % macroname) pushing(macroname, "c", len(stack) + depth) self.write("(%s\n" % attrname) pushing(attrname, "sub-elem", len(stack) + depth + 1) self.line = skip_white(line)[1:] line = self.subconvert("}", len(stack) + depth + 1)[1:] dbgmsg("subconvert() ==> " + `line[:20]`) popping(attrname, "sub-elem", len(stack) + depth + 1) self.write(")%s\n" % attrname) else: m = _parameter_rx.match(line) if not m: raise LaTeXFormatError( "could not extract parameter %s for %s: %s" % (attrname, macroname, `line[:100]`)) value = m.group(1) if _token_rx.match(value): dtype = "TOKEN" else: dtype = "CDATA" self.write("A%s %s %s\n" % (attrname, dtype, encode(value))) line = line[m.end():] if params and type(params[-1]) is type('') \ and (not empty) and not environ: # attempt to strip off next '{' m = _start_group_rx.match(line) if not m: raise LaTeXFormatError( "non-empty element '%s' has no content: %s" % (macroname, line[:12])) line = line[m.end():] if not opened: self.write("(%s\n" % macroname) pushing(macroname, "d", len(stack) + depth) if empty: line = "}" + line stack.append(macroname) self.pop_output() continue if line[0] == endchar and not stack: if DEBUG: self.err_write("subconvert() --> %s\n" % `line[1:21]`) self.line = line[1:] return self.line if line[0] == "}": # end of macro or group macroname = stack[-1] conversion = self.table.get(macroname) if macroname \ and macroname not in self.discards \ and type(conversion) is not type(""): # otherwise, it was just a bare group self.write(")%s\n" % stack[-1]) popping(macroname, "d", len(stack) + depth - 1) del stack[-1] line = line[1:] continue if line[0] == "{": pushing("", "e", len(stack) + depth) stack.append("") line = line[1:] continue if line[0] == "\\" and line[1] in ESCAPED_CHARS: self.write("-%s\n" % encode(line[1])) line = line[2:] continue if line[:2] == r"\\": self.write("(BREAK\n)BREAK\n") line = line[2:] continue m = _text_rx.match(line) if m: text = encode(m.group()) self.write("-%s\n" % text) line = line[m.end():] continue # special case because of \item[] if line[0] == "]": self.write("-]\n") line = line[1:] continue # avoid infinite loops extra = "" if len(line) > 100: extra = "..." raise LaTeXFormatError("could not identify markup: %s%s" % (`line[:100]`, extra)) while stack and stack[-1] in self.autoclosing: self.write("-\\n\n") self.write(")%s\n" % stack[-1]) popping(stack.pop(), "e", len(stack) + depth - 1) if stack: raise LaTeXFormatError("elements remain on stack: " + string.join(stack, ", ")) # otherwise we just ran out of input here... def convert(self): self.subconvert() def start_macro(self, name): conversion = self.table.get(name, ([], 0, 0, 0, 0)) params, optional, empty, environ, nocontent = conversion if empty: self.write("e\n") elif nocontent: empty = 1 return params, optional, empty, environ def convert(ifp, ofp, table={}, discards=(), autoclosing=()): c = Conversion(ifp, ofp, table, discards, autoclosing) try: c.convert() except IOError, (err, msg): if err != errno.EPIPE: raise def skip_white(line): while line and line[0] in " %\n\t": line = string.lstrip(line[1:]) return line def main(): if len(sys.argv) == 2: ifp = open(sys.argv[1]) ofp = sys.stdout elif len(sys.argv) == 3: ifp = open(sys.argv[1]) ofp = open(sys.argv[2], "w") else: usage() sys.exit(2) convert(ifp, ofp, { # entries have the form: # name: ([attribute names], is1stOptional, isEmpty, isEnv, nocontent) # attribute names can be: # "string" -- normal attribute # ("string",) -- sub-element with content of macro; like for \section # ["string"] -- sub-element "appendix": ([], 0, 1, 0, 0), "bifuncindex": (["name"], 0, 1, 0, 0), "catcode": ([], 0, 1, 0, 0), "cfuncdesc": (["type", "name", ("args",)], 0, 0, 1, 0), "chapter": ([("title",)], 0, 0, 0, 0), "chapter*": ([("title",)], 0, 0, 0, 0), "classdesc": (["name", ("args",)], 0, 0, 1, 0), "ctypedesc": (["name"], 0, 0, 1, 0), "cvardesc": (["type", "name"], 0, 0, 1, 0), "datadesc": (["name"], 0, 0, 1, 0), "declaremodule": (["id", "type", "name"], 1, 1, 0, 0), "deprecated": (["release"], 0, 0, 0, 0), "documentclass": (["classname"], 0, 1, 0, 0), "excdesc": (["name"], 0, 0, 1, 0), "funcdesc": (["name", ("args",)], 0, 0, 1, 0), "funcdescni": (["name", ("args",)], 0, 0, 1, 0), "funcline": (["name"], 0, 0, 0, 0), "funclineni": (["name"], 0, 0, 0, 0), "geq": ([], 0, 1, 0, 0), "hline": ([], 0, 1, 0, 0), "indexii": (["ie1", "ie2"], 0, 1, 0, 0), "indexiii": (["ie1", "ie2", "ie3"], 0, 1, 0, 0), "indexiv": (["ie1", "ie2", "ie3", "ie4"], 0, 1, 0, 0), "indexname": ([], 0, 0, 0, 0), "input": (["source"], 0, 1, 0, 0), "item": ([("leader",)], 1, 0, 0, 0), "label": (["id"], 0, 1, 0, 0), "labelwidth": ([], 0, 1, 0, 0), "large": ([], 0, 1, 0, 0), "LaTeX": ([], 0, 1, 0, 0), "leftmargin": ([], 0, 1, 0, 0), "leq": ([], 0, 1, 0, 0), "lineii": ([["entry"], ["entry"]], 0, 0, 0, 1), "lineiii": ([["entry"], ["entry"], ["entry"]], 0, 0, 0, 1), "lineiv": ([["entry"], ["entry"], ["entry"], ["entry"]], 0, 0, 0, 1), "localmoduletable": ([], 0, 1, 0, 0), "makeindex": ([], 0, 1, 0, 0), "makemodindex": ([], 0, 1, 0, 0), "maketitle": ([], 0, 1, 0, 0), "manpage": (["name", "section"], 0, 1, 0, 0), "memberdesc": (["class", "name"], 1, 0, 1, 0), "methoddesc": (["class", "name", ("args",)], 1, 0, 1, 0), "methoddescni": (["class", "name", ("args",)], 1, 0, 1, 0), "methodline": (["class", "name"], 1, 0, 0, 0), "methodlineni": (["class", "name"], 1, 0, 0, 0), "moduleauthor": (["name", "email"], 0, 1, 0, 0), "opcodedesc": (["name", "var"], 0, 0, 1, 0), "par": ([], 0, 1, 0, 0), "paragraph": ([("title",)], 0, 0, 0, 0), "renewcommand": (["macro"], 0, 0, 0, 0), "rfc": (["num"], 0, 1, 0, 0), "section": ([("title",)], 0, 0, 0, 0), "sectionauthor": (["name", "email"], 0, 1, 0, 0), "seemodule": (["ref", "name"], 1, 0, 0, 0), "stindex": (["type"], 0, 1, 0, 0), "subparagraph": ([("title",)], 0, 0, 0, 0), "subsection": ([("title",)], 0, 0, 0, 0), "subsubsection": ([("title",)], 0, 0, 0, 0), "list": (["bullet", "init"], 0, 0, 1, 0), "tableii": (["colspec", "style", ["entry"], ["entry"]], 0, 0, 1, 0), "tableiii": (["colspec", "style", ["entry"], ["entry"], ["entry"]], 0, 0, 1, 0), "tableiv": (["colspec", "style", ["entry"], ["entry"], ["entry"], ["entry"]], 0, 0, 1, 0), "version": ([], 0, 1, 0, 0), "versionadded": (["version"], 0, 1, 0, 0), "versionchanged": (["version"], 0, 1, 0, 0), "withsubitem": (["text"], 0, 0, 0, 0), # "ABC": ([], 0, 1, 0, 0), "ASCII": ([], 0, 1, 0, 0), "C": ([], 0, 1, 0, 0), "Cpp": ([], 0, 1, 0, 0), "EOF": ([], 0, 1, 0, 0), "e": ([], 0, 1, 0, 0), "ldots": ([], 0, 1, 0, 0), "NULL": ([], 0, 1, 0, 0), "POSIX": ([], 0, 1, 0, 0), "UNIX": ([], 0, 1, 0, 0), # # Things that will actually be going away! # "fi": ([], 0, 1, 0, 0), "ifhtml": ([], 0, 1, 0, 0), "makeindex": ([], 0, 1, 0, 0), "makemodindex": ([], 0, 1, 0, 0), "maketitle": ([], 0, 1, 0, 0), "noindent": ([], 0, 1, 0, 0), "protect": ([], 0, 1, 0, 0), "tableofcontents": ([], 0, 1, 0, 0), }, discards=["fi", "ifhtml", "makeindex", "makemodindex", "maketitle", "noindent", "tableofcontents"], autoclosing=["chapter", "section", "subsection", "subsubsection", "paragraph", "subparagraph", ]) if __name__ == "__main__": main()