tornado/website/markdown/html4.py

275 lines
9.4 KiB
Python
Raw Normal View History

2009-09-10 07:50:51 +00:00
# markdown/html4.py
#
# Add html4 serialization to older versions of Elementree
# Taken from ElementTree 1.3 preview with slight modifications
#
# Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved.
#
# fredrik@pythonware.com
# http://www.pythonware.com
#
# --------------------------------------------------------------------
# The ElementTree toolkit is
#
# Copyright (c) 1999-2007 by Fredrik Lundh
#
# By obtaining, using, and/or copying this software and/or its
# associated documentation, you agree that you have read, understood,
# and will comply with the following terms and conditions:
#
# Permission to use, copy, modify, and distribute this software and
# its associated documentation for any purpose and without fee is
# hereby granted, provided that the above copyright notice appears in
# all copies, and that both that copyright notice and this permission
# notice appear in supporting documentation, and that the name of
# Secret Labs AB or the author not be used in advertising or publicity
# pertaining to distribution of the software without specific, written
# prior permission.
#
# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
# OF THIS SOFTWARE.
# --------------------------------------------------------------------
import markdown
ElementTree = markdown.etree.ElementTree
QName = markdown.etree.QName
Comment = markdown.etree.Comment
PI = markdown.etree.PI
ProcessingInstruction = markdown.etree.ProcessingInstruction
HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
"img", "input", "isindex", "link", "meta" "param")
try:
HTML_EMPTY = set(HTML_EMPTY)
except NameError:
pass
_namespace_map = {
# "well-known" namespace prefixes
"http://www.w3.org/XML/1998/namespace": "xml",
"http://www.w3.org/1999/xhtml": "html",
"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
"http://schemas.xmlsoap.org/wsdl/": "wsdl",
# xml schema
"http://www.w3.org/2001/XMLSchema": "xs",
"http://www.w3.org/2001/XMLSchema-instance": "xsi",
# dublic core
"http://purl.org/dc/elements/1.1/": "dc",
}
def _raise_serialization_error(text):
raise TypeError(
"cannot serialize %r (type %s)" % (text, type(text).__name__)
)
def _encode(text, encoding):
try:
return text.encode(encoding, "xmlcharrefreplace")
except (TypeError, AttributeError):
_raise_serialization_error(text)
def _escape_cdata(text, encoding):
# escape character data
try:
# it's worth avoiding do-nothing calls for strings that are
# shorter than 500 character, or so. assume that's, by far,
# the most common case in most applications.
if "&" in text:
text = text.replace("&", "&")
if "<" in text:
text = text.replace("<", "&lt;")
if ">" in text:
text = text.replace(">", "&gt;")
return text.encode(encoding, "xmlcharrefreplace")
except (TypeError, AttributeError):
_raise_serialization_error(text)
def _escape_attrib(text, encoding):
# escape attribute value
try:
if "&" in text:
text = text.replace("&", "&amp;")
if "<" in text:
text = text.replace("<", "&lt;")
if ">" in text:
text = text.replace(">", "&gt;")
if "\"" in text:
text = text.replace("\"", "&quot;")
if "\n" in text:
text = text.replace("\n", "&#10;")
return text.encode(encoding, "xmlcharrefreplace")
except (TypeError, AttributeError):
_raise_serialization_error(text)
def _escape_attrib_html(text, encoding):
# escape attribute value
try:
if "&" in text:
text = text.replace("&", "&amp;")
if ">" in text:
text = text.replace(">", "&gt;")
if "\"" in text:
text = text.replace("\"", "&quot;")
return text.encode(encoding, "xmlcharrefreplace")
except (TypeError, AttributeError):
_raise_serialization_error(text)
def _serialize_html(write, elem, encoding, qnames, namespaces):
tag = elem.tag
text = elem.text
if tag is Comment:
write("<!--%s-->" % _escape_cdata(text, encoding))
elif tag is ProcessingInstruction:
write("<?%s?>" % _escape_cdata(text, encoding))
else:
tag = qnames[tag]
if tag is None:
if text:
write(_escape_cdata(text, encoding))
for e in elem:
_serialize_html(write, e, encoding, qnames, None)
else:
write("<" + tag)
items = elem.items()
if items or namespaces:
items.sort() # lexical order
for k, v in items:
if isinstance(k, QName):
k = k.text
if isinstance(v, QName):
v = qnames[v.text]
else:
v = _escape_attrib_html(v, encoding)
# FIXME: handle boolean attributes
write(" %s=\"%s\"" % (qnames[k], v))
if namespaces:
items = namespaces.items()
items.sort(key=lambda x: x[1]) # sort on prefix
for v, k in items:
if k:
k = ":" + k
write(" xmlns%s=\"%s\"" % (
k.encode(encoding),
_escape_attrib(v, encoding)
))
write(">")
tag = tag.lower()
if text:
if tag == "script" or tag == "style":
write(_encode(text, encoding))
else:
write(_escape_cdata(text, encoding))
for e in elem:
_serialize_html(write, e, encoding, qnames, None)
if tag not in HTML_EMPTY:
write("</" + tag + ">")
if elem.tail:
write(_escape_cdata(elem.tail, encoding))
def write_html(root, f,
# keyword arguments
encoding="us-ascii",
default_namespace=None):
assert root is not None
if not hasattr(f, "write"):
f = open(f, "wb")
write = f.write
if not encoding:
encoding = "us-ascii"
qnames, namespaces = _namespaces(
root, encoding, default_namespace
)
_serialize_html(
write, root, encoding, qnames, namespaces
)
# --------------------------------------------------------------------
# serialization support
def _namespaces(elem, encoding, default_namespace=None):
# identify namespaces used in this tree
# maps qnames to *encoded* prefix:local names
qnames = {None: None}
# maps uri:s to prefixes
namespaces = {}
if default_namespace:
namespaces[default_namespace] = ""
def encode(text):
return text.encode(encoding)
def add_qname(qname):
# calculate serialized qname representation
try:
if qname[:1] == "{":
uri, tag = qname[1:].split("}", 1)
prefix = namespaces.get(uri)
if prefix is None:
prefix = _namespace_map.get(uri)
if prefix is None:
prefix = "ns%d" % len(namespaces)
if prefix != "xml":
namespaces[uri] = prefix
if prefix:
qnames[qname] = encode("%s:%s" % (prefix, tag))
else:
qnames[qname] = encode(tag) # default element
else:
if default_namespace:
# FIXME: can this be handled in XML 1.0?
raise ValueError(
"cannot use non-qualified names with "
"default_namespace option"
)
qnames[qname] = encode(qname)
except TypeError:
_raise_serialization_error(qname)
# populate qname and namespaces table
try:
iterate = elem.iter
except AttributeError:
iterate = elem.getiterator # cET compatibility
for elem in iterate():
tag = elem.tag
if isinstance(tag, QName) and tag.text not in qnames:
add_qname(tag.text)
elif isinstance(tag, basestring):
if tag not in qnames:
add_qname(tag)
elif tag is not None and tag is not Comment and tag is not PI:
_raise_serialization_error(tag)
for key, value in elem.items():
if isinstance(key, QName):
key = key.text
if key not in qnames:
add_qname(key)
if isinstance(value, QName) and value.text not in qnames:
add_qname(value.text)
text = elem.text
if isinstance(text, QName) and text.text not in qnames:
add_qname(text.text)
return qnames, namespaces
def to_html_string(element, encoding=None):
class dummy:
pass
data = []
file = dummy()
file.write = data.append
write_html(ElementTree(element).getroot(),file,encoding)
return "".join(data)