275 lines
9.4 KiB
Python
275 lines
9.4 KiB
Python
|
# markdown/html4.py
|
||
|
#
|
||
|
# Add html4 serialization to older versions of Elementree
|
||
|
# Taken from ElementTree 1.3 preview with slight modifications
|
||
|
#
|
||
|
# Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved.
|
||
|
#
|
||
|
# fredrik@pythonware.com
|
||
|
# http://www.pythonware.com
|
||
|
#
|
||
|
# --------------------------------------------------------------------
|
||
|
# The ElementTree toolkit is
|
||
|
#
|
||
|
# Copyright (c) 1999-2007 by Fredrik Lundh
|
||
|
#
|
||
|
# By obtaining, using, and/or copying this software and/or its
|
||
|
# associated documentation, you agree that you have read, understood,
|
||
|
# and will comply with the following terms and conditions:
|
||
|
#
|
||
|
# Permission to use, copy, modify, and distribute this software and
|
||
|
# its associated documentation for any purpose and without fee is
|
||
|
# hereby granted, provided that the above copyright notice appears in
|
||
|
# all copies, and that both that copyright notice and this permission
|
||
|
# notice appear in supporting documentation, and that the name of
|
||
|
# Secret Labs AB or the author not be used in advertising or publicity
|
||
|
# pertaining to distribution of the software without specific, written
|
||
|
# prior permission.
|
||
|
#
|
||
|
# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
|
||
|
# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
|
||
|
# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
|
||
|
# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
|
||
|
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||
|
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
||
|
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
|
||
|
# OF THIS SOFTWARE.
|
||
|
# --------------------------------------------------------------------
|
||
|
|
||
|
|
||
|
import markdown
|
||
|
ElementTree = markdown.etree.ElementTree
|
||
|
QName = markdown.etree.QName
|
||
|
Comment = markdown.etree.Comment
|
||
|
PI = markdown.etree.PI
|
||
|
ProcessingInstruction = markdown.etree.ProcessingInstruction
|
||
|
|
||
|
HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
|
||
|
"img", "input", "isindex", "link", "meta" "param")
|
||
|
|
||
|
try:
|
||
|
HTML_EMPTY = set(HTML_EMPTY)
|
||
|
except NameError:
|
||
|
pass
|
||
|
|
||
|
_namespace_map = {
|
||
|
# "well-known" namespace prefixes
|
||
|
"http://www.w3.org/XML/1998/namespace": "xml",
|
||
|
"http://www.w3.org/1999/xhtml": "html",
|
||
|
"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
|
||
|
"http://schemas.xmlsoap.org/wsdl/": "wsdl",
|
||
|
# xml schema
|
||
|
"http://www.w3.org/2001/XMLSchema": "xs",
|
||
|
"http://www.w3.org/2001/XMLSchema-instance": "xsi",
|
||
|
# dublic core
|
||
|
"http://purl.org/dc/elements/1.1/": "dc",
|
||
|
}
|
||
|
|
||
|
|
||
|
def _raise_serialization_error(text):
|
||
|
raise TypeError(
|
||
|
"cannot serialize %r (type %s)" % (text, type(text).__name__)
|
||
|
)
|
||
|
|
||
|
def _encode(text, encoding):
|
||
|
try:
|
||
|
return text.encode(encoding, "xmlcharrefreplace")
|
||
|
except (TypeError, AttributeError):
|
||
|
_raise_serialization_error(text)
|
||
|
|
||
|
def _escape_cdata(text, encoding):
|
||
|
# escape character data
|
||
|
try:
|
||
|
# it's worth avoiding do-nothing calls for strings that are
|
||
|
# shorter than 500 character, or so. assume that's, by far,
|
||
|
# the most common case in most applications.
|
||
|
if "&" in text:
|
||
|
text = text.replace("&", "&")
|
||
|
if "<" in text:
|
||
|
text = text.replace("<", "<")
|
||
|
if ">" in text:
|
||
|
text = text.replace(">", ">")
|
||
|
return text.encode(encoding, "xmlcharrefreplace")
|
||
|
except (TypeError, AttributeError):
|
||
|
_raise_serialization_error(text)
|
||
|
|
||
|
|
||
|
def _escape_attrib(text, encoding):
|
||
|
# escape attribute value
|
||
|
try:
|
||
|
if "&" in text:
|
||
|
text = text.replace("&", "&")
|
||
|
if "<" in text:
|
||
|
text = text.replace("<", "<")
|
||
|
if ">" in text:
|
||
|
text = text.replace(">", ">")
|
||
|
if "\"" in text:
|
||
|
text = text.replace("\"", """)
|
||
|
if "\n" in text:
|
||
|
text = text.replace("\n", " ")
|
||
|
return text.encode(encoding, "xmlcharrefreplace")
|
||
|
except (TypeError, AttributeError):
|
||
|
_raise_serialization_error(text)
|
||
|
|
||
|
def _escape_attrib_html(text, encoding):
|
||
|
# escape attribute value
|
||
|
try:
|
||
|
if "&" in text:
|
||
|
text = text.replace("&", "&")
|
||
|
if ">" in text:
|
||
|
text = text.replace(">", ">")
|
||
|
if "\"" in text:
|
||
|
text = text.replace("\"", """)
|
||
|
return text.encode(encoding, "xmlcharrefreplace")
|
||
|
except (TypeError, AttributeError):
|
||
|
_raise_serialization_error(text)
|
||
|
|
||
|
|
||
|
def _serialize_html(write, elem, encoding, qnames, namespaces):
|
||
|
tag = elem.tag
|
||
|
text = elem.text
|
||
|
if tag is Comment:
|
||
|
write("<!--%s-->" % _escape_cdata(text, encoding))
|
||
|
elif tag is ProcessingInstruction:
|
||
|
write("<?%s?>" % _escape_cdata(text, encoding))
|
||
|
else:
|
||
|
tag = qnames[tag]
|
||
|
if tag is None:
|
||
|
if text:
|
||
|
write(_escape_cdata(text, encoding))
|
||
|
for e in elem:
|
||
|
_serialize_html(write, e, encoding, qnames, None)
|
||
|
else:
|
||
|
write("<" + tag)
|
||
|
items = elem.items()
|
||
|
if items or namespaces:
|
||
|
items.sort() # lexical order
|
||
|
for k, v in items:
|
||
|
if isinstance(k, QName):
|
||
|
k = k.text
|
||
|
if isinstance(v, QName):
|
||
|
v = qnames[v.text]
|
||
|
else:
|
||
|
v = _escape_attrib_html(v, encoding)
|
||
|
# FIXME: handle boolean attributes
|
||
|
write(" %s=\"%s\"" % (qnames[k], v))
|
||
|
if namespaces:
|
||
|
items = namespaces.items()
|
||
|
items.sort(key=lambda x: x[1]) # sort on prefix
|
||
|
for v, k in items:
|
||
|
if k:
|
||
|
k = ":" + k
|
||
|
write(" xmlns%s=\"%s\"" % (
|
||
|
k.encode(encoding),
|
||
|
_escape_attrib(v, encoding)
|
||
|
))
|
||
|
write(">")
|
||
|
tag = tag.lower()
|
||
|
if text:
|
||
|
if tag == "script" or tag == "style":
|
||
|
write(_encode(text, encoding))
|
||
|
else:
|
||
|
write(_escape_cdata(text, encoding))
|
||
|
for e in elem:
|
||
|
_serialize_html(write, e, encoding, qnames, None)
|
||
|
if tag not in HTML_EMPTY:
|
||
|
write("</" + tag + ">")
|
||
|
if elem.tail:
|
||
|
write(_escape_cdata(elem.tail, encoding))
|
||
|
|
||
|
def write_html(root, f,
|
||
|
# keyword arguments
|
||
|
encoding="us-ascii",
|
||
|
default_namespace=None):
|
||
|
assert root is not None
|
||
|
if not hasattr(f, "write"):
|
||
|
f = open(f, "wb")
|
||
|
write = f.write
|
||
|
if not encoding:
|
||
|
encoding = "us-ascii"
|
||
|
qnames, namespaces = _namespaces(
|
||
|
root, encoding, default_namespace
|
||
|
)
|
||
|
_serialize_html(
|
||
|
write, root, encoding, qnames, namespaces
|
||
|
)
|
||
|
|
||
|
# --------------------------------------------------------------------
|
||
|
# serialization support
|
||
|
|
||
|
def _namespaces(elem, encoding, default_namespace=None):
|
||
|
# identify namespaces used in this tree
|
||
|
|
||
|
# maps qnames to *encoded* prefix:local names
|
||
|
qnames = {None: None}
|
||
|
|
||
|
# maps uri:s to prefixes
|
||
|
namespaces = {}
|
||
|
if default_namespace:
|
||
|
namespaces[default_namespace] = ""
|
||
|
|
||
|
def encode(text):
|
||
|
return text.encode(encoding)
|
||
|
|
||
|
def add_qname(qname):
|
||
|
# calculate serialized qname representation
|
||
|
try:
|
||
|
if qname[:1] == "{":
|
||
|
uri, tag = qname[1:].split("}", 1)
|
||
|
prefix = namespaces.get(uri)
|
||
|
if prefix is None:
|
||
|
prefix = _namespace_map.get(uri)
|
||
|
if prefix is None:
|
||
|
prefix = "ns%d" % len(namespaces)
|
||
|
if prefix != "xml":
|
||
|
namespaces[uri] = prefix
|
||
|
if prefix:
|
||
|
qnames[qname] = encode("%s:%s" % (prefix, tag))
|
||
|
else:
|
||
|
qnames[qname] = encode(tag) # default element
|
||
|
else:
|
||
|
if default_namespace:
|
||
|
# FIXME: can this be handled in XML 1.0?
|
||
|
raise ValueError(
|
||
|
"cannot use non-qualified names with "
|
||
|
"default_namespace option"
|
||
|
)
|
||
|
qnames[qname] = encode(qname)
|
||
|
except TypeError:
|
||
|
_raise_serialization_error(qname)
|
||
|
|
||
|
# populate qname and namespaces table
|
||
|
try:
|
||
|
iterate = elem.iter
|
||
|
except AttributeError:
|
||
|
iterate = elem.getiterator # cET compatibility
|
||
|
for elem in iterate():
|
||
|
tag = elem.tag
|
||
|
if isinstance(tag, QName) and tag.text not in qnames:
|
||
|
add_qname(tag.text)
|
||
|
elif isinstance(tag, basestring):
|
||
|
if tag not in qnames:
|
||
|
add_qname(tag)
|
||
|
elif tag is not None and tag is not Comment and tag is not PI:
|
||
|
_raise_serialization_error(tag)
|
||
|
for key, value in elem.items():
|
||
|
if isinstance(key, QName):
|
||
|
key = key.text
|
||
|
if key not in qnames:
|
||
|
add_qname(key)
|
||
|
if isinstance(value, QName) and value.text not in qnames:
|
||
|
add_qname(value.text)
|
||
|
text = elem.text
|
||
|
if isinstance(text, QName) and text.text not in qnames:
|
||
|
add_qname(text.text)
|
||
|
return qnames, namespaces
|
||
|
|
||
|
def to_html_string(element, encoding=None):
|
||
|
class dummy:
|
||
|
pass
|
||
|
data = []
|
||
|
file = dummy()
|
||
|
file.write = data.append
|
||
|
write_html(ElementTree(element).getroot(),file,encoding)
|
||
|
return "".join(data)
|