215 lines
6.9 KiB
Python
215 lines
6.9 KiB
Python
|
|
||
|
"""
|
||
|
PRE-PROCESSORS
|
||
|
=============================================================================
|
||
|
|
||
|
Preprocessors work on source text before we start doing anything too
|
||
|
complicated.
|
||
|
"""
|
||
|
|
||
|
import re
|
||
|
import markdown
|
||
|
|
||
|
HTML_PLACEHOLDER_PREFIX = markdown.STX+"wzxhzdk:"
|
||
|
HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%d" + markdown.ETX
|
||
|
|
||
|
class Processor:
|
||
|
def __init__(self, markdown_instance=None):
|
||
|
if markdown_instance:
|
||
|
self.markdown = markdown_instance
|
||
|
|
||
|
class Preprocessor (Processor):
|
||
|
"""
|
||
|
Preprocessors are run after the text is broken into lines.
|
||
|
|
||
|
Each preprocessor implements a "run" method that takes a pointer to a
|
||
|
list of lines of the document, modifies it as necessary and returns
|
||
|
either the same pointer or a pointer to a new list.
|
||
|
|
||
|
Preprocessors must extend markdown.Preprocessor.
|
||
|
|
||
|
"""
|
||
|
def run(self, lines):
|
||
|
"""
|
||
|
Each subclass of Preprocessor should override the `run` method, which
|
||
|
takes the document as a list of strings split by newlines and returns
|
||
|
the (possibly modified) list of lines.
|
||
|
|
||
|
"""
|
||
|
pass
|
||
|
|
||
|
class HtmlStash:
|
||
|
"""
|
||
|
This class is used for stashing HTML objects that we extract
|
||
|
in the beginning and replace with place-holders.
|
||
|
"""
|
||
|
|
||
|
def __init__ (self):
|
||
|
""" Create a HtmlStash. """
|
||
|
self.html_counter = 0 # for counting inline html segments
|
||
|
self.rawHtmlBlocks=[]
|
||
|
|
||
|
def store(self, html, safe=False):
|
||
|
"""
|
||
|
Saves an HTML segment for later reinsertion. Returns a
|
||
|
placeholder string that needs to be inserted into the
|
||
|
document.
|
||
|
|
||
|
Keyword arguments:
|
||
|
|
||
|
* html: an html segment
|
||
|
* safe: label an html segment as safe for safemode
|
||
|
|
||
|
Returns : a placeholder string
|
||
|
|
||
|
"""
|
||
|
self.rawHtmlBlocks.append((html, safe))
|
||
|
placeholder = HTML_PLACEHOLDER % self.html_counter
|
||
|
self.html_counter += 1
|
||
|
return placeholder
|
||
|
|
||
|
def reset(self):
|
||
|
self.html_counter = 0
|
||
|
self.rawHtmlBlocks = []
|
||
|
|
||
|
|
||
|
class HtmlBlockPreprocessor(Preprocessor):
|
||
|
"""Remove html blocks from the text and store them for later retrieval."""
|
||
|
|
||
|
right_tag_patterns = ["</%s>", "%s>"]
|
||
|
|
||
|
def _get_left_tag(self, block):
|
||
|
return block[1:].replace(">", " ", 1).split()[0].lower()
|
||
|
|
||
|
def _get_right_tag(self, left_tag, block):
|
||
|
for p in self.right_tag_patterns:
|
||
|
tag = p % left_tag
|
||
|
i = block.rfind(tag)
|
||
|
if i > 2:
|
||
|
return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag)
|
||
|
return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block)
|
||
|
|
||
|
def _equal_tags(self, left_tag, right_tag):
|
||
|
if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
|
||
|
return True
|
||
|
if ("/" + left_tag) == right_tag:
|
||
|
return True
|
||
|
if (right_tag == "--" and left_tag == "--"):
|
||
|
return True
|
||
|
elif left_tag == right_tag[1:] \
|
||
|
and right_tag[0] != "<":
|
||
|
return True
|
||
|
else:
|
||
|
return False
|
||
|
|
||
|
def _is_oneliner(self, tag):
|
||
|
return (tag in ['hr', 'hr/'])
|
||
|
|
||
|
def run(self, lines):
|
||
|
text = "\n".join(lines)
|
||
|
new_blocks = []
|
||
|
text = text.split("\n\n")
|
||
|
items = []
|
||
|
left_tag = ''
|
||
|
right_tag = ''
|
||
|
in_tag = False # flag
|
||
|
|
||
|
while text:
|
||
|
block = text[0]
|
||
|
if block.startswith("\n"):
|
||
|
block = block[1:]
|
||
|
text = text[1:]
|
||
|
|
||
|
if block.startswith("\n"):
|
||
|
block = block[1:]
|
||
|
|
||
|
if not in_tag:
|
||
|
if block.startswith("<"):
|
||
|
left_tag = self._get_left_tag(block)
|
||
|
right_tag, data_index = self._get_right_tag(left_tag, block)
|
||
|
|
||
|
if data_index < len(block):
|
||
|
text.insert(0, block[data_index:])
|
||
|
block = block[:data_index]
|
||
|
|
||
|
if not (markdown.isBlockLevel(left_tag) \
|
||
|
or block[1] in ["!", "?", "@", "%"]):
|
||
|
new_blocks.append(block)
|
||
|
continue
|
||
|
|
||
|
if self._is_oneliner(left_tag):
|
||
|
new_blocks.append(block.strip())
|
||
|
continue
|
||
|
|
||
|
if block[1] == "!":
|
||
|
# is a comment block
|
||
|
left_tag = "--"
|
||
|
right_tag, data_index = self._get_right_tag(left_tag, block)
|
||
|
# keep checking conditions below and maybe just append
|
||
|
|
||
|
if block.rstrip().endswith(">") \
|
||
|
and self._equal_tags(left_tag, right_tag):
|
||
|
new_blocks.append(
|
||
|
self.markdown.htmlStash.store(block.strip()))
|
||
|
continue
|
||
|
else: #if not block[1] == "!":
|
||
|
# if is block level tag and is not complete
|
||
|
|
||
|
if markdown.isBlockLevel(left_tag) or left_tag == "--" \
|
||
|
and not block.rstrip().endswith(">"):
|
||
|
items.append(block.strip())
|
||
|
in_tag = True
|
||
|
else:
|
||
|
new_blocks.append(
|
||
|
self.markdown.htmlStash.store(block.strip()))
|
||
|
|
||
|
continue
|
||
|
|
||
|
new_blocks.append(block)
|
||
|
|
||
|
else:
|
||
|
items.append(block.strip())
|
||
|
|
||
|
right_tag, data_index = self._get_right_tag(left_tag, block)
|
||
|
|
||
|
if self._equal_tags(left_tag, right_tag):
|
||
|
# if find closing tag
|
||
|
in_tag = False
|
||
|
new_blocks.append(
|
||
|
self.markdown.htmlStash.store('\n\n'.join(items)))
|
||
|
items = []
|
||
|
|
||
|
if items:
|
||
|
new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))
|
||
|
new_blocks.append('\n')
|
||
|
|
||
|
new_text = "\n\n".join(new_blocks)
|
||
|
return new_text.split("\n")
|
||
|
|
||
|
|
||
|
class ReferencePreprocessor(Preprocessor):
|
||
|
""" Remove reference definitions from text and store for later use. """
|
||
|
|
||
|
RE = re.compile(r'^(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)$', re.DOTALL)
|
||
|
|
||
|
def run (self, lines):
|
||
|
new_text = [];
|
||
|
for line in lines:
|
||
|
m = self.RE.match(line)
|
||
|
if m:
|
||
|
id = m.group(2).strip().lower()
|
||
|
t = m.group(4).strip() # potential title
|
||
|
if not t:
|
||
|
self.markdown.references[id] = (m.group(3), t)
|
||
|
elif (len(t) >= 2
|
||
|
and (t[0] == t[-1] == "\""
|
||
|
or t[0] == t[-1] == "\'"
|
||
|
or (t[0] == "(" and t[-1] == ")") ) ):
|
||
|
self.markdown.references[id] = (m.group(3), t[1:-1])
|
||
|
else:
|
||
|
new_text.append(line)
|
||
|
else:
|
||
|
new_text.append(line)
|
||
|
|
||
|
return new_text #+ "\n"
|