diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py new file mode 100644 index 00000000000..363a6723a8b --- /dev/null +++ b/Lib/HTMLParser.py @@ -0,0 +1,432 @@ +"""A parser for HTML.""" + +# This file is based on sgmllib.py, but the API is slightly different. + +# XXX There should be a way to distinguish between PCDATA (parsed +# character data -- the normal case), RCDATA (replaceable character +# data -- only char and entity references and end tags are special) +# and CDATA (character data -- only end tags are special). + + +import re +import string + +# Regular expressions used for parsing + +interesting_normal = re.compile('[&<]') +interesting_cdata = re.compile(r'<(/|\Z)') +incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*|#[0-9]*)?') + +entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') +charref = re.compile('&#([0-9]+)[^0-9]') + +starttagopen = re.compile('<[a-zA-Z]') +piopen = re.compile(r'<\?') +piclose = re.compile('>') +endtagopen = re.compile(']*>') +commentopen = re.compile('