From 8846d7178b8caf1411ca6f458b78b9f46ba73abe Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Fri, 18 May 2001 14:50:52 +0000 Subject: [PATCH] A much improved HTML parser -- a replacement for sgmllib. The API is derived from but not quite compatible with that of sgmllib, so it's a new file. I suppose it needs documentation, and htmllib needs to be changed to use this instead of sgmllib, and sgmllib needs to be declared obsolete. But that can all be done later. This code was first published as part of TAL (part of Zope Page Templates), but that was strongly based on sgmllib anyway. Authors are Fred drake and Guido van Rossum. --- Lib/HTMLParser.py | 432 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 432 insertions(+) create mode 100644 Lib/HTMLParser.py diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py new file mode 100644 index 00000000000..363a6723a8b --- /dev/null +++ b/Lib/HTMLParser.py @@ -0,0 +1,432 @@ +"""A parser for HTML.""" + +# This file is based on sgmllib.py, but the API is slightly different. + +# XXX There should be a way to distinguish between PCDATA (parsed +# character data -- the normal case), RCDATA (replaceable character +# data -- only char and entity references and end tags are special) +# and CDATA (character data -- only end tags are special). + + +import re +import string + +# Regular expressions used for parsing + +interesting_normal = re.compile('[&<]') +interesting_cdata = re.compile(r'<(/|\Z)') +incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*|#[0-9]*)?') + +entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') +charref = re.compile('&#([0-9]+)[^0-9]') + +starttagopen = re.compile('<[a-zA-Z]') +piopen = re.compile(r'<\?') +piclose = re.compile('>') +endtagopen = re.compile(']*>') +commentopen = re.compile('