From 669573726bb31d8862e98b3843549e5fe0b54d37 Mon Sep 17 00:00:00 2001 From: Fred Drake Date: Fri, 16 Mar 2001 20:04:57 +0000 Subject: [PATCH] Change RuntimeError to SGMLParseError, which subclasses RuntimeError for backward compatibility. Add support for SGML declaration syntax () to some reasonable degree. This does not support everything allowed in SGML, but should work with "real" HTML (internal subset in a DOCTYPE is not handled). The content of the declaration is passed to the .handle_decl() method, which can be overridden by subclasses. --- Lib/sgmllib.py | 58 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 5 deletions(-) diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py index 19580e25999..2ebe6981f5d 100644 --- a/Lib/sgmllib.py +++ b/Lib/sgmllib.py @@ -39,6 +39,14 @@ r'\s*([a-zA-Z_][-.a-zA-Z_0-9]*)(\s*=\s*' r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?') +declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*') +declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*') + + +class SGMLParseError(RuntimeError): + """Exception raised for all parse errors.""" + pass + # SGML parser base class -- find tags and call handler functions. # Usage: p = SGMLParser(); p.feed(data); ...; p.close(). @@ -144,7 +152,12 @@ def goahead(self, end): self.handle_data(rawdata[i]) i = i+1 continue - i = match.end(0) + # This is some sort of declaration; in "HTML as + # deployed," this should only be the document type + # declaration (""). + k = self.parse_declaration(i) + if k < 0: break + i = k continue elif rawdata[i] == '&': match = charref.match(rawdata, i) @@ -162,7 +175,7 @@ def goahead(self, end): if rawdata[i-1] != ';': i = i-1 continue else: - raise RuntimeError, 'neither < nor & ??' + raise SGMLParserError('neither < nor & ??') # We get here only if incomplete matches but # nothing else match = incomplete.match(rawdata, i) @@ -186,7 +199,7 @@ def goahead(self, end): def parse_comment(self, i): rawdata = self.rawdata if rawdata[i:i+4] != '