From 1ad00717fb90ebfe1bb7727f9a89b1b06ebf9e5f Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Thu, 28 May 1998 22:48:53 +0000 Subject: [PATCH] Patch by Lars Marius Garshol: - Handle . - Allow . and - in entity names. Also fixed an oversight in the previous fix (in one place, [ \t\r\n] was used instead of string.whitespace). --- Lib/sgmllib.py | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py index 956341c8fea..681760854f9 100644 --- a/Lib/sgmllib.py +++ b/Lib/sgmllib.py @@ -20,12 +20,14 @@ '/([a-zA-Z][^<>]*)?|' '![^<>]*)?') -entityref = re.compile('&([a-zA-Z][a-zA-Z0-9]*)[^a-zA-Z0-9]') +entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') charref = re.compile('&#([0-9]+)[^0-9]') starttagopen = re.compile('<[>a-zA-Z]') shorttagopen = re.compile('<[a-zA-Z][a-zA-Z0-9]*/') shorttag = re.compile('<([a-zA-Z][a-zA-Z0-9]*)/([^/]*)/') +piopen = re.compile('<\?') +piclose = re.compile('>') endtagopen = re.compile('a-zA-Z]') endbracket = re.compile('[<>]') special = re.compile(']*>') @@ -33,7 +35,7 @@ commentclose = re.compile('--[%s]*>' % string.whitespace) tagfind = re.compile('[a-zA-Z][a-zA-Z0-9]*') attrfind = re.compile( - '[ \t\n\r]+([a-zA-Z_][-.a-zA-Z_0-9]*)' + '[%s]+([a-zA-Z_][-.a-zA-Z_0-9]*)' % string.whitespace + ('([%s]*=[%s]*' % (string.whitespace, string.whitespace)) + r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:+*%?!\(\)_#=~]*))?') @@ -127,6 +129,15 @@ def goahead(self, end): if k < 0: break i = i+k continue + if piopen.match(rawdata, i): + if self.literal: + self.handle_data(rawdata[i]) + i = i+1 + continue + k = self.parse_pi(i) + if k < 0: break + i = i+k + continue match = special.match(rawdata, i) if match: if self.literal: @@ -184,6 +195,19 @@ def parse_comment(self, i): j = match.end(0) return j-i + # Internal -- parse processing instr, return length or -1 if not terminated + def parse_pi(self, i): + rawdata = self.rawdata + if rawdata[i:i+2] <> '