From 0c4fdbaee8c555527faa656777011570bce5ad5f Mon Sep 17 00:00:00 2001 From: Fredrik Lundh Date: Thu, 31 Aug 2000 22:57:55 +0000 Subject: [PATCH] closes bug #112468 (and all the other bugs that surfaced when I fixed the a bug in the regression test harness...) --- Lib/sre_parse.py | 75 +++++++++++++++++++++++++++----------------- Lib/test/re_tests.py | 53 ++++++++++++++++++++++--------- 2 files changed, 85 insertions(+), 43 deletions(-) diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 16e49b620e4..a50191ec9b0 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -14,10 +14,6 @@ MAXREPEAT = 65535 -# FIXME: the following might change in 2.0 final. but for now, this -# seems to be the best way to be compatible with 1.5.2 -CHARMASK = 0xff - SPECIAL_CHARS = ".\\[{()*+?^$|" REPEAT_CHARS = "*+?{" @@ -181,9 +177,10 @@ def __next(self): char = char + c self.index = self.index + len(char) self.next = char - def match(self, char): + def match(self, char, skip=1): if char == self.next: - self.__next() + if skip: + self.__next() return 1 return 0 def get(self): @@ -230,16 +227,19 @@ def _class_escape(source, escape): return code try: if escape[1:2] == "x": - # FIXME: in 2.0, \xNN must have exactly two digits - while source.next in HEXDIGITS: + # hexadecimal escape (exactly two digits) + while source.next in HEXDIGITS and len(escape) < 4: escape = escape + source.get() escape = escape[2:] - return LITERAL, int(escape[-4:], 16) & CHARMASK + if len(escape) != 2: + raise error, "bogus escape: %s" % repr("\\" + escape) + return LITERAL, int(escape, 16) & 0xff elif str(escape[1:2]) in OCTDIGITS: - while source.next in OCTDIGITS: + # octal escape (up to three digits) + while source.next in OCTDIGITS and len(escape) < 5: escape = escape + source.get() escape = escape[1:] - return LITERAL, int(escape[-6:], 8) & CHARMASK + return LITERAL, int(escape, 8) & 0xff if len(escape) == 2: return LITERAL, ord(escape[1]) except ValueError: @@ -256,24 +256,32 @@ def _escape(source, escape, state): return code try: if escape[1:2] == "x": - while source.next in HEXDIGITS: + # hexadecimal escape + while source.next in HEXDIGITS and len(escape) < 4: escape = escape + source.get() escape = escape[2:] - return LITERAL, int(escape[-4:], 16) & CHARMASK + if len(escape) != 2: + raise error, "bogus escape: %s" % repr("\\" + escape) + return LITERAL, int(escape, 16) & 0xff + elif escape[1:2] == "0": + # octal escape + while source.next in OCTDIGITS and len(escape) < 5: + escape = escape + source.get() + return LITERAL, int(escape[1:], 8) & 0xff elif escape[1:2] in DIGITS: - while 1: - group = _group(escape, state.groups) - if group: - if (not source.next or - not _group(escape + source.next, state.groups)): - return GROUPREF, group + # octal escape *or* decimal group reference (sigh) + here = source.tell() + if source.next in DIGITS: + escape = escape + source.get() + if escape[2] in OCTDIGITS and source.next in OCTDIGITS: + # got three octal digits; this is an octal escape escape = escape + source.get() - elif source.next in OCTDIGITS: - escape = escape + source.get() - else: - break - escape = escape[1:] - return LITERAL, int(escape[-6:], 8) & CHARMASK + return LITERAL, int(escape[1:], 8) & 0xff + # got at least one decimal digit; this is a group reference + group = _group(escape, state.groups) + if group: + return GROUPREF, group + raise error, "bogus escape: %s" % repr(escape) if len(escape) == 2: return LITERAL, ord(escape[1]) except ValueError: @@ -290,7 +298,7 @@ def _parse_sub(source, state, nested=1): continue if not nested: break - if not source.next or source.match(")"): + if not source.next or source.match(")", 0): break else: raise error, "pattern not properly closed" @@ -395,7 +403,11 @@ def _parse(source, state): code2 = LITERAL, ord(this) if code1[0] != LITERAL or code2[0] != LITERAL: raise error, "illegal range" - set.append((RANGE, (code1[1], code2[1]))) + lo = code1[1] + hi = code2[1] + if hi < lo: + raise error, "illegal range" + set.append((RANGE, (lo, hi))) else: if code1[0] is IN: code1 = code1[1][0] @@ -505,6 +517,9 @@ def _parse(source, state): if source.next is None or source.next == ")": break source.get() + if not source.match(")"): + raise error, "unbalanced parenthesis" + continue elif source.next in ("=", "!", "<"): # lookahead assertions char = source.get() @@ -515,6 +530,8 @@ def _parse(source, state): dir = -1 # lookbehind char = source.get() p = _parse_sub(source, state) + if not source.match(")"): + raise error, "unbalanced parenthesis" if char == "=": subpattern.append((ASSERT, (dir, p))) else: @@ -532,6 +549,8 @@ def _parse(source, state): else: group = state.getgroup(name) p = _parse_sub(source, state) + if not source.match(")"): + raise error, "unbalanced parenthesis" subpattern.append((SUBPATTERN, (group, p))) else: while 1: @@ -625,7 +644,7 @@ def parse_template(source, pattern): break if not code: this = this[1:] - code = LITERAL, int(this[-6:], 8) & CHARMASK + code = LITERAL, int(this[-6:], 8) & 0xff a(code) else: try: diff --git a/Lib/test/re_tests.py b/Lib/test/re_tests.py index 14a2cee36c1..7eac04cb8f4 100755 --- a/Lib/test/re_tests.py +++ b/Lib/test/re_tests.py @@ -16,16 +16,23 @@ # matching performs on large strings. benchmarks = [ + + # test common prefix + ('Python|Perl', 'Perl'), # Alternation + ('(Python|Perl)', 'Perl'), # Grouped alternation + + ('Python|Perl|Tcl', 'Perl'), # Alternation + ('(Python|Perl|Tcl)', 'Perl'), # Grouped alternation + + ('(Python)\\1', 'PythonPython'), # Backreference + ('([0a-z][a-z0-9]*,)+', 'a5,b7,c9,'), # Disable the fastmap optimization + ('([a-z][a-z0-9]*,)+', 'a5,b7,c9,'), # A few sets + ('Python', 'Python'), # Simple text literal ('.*Python', 'Python'), # Bad text literal ('.*Python.*', 'Python'), # Worse text literal ('.*(Python)', 'Python'), # Bad text literal with grouping - ('(Python|Perl|Tcl', 'Perl'), # Alternation - ('(Python|Perl|Tcl)', 'Perl'), # Grouped alternation - ('(Python)\\1', 'PythonPython'), # Backreference - ('([0a-z][a-z]*,)+', 'a5,b7,c9,'), # Disable the fastmap optimization - ('([a-z][a-z0-9]*,)+', 'a5,b7,c9,') # A few sets ] # Test suite (for verifying correctness) @@ -79,12 +86,17 @@ # Test various letter escapes (r'\a[\b]\f\n\r\t\v', '\a\b\f\n\r\t\v', SUCCEED, 'found', '\a\b\f\n\r\t\v'), (r'[\a][\b][\f][\n][\r][\t][\v]', '\a\b\f\n\r\t\v', SUCCEED, 'found', '\a\b\f\n\r\t\v'), - (r'\u', '', SYNTAX_ERROR), # A Perl escape + # NOTE: not an error under PCRE/PRE: + # (r'\u', '', SYNTAX_ERROR), # A Perl escape (r'\c\e\g\h\i\j\k\m\o\p\q\y\z', 'ceghijkmopqyz', SUCCEED, 'found', 'ceghijkmopqyz'), (r'\xff', '\377', SUCCEED, 'found', chr(255)), - (r'\x00ffffffffffffff', '\377', SUCCEED, 'found', chr(255)), - (r'\x00f', '\017', SUCCEED, 'found', chr(15)), - (r'\x00fe', '\376', SUCCEED, 'found', chr(254)), + # new \x semantics + (r'\x00ffffffffffffff', '\377', FAIL, 'found', chr(255)), + (r'\x00f', '\017', FAIL, 'found', chr(15)), + (r'\x00fe', '\376', FAIL, 'found', chr(254)), + # (r'\x00ffffffffffffff', '\377', SUCCEED, 'found', chr(255)), + # (r'\x00f', '\017', SUCCEED, 'found', chr(15)), + # (r'\x00fe', '\376', SUCCEED, 'found', chr(254)), (r"^\w+=(\\[\000-\277]|[^\n\\])*", "SRC=eval.c g.c blah blah blah \\\\\n\tapes.c", SUCCEED, 'found', "SRC=eval.c g.c blah blah blah \\\\"), @@ -138,7 +150,8 @@ ('a[b-d]', 'aac', SUCCEED, 'found', 'ac'), ('a[-b]', 'a-', SUCCEED, 'found', 'a-'), ('a[\\-b]', 'a-', SUCCEED, 'found', 'a-'), - ('a[b-]', 'a-', SYNTAX_ERROR), + # NOTE: not an error under PCRE/PRE: + # ('a[b-]', 'a-', SYNTAX_ERROR), ('a[]b', '-', SYNTAX_ERROR), ('a[', '-', SYNTAX_ERROR), ('a\\', '-', SYNTAX_ERROR), @@ -543,7 +556,9 @@ # Check odd placement of embedded pattern modifiers - ('w(?i)', 'W', SYNTAX_ERROR), + # not an error under PCRE/PRE: + ('w(?i)', 'W', SUCCEED, 'found', 'W'), + # ('w(?i)', 'W', SYNTAX_ERROR), # Comments using the x embedded pattern modifier @@ -577,20 +592,28 @@ ('\\D+', '1234abc5678', SUCCEED, 'found', 'abc'), ('[\\D]+', '1234abc5678', SUCCEED, 'found', 'abc'), ('[\\da-fA-F]+', '123abc', SUCCEED, 'found', '123abc'), - ('[\\d-x]', '-', SYNTAX_ERROR), + # not an error under PCRE/PRE: + # ('[\\d-x]', '-', SYNTAX_ERROR), (r'([\s]*)([\S]*)([\s]*)', ' testing!1972', SUCCEED, 'g3+g2+g1', 'testing!1972 '), (r'(\s*)(\S*)(\s*)', ' testing!1972', SUCCEED, 'g3+g2+g1', 'testing!1972 '), (r'\xff', '\377', SUCCEED, 'found', chr(255)), - (r'\x00ff', '\377', SUCCEED, 'found', chr(255)), + # new \x semantics + (r'\x00ff', '\377', FAIL, 'found', chr(255)), + # (r'\x00ff', '\377', SUCCEED, 'found', chr(255)), (r'\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'), ('\t\n\v\r\f\a\g', '\t\n\v\r\f\ag', SUCCEED, 'found', '\t\n\v\r\f\ag'), (r'\t\n\v\r\f\a', '\t\n\v\r\f\a', SUCCEED, 'found', chr(9)+chr(10)+chr(11)+chr(13)+chr(12)+chr(7)), (r'[\t][\n][\v][\r][\f][\b]', '\t\n\v\r\f\b', SUCCEED, 'found', '\t\n\v\r\f\b'), - # additional regression tests (1.6 and later) + # + # post-1.5.2 additions # xmllib problem (r'(([a-z]+):)?([a-z]+)$', 'smil', SUCCEED, 'g1+"-"+g2+"-"+g3', 'None-None-smil'), - + # bug 111869 (PRE/PCRE fails on this one, SRE doesn't) + (r'.*d', 'abc\nabd', SUCCEED, 'found', 'abd'), + # bug 112468 + ('(', '', SYNTAX_ERROR), + ('[\\41]', '!', SUCCEED, 'found', '!'), ]