From b25e1ad253a4d96aea31a7a3fb78522ea354f43a Mon Sep 17 00:00:00 2001 From: Fredrik Lundh Date: Thu, 22 Mar 2001 15:50:10 +0000 Subject: [PATCH] sre 2.1b2 update: - take locale into account for word boundary anchors (#410271) - restored 2.0's *? behaviour (#233283, #408936 and others) - speed up re.sub/re.subn --- Lib/sre.py | 22 +++++++++++- Lib/sre_compile.py | 16 +++++---- Lib/sre_constants.py | 20 +++++++++-- Lib/sre_parse.py | 63 +++++++++++++++++++++++------------ Lib/test/re_tests.py | 11 ++++++ Lib/test/test_sre.py | 2 ++ Modules/_sre.c | 74 ++++++++++++++++++++++++++++++++--------- Modules/sre_constants.h | 6 +++- 8 files changed, 165 insertions(+), 49 deletions(-) diff --git a/Lib/sre.py b/Lib/sre.py index 48d390a5ec5..6706fac8692 100644 --- a/Lib/sre.py +++ b/Lib/sre.py @@ -23,6 +23,8 @@ "U", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE", "UNICODE", "error" ] +__version__ = "2.1b2" + # this module works under 1.5.2 and later. don't use string methods import string @@ -90,6 +92,7 @@ def compile(pattern, flags=0): def purge(): "Clear the regular expression cache" _cache.clear() + _cache_repl.clear() def template(pattern, flags=0): "Compile a template pattern, returning a pattern object" @@ -111,6 +114,8 @@ def escape(pattern): # internals _cache = {} +_cache_repl = {} + _MAXCACHE = 100 def _join(seq, sep): @@ -134,6 +139,21 @@ def _compile(*key): _cache[key] = p return p +def _compile_repl(*key): + # internal: compile replacement pattern + p = _cache_repl.get(key) + if p is not None: + return p + repl, pattern = key + try: + p = sre_parse.parse_template(repl, pattern) + except error, v: + raise error, v # invalid expression + if len(_cache_repl) >= _MAXCACHE: + _cache_repl.clear() + _cache_repl[key] = p + return p + def _expand(pattern, match, template): # internal: match.expand implementation hook template = sre_parse.parse_template(template, pattern) @@ -148,7 +168,7 @@ def _subn(pattern, template, string, count=0): if callable(template): filter = template else: - template = sre_parse.parse_template(template, pattern) + template = _compile_repl(template, pattern) def filter(match, template=template): return sre_parse.expand_template(template, match) n = i = 0 diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index ab2a2cc9ad2..44cb23e6a4a 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -105,9 +105,12 @@ def fixup(literal, flags=flags): elif op is AT: emit(OPCODES[op]) if flags & SRE_FLAG_MULTILINE: - emit(ATCODES[AT_MULTILINE.get(av, av)]) - else: - emit(ATCODES[av]) + av = AT_MULTILINE.get(av, av) + if flags & SRE_FLAG_LOCALE: + av = AT_LOCALE.get(av, av) + elif flags & SRE_FLAG_UNICODE: + av = AT_UNICODE.get(av, av) + emit(ATCODES[av]) elif op is BRANCH: emit(OPCODES[op]) tail = [] @@ -124,11 +127,10 @@ def fixup(literal, flags=flags): elif op is CATEGORY: emit(OPCODES[op]) if flags & SRE_FLAG_LOCALE: - emit(CHCODES[CH_LOCALE[av]]) + av = CH_LOCALE[av] elif flags & SRE_FLAG_UNICODE: - emit(CHCODES[CH_UNICODE[av]]) - else: - emit(CHCODES[av]) + av = CH_UNICODE[av] + emit(CHCODES[av]) elif op is GROUPREF: if flags & SRE_FLAG_IGNORECASE: emit(OPCODES[OP_IGNORE[op]]) diff --git a/Lib/sre_constants.py b/Lib/sre_constants.py index b429a33cbf0..bbe7880a1d5 100644 --- a/Lib/sre_constants.py +++ b/Lib/sre_constants.py @@ -11,7 +11,7 @@ # update when constants are added or removed -MAGIC = 20010115 +MAGIC = 20010320 # max code word in this release @@ -67,6 +67,10 @@ class error(Exception): AT_END = "at_end" AT_END_LINE = "at_end_line" AT_END_STRING = "at_end_string" +AT_LOC_BOUNDARY = "at_loc_boundary" +AT_LOC_NON_BOUNDARY = "at_loc_non_boundary" +AT_UNI_BOUNDARY = "at_uni_boundary" +AT_UNI_NON_BOUNDARY = "at_uni_non_boundary" # categories CATEGORY_DIGIT = "category_digit" @@ -119,7 +123,9 @@ class error(Exception): ATCODES = [ AT_BEGINNING, AT_BEGINNING_LINE, AT_BEGINNING_STRING, AT_BOUNDARY, - AT_NON_BOUNDARY, AT_END, AT_END_LINE, AT_END_STRING + AT_NON_BOUNDARY, AT_END, AT_END_LINE, AT_END_STRING, + AT_LOC_BOUNDARY, AT_LOC_NON_BOUNDARY, AT_UNI_BOUNDARY, + AT_UNI_NON_BOUNDARY ] CHCODES = [ @@ -157,6 +163,16 @@ def makedict(list): AT_END: AT_END_LINE } +AT_LOCALE = { + AT_BOUNDARY: AT_LOC_BOUNDARY, + AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY +} + +AT_UNICODE = { + AT_BOUNDARY: AT_UNI_BOUNDARY, + AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY +} + CH_LOCALE = { CATEGORY_DIGIT: CATEGORY_DIGIT, CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT, diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 3840365b8ef..44626bd5e82 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -638,6 +638,16 @@ def parse_template(source, pattern): s = Tokenizer(source) p = [] a = p.append + def literal(literal, p=p): + if p and p[-1][0] is LITERAL: + p[-1] = LITERAL, p[-1][1] + literal + else: + p.append((LITERAL, literal)) + sep = source[:0] + if type(sep) is type(""): + char = chr + else: + char = unichr while 1: this = s.get() if this is None: @@ -681,33 +691,42 @@ def parse_template(source, pattern): break if not code: this = this[1:] - code = LITERAL, atoi(this[-6:], 8) & 0xff - a(code) + code = LITERAL, char(atoi(this[-6:], 8) & 0xff) + if code[0] is LITERAL: + literal(code[1]) + else: + a(code) else: try: - a(ESCAPES[this]) + this = char(ESCAPES[this][1]) except KeyError: - for c in this: - a((LITERAL, ord(c))) + pass + literal(this) else: - a((LITERAL, ord(this))) - return p + literal(this) + # convert template to groups and literals lists + i = 0 + groups = [] + literals = [] + for c, s in p: + if c is MARK: + groups.append((i, s)) + literals.append(None) + else: + literals.append(s) + i = i + 1 + return groups, literals def expand_template(template, match): - # XXX: this is sooooo slow. drop in the slicelist code instead - p = [] - a = p.append + g = match.group sep = match.string[:0] - if type(sep) is type(""): - char = chr - else: - char = unichr - for c, s in template: - if c is LITERAL: - a(char(s)) - elif c is MARK: - s = match.group(s) + groups, literals = template + literals = literals[:] + try: + for index, group in groups: + literals[index] = s = g(group) if s is None: - raise error, "empty group" - a(s) - return string.join(p, sep) + raise IndexError + except IndexError: + raise error, "empty group" + return string.join(literals, sep) diff --git a/Lib/test/re_tests.py b/Lib/test/re_tests.py index aacd916267c..7c5dc890d91 100755 --- a/Lib/test/re_tests.py +++ b/Lib/test/re_tests.py @@ -639,3 +639,14 @@ # bug 130748: ^* should be an error (nothing to repeat) (r'^*', '', SYNTAX_ERROR), ] + +try: + u = eval("u'\N{LATIN CAPITAL LETTER A WITH DIAERESIS}'") +except SyntaxError: + pass +else: + tests.extend([ + # bug 410271: \b broken under locales + (r'\b.\b', 'a', SUCCEED, 'found', 'a'), + (r'(?u)\b.\b', u, SUCCEED, 'found', u), + ]) diff --git a/Lib/test/test_sre.py b/Lib/test/test_sre.py index 88c0d62e8db..031cda6c0b4 100644 --- a/Lib/test/test_sre.py +++ b/Lib/test/test_sre.py @@ -329,6 +329,8 @@ def bump_num(matchobj): u = unicode(s, "latin-1") except NameError: pass + except TypeError: + continue # skip unicode test strings else: result=obj.search(u) if result==None: diff --git a/Modules/_sre.c b/Modules/_sre.c index 63e4ef361ec..8811038d0f7 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -24,8 +24,9 @@ * 2000-10-24 fl really fixed assert_not; reset groups in findall * 2000-12-21 fl fixed memory leak in groupdict * 2001-01-02 fl properly reset pointer after failed assertion in MIN_UNTIL - * 2001-01-15 fl avoid recursion for MIN_UTIL; fixed uppercase literal bug + * 2001-01-15 fl avoid recursion for MIN_UNTIL; fixed uppercase literal bug * 2001-01-16 fl fixed memory leak in pattern destructor + * 2001-03-20 fl lots of fixes for 2.1b2 * * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. * @@ -40,7 +41,7 @@ #ifndef SRE_RECURSIVE -char copyright[] = " SRE 2.1 Copyright (c) 1997-2001 by Secret Labs AB "; +char copyright[] = " SRE 2.1b2 Copyright (c) 1997-2001 by Secret Labs AB "; #include "Python.h" @@ -141,11 +142,6 @@ static char sre_char_lower[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127 }; -static unsigned int sre_lower(unsigned int ch) -{ - return ((ch) < 128 ? sre_char_lower[ch] : ch); -} - #define SRE_IS_DIGIT(ch)\ ((ch) < 128 ? (sre_char_info[(ch)] & SRE_DIGIT_MASK) : 0) #define SRE_IS_SPACE(ch)\ @@ -157,30 +153,39 @@ static unsigned int sre_lower(unsigned int ch) #define SRE_IS_WORD(ch)\ ((ch) < 128 ? (sre_char_info[(ch)] & SRE_WORD_MASK) : 0) +static unsigned int sre_lower(unsigned int ch) +{ + return ((ch) < 128 ? sre_char_lower[ch] : ch); +} + /* locale-specific character predicates */ -static unsigned int sre_lower_locale(unsigned int ch) -{ - return ((ch) < 256 ? tolower((ch)) : ch); -} #define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0) #define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0) #define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n') #define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0) #define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_') +static unsigned int sre_lower_locale(unsigned int ch) +{ + return ((ch) < 256 ? tolower((ch)) : ch); +} + /* unicode-specific character predicates */ #if defined(HAVE_UNICODE) -static unsigned int sre_lower_unicode(unsigned int ch) -{ - return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch)); -} + #define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch)) #define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch)) #define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch)) #define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch)) #define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_') + +static unsigned int sre_lower_unicode(unsigned int ch) +{ + return (unsigned int) Py_UNICODE_TOLOWER((Py_UNICODE)(ch)); +} + #endif LOCAL(int) @@ -418,6 +423,42 @@ SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at) this = ((void*) ptr < state->end) ? SRE_IS_WORD((int) ptr[0]) : 0; return this == that; + + case SRE_AT_LOC_BOUNDARY: + if (state->beginning == state->end) + return 0; + that = ((void*) ptr > state->beginning) ? + SRE_LOC_IS_WORD((int) ptr[-1]) : 0; + this = ((void*) ptr < state->end) ? + SRE_LOC_IS_WORD((int) ptr[0]) : 0; + return this != that; + + case SRE_AT_LOC_NON_BOUNDARY: + if (state->beginning == state->end) + return 0; + that = ((void*) ptr > state->beginning) ? + SRE_LOC_IS_WORD((int) ptr[-1]) : 0; + this = ((void*) ptr < state->end) ? + SRE_LOC_IS_WORD((int) ptr[0]) : 0; + return this == that; + + case SRE_AT_UNI_BOUNDARY: + if (state->beginning == state->end) + return 0; + that = ((void*) ptr > state->beginning) ? + SRE_UNI_IS_WORD((int) ptr[-1]) : 0; + this = ((void*) ptr < state->end) ? + SRE_UNI_IS_WORD((int) ptr[0]) : 0; + return this != that; + + case SRE_AT_UNI_NON_BOUNDARY: + if (state->beginning == state->end) + return 0; + that = ((void*) ptr > state->beginning) ? + SRE_UNI_IS_WORD((int) ptr[-1]) : 0; + this = ((void*) ptr < state->end) ? + SRE_UNI_IS_WORD((int) ptr[0]) : 0; + return this == that; } return 0; @@ -1037,7 +1078,8 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) /* see if the tail matches */ state->repeat = rp->prev; - if (rp->pattern[2] == 65535) { + /* FIXME: the following fix doesn't always work (#133283) */ + if (0 && rp->pattern[2] == 65535) { /* unbounded repeat */ for (;;) { i = SRE_MATCH(state, pattern, level + 1); diff --git a/Modules/sre_constants.h b/Modules/sre_constants.h index c6850ad69be..73bcb349711 100644 --- a/Modules/sre_constants.h +++ b/Modules/sre_constants.h @@ -11,7 +11,7 @@ * See the _sre.c file for information on usage and redistribution. */ -#define SRE_MAGIC 20010115 +#define SRE_MAGIC 20010320 #define SRE_OP_FAILURE 0 #define SRE_OP_SUCCESS 1 #define SRE_OP_ANY 2 @@ -49,6 +49,10 @@ #define SRE_AT_END 5 #define SRE_AT_END_LINE 6 #define SRE_AT_END_STRING 7 +#define SRE_AT_LOC_BOUNDARY 8 +#define SRE_AT_LOC_NON_BOUNDARY 9 +#define SRE_AT_UNI_BOUNDARY 10 +#define SRE_AT_UNI_NON_BOUNDARY 11 #define SRE_CATEGORY_DIGIT 0 #define SRE_CATEGORY_NOT_DIGIT 1 #define SRE_CATEGORY_SPACE 2