From 5ca1b711b5dfe3de292330130987e06f65612cb9 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Thu, 10 Jul 1997 21:00:31 +0000 Subject: [PATCH] Straight from Jeffrey Ollie's web page. --- Lib/re.py | 1137 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1137 insertions(+) create mode 100644 Lib/re.py diff --git a/Lib/re.py b/Lib/re.py new file mode 100644 index 00000000000..016cd68a5ec --- /dev/null +++ b/Lib/re.py @@ -0,0 +1,1137 @@ +#!/usr/bin/env python +# -*- mode: python -*- +# $Id$ + +import string +import reop + +error = 're error' + +# compilation flags + +IGNORECASE = I = 0x01 + +MULTILINE = M = 0x02 +DOTALL = S = 0x04 +VERBOSE = X = 0x08 + +# +# Initialize syntax table. This information should really come from the +# syntax table in regexpr.c rather than being duplicated here. +# + +syntax_table = {} + +for char in map(chr, range(0, 256)): + syntax_table[char] = [] + +for char in string.lowercase: + syntax_table[char].append('word') + +for char in string.uppercase: + syntax_table[char].append('word') + +for char in string.digits: + syntax_table[char].append('word') + syntax_table[char].append('digit') + +for char in string.whitespace: + syntax_table[char].append('whitespace') + +syntax_table['_'].append('word') + +# +# +# + +def match(pattern, string, flags=0): + return compile(pattern, flags).match(string) + +def search(pattern, string, flags=0): + return compile(pattern, flags).search(string) + +def sub(pattern, repl, string, count=0): + pass + +def subn(pattern, repl, string, count=0): + pass + +def split(string, pattern, maxsplit=0): + pass + +# +# +# + +class RegexObject: + def __init__(self, pattern, flags, code, num_regs, groupindex, callouts): + self.code = code + self.num_regs = num_regs + self.flags = flags + self.pattern = pattern + self.groupindex = groupindex + self.callouts = callouts + self.fastmap = build_fastmap(code) + if code[0].name == 'bol': + self.anchor = 1 + elif code[0].name == 'begbuf': + self.anchor = 2 + else: + self.anchor = 0 + self.buffer = assemble(code) + def match(self, string, pos=0): + regs = reop.match(self.buffer, + self.num_regs, + self.flags, + self.fastmap.can_be_null, + self.fastmap.fastmap(), + self.anchor, + string, + pos) + if regs is None: + return None + return MatchObject(self, + string, + pos, + regs) + def search(self, string, pos=0): + regs = reop.search(self.buffer, + self.num_regs, + self.flags, + self.fastmap.can_be_null, + self.fastmap.fastmap(), + self.anchor, + string, + pos) + if regs is None: + return None + return MatchObject(self, + string, + pos, + regs) + +class MatchObject: + def __init__(self, re, string, pos, regs): + self.re = re + self.string = string + self.pos = pos + self.regs = regs + def start(self, i): + if type(i) == type(''): + try: + i = self.re.groupindex[i] + except (KeyError, TypeError): + raise IndexError + return self.regs[i][0] + def end(self, i): + if type(i) == type(''): + try: + i = self.re.groupindex[i] + except (KeyError, TypeError): + raise IndexError + return self.regs[i][1] + def span(self, i): + if type(i) == type(''): + try: + i = self.re.groupindex[i] + except (KeyError, TypeError): + raise IndexError + return self.regs[i] + def group(i): + if type(i) == type(''): + try: + i = self.re.groupindex[i] + except (KeyError, TypeError): + raise IndexError + return self.string[self.regs[i][0]:self.regs[i][1]] + +# +# A set of classes to make assembly a bit easier, if a bit verbose. +# + +class Instruction: + def __init__(self, opcode, size=1): + self.opcode = opcode + self.size = size + def assemble(self, position, labels): + return self.opcode + def __repr__(self): + return '%-15s' % (self.name) + +class FunctionCallout(Instruction): + name = 'function' + def __init__(self, function): + self.function = function + Instruction.__init__(self, chr(22), 2 + len(self.function)) + def assemble(self, position, labels): + return self.opcode + chr(len(self.function)) + self.function + def __repr__(self): + return '%-15s %-10s' % (self.name, self.function) + +class End(Instruction): + name = 'end' + def __init__(self): + Instruction.__init__(self, chr(0)) + +class Bol(Instruction): + name = 'bol' + def __init__(self): + self.name = 'bol' + Instruction.__init__(self, chr(1)) + +class Eol(Instruction): + name = 'eol' + def __init__(self): + Instruction.__init__(self, chr(2)) + +class Set(Instruction): + name = 'set' + def __init__(self, set): + self.set = set + Instruction.__init__(self, chr(3), 33) + def assemble_set(self, position, labels): + result = self.opcode + temp = 0 + for i, c in map(lambda x: (x, chr(x)), range(256)): + if c in self.set[2]: + temp = temp | (1 << (i & 7)) + if (i % 8) == 7: + result = result + chr(temp) + temp = 0 + return result + def __repr__(self): + result = '%-15s' % (self.name) + self.set.sort() + for char in self.set: + result = result + `char` + return result + +class Exact(Instruction): + name = 'exact' + def __init__(self, char): + self.char = char + Instruction.__init__(self, chr(4), 2) + def assemble(self, position, labels): + return self.opcode + self.char + def __repr__(self): + return '%-15s %s' % (self.name, `self.char`) + +class AnyChar(Instruction): + name = 'anychar' + def __init__(self): + Instruction.__init__(self, chr(5)) + def assemble(self, position, labels): + return self.opcode + +class MemoryInstruction(Instruction): + def __init__(self, opcode, register): + self.register = register + Instruction.__init__(self, opcode, 2) + def assemble(self, position, labels): + return self.opcode + chr(self.register) + def __repr__(self): + return '%-15s %i' % (self.name, self.register) + +class StartMemory(MemoryInstruction): + name = 'start_memory' + def __init__(self, register): + MemoryInstruction.__init__(self, chr(6), register) + +class EndMemory(MemoryInstruction): + name = 'end_memory' + def __init__(self, register): + MemoryInstruction.__init__(self, chr(7), register) + +class MatchMemory(MemoryInstruction): + name = 'match_memory' + def __init__(self, register): + MemoryInstruction.__init__(self, chr(8), register) + +class JumpInstruction(Instruction): + def __init__(self, opcode, label): + self.label = label + Instruction.__init__(self, opcode, 3) + def compute_offset(self, start, dest): + return dest - (start + 3) + def pack_offset(self, offset): + if offset > 32767: + raise error, 'offset out of range (pos)' + elif offset < -32768: + raise error, 'offset out of range (neg)' + elif offset < 0: + offset = offset + 65536 + return chr(offset & 0xff) + chr((offset >> 8) & 0xff) + def assemble(self, position, labels): + return self.opcode + \ + self.pack_offset(self.compute_offset(position, + labels[self.label])) + def __repr__(self): + return '%-15s %i' % (self.name, self.label) + +class Jump(JumpInstruction): + name = 'jump' + def __init__(self, label): + JumpInstruction.__init__(self, chr(9), label) + +class StarJump(JumpInstruction): + name = 'star_jump' + def __init__(self, label): + JumpInstruction.__init__(self, chr(10), label) + +class FailureJump(JumpInstruction): + name = 'failure_jump' + def __init__(self, label): + JumpInstruction.__init__(self, chr(11), label) + +class UpdateFailureJump(JumpInstruction): + name = 'update_failure_jump' + def __init__(self, label): + JumpInstruction.__init__(self, chr(12), label) + +class DummyFailureJump(JumpInstruction): + name = 'update_failure_jump' + def __init__(self, label): + JumpInstruction.__init__(self, chr(13), label) + +class BegBuf(Instruction): + name = 'begbuf' + def __init__(self): + Instruction.__init__(self, chr(14)) + +class EndBuf(Instruction): + name = 'endbuf' + def __init__(self): + Instruction.__init__(self, chr(15)) + +class WordBeg(Instruction): + name = 'wordbeg' + def __init__(self): + Instruction.__init__(self, chr(16)) + +class WordEnd(Instruction): + name = 'wordend' + def __init__(self): + Instruction.__init__(self, chr(17)) + +class WordBound(Instruction): + name = 'wordbound' + def __init__(self): + Instruction.__init__(self, chr(18)) + +class NotWordBound(Instruction): + name = 'notwordbound' + def __init__(self): + Instruction.__init__(self, chr(18)) + +class SyntaxSpec(Instruction): + name = 'syntaxspec' + def __init__(self, syntax): + self.syntax = syntax + Instruction.__init__(self, chr(20), 2) + def assemble(self, postition, labels): + return self.opcode + chr(self.syntax) + +class SyntaxSpec(Instruction): + name = 'notsyntaxspec' + def __init__(self, syntax): + self.syntax = syntax + Instruction.__init__(self, chr(21), 2) + def assemble(self, postition, labels): + return self.opcode + chr(self.syntax) + +class Label(Instruction): + name = 'label' + def __init__(self, label): + self.label = label + Instruction.__init__(self, '', 0) + def __repr__(self): + return '%-15s %i' % (self.name, self.label) + +class OpenParen(Instruction): + name = '(' + def __init__(self, register): + self.register = register + Instruction.__init__(self, '', 0) + +class Alternation(Instruction): + name = '|' + def __init__(self): + Instruction.__init__(self, '', 0) + +# +# +# + +def assemble(instructions): + labels = {} + position = 0 + pass1 = [] + for instruction in instructions: + if instruction.name == 'label': + labels[instruction.label] = position + else: + pass1.append((position, instruction)) + position = position + instruction.size + pass2 = '' + for position, instruction in pass1: + pass2 = pass2 + instruction.assemble(position, labels) + return pass2 + +# +# +# + +def escape(pattern): + result = '' + for char in pattern: + if 'word' not in syntax_table[char]: + result = result + '\\' + char + else: + result = result + char + return result + +# +# +# + +def registers_used(instructions): + result = [] + for instruction in instructions: + if (instruction.name in ['set_memory', 'end_memory']) and \ + (instruction.register not in result): + result.append(instruction.register) + return result + +# +# +# + +class Fastmap: + def __init__(self): + self.map = ['\000']*256 + self.can_be_null = 0 + def add(self, char): + self.map[ord(char)] = '\001' + def fastmap(self): + return string.join(self.map, '') + def __getitem__(self, char): + return ord(self.map[ord(char)]) + def __repr__(self): + self.map.sort() + return 'Fastmap(' + `self.can_be_null` + ', ' + `self.map` + ')' + +# +# +# + +def find_label(code, label): + line = 0 + for instruction in code: + if (instruction.name == 'label') and (instruction.label == label): + return line + 1 + line = line + 1 + +def build_fastmap_aux(code, pos, visited, fastmap): + if visited[pos]: + return + while 1: + instruction = code[pos] + visited[pos] = 1 + pos = pos + 1 + if instruction.name == 'end': + fastmap.can_be_null = 1 + return + elif instruction.name == 'syntaxspec': + for char in map(chr, range(256)): + if instruction.syntax in syntax_table[char]: + fastmap.add(char) + return + elif instruction.name == 'notsyntaxspec': + for char in map(chr, range(256)): + if instruction.syntax not in syntax_table[char]: + fastmap.add(char) + return + elif instruction.name == 'eol': + fastmap.add('\n') + if fastmap.can_be_null == 0: + fastmap.can_be_null = 2 + return + elif instruction.name == 'set': + for char in instruction.set: + fastmap.add(char) + return + elif instruction.name == 'exact': + fastmap.add(instruction.char) + elif instruction.name == 'anychar': + for char in map(chr, range(256)): + if char != '\n': + fastmap.add(char) + return + elif instruction.name == 'match_memory': + for char in map(chr, range(256)): + fastmap.add(char) + fastmap.can_be_null = 1 + return + elif instruction.name in ['jump', 'dummy_failure_jump', \ + 'update_failure_jump', 'star_jump']: + pos = find_label(code, instruction.label) + if visited[pos]: + return + visited[pos] = 1 + elif instruction.name == 'failure_jump': + build_fastmap_aux(code, + find_label(code, instruction.label), + visited, + fastmap) + elif instruction.name == 'function': + for char in map(chr, range(256)): + fastmap.add(char) + fastmap.can_be_null = 1 + return + +def build_fastmap(code, pos=0): + visited = [0] * len(code) + fastmap = Fastmap() + build_fastmap_aux(code, pos, visited, fastmap) + return fastmap + +# +# +# + +def compile(pattern, flags=0): + stack = [] + index = 0 + label = 0 + register = 1 + groupindex = {} + callouts = [] + while (index < len(pattern)): + char = pattern[index] + index = index + 1 + if char == '\\': + if index < len(pattern): + next = pattern[index] + index = index + 1 + if next == 't': + stack.append([Exact(chr(9))]) + + elif next == 'n': + stack.append([Exact(chr(10))]) + + elif next == 'r': + stack.append([Exact(chr(13))]) + + elif next == 'f': + stack.append([Exact(chr(12))]) + + elif next == 'a': + stack.append([Exact(chr(7))]) + + elif next == 'e': + stack.append([Exact(chr(27))]) + + elif next in '0123456789': + value = next + while (index < len(pattern)) and \ + (pattern[index] in string.digits): + value = value + pattern[index] + index = index + 1 + if (len(value) == 3) or \ + ((len(value) == 2) and (value[0] == '0')): + value = string.atoi(value, 8) + if value > 255: + raise error, 'octal char out of range' + stack.append([Exact(chr(value))]) + elif value == '0': + stack.append([Exact(chr(0))]) + elif len(value) > 3: + raise error, 'too many digits' + else: + value = string.atoi(value) + if value >= register: + raise error, ('cannot reference a register ' + 'not yet used') + elif value == 0: + raise error, ('register 0 cannot be used ' + 'during match') + stack.append([MatchMemory(value)]) + + elif next == 'x': + value = '' + while (index < len(pattern)) and \ + (pattern[index] in string.hexdigits): + value = value + pattern[index] + index = index + 1 + value = string.atoi(value, 16) + if value > 255: + raise error, 'hex char out of range' + stack.append([Exact(chr(value))]) + + elif next == 'c': + if index >= len(pattern): + raise error, '\\c at end of re' + elif pattern[index] in 'abcdefghijklmnopqrstuvwxyz': + stack.append(Exact(chr(ord(pattern[index]) - + ord('a') + 1))) + else: + stack.append(Exact(chr(ord(pattern[index]) ^ 64))) + index = index + 1 + + elif next == 'A': + stack.append([BegBuf()]) + + elif next == 'Z': + stack.append([EndBuf()]) + + elif next == 'b': + stack.append([WordBound()]) + + elif next == 'B': + stack.append([NotWordBound()]) + + elif next == 'w': + stack.append([SyntaxSpec('word')]) + + elif next == 'W': + stack.append([NotSyntaxSpec('word')]) + + elif next == 's': + stack.append([SyntaxSpec('whitespace')]) + + elif next == 'S': + stack.append([NotSyntaxSpec('whitespace')]) + + elif next == 'd': + stack.append([SyntaxSpec('digit')]) + + elif next == 'D': + stack.append([NotSyntaxSpec('digit')]) + + elif next in 'GluLUQE': + # some perl-isms that we don't support + raise error, '\\' + next + ' not supported' + + else: + stack.append([Exact(pattern[index])]) + + else: + raise error, 'backslash at the end of a string' + + elif char == '|': + if len(stack) == 0: + raise error, 'nothing to alternate' + expr = [] + while (len(stack) != 0) and \ + (stack[-1][0].name != '(') and \ + (stack[-1][0].name != '|'): + expr = stack[-1] + expr + del stack[-1] + stack.append([FailureJump(label)] + \ + expr + \ + [Jump(-1), + Label(label)]) + stack.append([('|',)]) + label = label + 1 + + elif char == '(': + if index >= len(pattern): + raise error, 'no matching close paren' + + elif pattern[index] == '?': + # Perl style (?...) extensions + index = index + 1 + if index >= len(pattern): + raise error, 'extension ends prematurely' + + elif pattern[index] == 'P': + # Python extensions + index = index + 1 + if index >= len(pattern): + raise error, 'extension ends prematurely' + + elif pattern[index] == '<': + # Handle Python symbolic group names (?<...>...) + index = index + 1 + end = string.find(pattern, '>', index) + if end == -1: + raise error, 'no end to symbolic group name' + name = pattern[index:end] + # XXX check syntax of name + index = end + 1 + groupindex[name] = register + stack.append([OpenParen(register)]) + register = register + 1 + + elif pattern[index] == '=': + # backreference to symbolic group name + if index >= len(pattern): + raise error, '(?P= at the end of the pattern' + start = index + 1 + end = string.find(pattern, ')', start) + if end == -1: + raise error, 'no ) to end symbolic group name' + name = pattern[start:end] + if name not in groupindex: + raise error, ('symbolic group name ' + name + \ + ' has not been used yet') + stack.append([MatchMemory(groupindex[name])]) + index = end + 1 + + elif pattern[index] == '!': + # function callout + if index >= len(pattern): + raise error, 'no function callout name' + start = index + 1 + end = string.find(pattern, ')', start) + if end == -1: + raise error, 'no ) to end function callout name' + name = pattern[start:end] + if name not in callouts: + raise error, ('function callout name not listed ' + 'in callouts dict') + stack.append([FunctionCallout(name)]) + + else: + raise error, 'unknown Python extension' + + elif pattern[index] == ':': + # grouping, but no registers + index = index + 1 + stack.append([('(', -1)]) + + elif pattern[index] == '#': + # comment + index = index + 1 + end = string.find(pattern, ')', index) + if end == -1: + raise error, 'no end to comment' + index = end + 1 + + elif pattern[index] == '=': + raise error, ('zero-width positive lookahead ' + 'assertion is unsupported') + + elif pattern[index] == '!': + raise error, ('zero-width negative lookahead ' + 'assertion is unsupported') + + elif pattern[index] in 'iImMsSxX': + while (index < len(pattern)) and (pattern[index] != ')'): + if pattern[index] == 'iI': + flags = flags | IGNORECASE + elif pattern[index] == 'mM': + flags = flags | MULTILINE + elif pattern[index] == 'sS': + flags = flags | DOTALL + elif pattern[index] in 'xX': + flags = flags | VERBOSE + else: + raise error, 'unknown flag' + index = index + 1 + index = index + 1 + + else: + raise error, 'unknown extension' + + else: + stack.append([OpenParen(register)]) + register = register + 1 + + elif char == ')': + # make one expression out of everything on the stack up to + # the marker left by the last parenthesis + expr = [] + while (len(stack) > 0) and (stack[-1][0].name != '('): + expr = stack[-1] + expr + del stack[-1] + + if len(stack) == 0: + raise error, 'too many close parens' + if len(expr) == 0: + raise error, 'nothing inside parens' + + # check to see if alternation used correctly + if (expr[-1].name == '|'): + raise error, 'alternation with nothing on the right' + + # remove markers left by alternation + expr = filter(lambda x: x.name != '|', expr) + + # clean up jumps inserted by alternation + need_label = 0 + for i in range(len(expr)): + if (expr[i].name == 'jump') and (expr[i].label == -1): + expr[i] = JumpOpcode(label) + need_label = 1 + if need_label: + expr.append(Label(label)) + label = label + 1 + + if stack[-1][0][1] > 0: + expr = [StartMemory(stack[-1][0].register)] + \ + expr + \ + [EndMemory(stack[-1][0].register)] + del stack[-1] + stack.append(expr) + + elif char == '{': + if len(stack) == 0: + raise error, 'no expression to repeat' + end = string.find(pattern, '}', index) + if end == -1: + raise error, ('no close curly bracket to match' + ' open curly bracket') + + fields = map(string.strip, + string.split(pattern[index:end], ',')) + index = end + 1 + + minimal = 0 + if (index < len(pattern)) and (pattern[index] == '?'): + minimal = 1 + index = index + 1 + + if len(fields) == 1: + # {n} or {n}? (there's really no difference) + try: + count = string.atoi(fields[0]) + except ValueError: + raise error, ('count must be an integer ' + 'inside curly braces') + if count > 65535: + raise error, 'repeat count out of range' + expr = [] + while count > 0: + expr = expr + stack[-1] + count = count - 1 + del stack[-1] + stack.append(expr) + + elif len(fields) == 2: + # {n,} or {n,m} + if fields[1] == '': + # {n,} + try: + min = string.atoi(fields[0]) + except ValueError: + raise error, ('minimum must be an integer ' + 'inside curly braces') + if min > 65535: + raise error, 'minimum repeat count out of range' + + expr = [] + while min > 0: + expr = expr + stack[-1] + min = min - 1 + registers = registers_used(stack[-1]) + if minimal: + expr = expr + \ + ([Jump(label + 1), + Label(label)] + \ + stack[-1] + \ + [Label(label + 1), + FailureJump(label, registers)]) + else: + expr = expr + \ + ([Label(label), + FailureJump(label + 1, registers)] + + stack[-1] + + [StarJump(label), + Label(label + 1)]) + + del stack[-1] + stack.append(expr) + label = label + 2 + + else: + # {n,m} + try: + min = string.atoi(fields[0]) + except ValueError: + raise error, ('minimum must be an integer ' + 'inside curly braces') + try: + max = string.atoi(fields[1]) + except ValueError: + raise error, ('maximum must be an integer ' + 'inside curly braces') + if min > 65535: + raise error, ('minumim repeat count out ' + 'of range') + if max > 65535: + raise error, ('maximum repeat count out ' + 'of range') + if min > max: + raise error, ('minimum repeat count must be ' + 'less than the maximum ' + 'repeat count') + expr = [] + while min > 0: + expr = expr + stack[-1] + min = min - 1 + max = max - 1 + if minimal: + while max > 0: + expr = expr + \ + [FailureJump(label), + Jump(label + 1), + Label(label)] + \ + stack[-1] + \ + [Label(label + 1)] + label = label + 2 + del stack[-1] + stack.append(expr) + else: + while max > 0: + expr = expr + \ + [FailureJump(label)] + \ + stack[-1] + max = max - 1 + del stack[-1] + stack.append(expr + [Label(label)]) + label = label + 1 + + else: + raise error, ('there need to be one or two fields ' + 'in a {} expression') + index = end + 1 + + elif char == '}': + raise error, 'unbalanced close curly brace' + + elif char == '*': + # Kleene closure + if len(stack) == 0: + raise error, 'the Kleene closure needs something to repeat' + registers = registers_used(stack[-1]) + if (index < len(pattern)) and (pattern[index] == '?'): + # non-greedy matching + expr = [JumpInstructions(label + 1), + Label(label)] + \ + stack[-1] + \ + [Label(label + 1), + FailureJump(label)] + index = index + 1 + else: + # greedy matching + expr = [Label(label), + FailureJump(label + 1)] + \ + stack[-1] + \ + [StarJump(label), + Label(label + 1)] + del stack[-1] + stack.append(expr) + label = label + 2 + + elif char == '+': + # positive closure + if len(stack) == 0: + raise error, 'the positive closure needs something to repeat' + registers = registers_used(stack[-1]) + if (index < len(pattern)) and (pattern[index] == '?'): + # non-greedy + expr = [Label(label)] + \ + stack[-1] + \ + [FailureJump(label)] + label = label + 1 + index = index + 1 + else: + # greedy + expr = [DummyFailureJump(label + 1), + Label(label), + FailureJump(label + 2), + Label(label + 1)] + \ + stack[-1] + \ + [StarJump(label), + Label(label + 2)] + label = label + 3 + del stack[-1] + stack.append(expr) + + elif char == '?': + if len(stack) == 0: + raise error, 'need something to be optional' + registers = registers_used(stack[-1]) + if (index < len(pattern)) and (pattern[index] == '?'): + # non-greedy matching + expr = [FailureJump(label), + Jump(label + 1), + Label(label)] + \ + stack[-1] + \ + [Label(label + 1)] + label = label + 2 + index = index + 1 + else: + # greedy matching + expr = [FailureJump(label)] + \ + stack[-1] + \ + [Label(label)] + label = label + 1 + del stack[-1] + stack.append(expr) + + elif char == '.': + if flags & DOTALL: + stack.append(Set(map(chr, range(256)))) + else: + stack.append([AnyChar()]) + + elif char == '^': + if flags & MULTILINE: + stack.append([Bol()]) + else: + stack.append([BegBuf()]) + + elif char == '$': + if flags & MULTILINE: + stack.append([Eol()]) + else: + stack.append([EndBuf()]) + + elif char == '#': + if flags & VERBOSE: + # comment + index = index + 1 + end = string.find(pattern, '\n', index) + if end == -1: + index = len(pattern) + else: + index = end + 1 + else: + stack.append([Exact(char)]) + + elif char in string.whitespace: + if flags & VERBOSE: + stack.append([Exact(char)]) + + elif char == '[': + if index >= len(pattern): + raise error, 'incomplete set' + negate = 0 + last = '' + set = [] + if pattern[index] == '^': + negate = 1 + index = index + 1 + if index >= len(pattern): + raise error, 'incomplete set' + if pattern[index] in ']-': + set.append(pattern[index]) + last = pattern[index] + index = index + 1 + while (index < len(pattern)) and (pattern[index] != ']'): + next = pattern[index] + index = index + 1 + if next == '-': + if (index >= len(pattern)) or (pattern[index] == ']'): + raise error, 'incomplete range in set' + if last > pattern[index]: + raise error, 'range arguments out of order in set' + for next in map(chr, \ + range(ord(last), \ + ord(pattern[index]) + 1)): + if next not in set: + set.append(next) + last = '' + index = index + 1 + + elif next == '\\': + # expand syntax meta-characters and add to set + if index >= len(pattern): + raise error, 'incomplete set' + elif (pattern[index] == ']'): + raise error, 'backslash at the end of a set' + elif pattern[index] == 'w': + for next in syntax_table.keys(): + if 'word' in syntax_table[next]: + set.append(next) + elif pattern[index] == 'W': + for next in syntax_table.keys(): + if 'word' not in syntax_table[next]: + set.append(next) + elif pattern[index] == 'd': + for next in syntax_table.keys(): + if 'digit' in syntax_table[next]: + set.append(next) + elif pattern[index] == 'D': + for next in syntax_table.keys(): + if 'digit' not in syntax_table[next]: + set.append(next) + elif pattern[index] == 's': + for next in syntax_table.keys(): + if 'whitespace' in syntax_table[next]: + set.append(next) + elif pattern[index] == 'S': + for next in syntax_table.keys(): + if 'whitespace' not in syntax_table[next]: + set.append(next) + else: + raise error, 'unknown meta in set' + last = '' + index = index + 1 + + else: + if next not in set: + set.append(next) + last = next + + if pattern[index] != ']': + raise error, 'incomplete set' + + index = index + 1 + + if negate: + notset = [] + for char in map(chr, range(256)): + if char not in set: + notset.append(char) + stack.append([Set(notset)]) + else: + stack.append([Set(set)]) + + else: + stack.append([Exact(char)]) + + code = [] + while len(stack) > 0: + if stack[-1][0].name == '(': + raise error, 'too many open parens' + code = stack[-1] + code + del stack[-1] + if len(code) == 0: + raise error, 'no code generated' + if (code[-1].name == '|'): + raise error, 'alternation with nothing on the right' + code = filter(lambda x: x.name != '|', code) + need_label = 0 + for i in range(len(code)): + if (code[i].name == 'jump') and (code[i].label == -1): + code[i] = Jump(label) + need_label = 1 + if need_label: + code.append(Label(label)) + label = label + 1 + code.append(End()) + return RegexObject(pattern, flags, code, register, groupindex, callouts) + +if __name__ == '__main__': + print compile('a(b)*') + print compile('a{3}') + print compile('(a){2}') + print compile('a{2,4}') + print compile('a|b') + print compile('a(b|c)') + print compile('a*') + print compile('a+') + print compile('a|b|c') + print compile('a(b|c)*') + print compile('\\n') + print compile('a(?# huh huh)b') + print compile('[a-c\\w]') + print compile('[[]') + print compile('[]]') + print compile('(a)') + print compile('\Q*\e') + print compile('a{0,}') +