cpython/Lib/re.py

#!/usr/bin/env python
# -*- mode: python -*-
# $Id$

import string
import reop

error = 're error'

# compilation flags

IGNORECASE = I = 0x01

MULTILINE = M = 0x02
DOTALL = S = 0x04
VERBOSE = X = 0x08

#
# Initialize syntax table. This information should really come from the
# syntax table in regexpr.c rather than being duplicated here.
#

syntax_table = {}

for char in map(chr, range(0, 256)):
    syntax_table[char] = []

for char in string.lowercase:
    syntax_table[char].append('word')

for char in string.uppercase:
    syntax_table[char].append('word')

for char in string.digits:
    syntax_table[char].append('word')
    syntax_table[char].append('digit')

for char in string.whitespace:
    syntax_table[char].append('whitespace')

syntax_table['_'].append('word')

#
#
#

def valid_identifier(id):
    if len(id) == 0:
	return 0
    if ('word' not in syntax_table[id[0]]) or ('digit' in syntax_table[id[0]]):
	return 0
    for char in id[1:]:
	if 'word' not in syntax_table[char]:
	    return 0
    return 1

#
#
#

def match(pattern, string, flags=0):
    return compile(pattern, flags).match(string)

def search(pattern, string, flags=0):
    return compile(pattern, flags).search(string)

def sub(pattern, repl, string, count=0):
    return compile(pattern).sub(repl, string, count)

def subn(pattern, repl, string, count=0):
    return compile(pattern).subn(repl, string, count)

def split(pattern, string, maxsplit=0):
    return compile(pattern).subn(string, maxsplit)

#
#
#

class RegexObject:
    def __init__(self, pattern, flags, code, num_regs, groupindex, callouts):
	self.code = code
	self.num_regs = num_regs
	self.flags = flags
	self.pattern = pattern
	self.groupindex = groupindex
	self.callouts = callouts
	self.fastmap = build_fastmap(code)
	if code[0].name == 'bol':
	    self.anchor = 1
	elif code[0].name == 'begbuf':
	    self.anchor = 2
	else:
	    self.anchor = 0
	self.buffer = assemble(code)
    def search(self, string, pos=0):
	regs = reop.search(self.buffer,
			   self.num_regs,
			   self.flags,
			   self.fastmap.can_be_null,
			   self.fastmap.fastmap(),
			   self.anchor,
			   string,
			   pos)
	if regs is None:
	    return None
	return MatchObject(self,
			   string,
			   pos,
			   regs)
    def match(self, string, pos=0):
	regs = reop.match(self.buffer,
			  self.num_regs,
			  self.flags,
			  self.fastmap.can_be_null,
			  self.fastmap.fastmap(),
			  self.anchor,
			  string,
			  pos)
	if regs is None:
	    return None
	return MatchObject(self,
			   string,
			   pos,
			   regs)
    def sub(self, repl, string, count=0):
	pass
    def subn(self, repl, string, count=0):
	pass
    def split(self, string, maxsplit=0):
	pass

class MatchObject:
    def __init__(self, re, string, pos, regs):
	self.re = re
	self.string = string
	self.pos = pos
	self.regs = regs
    def start(self, g):
	if type(g) == type(''):
	    try:
		g = self.re.groupindex[g]
	    except (KeyError, TypeError):
		raise IndexError, ('group "' + g + '" is undefined')
	return self.regs[g][0]
    def end(self, g):
	if type(g) == type(''):
	    try:
		g = self.re.groupindex[g]
	    except (KeyError, TypeError):
		raise IndexError, ('group "' + g + '" is undefined')
	return self.regs[g][1]
    def span(self, g):
	if type(g) == type(''):
	    try:
		g = self.re.groupindex[g]
	    except (KeyError, TypeError):
		raise IndexError, ('group "' + g + '" is undefined')
	return self.regs[g]
    def group(self, *groups):
	if len(groups) == 0:
	    groups = range(1, self.re.num_regs)
	result = []
	for g in groups:
	    if type(g) == type(''):
		try:
		    g = self.re.groupindex[g]
		except (KeyError, TypeError):
		    raise IndexError, ('group "' + g + '" is undefined')
	    if (self.regs[g][0] == -1) or (self.regs[g][1] == -1):
		result.append(None)
	    else:
		result.append(self.string[self.regs[g][0]:self.regs[g][1]])
	if len(result) > 1:
	    return tuple(result)
	elif len(result) == 1:
	    return result[0]
	else:
	    return ()

#
# A set of classes to make assembly a bit easier, if a bit verbose.
#

class Instruction:
    def __init__(self, opcode, size=1):
	self.opcode = opcode
	self.size = size
    def assemble(self, position, labels):
	return self.opcode
    def __repr__(self):
	return '%-15s' % (self.name)

class FunctionCallout(Instruction):
    name = 'function'
    def __init__(self, function):
	self.function = function
	Instruction.__init__(self, chr(22), 2 + len(self.function))
    def assemble(self, position, labels):
	return self.opcode + chr(len(self.function)) + self.function
    def __repr__(self):
	return '%-15s %-10s' % (self.name, self.function)

class End(Instruction):
    name = 'end'
    def __init__(self):
	Instruction.__init__(self, chr(0))

class Bol(Instruction):
    name = 'bol'
    def __init__(self):
	self.name = 'bol'
	Instruction.__init__(self, chr(1))

class Eol(Instruction):
    name = 'eol'
    def __init__(self):
	Instruction.__init__(self, chr(2))

class Set(Instruction):
    name = 'set'
    def __init__(self, set):
	self.set = set
	Instruction.__init__(self, chr(3), 33)
    def assemble(self, position, labels):
	result = self.opcode
	temp = 0
	for i, c in map(lambda x: (x, chr(x)), range(256)):
	    if c in self.set:
		temp = temp | (1 << (i & 7))
	    if (i % 8) == 7:
		result = result + chr(temp)
		temp = 0
	return result
    def __repr__(self):
	result = '%-15s' % (self.name)
	self.set.sort()
	for char in self.set:
	    result = result + char
	return result

class Exact(Instruction):
    name = 'exact'
    def __init__(self, char):
	self.char = char
	Instruction.__init__(self, chr(4), 2)
    def assemble(self, position, labels):
	return self.opcode + self.char
    def __repr__(self):
	return '%-15s %s' % (self.name, `self.char`)

class AnyChar(Instruction):
    name = 'anychar'
    def __init__(self):
	Instruction.__init__(self, chr(5))
    def assemble(self, position, labels):
	return self.opcode

class MemoryInstruction(Instruction):
    def __init__(self, opcode, register):
	self.register = register
	Instruction.__init__(self, opcode, 2)
    def assemble(self, position, labels):
	return self.opcode + chr(self.register)
    def __repr__(self):
	return '%-15s %i' % (self.name, self.register)

class StartMemory(MemoryInstruction):
    name = 'start_memory'
    def __init__(self, register):
	MemoryInstruction.__init__(self, chr(6), register)

class EndMemory(MemoryInstruction):
    name = 'end_memory'
    def __init__(self, register):
	MemoryInstruction.__init__(self, chr(7), register)

class MatchMemory(MemoryInstruction):
    name = 'match_memory'
    def __init__(self, register):
	MemoryInstruction.__init__(self, chr(8), register)

class JumpInstruction(Instruction):
    def __init__(self, opcode, label):
	self.label = label
	Instruction.__init__(self, opcode, 3)
    def compute_offset(self, start, dest):
	return dest - (start + 3)
    def pack_offset(self, offset):
	if offset > 32767:
	    raise error, 'offset out of range (pos)'
	elif offset < -32768:
	    raise error, 'offset out of range (neg)'
	elif offset < 0:
	    offset = offset + 65536
	return chr(offset & 0xff) + chr((offset >> 8) & 0xff)
    def assemble(self, position, labels):
	return self.opcode + \
	       self.pack_offset(self.compute_offset(position,
						    labels[self.label]))
    def __repr__(self):
	return '%-15s %i' % (self.name, self.label)

class Jump(JumpInstruction):
    name = 'jump'
    def __init__(self, label):
	JumpInstruction.__init__(self, chr(9), label)

class StarJump(JumpInstruction):
    name = 'star_jump'
    def __init__(self, label):
	JumpInstruction.__init__(self, chr(10), label)

class FailureJump(JumpInstruction):
    name = 'failure_jump'
    def __init__(self, label):
	JumpInstruction.__init__(self, chr(11), label)

class UpdateFailureJump(JumpInstruction):
    name = 'update_failure_jump'
    def __init__(self, label):
	JumpInstruction.__init__(self, chr(12), label)

class DummyFailureJump(JumpInstruction):
    name = 'update_failure_jump'
    def __init__(self, label):
	JumpInstruction.__init__(self, chr(13), label)

class BegBuf(Instruction):
    name = 'begbuf'
    def __init__(self):
	Instruction.__init__(self, chr(14))

class EndBuf(Instruction):
    name = 'endbuf'
    def __init__(self):
	Instruction.__init__(self, chr(15))

class WordBeg(Instruction):
    name = 'wordbeg'
    def __init__(self):
	Instruction.__init__(self, chr(16))

class WordEnd(Instruction):
    name = 'wordend'
    def __init__(self):
	Instruction.__init__(self, chr(17))

class WordBound(Instruction):
    name = 'wordbound'
    def __init__(self):
	Instruction.__init__(self, chr(18))

class NotWordBound(Instruction):
    name = 'notwordbound'
    def __init__(self):
	Instruction.__init__(self, chr(18))

class SyntaxSpec(Instruction):
    name = 'syntaxspec'
    def __init__(self, syntax):
	self.syntax = syntax
	Instruction.__init__(self, chr(20), 2)
    def assemble(self, postition, labels):
	# XXX
	return self.opcode + chr(self.syntax)

class NotSyntaxSpec(Instruction):
    name = 'notsyntaxspec'
    def __init__(self, syntax):
	self.syntax = syntax
	Instruction.__init__(self, chr(21), 2)
    def assemble(self, postition, labels):
	# XXX
	return self.opcode + chr(self.syntax)

class Label(Instruction):
    name = 'label'
    def __init__(self, label):
	self.label = label
	Instruction.__init__(self, '', 0)
    def __repr__(self):
	return '%-15s %i' % (self.name, self.label)

class OpenParen(Instruction):
    name = '('
    def __init__(self, register):
	self.register = register
	Instruction.__init__(self, '', 0)
    def assemble(self, position, labels):
	raise error, 'unmatched open parenthesis'

class Alternation(Instruction):
    name = '|'
    def __init__(self):
	Instruction.__init__(self, '', 0)
    def assemble(self, position, labels):
	raise error, 'an alternation was not taken care of'

#
#
#

def assemble(instructions):
    labels = {}
    position = 0
    pass1 = []
    for instruction in instructions:
	if instruction.name == 'label':
	    labels[instruction.label] = position
	else:
	    pass1.append((position, instruction))
	    position = position + instruction.size
    pass2 = ''
    for position, instruction in pass1:
	pass2 = pass2 + instruction.assemble(position, labels)
    return pass2

#
#
#

def escape(pattern):
    result = []
    for char in pattern:
	if 'word' not in syntax_table[char]:
	    result.append('\\')
	result.append(char)
    return string.join(result, '')

#
#
#

def registers_used(instructions):
    result = []
    for instruction in instructions:
	if (instruction.name in ['set_memory', 'end_memory']) and \
	   (instruction.register not in result):
	    result.append(instruction.register)
    return result

#
#
#

class Fastmap:
    def __init__(self):
	self.map = ['\000']*256
	self.can_be_null = 0
    def add(self, char):
	self.map[ord(char)] = '\001'
    def fastmap(self):
	return string.join(self.map, '')
    def __getitem__(self, char):
	return ord(self.map[ord(char)])
    def __repr__(self):
	self.map.sort()
	return 'Fastmap(' + `self.can_be_null` + ', ' + `self.map` + ')'

#
#
#

def find_label(code, label):
    line = 0
    for instruction in code:
	if (instruction.name == 'label') and (instruction.label == label):
	    return line + 1
	line = line + 1

def build_fastmap_aux(code, pos, visited, fastmap):
    if visited[pos]:
	return
    while 1:
	instruction = code[pos]
	visited[pos] = 1
	pos = pos + 1
	if instruction.name == 'end':
	    fastmap.can_be_null = 1
	    return
	elif instruction.name == 'syntaxspec':
	    for char in map(chr, range(256)):
		if instruction.syntax in syntax_table[char]:
		    fastmap.add(char)
	    return
	elif instruction.name == 'notsyntaxspec':
	    for char in map(chr, range(256)):
		if instruction.syntax not in syntax_table[char]:
		    fastmap.add(char)
	    return
	elif instruction.name == 'eol':
	    fastmap.add('\n')
	    if fastmap.can_be_null == 0:
		fastmap.can_be_null = 2
	    return
	elif instruction.name == 'set':
	    for char in instruction.set:
		fastmap.add(char)
	    return
	elif instruction.name == 'exact':
	    fastmap.add(instruction.char)
	elif instruction.name == 'anychar':
	    for char in map(chr, range(256)):
		if char != '\n':
		    fastmap.add(char)
	    return
	elif instruction.name == 'match_memory':
	    for char in map(chr, range(256)):
		fastmap.add(char)
	    fastmap.can_be_null = 1
	    return
	elif instruction.name in ['jump', 'dummy_failure_jump', \
				  'update_failure_jump', 'star_jump']:
	    pos = find_label(code, instruction.label)
	    if visited[pos]:
		return
	    visited[pos] = 1
	elif instruction.name  == 'failure_jump':
	    build_fastmap_aux(code,
			      find_label(code, instruction.label),
			      visited,
			      fastmap)
	elif instruction.name == 'function':
	    for char in map(chr, range(256)):
		fastmap.add(char)
	    fastmap.can_be_null = 1
	    return

def build_fastmap(code, pos=0):
    visited = [0] * len(code)
    fastmap = Fastmap()
    build_fastmap_aux(code, pos, visited, fastmap)
    return fastmap

#
#
#

[NORMAL, CHARCLASS, REPLACEMENT] = range(3)
[CHAR, MEMORY_REFERENCE, SYNTAX, SET, WORD_BOUNDARY, NOT_WORD_BOUNDARY,
 BEGINNING_OF_BUFFER, END_OF_BUFFER] = range(8)

def expand_escape(pattern, index, context=NORMAL):
    if index >= len(pattern):
	raise error, 'escape ends too soon'

    elif pattern[index] == 't':
	return CHAR, chr(9), index + 1

    elif pattern[index] == 'n':
	return CHAR, chr(10), index + 1

    elif pattern[index] == 'r':
	return CHAR, chr(13), index + 1

    elif pattern[index] == 'f':
	return CHAR, chr(12), index + 1

    elif pattern[index] == 'a':
	return CHAR, chr(7), index + 1

    elif pattern[index] == 'e':
	return CHAR, chr(27), index + 1

    elif pattern[index] == 'c':
	if index + 1 >= len(pattern):
	    raise error, '\\c must be followed by another character'
	elif pattern[index + 1] in 'abcdefghijklmnopqrstuvwxyz':
	    return CHAR, chr(ord(pattern[index + 1]) - ord('a') + 1), index + 2
	else:
	    return CHAR, chr(ord(pattern[index + 1]) ^ 64), index + 2

    elif pattern[index] == 'x':
	# CAUTION: this is the Python rule, not the Perl rule!
	end = index
	while (end < len(pattern)) and (pattern[end] in string.hexdigits):
	    end = end + 1
	if end == index:
	    raise error, "\\x must be followed by hex digit(s)"
	# let Python evaluate it, so we don't incorrectly 2nd-guess
	# what it's doing (and Python in turn passes it on to sscanf,
	# so that *it* doesn't incorrectly 2nd-guess what C does!)
	char = eval ('"' + pattern[index-2:end] + '"')
	assert len(char) == 1
	return CHAR, char, end

    elif pattern[index] == 'b':
	if context != NORMAL:
	    return CHAR, chr(8), index + 1
	else:
	    return WORD_BOUNDARY, '', index + 1

    elif pattern[index] == 'B':
	if context != NORMAL:
	    return CHAR, 'B', index + 1
	else:
	    return NOT_WORD_BOUNDARY, '', index + 1

    elif pattern[index] == 'A':
	if context != NORMAL:
	    return CHAR, 'A', index + 1
	else:
	    return BEGINNING_OF_BUFFER, '', index + 1

    elif pattern[index] == 'Z':
	if context != NORMAL:
	    return 'Z', index + 1
	else:
	    return END_OF_BUFFER, '', index + 1

    elif pattern[index] in 'GluLUQE':
	raise error, ('\\' + ch + ' is not allowed')

    elif pattern[index] == 'w':
	if context == NORMAL:
	    return SYNTAX, 'word', index + 1
	elif context == CHARCLASS:
	    set = []
	    for char in syntax_table.keys():
		if 'word' in syntax_table[char]:
		    set.append(char)
	    return SET, set, index + 1
	else:
	    return CHAR, 'w', index + 1

    elif pattern[index] == 'W':
	if context == NORMAL:
	    return NOT_SYNTAX, 'word', index + 1
	elif context == CHARCLASS:
	    set = []
	    for char in syntax_table.keys():
		if 'word' not in syntax_table[char]:
		    set.append(char)
	    return SET, set, index + 1
	else:
	    return CHAR, 'W', index + 1

    elif pattern[index] == 's':
	if context == NORMAL:
	    return SYNTAX, 'whitespace', index + 1
	elif context == CHARCLASS:
	    set = []
	    for char in syntax_table.keys():
		if 'whitespace' in syntax_table[char]:
		    set.append(char)
	    return SET, set, index + 1
	else:
	    return CHAR, 's', index + 1

    elif pattern[index] == 'S':
	if context == NORMAL:
	    return NOT_SYNTAX, 'whitespace', index + 1
	elif context == CHARCLASS:
	    set = []
	    for char in syntax_table.keys():
		if 'whitespace' not in syntax_table[char]:
		    set.append(char)
	    return SET, set, index + 1
	else:
	    return CHAR, 'S', index + 1

    elif pattern[index] == 'd':
	if context == NORMAL:
	    return SYNTAX, 'digit', index + 1
	elif context == CHARCLASS:
	    set = []
	    for char in syntax_table.keys():
		if 'digit' in syntax_table[char]:
		    set.append(char)
	    return SET, set, index + 1
	else:
	    return CHAR, 'd', index + 1

    elif pattern[index] == 'D':
	if context == NORMAL:
	    return NOT_SYNTAX, 'digit', index + 1
	elif context == CHARCLASS:
	    set = []
	    for char in syntax_table.keys():
		if 'digit' not in syntax_table[char]:
		    set.append(char)
	    return SET, set, index + 1
	else:
	    return CHAR, 'D', index + 1

    elif pattern[index] in '0123456789':
	end = index
	while (end < len(pattern)) and (pattern[end] in string.digits):
	    end = end + 1
	value = pattern[index:end]

	if (len(value) == 3) or ((len(value) == 2) and (value[0] == '0')):
	    # octal character value
	    value = string.atoi(value, 8)
	    if value > 255:
		raise error, 'octal char out of range'
	    return CHAR, chr(value), end

	elif value == '0':
	    return CHAR, chr(0), end

	elif len(value) > 3:
	    raise error, ('\\' + value + ' has too many digits')

	else:
	    # \1-\99 - reference a register
	    if context == CHARCLASS:
		raise error, ('cannot reference a register from '
			      'inside a character class')
	    value = string.atoi(value)
	    if value == 0:
		raise error, ('register 0 cannot be used '
			      'during match')
	    return MEMORY_REFERENCE, value, end

    else:
	return CHAR, pattern[index], index + 1

def compile(pattern, flags=0):
    stack = []
    index = 0
    label = 0
    register = 1
    groupindex = {}
    callouts = []
    while (index < len(pattern)):
	char = pattern[index]
	index = index + 1
	if char == '\\':
	    escape_type, value, index = expand_escape(pattern, index)

	    if escape_type == CHAR:
		stack.append([Exact(value)])

	    elif escape_type == MEMORY_REFERENCE:
		if value >= register:
		    raise error, ('cannot reference a register '
				  'not yet used')
		stack.append([MatchMemory(value)])

	    elif escape_type == BEGINNING_OF_BUFFER:
		stack.append([BegBuf()])

	    elif escape_type == END_OF_BUFFER:
		stack.append([EndBuf()])

	    elif escape_type == WORD_BOUNDARY:
		stack.append([WordBound()])

	    elif escape_type == NOT_WORD_BOUNDARY:
		stack.append([NotWordBound()])

	    elif escape_type == SYNTAX:
		stack.append([SyntaxSpec(value)])

	    elif escape_type == NOT_SYNTAX:
		stack.append([NotSyntaxSpec(value)])

	    elif escape_type == SET:
		raise error, 'cannot use set escape type here'

	    else:
		raise error, 'unknown escape type'

	elif char == '|':
	    if len(stack) == 0:
		raise error, 'alternate with nothing on the left'
	    if stack[-1][0].name == '(':
		raise error, 'alternate with nothing on the left in the group'
	    if stack[-1][0].name == '|':
		raise error, 'alternates with nothing inbetween them'
	    expr = []

	    while (len(stack) != 0) and \
		  (stack[-1][0].name != '(') and \
		  (stack[-1][0].name != '|'):
		expr = stack[-1] + expr
		del stack[-1]
	    stack.append([FailureJump(label)] + \
			 expr + \
			 [Jump(-1),
			  Label(label)])
	    stack.append([Alternation()])
	    label = label + 1

	elif char == '(':
	    if index >= len(pattern):
		raise error, 'no matching close paren'

	    elif pattern[index] == '?':
		# Perl style (?...) extensions
		index = index + 1
		if index >= len(pattern):
		    raise error, 'extension ends prematurely'

		elif pattern[index] == 'P':
		    # Python extensions
		    index = index + 1
		    if index >= len(pattern):
			raise error, 'extension ends prematurely'

		    elif pattern[index] == '<':
			# Handle Python symbolic group names (?P<...>...)
			index = index + 1
			end = string.find(pattern, '>', index)
			if end == -1:
			    raise error, 'no end to symbolic group name'
			name = pattern[index:end]
			if not valid_identifier(name):
			    raise error, ('symbolic group name must be a '
					  'valid identifier')
			index = end + 1
			groupindex[name] = register
			stack.append([OpenParen(register)])
			register = register + 1

		    elif pattern[index] == '=':
			# backreference to symbolic group name
			if index >= len(pattern):
			    raise error, '(?P= at the end of the pattern'
			start = index + 1
			end = string.find(pattern, ')', start)
			if end == -1:
			    raise error, 'no ) to end symbolic group name'
			name = pattern[start:end]
			if name not in groupindex.keys():
			    raise error, ('symbolic group name ' + name + \
					  ' has not been used yet')
			stack.append([MatchMemory(groupindex[name])])
			index = end + 1

		    elif pattern[index] == '!':
			# function callout
			if index >= len(pattern):
			    raise error, 'no function callout name'
			start = index + 1
			end = string.find(pattern, ')', start)
			if end == -1:
			    raise error, 'no ) to end function callout name'
			name = pattern[start:end]
			if name not in callouts:
			    raise error, ('function callout name not listed '
					  'in callouts dict')
			stack.append([FunctionCallout(name)])

		    else:
			raise error, ('unknown Python extension: ' + \
				      pattern[index])

		elif pattern[index] == ':':
		    # grouping, but no registers
		    index = index + 1
		    stack.append([OpenParen(-1)])

		elif pattern[index] == '#':
		    # comment
		    index = index + 1
		    end = string.find(pattern, ')', index)
		    if end == -1:
			raise error, 'no end to comment'
		    index = end + 1

		elif pattern[index] == '=':
		    raise error, ('zero-width positive lookahead '
				  'assertion is unsupported')

		elif pattern[index] == '!':
		    raise error, ('zero-width negative lookahead '
				  'assertion is unsupported')

		elif pattern[index] in 'iImMsSxX':
		    while (index < len(pattern)) and (pattern[index] != ')'):
			if pattern[index] in 'iI':
			    flags = flags | IGNORECASE
			elif pattern[index] in 'mM':
			    flags = flags | MULTILINE
			elif pattern[index] in 'sS':
			    flags = flags | DOTALL
			elif pattern[index] in 'xX':
			    flags = flags | VERBOSE
			else:
			    raise error, 'unknown flag'
			index = index + 1
		    index = index + 1

		else:
		    raise error, 'unknown extension'

	    else:
		stack.append([OpenParen(register)])
		register = register + 1

	elif char == ')':
	    # make one expression out of everything on the stack up to
	    # the marker left by the last parenthesis
	    expr = []
	    while (len(stack) > 0) and (stack[-1][0].name != '('):
		expr = stack[-1] + expr
		del stack[-1]

	    if len(stack) == 0:
		raise error, 'too many close parens'

	    if len(expr) == 0:
		raise error, 'nothing inside parens'

	    # check to see if alternation used correctly
	    if (expr[-1].name == '|'):
		raise error, 'alternate with nothing on the right'

	    # remove markers left by alternation
	    expr = filter(lambda x: x.name != '|', expr)

	    # clean up jumps inserted by alternation
	    need_label = 0
	    for i in range(len(expr)):
		if (expr[i].name == 'jump') and (expr[i].label == -1):
		    expr[i] = Jump(label)
		    need_label = 1
	    if need_label:
		expr.append(Label(label))
		label = label + 1

	    if stack[-1][0].register > 0:
		expr = [StartMemory(stack[-1][0].register)] + \
		       expr + \
		       [EndMemory(stack[-1][0].register)]
	    del stack[-1]
	    stack.append(expr)

	elif char == '{':
	    if len(stack) == 0:
		raise error, 'no expression to repeat'
	    end = string.find(pattern, '}', index)
	    if end == -1:
		raise error, ('no close curly bracket to match'
			      ' open curly bracket')

	    fields = map(string.strip,
			 string.split(pattern[index:end], ','))
	    index = end + 1

	    minimal = 0
	    if (index < len(pattern)) and (pattern[index] == '?'):
		minimal = 1
		index = index + 1

	    if len(fields) == 1:
		# {n} or {n}? (there's really no difference)
		try:
		    count = string.atoi(fields[0])
		except ValueError:
		    raise error, ('count must be an integer '
				  'inside curly braces')
		if count > 65535:
		    raise error, 'repeat count out of range'
		expr = []
		while count > 0:
		    expr = expr + stack[-1]
		    count = count - 1
		del stack[-1]
		stack.append(expr)

	    elif len(fields) == 2:
		# {n,} or {n,m}
		if fields[1] == '':
		    # {n,}
		    try:
			min = string.atoi(fields[0])
		    except ValueError:
			raise error, ('minimum must be an integer '
				      'inside curly braces')
		    if min > 65535:
			raise error, 'minimum repeat count out of range'

		    expr = []
		    while min > 0:
			expr = expr + stack[-1]
			min = min - 1
		    registers = registers_used(stack[-1])
		    if minimal:
			expr = expr + \
			       ([Jump(label + 1),
				 Label(label)] + \
				stack[-1] + \
				[Label(label + 1),
				 FailureJump(label, registers)])
		    else:
			expr = expr + \
			       ([Label(label),
				 FailureJump(label + 1, registers)] +
				stack[-1] +
				[StarJump(label),
				 Label(label + 1)])

		    del stack[-1]
		    stack.append(expr)
		    label = label + 2

		else:
		    # {n,m}
		    try:
			min = string.atoi(fields[0])
		    except ValueError:
			raise error, ('minimum must be an integer '
				      'inside curly braces')
		    try:
			max = string.atoi(fields[1])
		    except ValueError:
			raise error, ('maximum must be an integer '
				      'inside curly braces')
		    if min > 65535:
			raise error, ('minumim repeat count out '
				      'of range')
		    if max > 65535:
			raise error, ('maximum repeat count out '
				      'of range')
		    if min > max:
			raise error, ('minimum repeat count must be '
				      'less than the maximum '
				      'repeat count')
		    expr = []
		    while min > 0:
			expr = expr + stack[-1]
			min = min - 1
			max = max - 1
		    if minimal:
			while max > 0:
			    expr = expr + \
				   [FailureJump(label),
				    Jump(label + 1),
				    Label(label)] + \
				   stack[-1] + \
				   [Label(label + 1)]
			    label = label + 2
			del stack[-1]
			stack.append(expr)
		    else:
			while max > 0:
			    expr = expr + \
				   [FailureJump(label)] + \
				   stack[-1]
			    max = max - 1
			del stack[-1]
			stack.append(expr + [Label(label)])
			label = label + 1

	    else:
		raise error, ('there need to be one or two fields '
			      'in a {} expression')
	    index = end + 1

	elif char == '}':
	    raise error, 'unbalanced close curly brace'

	elif char == '*':
	    # Kleene closure
	    if len(stack) == 0:
		raise error, '* needs something to repeat'
	    if (stack[-1][0].name == '(') or (stack[-1][0].name == '|'):
		raise error, '* needs something to repeat'
	    registers = registers_used(stack[-1])
	    if (index < len(pattern)) and (pattern[index] == '?'):
		# non-greedy matching
		expr = [JumpInstructions(label + 1),
			Label(label)] + \
		       stack[-1] + \
		       [Label(label + 1),
			FailureJump(label)]
		index = index + 1
	    else:
		# greedy matching
		expr = [Label(label),
			FailureJump(label + 1)] + \
		       stack[-1] + \
		       [StarJump(label),
			Label(label + 1)]
	    del stack[-1]
	    stack.append(expr)
	    label = label + 2

	elif char == '+':
	    # positive closure
	    if len(stack) == 0:
		raise error, '+ needs something to repeat'
	    if (stack[-1][0].name == '(') or (stack[-1][0].name == '|'):
		raise error, '+ needs something to repeat'
	    registers = registers_used(stack[-1])
	    if (index < len(pattern)) and (pattern[index] == '?'):
		# non-greedy
		expr = [Label(label)] + \
		       stack[-1] + \
		       [FailureJump(label)]
		label = label + 1
		index = index + 1
	    else:
		# greedy
		expr = [DummyFailureJump(label + 1),
			Label(label),
			FailureJump(label + 2),
			Label(label + 1)] + \
		       stack[-1] + \
		       [StarJump(label),
			Label(label + 2)]
		label = label + 3
	    del stack[-1]
	    stack.append(expr)

	elif char == '?':
	    if len(stack) == 0:
		raise error, 'need something to be optional'
	    registers = registers_used(stack[-1])
	    if (index < len(pattern)) and (pattern[index] == '?'):
		# non-greedy matching
		expr = [FailureJump(label),
			Jump(label + 1),
			Label(label)] + \
		       stack[-1] + \
		       [Label(label + 1)]
		label = label + 2
		index = index + 1
	    else:
		# greedy matching
		expr = [FailureJump(label)] + \
		       stack[-1] + \
		       [Label(label)]
		label = label + 1
	    del stack[-1]
	    stack.append(expr)

	elif char == '.':
	    if flags & DOTALL:
		stack.append(Set(map(chr, range(256))))
	    else:
		stack.append([AnyChar()])

	elif char == '^':
	    if flags & MULTILINE:
		stack.append([Bol()])
	    else:
		stack.append([BegBuf()])

	elif char == '$':
	    if flags & MULTILINE:
		stack.append([Eol()])
	    else:
		stack.append([EndBuf()])

	elif char == '#':
	    if flags & VERBOSE:
		# comment
		index = index + 1
		end = string.find(pattern, '\n', index)
		if end == -1:
		    index = len(pattern)
		else:
		    index = end + 1
	    else:
		stack.append([Exact(char)])

	elif char in string.whitespace:
	    if not (flags & VERBOSE):
		stack.append([Exact(char)])

	elif char == '[':
	    # compile character class

	    if index >= len(pattern):
		raise error, 'unclosed character class'

	    negate = 0
	    last = ''
	    set = []

	    if pattern[index] == '^':
		negate = 1
		index = index + 1
		if index >= len(pattern):
		    raise error, 'unclosed character class'

	    if pattern[index] == ']':
		set.append(']')
		index = index + 1
		if index >= len(pattern):
		    raise error, 'unclosed character class'

	    elif pattern[index] == '-':
		set.append('-')
		index = index + 1
		if index >= len(pattern):
		    raise error, 'unclosed character class'

	    while (index < len(pattern)) and (pattern[index] != ']'):
		next = pattern[index]
		index = index + 1
		if next == '-':
		    if index >= len(pattern):
			raise error, 'incomplete range in character class'

		    elif pattern[index] == ']':
			set.append('-')

		    else:
			if last == '':
			    raise error, ('improper use of range in '
					  'character class')

			start = last

			if pattern[index] == '\\':
			    escape_type,
			    value,
			    index = expand_escape(pattern,
						  index + 1,
						  CHARCLASS)

			    if escape_type == CHAR:
				end = value

			    else:
				raise error, ('illegal escape in character '
					      'class range')
			else:
			    end = pattern[index]
			    index = index + 1

			if start > end:
			    raise error, ('range arguments out of order '
					  'in character class')

			for char in map(chr, range(ord(start), ord(end) + 1)):
			    if char not in set:
				set.append(char)

			last = ''

		elif next == '\\':
		    # expand syntax meta-characters and add to set
		    if index >= len(pattern):
			raise error, 'incomplete set'

		    escape_type, value, index = expand_escape(pattern,
							      index,
							      CHARCLASS)

		    if escape_type == CHAR:
			set.append(value)
			last = value

		    elif escape_type == SET:
			for char in value:
			    if char not in set:
				set.append(char)
			last = ''

		    else:
			raise error, 'illegal escape type in character class'

		else:
		    if next not in set:
			set.append(next)
		    last = next

	    if (index >= len(pattern)) or ( pattern[index] != ']'):
		raise error, 'incomplete set'

	    index = index + 1

	    if negate:
		notset = []
		for char in map(chr, range(256)):
		    if char not in set:
			notset.append(char)
		if len(notset) == 0:
		    raise error, 'empty negated set'
		stack.append([Set(notset)])
	    else:
		if len(set) == 0:
		    raise error, 'empty set'
		stack.append([Set(set)])

	else:
	    stack.append([Exact(char)])

    code = []
    while len(stack) > 0:
	if stack[-1][0].name == '(':
	    raise error, 'too many open parens'
	code = stack[-1] + code
	del stack[-1]
    if len(code) == 0:
	raise error, 'no code generated'
    if (code[-1].name == '|'):
	raise error, 'alternate with nothing on the right'
    code = filter(lambda x: x.name != '|', code)
    need_label = 0
    for i in range(len(code)):
	if (code[i].name == 'jump') and (code[i].label == -1):
	    code[i] = Jump(label)
	    need_label = 1
    if need_label:
	code.append(Label(label))
	label = label + 1
    code.append(End())
    return RegexObject(pattern, flags, code, register, groupindex, callouts)

if __name__ == '__main__':
    print compile('a(b)*')
    print compile('a{3}')
    print compile('(a){2}')
    print compile('a{2,4}')
    print compile('a|b')
    print compile('a(b|c)')
    print compile('a*')
    print compile('a+')
    print compile('a|b|c')
    print compile('a(b|c)*')
    print compile('\\n')
    print compile('a(?# huh huh)b')
    print compile('[a-c\\w]')
    print compile('[[]')
    print compile('[]]')
    print compile('(<hello>a)')
    print compile('\Q*\e')
    print compile('a{0,}')