cpython/Lib/re.py

1509 lines
37 KiB
Python
Raw Normal View History

#!/usr/bin/env python
# -*- mode: python -*-
# $Id$
import string
import reop
# reop.error and re.error should be the same, since exceptions can be
# raised from either module.
error = reop.error # 're error'
from reop import NORMAL, CHARCLASS, REPLACEMENT
from reop import CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET
from reop import WORD_BOUNDARY, NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER
# compilation flags
IGNORECASE = I = 0x01
MULTILINE = M = 0x02
DOTALL = S = 0x04
VERBOSE = X = 0x08
1997-07-17 22:38:10 +00:00
repetition_operators = ['*', '*?', '+', '+?', '?', '??', '{n}', '{n}?',
'{n,}', '{n,}?', '{n,m}', '{n,m}?']
#
#
#
1997-07-15 15:38:20 +00:00
def valid_identifier(id):
if len(id) == 0:
return 0
1997-07-17 22:38:10 +00:00
if (not reop.syntax_table[id[0]] & reop.word) or \
(reop.syntax_table[id[0]] & reop.digit):
1997-07-15 15:38:20 +00:00
return 0
for char in id[1:]:
1997-07-17 22:38:10 +00:00
if not reop.syntax_table[char] & reop.word:
1997-07-15 15:38:20 +00:00
return 0
return 1
#
#
#
1997-07-15 18:59:04 +00:00
_cache = {}
_MAXCACHE = 20
def _cachecompile(pattern, flags=0):
1997-07-15 18:59:04 +00:00
key = (pattern, flags)
try:
return _cache[key]
except KeyError:
pass
value = compile(pattern, flags)
if len(_cache) >= _MAXCACHE:
_cache.clear()
_cache[key] = value
return value
def match(pattern, string, flags=0):
1997-07-15 18:59:04 +00:00
return _cachecompile(pattern, flags).match(string)
def search(pattern, string, flags=0):
1997-07-15 18:59:04 +00:00
return _cachecompile(pattern, flags).search(string)
def sub(pattern, repl, string, count=0):
if type(pattern) == type(''):
pattern = _cachecompile(pattern)
return pattern.sub(repl, string, count)
def subn(pattern, repl, string, count=0):
if type(pattern) == type(''):
pattern = _cachecompile(pattern)
return pattern.subn(repl, string, count)
1997-07-11 20:48:25 +00:00
def split(pattern, string, maxsplit=0):
if type(pattern) == type(''):
pattern = _cachecompile(pattern)
return pattern.split(string, maxsplit)
#
#
#
def _expand(m, repl):
results = []
index = 0
size = len(repl)
while index < size:
found = string.find(repl, '\\', index)
if found < 0:
results.append(repl[index:])
break
if found > index:
results.append(repl[index:found])
escape_type, value, index = expand_escape(repl, found+1, REPLACEMENT)
if escape_type == CHAR:
results.append(value)
elif escape_type == MEMORY_REFERENCE:
r = m.group(value)
if r is None:
raise error, ('group "' + str(value) + '" did not contribute '
'to the match')
results.append(m.group(value))
else:
raise error, "bad escape in replacement"
return string.join(results, '')
class RegexObject:
def __init__(self, pattern, flags, code, num_regs, groupindex):
self.code = code
self.num_regs = num_regs
self.flags = flags
self.pattern = pattern
self.groupindex = groupindex
self.fastmap = build_fastmap(code)
if code[0].name == 'bol':
self.anchor = 1
elif code[0].name == 'begbuf':
self.anchor = 2
else:
self.anchor = 0
self.buffer = assemble(code)
def search(self, string, pos=0):
regs = reop.search(self.buffer,
self.num_regs,
self.flags,
self.fastmap.can_be_null,
self.fastmap.fastmap(),
self.anchor,
string,
pos)
if regs is None:
return None
return MatchObject(self,
string,
pos,
regs)
1997-07-11 20:48:25 +00:00
def match(self, string, pos=0):
regs = reop.match(self.buffer,
self.num_regs,
self.flags,
self.fastmap.can_be_null,
self.fastmap.fastmap(),
self.anchor,
string,
pos)
if regs is None:
return None
1997-07-11 20:48:25 +00:00
return MatchObject(self,
string,
pos,
regs)
1997-07-11 20:48:25 +00:00
def sub(self, repl, string, count=0):
return self.subn(repl, string, count)[0]
def subn(self, repl, source, count=0):
if count < 0:
raise ValueError, "negative substibution count"
if count == 0:
import sys
count = sys.maxint
if type(repl) == type(''):
if '\\' in repl:
repl = lambda m, r=repl: _expand(m, r)
else:
repl = lambda m, r=repl: r
n = 0 # Number of matches
pos = 0 # Where to start searching
lastmatch = -1 # End of last match
results = [] # Substrings making up the result
end = len(source)
while n < count and pos <= end:
m = self.search(source, pos)
if not m:
break
i, j = m.span(0)
if i == j == lastmatch:
# Empty match adjacent to previous match
pos = pos + 1
results.append(source[lastmatch:pos])
continue
if pos < i:
results.append(source[pos:i])
results.append(repl(m))
pos = lastmatch = j
if i == j:
# Last match was empty; don't try here again
pos = pos + 1
results.append(source[lastmatch:pos])
n = n + 1
results.append(source[pos:])
return (string.join(results, ''), n)
def split(self, source, maxsplit=0):
if maxsplit < 0:
raise error, "negative split count"
if maxsplit == 0:
import sys
maxsplit = sys.maxint
n = 0
pos = 0
lastmatch = 0
results = []
end = len(source)
while n < maxsplit:
m = self.search(source, pos)
if not m:
break
i, j = m.span(0)
if i == j:
# Empty match
if pos >= end:
break
pos = pos+1
continue
results.append(source[lastmatch:i])
g = m.group()
if g:
results[len(results):] = list(g)
pos = lastmatch = j
results.append(source[lastmatch:])
return results
class MatchObject:
def __init__(self, re, string, pos, regs):
self.re = re
self.string = string
self.pos = pos
self.regs = regs
1997-07-11 20:48:25 +00:00
def start(self, g):
if type(g) == type(''):
try:
1997-07-11 20:48:25 +00:00
g = self.re.groupindex[g]
except (KeyError, TypeError):
1997-07-11 20:48:25 +00:00
raise IndexError, ('group "' + g + '" is undefined')
return self.regs[g][0]
1997-07-11 20:48:25 +00:00
def end(self, g):
if type(g) == type(''):
try:
1997-07-11 20:48:25 +00:00
g = self.re.groupindex[g]
except (KeyError, TypeError):
1997-07-11 20:48:25 +00:00
raise IndexError, ('group "' + g + '" is undefined')
return self.regs[g][1]
1997-07-11 20:48:25 +00:00
def span(self, g):
if type(g) == type(''):
try:
1997-07-11 20:48:25 +00:00
g = self.re.groupindex[g]
except (KeyError, TypeError):
1997-07-11 20:48:25 +00:00
raise IndexError, ('group "' + g + '" is undefined')
return self.regs[g]
1997-07-11 20:48:25 +00:00
def group(self, *groups):
if len(groups) == 0:
groups = range(1, self.re.num_regs)
use_all = 1
else:
use_all = 0
1997-07-11 20:48:25 +00:00
result = []
for g in groups:
if type(g) == type(''):
try:
g = self.re.groupindex[g]
except (KeyError, TypeError):
raise IndexError, ('group "' + g + '" is undefined')
1997-07-15 14:38:13 +00:00
if (self.regs[g][0] == -1) or (self.regs[g][1] == -1):
1997-07-11 20:48:25 +00:00
result.append(None)
else:
result.append(self.string[self.regs[g][0]:self.regs[g][1]])
if use_all or len(result) > 1:
1997-07-11 20:48:25 +00:00
return tuple(result)
elif len(result) == 1:
return result[0]
else:
return ()
#
# A set of classes to make assembly a bit easier, if a bit verbose.
#
class Instruction:
def __init__(self, opcode, size=1):
self.opcode = opcode
self.size = size
def assemble(self, position, labels):
return self.opcode
def __repr__(self):
return '%-15s' % (self.name)
class End(Instruction):
name = 'end'
def __init__(self):
Instruction.__init__(self, chr(0))
class Bol(Instruction):
name = 'bol'
def __init__(self):
self.name = 'bol'
Instruction.__init__(self, chr(1))
class Eol(Instruction):
name = 'eol'
def __init__(self):
Instruction.__init__(self, chr(2))
class Set(Instruction):
name = 'set'
1997-08-13 22:34:14 +00:00
def __init__(self, set, flags=0):
self.set = set
1997-08-13 22:34:14 +00:00
if flags & IGNORECASE: self.set=map(string.lower, self.set)
if len(set)==1:
# If only one element, use the "exact" opcode (it'll be faster)
Instruction.__init__(self, chr(4), 2)
else:
# Use the "set" opcode
Instruction.__init__(self, chr(3), 33)
def assemble(self, position, labels):
1997-08-13 22:34:14 +00:00
if len(self.set)==1:
# If only one character in set, generate an "exact" opcode
return self.opcode + self.set[0]
result = self.opcode
temp = 0
for i, c in map(lambda x: (x, chr(x)), range(256)):
if c in self.set:
temp = temp | (1 << (i & 7))
if (i % 8) == 7:
result = result + chr(temp)
temp = 0
return result
def __repr__(self):
result = '%-15s' % (self.name)
self.set.sort()
1997-08-13 22:34:14 +00:00
# XXX this should print more intelligently
for char in self.set:
result = result + char
return result
class Exact(Instruction):
name = 'exact'
1997-08-13 22:34:14 +00:00
def __init__(self, char, flags):
self.char = char
1997-08-13 22:34:14 +00:00
if flags & IGNORECASE: self.char=string.lower(self.char)
Instruction.__init__(self, chr(4), 2)
def assemble(self, position, labels):
return self.opcode + self.char
def __repr__(self):
return '%-15s %s' % (self.name, `self.char`)
class AnyChar(Instruction):
name = 'anychar'
def __init__(self):
Instruction.__init__(self, chr(5))
def assemble(self, position, labels):
return self.opcode
class MemoryInstruction(Instruction):
def __init__(self, opcode, register):
self.register = register
Instruction.__init__(self, opcode, 2)
def assemble(self, position, labels):
return self.opcode + chr(self.register)
def __repr__(self):
return '%-15s %i' % (self.name, self.register)
class StartMemory(MemoryInstruction):
name = 'start_memory'
def __init__(self, register):
MemoryInstruction.__init__(self, chr(6), register)
class EndMemory(MemoryInstruction):
name = 'end_memory'
def __init__(self, register):
MemoryInstruction.__init__(self, chr(7), register)
class MatchMemory(MemoryInstruction):
name = 'match_memory'
def __init__(self, register):
MemoryInstruction.__init__(self, chr(8), register)
class JumpInstruction(Instruction):
def __init__(self, opcode, label):
self.label = label
Instruction.__init__(self, opcode, 3)
def compute_offset(self, start, dest):
return dest - (start + 3)
def pack_offset(self, offset):
if offset > 32767:
raise error, 'offset out of range (pos)'
elif offset < -32768:
raise error, 'offset out of range (neg)'
elif offset < 0:
offset = offset + 65536
return chr(offset & 0xff) + chr((offset >> 8) & 0xff)
def assemble(self, position, labels):
return self.opcode + \
self.pack_offset(self.compute_offset(position,
labels[self.label]))
def __repr__(self):
return '%-15s %i' % (self.name, self.label)
class Jump(JumpInstruction):
name = 'jump'
def __init__(self, label):
JumpInstruction.__init__(self, chr(9), label)
class StarJump(JumpInstruction):
name = 'star_jump'
def __init__(self, label):
JumpInstruction.__init__(self, chr(10), label)
class FailureJump(JumpInstruction):
name = 'failure_jump'
def __init__(self, label):
JumpInstruction.__init__(self, chr(11), label)
class UpdateFailureJump(JumpInstruction):
name = 'update_failure_jump'
def __init__(self, label):
JumpInstruction.__init__(self, chr(12), label)
class DummyFailureJump(JumpInstruction):
name = 'dummy_failure_jump'
def __init__(self, label):
JumpInstruction.__init__(self, chr(13), label)
class BegBuf(Instruction):
name = 'begbuf'
def __init__(self):
Instruction.__init__(self, chr(14))
class EndBuf(Instruction):
name = 'endbuf'
def __init__(self):
Instruction.__init__(self, chr(15))
class WordBeg(Instruction):
name = 'wordbeg'
def __init__(self):
Instruction.__init__(self, chr(16))
class WordEnd(Instruction):
name = 'wordend'
def __init__(self):
Instruction.__init__(self, chr(17))
class WordBound(Instruction):
name = 'wordbound'
def __init__(self):
Instruction.__init__(self, chr(18))
class NotWordBound(Instruction):
name = 'notwordbound'
def __init__(self):
Instruction.__init__(self, chr(18))
class SyntaxSpec(Instruction):
name = 'syntaxspec'
def __init__(self, syntax):
self.syntax = syntax
Instruction.__init__(self, chr(20), 2)
def assemble(self, postition, labels):
return self.opcode + chr(self.syntax)
1997-07-11 20:48:25 +00:00
class NotSyntaxSpec(Instruction):
name = 'notsyntaxspec'
def __init__(self, syntax):
self.syntax = syntax
Instruction.__init__(self, chr(21), 2)
def assemble(self, postition, labels):
return self.opcode + chr(self.syntax)
class Label(Instruction):
name = 'label'
def __init__(self, label):
self.label = label
Instruction.__init__(self, '', 0)
def __repr__(self):
return '%-15s %i' % (self.name, self.label)
class OpenParen(Instruction):
name = '('
def __init__(self, register):
self.register = register
Instruction.__init__(self, '', 0)
1997-07-11 21:10:17 +00:00
def assemble(self, position, labels):
raise error, 'unmatched open parenthesis'
class Alternation(Instruction):
name = '|'
def __init__(self):
Instruction.__init__(self, '', 0)
1997-07-11 21:10:17 +00:00
def assemble(self, position, labels):
raise error, 'an alternation was not taken care of'
#
#
#
def assemble(instructions):
labels = {}
position = 0
pass1 = []
for instruction in instructions:
if instruction.name == 'label':
labels[instruction.label] = position
else:
pass1.append((position, instruction))
position = position + instruction.size
pass2 = ''
for position, instruction in pass1:
pass2 = pass2 + instruction.assemble(position, labels)
return pass2
#
#
#
def escape(pattern):
1997-07-11 20:48:25 +00:00
result = []
for char in pattern:
1997-07-17 22:38:10 +00:00
if not reop.syntax_table[char] & reop.word:
1997-07-11 20:48:25 +00:00
result.append('\\')
result.append(char)
return string.join(result, '')
#
#
#
def registers_used(instructions):
result = []
for instruction in instructions:
if (instruction.name in ['set_memory', 'end_memory']) and \
(instruction.register not in result):
result.append(instruction.register)
return result
#
#
#
class Fastmap:
def __init__(self):
self.map = ['\000']*256
self.can_be_null = 0
def add(self, char):
self.map[ord(char)] = '\001'
def fastmap(self):
return string.join(self.map, '')
def __getitem__(self, char):
return ord(self.map[ord(char)])
def __repr__(self):
self.map.sort()
return 'Fastmap(' + `self.can_be_null` + ', ' + `self.map` + ')'
#
#
#
def find_label(code, label):
line = 0
for instruction in code:
if (instruction.name == 'label') and (instruction.label == label):
return line + 1
line = line + 1
def build_fastmap_aux(code, pos, visited, fastmap):
if visited[pos]:
return
while 1:
instruction = code[pos]
visited[pos] = 1
pos = pos + 1
if instruction.name == 'end':
fastmap.can_be_null = 1
return
elif instruction.name == 'syntaxspec':
for char in map(chr, range(256)):
1997-07-17 22:38:10 +00:00
if reop.syntax_table[char] & instruction.syntax:
fastmap.add(char)
return
elif instruction.name == 'notsyntaxspec':
for char in map(chr, range(256)):
1997-07-17 22:38:10 +00:00
if not reop.syntax_table[char] & instruction.syntax:
fastmap.add(char)
return
elif instruction.name == 'eol':
fastmap.add('\n')
if fastmap.can_be_null == 0:
fastmap.can_be_null = 2
return
elif instruction.name == 'set':
for char in instruction.set:
fastmap.add(char)
return
elif instruction.name == 'exact':
fastmap.add(instruction.char)
elif instruction.name == 'anychar':
for char in map(chr, range(256)):
if char != '\n':
fastmap.add(char)
return
elif instruction.name == 'match_memory':
for char in map(chr, range(256)):
fastmap.add(char)
fastmap.can_be_null = 1
return
elif instruction.name in ['jump', 'dummy_failure_jump', \
'update_failure_jump', 'star_jump']:
pos = find_label(code, instruction.label)
if visited[pos]:
return
visited[pos] = 1
elif instruction.name == 'failure_jump':
build_fastmap_aux(code,
find_label(code, instruction.label),
visited,
fastmap)
def build_fastmap(code, pos=0):
visited = [0] * len(code)
fastmap = Fastmap()
build_fastmap_aux(code, pos, visited, fastmap)
return fastmap
#
#
#
#[NORMAL, CHARCLASS, REPLACEMENT] = range(3)
#[CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET, WORD_BOUNDARY,
# NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER] = range(9)
1997-07-15 14:38:13 +00:00
def expand_escape(pattern, index, context=NORMAL):
if index >= len(pattern):
raise error, 'escape ends too soon'
elif pattern[index] == 't':
return CHAR, chr(9), index + 1
elif pattern[index] == 'n':
return CHAR, chr(10), index + 1
elif pattern[index] == 'v':
return CHAR, chr(11), index + 1
1997-07-15 14:38:13 +00:00
elif pattern[index] == 'r':
return CHAR, chr(13), index + 1
elif pattern[index] == 'f':
return CHAR, chr(12), index + 1
elif pattern[index] == 'a':
return CHAR, chr(7), index + 1
elif pattern[index] == 'x':
# CAUTION: this is the Python rule, not the Perl rule!
end = index + 1 # Skip over the 'x' character
1997-07-15 14:38:13 +00:00
while (end < len(pattern)) and (pattern[end] in string.hexdigits):
end = end + 1
if end == index:
raise error, "\\x must be followed by hex digit(s)"
# let Python evaluate it, so we don't incorrectly 2nd-guess
# what it's doing (and Python in turn passes it on to sscanf,
# so that *it* doesn't incorrectly 2nd-guess what C does!)
char = eval ('"' + pattern[index-1:end] + '"')
1997-07-15 14:38:13 +00:00
assert len(char) == 1
return CHAR, char, end
elif pattern[index] == 'b':
if context != NORMAL:
return CHAR, chr(8), index + 1
else:
return WORD_BOUNDARY, '', index + 1
elif pattern[index] == 'B':
if context != NORMAL:
return CHAR, 'B', index + 1
else:
return NOT_WORD_BOUNDARY, '', index + 1
elif pattern[index] == 'A':
if context != NORMAL:
return CHAR, 'A', index + 1
else:
return BEGINNING_OF_BUFFER, '', index + 1
elif pattern[index] == 'Z':
if context != NORMAL:
return CHAR, 'Z', index + 1
1997-07-15 14:38:13 +00:00
else:
return END_OF_BUFFER, '', index + 1
elif pattern[index] in 'GluLUQE':
raise error, ('\\' + pattern[index] + ' is not allowed')
1997-07-15 14:38:13 +00:00
elif pattern[index] == 'w':
if context == NORMAL:
1997-07-17 22:38:10 +00:00
return SYNTAX, reop.word, index + 1
1997-07-15 14:38:13 +00:00
elif context == CHARCLASS:
set = []
1997-07-17 22:38:10 +00:00
for char in reop.syntax_table.keys():
if reop.syntax_table[char] & reop.word:
1997-07-15 14:38:13 +00:00
set.append(char)
return SET, set, index + 1
else:
return CHAR, 'w', index + 1
elif pattern[index] == 'W':
if context == NORMAL:
1997-07-17 22:38:10 +00:00
return NOT_SYNTAX, reop.word, index + 1
1997-07-15 14:38:13 +00:00
elif context == CHARCLASS:
set = []
1997-07-17 22:38:10 +00:00
for char in reop.syntax_table.keys():
if not reop.syntax_table[char] & reop.word:
1997-07-15 14:38:13 +00:00
set.append(char)
return SET, set, index + 1
else:
return CHAR, 'W', index + 1
elif pattern[index] == 's':
if context == NORMAL:
1997-07-17 22:38:10 +00:00
return SYNTAX, reop.whitespace, index + 1
1997-07-15 14:38:13 +00:00
elif context == CHARCLASS:
set = []
1997-07-17 22:38:10 +00:00
for char in reop.syntax_table.keys():
if reop.syntax_table[char] & reop.whitespace:
1997-07-15 14:38:13 +00:00
set.append(char)
return SET, set, index + 1
else:
return CHAR, 's', index + 1
elif pattern[index] == 'S':
if context == NORMAL:
1997-07-17 22:38:10 +00:00
return NOT_SYNTAX, reop.whitespace, index + 1
1997-07-15 14:38:13 +00:00
elif context == CHARCLASS:
set = []
1997-07-17 22:38:10 +00:00
for char in reop.syntax_table.keys():
if not reop.syntax_table[char] & reop.word:
1997-07-15 14:38:13 +00:00
set.append(char)
return SET, set, index + 1
else:
return CHAR, 'S', index + 1
elif pattern[index] == 'd':
if context == NORMAL:
1997-07-17 22:38:10 +00:00
return SYNTAX, reop.digit, index + 1
1997-07-15 14:38:13 +00:00
elif context == CHARCLASS:
set = []
1997-07-17 22:38:10 +00:00
for char in reop.syntax_table.keys():
if reop.syntax_table[char] & reop.digit:
1997-07-15 14:38:13 +00:00
set.append(char)
return SET, set, index + 1
else:
return CHAR, 'd', index + 1
elif pattern[index] == 'D':
if context == NORMAL:
1997-07-17 22:38:10 +00:00
return NOT_SYNTAX, reop.digit, index + 1
1997-07-15 14:38:13 +00:00
elif context == CHARCLASS:
set = []
1997-07-17 22:38:10 +00:00
for char in reop.syntax_table.keys():
if not reop.syntax_table[char] & reop.digit:
1997-07-15 14:38:13 +00:00
set.append(char)
return SET, set, index + 1
else:
return CHAR, 'D', index + 1
elif pattern[index] in '0123456789':
1997-07-15 18:11:42 +00:00
if pattern[index] == '0':
if (index + 1 < len(pattern)) and \
(pattern[index + 1] in string.octdigits):
if (index + 2 < len(pattern)) and \
(pattern[index + 2] in string.octdigits):
value = string.atoi(pattern[index:index + 3], 8)
index = index + 3
1997-07-15 14:38:13 +00:00
1997-07-15 18:11:42 +00:00
else:
value = string.atoi(pattern[index:index + 2], 8)
index = index + 2
1997-07-15 14:38:13 +00:00
1997-07-15 18:11:42 +00:00
else:
value = 0
index = index + 1
if value > 255:
raise error, 'octal value out of range'
1997-07-15 14:38:13 +00:00
1997-07-15 18:11:42 +00:00
return CHAR, chr(value), index
1997-07-15 14:38:13 +00:00
else:
1997-07-15 18:11:42 +00:00
if (index + 1 < len(pattern)) and \
(pattern[index + 1] in string.digits):
if (index + 2 < len(pattern)) and \
(pattern[index + 2] in string.octdigits) and \
(pattern[index + 1] in string.octdigits) and \
(pattern[index] in string.octdigits):
value = string.atoi(pattern[index:index + 3], 8)
if value > 255:
raise error, 'octal value out of range'
return CHAR, chr(value), index + 3
else:
value = string.atoi(pattern[index:index + 2])
if (value < 1) or (value > 99):
raise error, 'memory reference out of range'
if context == CHARCLASS:
raise error, ('cannot reference a register from '
'inside a character class')
return MEMORY_REFERENCE, value, index + 2
else:
if context == CHARCLASS:
raise error, ('cannot reference a register from '
'inside a character class')
value = string.atoi(pattern[index])
return MEMORY_REFERENCE, value, index + 1
elif pattern[index] == 'g':
if context != REPLACEMENT:
return CHAR, 'g', index + 1
1997-07-15 14:38:13 +00:00
index = index + 1
if index >= len(pattern):
raise error, 'unfinished symbolic reference'
if pattern[index] != '<':
raise error, 'missing < in symbolic reference'
index = index + 1
end = string.find(pattern, '>', index)
if end == -1:
raise error, 'unfinished symbolic reference'
value = pattern[index:end]
if not valid_identifier(value):
raise error, 'illegal symbolic reference'
return MEMORY_REFERENCE, value, end + 1
1997-07-15 14:38:13 +00:00
else:
return CHAR, pattern[index], index + 1
def compile(pattern, flags=0):
stack = []
label = 0
register = 1
groupindex = {}
1997-07-17 22:38:10 +00:00
lastop = ''
1997-07-17 22:38:10 +00:00
# look for embedded pattern modifiers at the beginning of the pattern
index = 0
1997-07-17 22:38:10 +00:00
if len(pattern) >= 3 and \
(pattern[:2] == '(?') and \
(pattern[2] in 'iImMsSxX'):
index = 2
while (index < len(pattern)) and (pattern[index] != ')'):
if pattern[index] in 'iI':
flags = flags | IGNORECASE
elif pattern[index] in 'mM':
flags = flags | MULTILINE
elif pattern[index] in 'sS':
flags = flags | DOTALL
elif pattern[index] in 'xX':
flags = flags | VERBOSE
else:
raise error, 'unknown modifier'
index = index + 1
index = index + 1
# compile the rest of the pattern
while (index < len(pattern)):
char = pattern[index]
index = index + 1
if char == '\\':
1997-07-15 14:38:13 +00:00
escape_type, value, index = expand_escape(pattern, index)
1997-07-15 14:38:13 +00:00
if escape_type == CHAR:
1997-08-13 22:34:14 +00:00
stack.append([Exact(value, flags)])
1997-07-17 22:38:10 +00:00
lastop = '\\' + value
1997-07-15 14:38:13 +00:00
elif escape_type == MEMORY_REFERENCE:
if value >= register:
raise error, ('cannot reference a register '
'not yet used')
stack.append([MatchMemory(value)])
1997-07-17 22:38:10 +00:00
lastop = '\\1'
1997-07-15 14:38:13 +00:00
elif escape_type == BEGINNING_OF_BUFFER:
stack.append([BegBuf()])
1997-07-17 22:38:10 +00:00
lastop = '\\A'
1997-07-15 14:38:13 +00:00
elif escape_type == END_OF_BUFFER:
stack.append([EndBuf()])
1997-07-17 22:38:10 +00:00
lastop = '\\Z'
1997-07-15 14:38:13 +00:00
elif escape_type == WORD_BOUNDARY:
stack.append([WordBound()])
1997-07-17 22:38:10 +00:00
lastop = '\\b'
1997-07-15 14:38:13 +00:00
elif escape_type == NOT_WORD_BOUNDARY:
stack.append([NotWordBound()])
1997-07-17 22:38:10 +00:00
lastop = '\\B'
1997-07-15 14:38:13 +00:00
elif escape_type == SYNTAX:
stack.append([SyntaxSpec(value)])
1997-07-17 22:38:10 +00:00
if value == reop.word:
lastop = '\\w'
elif value == reop.whitespace:
lastop = '\\s'
elif value == reop.digit:
lastop = '\\d'
else:
lastop = '\\?'
1997-07-15 14:38:13 +00:00
elif escape_type == NOT_SYNTAX:
stack.append([NotSyntaxSpec(value)])
1997-07-17 22:38:10 +00:00
if value == reop.word:
lastop = '\\W'
elif value == reop.whitespace:
lastop = '\\S'
elif value == reop.digit:
lastop = '\\D'
else:
lastop = '\\?'
1997-07-15 14:38:13 +00:00
elif escape_type == SET:
raise error, 'cannot use set escape type here'
else:
1997-07-15 14:38:13 +00:00
raise error, 'unknown escape type'
elif char == '|':
expr = []
1997-07-15 14:38:13 +00:00
while (len(stack) != 0) and \
(stack[-1][0].name != '(') and \
(stack[-1][0].name != '|'):
expr = stack[-1] + expr
del stack[-1]
stack.append([FailureJump(label)] + \
expr + \
[Jump(-1),
Label(label)])
1997-07-11 20:48:25 +00:00
stack.append([Alternation()])
label = label + 1
1997-07-17 22:38:10 +00:00
lastop = '|'
elif char == '(':
if index >= len(pattern):
raise error, 'no matching close paren'
elif pattern[index] == '?':
# Perl style (?...) extensions
index = index + 1
if index >= len(pattern):
raise error, 'extension ends prematurely'
elif pattern[index] == 'P':
# Python extensions
index = index + 1
if index >= len(pattern):
raise error, 'extension ends prematurely'
elif pattern[index] == '<':
1997-07-15 15:38:20 +00:00
# Handle Python symbolic group names (?P<...>...)
index = index + 1
end = string.find(pattern, '>', index)
if end == -1:
raise error, 'no end to symbolic group name'
name = pattern[index:end]
1997-07-15 15:38:20 +00:00
if not valid_identifier(name):
raise error, ('symbolic group name must be a '
'valid identifier')
index = end + 1
groupindex[name] = register
stack.append([OpenParen(register)])
register = register + 1
1997-07-17 22:38:10 +00:00
lastop = '('
elif pattern[index] == '=':
# backreference to symbolic group name
if index >= len(pattern):
raise error, '(?P= at the end of the pattern'
start = index + 1
end = string.find(pattern, ')', start)
if end == -1:
raise error, 'no ) to end symbolic group name'
name = pattern[start:end]
1997-07-15 15:38:20 +00:00
if name not in groupindex.keys():
raise error, ('symbolic group name ' + name + \
' has not been used yet')
stack.append([MatchMemory(groupindex[name])])
index = end + 1
1997-07-17 22:38:10 +00:00
lastop = '(?P=)'
else:
1997-07-11 20:48:25 +00:00
raise error, ('unknown Python extension: ' + \
pattern[index])
elif pattern[index] == ':':
# grouping, but no registers
index = index + 1
1997-07-11 20:48:25 +00:00
stack.append([OpenParen(-1)])
1997-07-17 22:38:10 +00:00
lastop = '('
elif pattern[index] == '#':
# comment
index = index + 1
end = string.find(pattern, ')', index)
if end == -1:
raise error, 'no end to comment'
index = end + 1
1997-07-17 22:38:10 +00:00
# do not change lastop
elif pattern[index] == '=':
raise error, ('zero-width positive lookahead '
'assertion is unsupported')
elif pattern[index] == '!':
raise error, ('zero-width negative lookahead '
'assertion is unsupported')
elif pattern[index] in 'iImMsSxX':
1997-07-17 22:38:10 +00:00
raise error, ('embedded pattern modifiers are only '
'allowed at the beginning of the pattern')
else:
raise error, 'unknown extension'
else:
stack.append([OpenParen(register)])
register = register + 1
1997-07-17 22:38:10 +00:00
lastop = '('
elif char == ')':
# make one expression out of everything on the stack up to
# the marker left by the last parenthesis
expr = []
while (len(stack) > 0) and (stack[-1][0].name != '('):
expr = stack[-1] + expr
del stack[-1]
if len(stack) == 0:
raise error, 'too many close parens'
1997-07-15 14:38:13 +00:00
# remove markers left by alternation
expr = filter(lambda x: x.name != '|', expr)
# clean up jumps inserted by alternation
need_label = 0
for i in range(len(expr)):
if (expr[i].name == 'jump') and (expr[i].label == -1):
1997-07-15 14:38:13 +00:00
expr[i] = Jump(label)
need_label = 1
if need_label:
expr.append(Label(label))
label = label + 1
if stack[-1][0].register > 0:
expr = [StartMemory(stack[-1][0].register)] + \
expr + \
[EndMemory(stack[-1][0].register)]
del stack[-1]
stack.append(expr)
1997-07-17 22:38:10 +00:00
lastop = ')'
elif char == '{':
if len(stack) == 0:
raise error, 'no expression to repeat'
end = string.find(pattern, '}', index)
if end == -1:
raise error, ('no close curly bracket to match'
' open curly bracket')
fields = map(string.strip,
string.split(pattern[index:end], ','))
index = end + 1
minimal = 0
if (index < len(pattern)) and (pattern[index] == '?'):
minimal = 1
index = index + 1
if len(fields) == 1:
# {n} or {n}? (there's really no difference)
try:
count = string.atoi(fields[0])
except ValueError:
raise error, ('count must be an integer '
'inside curly braces')
if count > 65535:
raise error, 'repeat count out of range'
expr = []
while count > 0:
expr = expr + stack[-1]
count = count - 1
del stack[-1]
stack.append(expr)
1997-07-17 22:38:10 +00:00
if minimal:
lastop = '{n}?'
else:
lastop = '{n}'
elif len(fields) == 2:
# {n,} or {n,m}
if fields[1] == '':
# {n,}
try:
min = string.atoi(fields[0])
except ValueError:
raise error, ('minimum must be an integer '
'inside curly braces')
if min > 65535:
raise error, 'minimum repeat count out of range'
expr = []
while min > 0:
expr = expr + stack[-1]
min = min - 1
if minimal:
expr = expr + \
([Jump(label + 1),
Label(label)] + \
stack[-1] + \
[Label(label + 1),
FailureJump(label)])
1997-07-17 22:38:10 +00:00
lastop = '{n,}?'
else:
expr = expr + \
([Label(label),
FailureJump(label + 1)] +
stack[-1] +
[StarJump(label),
Label(label + 1)])
1997-07-17 22:38:10 +00:00
lastop = '{n,}'
del stack[-1]
stack.append(expr)
label = label + 2
else:
# {n,m}
try:
min = string.atoi(fields[0])
except ValueError:
raise error, ('minimum must be an integer '
'inside curly braces')
try:
max = string.atoi(fields[1])
except ValueError:
raise error, ('maximum must be an integer '
'inside curly braces')
if min > 65535:
raise error, ('minumim repeat count out '
'of range')
if max > 65535:
raise error, ('maximum repeat count out '
'of range')
if min > max:
raise error, ('minimum repeat count must be '
'less than the maximum '
'repeat count')
expr = []
while min > 0:
expr = expr + stack[-1]
min = min - 1
max = max - 1
if minimal:
while max > 0:
expr = expr + \
[FailureJump(label),
Jump(label + 1),
Label(label)] + \
stack[-1] + \
[Label(label + 1)]
1997-07-15 18:59:04 +00:00
max = max - 1
label = label + 2
del stack[-1]
stack.append(expr)
1997-07-17 22:38:10 +00:00
lastop = '{n,m}?'
else:
while max > 0:
expr = expr + \
[FailureJump(label)] + \
stack[-1]
max = max - 1
del stack[-1]
stack.append(expr + [Label(label)])
label = label + 1
1997-07-17 22:38:10 +00:00
lastop = '{n,m}'
else:
raise error, ('there need to be one or two fields '
'in a {} expression')
elif char == '}':
raise error, 'unbalanced close curly brace'
elif char == '*':
# Kleene closure
if len(stack) == 0:
1997-07-11 21:10:17 +00:00
raise error, '* needs something to repeat'
1997-07-17 22:38:10 +00:00
if lastop in ['(', '|']:
1997-07-11 21:10:17 +00:00
raise error, '* needs something to repeat'
1997-07-17 22:38:10 +00:00
if lastop in repetition_operators:
raise error, 'nested repetition operators'
if (index < len(pattern)) and (pattern[index] == '?'):
# non-greedy matching
expr = [Jump(label + 1),
Label(label)] + \
stack[-1] + \
[Label(label + 1),
FailureJump(label)]
index = index + 1
1997-07-17 22:38:10 +00:00
lastop = '*?'
else:
# greedy matching
expr = [Label(label),
FailureJump(label + 1)] + \
stack[-1] + \
[StarJump(label),
Label(label + 1)]
1997-07-17 22:38:10 +00:00
lastop = '*'
del stack[-1]
stack.append(expr)
label = label + 2
elif char == '+':
# positive closure
if len(stack) == 0:
1997-07-11 21:10:17 +00:00
raise error, '+ needs something to repeat'
1997-07-17 22:38:10 +00:00
if lastop in ['(', '|']:
1997-07-11 21:10:17 +00:00
raise error, '+ needs something to repeat'
1997-07-17 22:38:10 +00:00
if lastop in repetition_operators:
raise error, 'nested repetition operators'
if (index < len(pattern)) and (pattern[index] == '?'):
# non-greedy
expr = [Label(label)] + \
stack[-1] + \
[FailureJump(label)]
label = label + 1
index = index + 1
1997-07-17 22:38:10 +00:00
lastop = '+?'
else:
# greedy
expr = [DummyFailureJump(label + 1),
Label(label),
FailureJump(label + 2),
Label(label + 1)] + \
stack[-1] + \
[StarJump(label),
Label(label + 2)]
label = label + 3
1997-07-17 22:38:10 +00:00
lastop = '+'
del stack[-1]
stack.append(expr)
elif char == '?':
if len(stack) == 0:
raise error, 'need something to be optional'
1997-07-17 22:38:10 +00:00
if len(stack) == 0:
raise error, '? needs something to repeat'
if lastop in ['(', '|']:
raise error, '? needs something to repeat'
if lastop in repetition_operators:
raise error, 'nested repetition operators'
if (index < len(pattern)) and (pattern[index] == '?'):
# non-greedy matching
expr = [FailureJump(label),
Jump(label + 1),
Label(label)] + \
stack[-1] + \
[Label(label + 1)]
label = label + 2
index = index + 1
1997-07-17 22:38:10 +00:00
lastop = '??'
else:
# greedy matching
expr = [FailureJump(label)] + \
stack[-1] + \
[Label(label)]
label = label + 1
1997-07-17 22:38:10 +00:00
lastop = '?'
del stack[-1]
stack.append(expr)
elif char == '.':
if flags & DOTALL:
1997-08-13 22:34:14 +00:00
stack.append([Set(map(chr, range(256)), flags)])
else:
stack.append([AnyChar()])
1997-07-17 22:38:10 +00:00
lastop = '.'
elif char == '^':
if flags & MULTILINE:
stack.append([Bol()])
else:
stack.append([BegBuf()])
1997-07-17 22:38:10 +00:00
lastop = '^'
elif char == '$':
if flags & MULTILINE:
stack.append([Eol()])
else:
stack.append([EndBuf()])
1997-07-17 22:38:10 +00:00
lastop = '$'
elif char == '#':
if flags & VERBOSE:
# comment
index = index + 1
end = string.find(pattern, '\n', index)
if end == -1:
index = len(pattern)
else:
index = end + 1
1997-07-17 22:38:10 +00:00
# do not change lastop
else:
1997-08-13 22:34:14 +00:00
stack.append([Exact(char, flags)])
1997-07-17 22:38:10 +00:00
lastop = '#'
elif char in string.whitespace:
1997-07-15 14:38:13 +00:00
if not (flags & VERBOSE):
1997-08-13 22:34:14 +00:00
stack.append([Exact(char, flags)])
1997-07-17 22:38:10 +00:00
lastop = char
elif char == '[':
1997-07-15 15:38:20 +00:00
# compile character class
if index >= len(pattern):
1997-07-15 15:38:20 +00:00
raise error, 'unclosed character class'
negate = 0
last = ''
set = []
1997-07-15 14:38:13 +00:00
if pattern[index] == '^':
negate = 1
index = index + 1
1997-07-15 15:38:20 +00:00
if index >= len(pattern):
raise error, 'unclosed character class'
1997-07-15 14:38:13 +00:00
1997-07-15 15:38:20 +00:00
if pattern[index] == ']':
set.append(']')
index = index + 1
if index >= len(pattern):
raise error, 'unclosed character class'
elif pattern[index] == '-':
set.append('-')
index = index + 1
if index >= len(pattern):
raise error, 'unclosed character class'
while (index < len(pattern)) and (pattern[index] != ']'):
next = pattern[index]
index = index + 1
if next == '-':
1997-07-15 15:38:20 +00:00
if index >= len(pattern):
raise error, 'incomplete range in character class'
1997-07-15 14:38:13 +00:00
1997-07-15 15:38:20 +00:00
elif pattern[index] == ']':
set.append('-')
else:
if last == '':
raise error, ('improper use of range in '
'character class')
1997-07-15 14:38:13 +00:00
1997-07-15 15:38:20 +00:00
start = last
if pattern[index] == '\\':
escape_type,
value,
index = expand_escape(pattern,
index + 1,
CHARCLASS)
if escape_type == CHAR:
end = value
else:
raise error, ('illegal escape in character '
'class range')
1997-07-15 14:38:13 +00:00
else:
1997-07-15 15:38:20 +00:00
end = pattern[index]
index = index + 1
if start > end:
raise error, ('range arguments out of order '
'in character class')
1997-07-15 14:38:13 +00:00
1997-07-15 15:38:20 +00:00
for char in map(chr, range(ord(start), ord(end) + 1)):
if char not in set:
set.append(char)
1997-07-15 14:38:13 +00:00
1997-07-15 15:38:20 +00:00
last = ''
elif next == '\\':
# expand syntax meta-characters and add to set
if index >= len(pattern):
raise error, 'incomplete set'
1997-07-15 14:38:13 +00:00
escape_type, value, index = expand_escape(pattern,
index,
CHARCLASS)
if escape_type == CHAR:
set.append(value)
last = value
elif escape_type == SET:
for char in value:
if char not in set:
set.append(char)
last = ''
else:
1997-07-15 14:38:13 +00:00
raise error, 'illegal escape type in character class'
else:
if next not in set:
set.append(next)
last = next
1997-07-15 14:38:13 +00:00
if (index >= len(pattern)) or ( pattern[index] != ']'):
raise error, 'incomplete set'
index = index + 1
if negate:
1997-08-13 22:34:14 +00:00
# If case is being ignored, then both upper- and lowercase
# versions of the letters must be excluded.
if flags & IGNORECASE: set=set+map(string.upper, set)
notset = []
for char in map(chr, range(256)):
if char not in set:
notset.append(char)
1997-07-15 14:38:13 +00:00
if len(notset) == 0:
raise error, 'empty negated set'
1997-08-13 22:34:14 +00:00
stack.append([Set(notset, flags)])
else:
1997-07-15 14:38:13 +00:00
if len(set) == 0:
raise error, 'empty set'
1997-08-13 22:34:14 +00:00
stack.append([Set(set, flags)])
1997-07-17 22:38:10 +00:00
lastop = '[]'
else:
1997-08-13 22:34:14 +00:00
stack.append([Exact(char, flags)])
1997-07-17 22:38:10 +00:00
lastop = char
code = []
while len(stack) > 0:
if stack[-1][0].name == '(':
raise error, 'too many open parens'
code = stack[-1] + code
del stack[-1]
if len(code) == 0:
raise error, 'no code generated'
code = filter(lambda x: x.name != '|', code)
need_label = 0
for i in range(len(code)):
if (code[i].name == 'jump') and (code[i].label == -1):
code[i] = Jump(label)
need_label = 1
if need_label:
code.append(Label(label))
label = label + 1
code.append(End())
1997-08-13 22:34:14 +00:00
# print code
return RegexObject(pattern, flags, code, register, groupindex)
# Replace expand_escape and _expand functions with their C equivalents.
# If you suspect bugs in the C versions, comment out the next two lines
1997-08-14 21:23:07 +00:00
expand_escape = reop.expand_escape
_expand = reop._expand