- fixed split

(test_sre still complains about split, but that's caused by
  the group reset bug, not split itself)

- added more mark slots
  (should be dynamically allocated, but 100 is better than 32.
  and checking for the upper limit is better than overwriting
  the memory ;-)

- internal: renamed the cursor helper class

- internal: removed some bloat from sre_compile
This commit is contained in:
Fredrik Lundh 2000-06-29 16:57:40 +00:00
parent 69218178ec
commit be2211e940
4 changed files with 116 additions and 130 deletions

View File

@ -26,7 +26,7 @@
U = UNICODE = sre_compile.SRE_FLAG_UNICODE U = UNICODE = sre_compile.SRE_FLAG_UNICODE
# sre exception # sre exception
error = sre_parse.error error = sre_compile.error
# -------------------------------------------------------------------- # --------------------------------------------------------------------
# public interface # public interface
@ -105,7 +105,7 @@ def filter(match, template=template):
n = i = 0 n = i = 0
s = [] s = []
append = s.append append = s.append
c = pattern.cursor(string) c = pattern.scanner(string)
while not count or n < count: while not count or n < count:
m = c.search() m = c.search()
if not m: if not m:
@ -127,16 +127,20 @@ def _split(pattern, string, maxsplit=0):
n = i = 0 n = i = 0
s = [] s = []
append = s.append append = s.append
c = pattern.cursor(string) extend = s.extend
c = pattern.scanner(string)
g = c.groups
while not maxsplit or n < maxsplit: while not maxsplit or n < maxsplit:
m = c.search() m = c.search()
if not m: if not m:
break break
j = m.start() b, e = m.span()
append(string[i:j]) if e == i:
i = m.end() continue
if i <= j: append(string[i:b])
break if g and b != e:
extend(m.groups())
i = e
n = n + 1 n = n + 1
if i < len(string): if i < len(string):
append(string[i:]) append(string[i:])

View File

@ -11,8 +11,7 @@
# other compatibility work. # other compatibility work.
# #
import array, string, sys import array
import _sre import _sre
from sre_constants import * from sre_constants import *
@ -24,123 +23,101 @@
else: else:
raise RuntimeError, "cannot find a useable array type" raise RuntimeError, "cannot find a useable array type"
# FIXME: <fl> should move some optimizations from the parser to here!
class Code:
def __init__(self):
self.data = []
def __len__(self):
return len(self.data)
def __getitem__(self, index):
return self.data[index]
def __setitem__(self, index, code):
self.data[index] = code
def append(self, code):
self.data.append(code)
def todata(self):
# print self.data
try:
return array.array(WORDSIZE, self.data).tostring()
except OverflowError:
print self.data
raise
def _compile(code, pattern, flags): def _compile(code, pattern, flags):
append = code.append emit = code.append
for op, av in pattern: for op, av in pattern:
if op is ANY: if op is ANY:
if flags & SRE_FLAG_DOTALL: if flags & SRE_FLAG_DOTALL:
append(OPCODES[op]) # any character at all! emit(OPCODES[op])
else: else:
append(OPCODES[CATEGORY]) emit(OPCODES[CATEGORY])
append(CHCODES[CATEGORY_NOT_LINEBREAK]) emit(CHCODES[CATEGORY_NOT_LINEBREAK])
elif op in (SUCCESS, FAILURE): elif op in (SUCCESS, FAILURE):
append(OPCODES[op]) emit(OPCODES[op])
elif op is AT: elif op is AT:
append(OPCODES[op]) emit(OPCODES[op])
if flags & SRE_FLAG_MULTILINE: if flags & SRE_FLAG_MULTILINE:
append(ATCODES[AT_MULTILINE[av]]) emit(ATCODES[AT_MULTILINE[av]])
else: else:
append(ATCODES[av]) emit(ATCODES[av])
elif op is BRANCH: elif op is BRANCH:
append(OPCODES[op]) emit(OPCODES[op])
tail = [] tail = []
for av in av[1]: for av in av[1]:
skip = len(code); append(0) skip = len(code); emit(0)
_compile(code, av, flags) _compile(code, av, flags)
## append(OPCODES[SUCCESS]) emit(OPCODES[JUMP])
append(OPCODES[JUMP]) tail.append(len(code)); emit(0)
tail.append(len(code)); append(0)
code[skip] = len(code) - skip code[skip] = len(code) - skip
append(0) # end of branch emit(0) # end of branch
for tail in tail: for tail in tail:
code[tail] = len(code) - tail code[tail] = len(code) - tail
elif op is CALL: elif op is CALL:
append(OPCODES[op]) emit(OPCODES[op])
skip = len(code); append(0) skip = len(code); emit(0)
_compile(code, av, flags) _compile(code, av, flags)
append(OPCODES[SUCCESS]) emit(OPCODES[SUCCESS])
code[skip] = len(code) - skip code[skip] = len(code) - skip
elif op is CATEGORY: elif op is CATEGORY:
append(OPCODES[op]) emit(OPCODES[op])
if flags & SRE_FLAG_LOCALE: if flags & SRE_FLAG_LOCALE:
append(CH_LOCALE[CHCODES[av]]) emit(CH_LOCALE[CHCODES[av]])
elif flags & SRE_FLAG_UNICODE: elif flags & SRE_FLAG_UNICODE:
append(CH_UNICODE[CHCODES[av]]) emit(CH_UNICODE[CHCODES[av]])
else: else:
append(CHCODES[av]) emit(CHCODES[av])
elif op is GROUP: elif op is GROUP:
if flags & SRE_FLAG_IGNORECASE: if flags & SRE_FLAG_IGNORECASE:
append(OPCODES[OP_IGNORE[op]]) emit(OPCODES[OP_IGNORE[op]])
else: else:
append(OPCODES[op]) emit(OPCODES[op])
append(av-1) emit(av-1)
elif op is IN: elif op is IN:
if flags & SRE_FLAG_IGNORECASE: if flags & SRE_FLAG_IGNORECASE:
append(OPCODES[OP_IGNORE[op]]) emit(OPCODES[OP_IGNORE[op]])
def fixup(literal, flags=flags): def fixup(literal, flags=flags):
return _sre.getlower(ord(literal), flags) return _sre.getlower(ord(literal), flags)
else: else:
append(OPCODES[op]) emit(OPCODES[op])
fixup = ord fixup = ord
skip = len(code); append(0) skip = len(code); emit(0)
for op, av in av: for op, av in av:
append(OPCODES[op]) emit(OPCODES[op])
if op is NEGATE: if op is NEGATE:
pass pass
elif op is LITERAL: elif op is LITERAL:
append(fixup(av)) emit(fixup(av))
elif op is RANGE: elif op is RANGE:
append(fixup(av[0])) emit(fixup(av[0]))
append(fixup(av[1])) emit(fixup(av[1]))
elif op is CATEGORY: elif op is CATEGORY:
if flags & SRE_FLAG_LOCALE: if flags & SRE_FLAG_LOCALE:
append(CH_LOCALE[CHCODES[av]]) emit(CH_LOCALE[CHCODES[av]])
elif flags & SRE_FLAG_UNICODE: elif flags & SRE_FLAG_UNICODE:
append(CH_UNICODE[CHCODES[av]]) emit(CH_UNICODE[CHCODES[av]])
else: else:
append(CHCODES[av]) emit(CHCODES[av])
else: else:
raise ValueError, "unsupported set operator" raise error, "internal: unsupported set operator"
append(OPCODES[FAILURE]) emit(OPCODES[FAILURE])
code[skip] = len(code) - skip code[skip] = len(code) - skip
elif op in (LITERAL, NOT_LITERAL): elif op in (LITERAL, NOT_LITERAL):
if flags & SRE_FLAG_IGNORECASE: if flags & SRE_FLAG_IGNORECASE:
append(OPCODES[OP_IGNORE[op]]) emit(OPCODES[OP_IGNORE[op]])
else: else:
append(OPCODES[op]) emit(OPCODES[op])
append(ord(av)) emit(ord(av))
elif op is MARK: elif op is MARK:
append(OPCODES[op]) emit(OPCODES[op])
append(av) emit(av)
elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT): elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
if flags & SRE_FLAG_TEMPLATE: if flags & SRE_FLAG_TEMPLATE:
append(OPCODES[REPEAT]) emit(OPCODES[REPEAT])
skip = len(code); append(0) skip = len(code); emit(0)
append(av[0]) emit(av[0])
append(av[1]) emit(av[1])
_compile(code, av[2], flags) _compile(code, av[2], flags)
append(OPCODES[SUCCESS]) emit(OPCODES[SUCCESS])
code[skip] = len(code) - skip code[skip] = len(code) - skip
else: else:
lo, hi = av[2].getwidth() lo, hi = av[2].getwidth()
@ -149,35 +126,35 @@ def fixup(literal, flags=flags):
if 0 and lo == hi == 1 and op is MAX_REPEAT: if 0 and lo == hi == 1 and op is MAX_REPEAT:
# FIXME: <fl> need a better way to figure out when # FIXME: <fl> need a better way to figure out when
# it's safe to use this one (in the parser, probably) # it's safe to use this one (in the parser, probably)
append(OPCODES[MAX_REPEAT_ONE]) emit(OPCODES[MAX_REPEAT_ONE])
skip = len(code); append(0) skip = len(code); emit(0)
append(av[0]) emit(av[0])
append(av[1]) emit(av[1])
_compile(code, av[2], flags) _compile(code, av[2], flags)
append(OPCODES[SUCCESS]) emit(OPCODES[SUCCESS])
code[skip] = len(code) - skip code[skip] = len(code) - skip
else: else:
append(OPCODES[op]) emit(OPCODES[op])
skip = len(code); append(0) skip = len(code); emit(0)
append(av[0]) emit(av[0])
append(av[1]) emit(av[1])
_compile(code, av[2], flags) _compile(code, av[2], flags)
append(OPCODES[SUCCESS]) emit(OPCODES[SUCCESS])
code[skip] = len(code) - skip code[skip] = len(code) - skip
elif op is SUBPATTERN: elif op is SUBPATTERN:
group = av[0] group = av[0]
if group: if group:
append(OPCODES[MARK]) emit(OPCODES[MARK])
append((group-1)*2) emit((group-1)*2)
_compile(code, av[1], flags) _compile(code, av[1], flags)
if group: if group:
append(OPCODES[MARK]) emit(OPCODES[MARK])
append((group-1)*2+1) emit((group-1)*2+1)
else: else:
raise ValueError, ("unsupported operand type", op) raise ValueError, ("unsupported operand type", op)
def compile(p, flags=0): def compile(p, flags=0):
# convert pattern list to internal format # internal: convert pattern list to internal format
if type(p) in (type(""), type(u"")): if type(p) in (type(""), type(u"")):
import sre_parse import sre_parse
pattern = p pattern = p
@ -185,18 +162,14 @@ def compile(p, flags=0):
else: else:
pattern = None pattern = None
flags = p.pattern.flags | flags flags = p.pattern.flags | flags
code = Code() code = []
_compile(code, p.data, flags) _compile(code, p.data, flags)
code.append(OPCODES[SUCCESS]) code.append(OPCODES[SUCCESS])
data = code.todata() # FIXME: <fl> get rid of this limitation
if 0: # debugging assert p.pattern.groups <= 100,\
print "sorry, but this version only supports 100 named groups"
print "-" * 68
import sre_disasm
sre_disasm.disasm(data)
print "-" * 68
return _sre.compile( return _sre.compile(
pattern, flags, pattern, flags,
data, array.array(WORDSIZE, code).tostring(),
p.pattern.groups-1, p.pattern.groupdict p.pattern.groups-1, p.pattern.groupdict
) )

View File

@ -14,11 +14,12 @@
* 00-03-06 fl first alpha, sort of (0.5) * 00-03-06 fl first alpha, sort of (0.5)
* 00-03-14 fl removed most compatibility stuff (0.6) * 00-03-14 fl removed most compatibility stuff (0.6)
* 00-05-10 fl towards third alpha (0.8.2) * 00-05-10 fl towards third alpha (0.8.2)
* 00-05-13 fl added experimental cursor stuff (0.8.3) * 00-05-13 fl added experimental scanner stuff (0.8.3)
* 00-05-27 fl final bug hunt (0.8.4) * 00-05-27 fl final bug hunt (0.8.4)
* 00-06-21 fl less bugs, more taste (0.8.5) * 00-06-21 fl less bugs, more taste (0.8.5)
* 00-06-25 fl major changes to better deal with nested repeats (0.9) * 00-06-25 fl major changes to better deal with nested repeats (0.9)
* 00-06-28 fl fixed findall (0.9.1) * 00-06-28 fl fixed findall (0.9.1)
* 00-06-29 fl fixed split, added more scanner features (0.9.2)
* *
* Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. * Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
* *
@ -384,7 +385,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
int i, count; int i, count;
/* FIXME: this is a hack! */ /* FIXME: this is a hack! */
void* mark_copy[64]; void* mark_copy[SRE_MARK_SIZE];
void* mark = NULL; void* mark = NULL;
TRACE(("%8d: enter\n", PTR(ptr))); TRACE(("%8d: enter\n", PTR(ptr)));
@ -954,7 +955,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
staticforward PyTypeObject Pattern_Type; staticforward PyTypeObject Pattern_Type;
staticforward PyTypeObject Match_Type; staticforward PyTypeObject Match_Type;
staticforward PyTypeObject Cursor_Type; staticforward PyTypeObject Scanner_Type;
static PyObject * static PyObject *
_compile(PyObject* self_, PyObject* args) _compile(PyObject* self_, PyObject* args)
@ -1074,7 +1075,7 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* args)
state->lastmark = 0; state->lastmark = 0;
/* FIXME: dynamic! */ /* FIXME: dynamic! */
for (i = 0; i < 64; i++) for (i = 0; i < SRE_MARK_SIZE; i++)
state->mark[i] = NULL; state->mark[i] = NULL;
state->stack = NULL; state->stack = NULL;
@ -1176,15 +1177,15 @@ pattern_new_match(PatternObject* pattern, SRE_STATE* state,
} }
static PyObject* static PyObject*
pattern_cursor(PatternObject* pattern, PyObject* args) pattern_scanner(PatternObject* pattern, PyObject* args)
{ {
/* create search state object */ /* create search state object */
CursorObject* self; ScannerObject* self;
PyObject* string; PyObject* string;
/* create match object (with room for extra group marks) */ /* create match object (with room for extra group marks) */
self = PyObject_NEW(CursorObject, &Cursor_Type); self = PyObject_NEW(ScannerObject, &Scanner_Type);
if (self == NULL) if (self == NULL)
return NULL; return NULL;
@ -1431,7 +1432,7 @@ static PyMethodDef pattern_methods[] = {
{"split", (PyCFunction) pattern_split, 1}, {"split", (PyCFunction) pattern_split, 1},
{"findall", (PyCFunction) pattern_findall, 1}, {"findall", (PyCFunction) pattern_findall, 1},
/* experimental */ /* experimental */
{"cursor", (PyCFunction) pattern_cursor, 1}, {"scanner", (PyCFunction) pattern_scanner, 1},
{NULL, NULL} {NULL, NULL}
}; };
@ -1467,7 +1468,7 @@ pattern_getattr(PatternObject* self, char* name)
statichere PyTypeObject Pattern_Type = { statichere PyTypeObject Pattern_Type = {
PyObject_HEAD_INIT(NULL) PyObject_HEAD_INIT(NULL)
0, "Pattern", sizeof(PatternObject), 0, 0, "SRE_Pattern", sizeof(PatternObject), 0,
(destructor)pattern_dealloc, /*tp_dealloc*/ (destructor)pattern_dealloc, /*tp_dealloc*/
0, /*tp_print*/ 0, /*tp_print*/
(getattrfunc)pattern_getattr, /*tp_getattr*/ (getattrfunc)pattern_getattr, /*tp_getattr*/
@ -1761,7 +1762,7 @@ match_getattr(MatchObject* self, char* name)
statichere PyTypeObject Match_Type = { statichere PyTypeObject Match_Type = {
PyObject_HEAD_INIT(NULL) PyObject_HEAD_INIT(NULL)
0, "Match", 0, "SRE_Match",
sizeof(MatchObject), /* size of basic object */ sizeof(MatchObject), /* size of basic object */
sizeof(int), /* space for group item */ sizeof(int), /* space for group item */
(destructor)match_dealloc, /*tp_dealloc*/ (destructor)match_dealloc, /*tp_dealloc*/
@ -1770,10 +1771,10 @@ statichere PyTypeObject Match_Type = {
}; };
/* -------------------------------------------------------------------- */ /* -------------------------------------------------------------------- */
/* cursor methods (experimental) */ /* scanner methods (experimental) */
static void static void
cursor_dealloc(CursorObject* self) scanner_dealloc(ScannerObject* self)
{ {
state_fini(&self->state); state_fini(&self->state);
Py_DECREF(self->string); Py_DECREF(self->string);
@ -1782,7 +1783,7 @@ cursor_dealloc(CursorObject* self)
} }
static PyObject* static PyObject*
cursor_match(CursorObject* self, PyObject* args) scanner_match(ScannerObject* self, PyObject* args)
{ {
SRE_STATE* state = &self->state; SRE_STATE* state = &self->state;
PyObject* match; PyObject* match;
@ -1811,7 +1812,7 @@ cursor_match(CursorObject* self, PyObject* args)
static PyObject* static PyObject*
cursor_search(CursorObject* self, PyObject* args) scanner_search(ScannerObject* self, PyObject* args)
{ {
SRE_STATE* state = &self->state; SRE_STATE* state = &self->state;
PyObject* match; PyObject* match;
@ -1830,24 +1831,26 @@ cursor_search(CursorObject* self, PyObject* args)
match = pattern_new_match((PatternObject*) self->pattern, match = pattern_new_match((PatternObject*) self->pattern,
state, self->string, status); state, self->string, status);
if (status >= 0) if (status == 0 || state->ptr == state->start)
state->start = (void*) ((char*) state->ptr + state->charsize);
else
state->start = state->ptr; state->start = state->ptr;
return match; return match;
} }
static PyMethodDef cursor_methods[] = { static PyMethodDef scanner_methods[] = {
{"match", (PyCFunction) cursor_match, 0}, {"match", (PyCFunction) scanner_match, 0},
{"search", (PyCFunction) cursor_search, 0}, {"search", (PyCFunction) scanner_search, 0},
{NULL, NULL} {NULL, NULL}
}; };
static PyObject* static PyObject*
cursor_getattr(CursorObject* self, char* name) scanner_getattr(ScannerObject* self, char* name)
{ {
PyObject* res; PyObject* res;
res = Py_FindMethod(cursor_methods, (PyObject*) self, name); res = Py_FindMethod(scanner_methods, (PyObject*) self, name);
if (res) if (res)
return res; return res;
@ -1859,18 +1862,21 @@ cursor_getattr(CursorObject* self, char* name)
return self->pattern; return self->pattern;
} }
if (!strcmp(name, "groups"))
return Py_BuildValue("i", ((PatternObject*) self->pattern)->groups);
PyErr_SetString(PyExc_AttributeError, name); PyErr_SetString(PyExc_AttributeError, name);
return NULL; return NULL;
} }
statichere PyTypeObject Cursor_Type = { statichere PyTypeObject Scanner_Type = {
PyObject_HEAD_INIT(NULL) PyObject_HEAD_INIT(NULL)
0, "Cursor", 0, "SRE_Scanner",
sizeof(CursorObject), /* size of basic object */ sizeof(ScannerObject), /* size of basic object */
0, 0,
(destructor)cursor_dealloc, /*tp_dealloc*/ (destructor)scanner_dealloc, /*tp_dealloc*/
0, /*tp_print*/ 0, /*tp_print*/
(getattrfunc)cursor_getattr, /*tp_getattr*/ (getattrfunc)scanner_getattr, /*tp_getattr*/
}; };
static PyMethodDef _functions[] = { static PyMethodDef _functions[] = {
@ -1888,7 +1894,7 @@ init_sre()
{ {
/* Patch object types */ /* Patch object types */
Pattern_Type.ob_type = Match_Type.ob_type = Pattern_Type.ob_type = Match_Type.ob_type =
Cursor_Type.ob_type = &PyType_Type; Scanner_Type.ob_type = &PyType_Type;
Py_InitModule("_" MODULE, _functions); Py_InitModule("_" MODULE, _functions);
} }

View File

@ -46,6 +46,9 @@ typedef struct {
void* ptr; void* ptr;
} SRE_STACK; } SRE_STACK;
/* FIXME: <fl> shouldn't be a constant, really... */
#define SRE_MARK_SIZE 200
typedef struct { typedef struct {
/* string pointers */ /* string pointers */
void* ptr; /* current position (also end of current slice) */ void* ptr; /* current position (also end of current slice) */
@ -56,7 +59,7 @@ typedef struct {
int charsize; int charsize;
/* registers */ /* registers */
int lastmark; int lastmark;
void* mark[64]; /* FIXME: <fl> should be dynamically allocated! */ void* mark[SRE_MARK_SIZE];
/* backtracking stack */ /* backtracking stack */
SRE_STACK* stack; SRE_STACK* stack;
int stacksize; int stacksize;
@ -66,11 +69,11 @@ typedef struct {
} SRE_STATE; } SRE_STATE;
typedef struct { typedef struct {
/* search helper */ /* scanner (internal helper object) */
PyObject_HEAD PyObject_HEAD
PyObject* pattern; PyObject* pattern;
PyObject* string; PyObject* string;
SRE_STATE state; SRE_STATE state;
} CursorObject; } ScannerObject;
#endif #endif