New pcre version from AMK

1997-12-08 17:15:20 +00:00 · 1997-12-08 17:15:20 +00:00 · 5070060d40
parent dfa6790bd6
commit 5070060d40
4 changed files with 1815 additions and 1227 deletions
--- a/Modules/pcre-internal.h
+++ b/Modules/pcre-internal.h
@ -3,7 +3,7 @@
 *************************************************/


-#define PCRE_VERSION       "0.95 23-Sep-1997"
+#define PCRE_VERSION       "1.01 19-Nov-1997"


 /* This is a library of functions to support regular expressions whose syntax
@ -34,38 +34,54 @@ computer system, and to redistribute it freely, subject to the following
 /* This header contains definitions that are shared between the different
 modules, but which are not relevant to the outside. */

+
+/* To cope with SunOS4 and other systems that lack memmove() but have bcopy(),
+define a macro for memmove() if USE_BCOPY is defined. */
+
+#ifdef USE_BCOPY
+#define memmove(a, b, c) bcopy(b, a, c)
+#endif
+ 
 /* Standard C headers plus the external interface definition */

 #include <ctype.h>
 #include <limits.h>
+#include <setjmp.h>
+#include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "pcre.h"

-/* Private options flags start at the most significant end of the byte. The
-public options defined in pcre.h start at the least significant end. Make sure
-they don't overlap! */
+/* Private options flags start at the most significant end of the two bytes.
+The public options defined in pcre.h start at the least significant end. Make
+sure they don't overlap! */

-#define PCRE_FIRSTSET  0x80          /* first_char is set */
-#define PCRE_STARTLINE 0x40          /* start after \n for multiline */
+#define PCRE_FIRSTSET           0x8000  /* first_char is set */
+#define PCRE_STARTLINE          0x4000  /* start after \n for multiline */
+#define PCRE_COMPILED_CASELESS  0x2000  /* like it says */

 /* Options for the "extra" block produced by pcre_study(). */

 #define PCRE_STUDY_CASELESS 0x01     /* study was caseless */
-#define PCRE_STUDY_MAPPED   0x20     /* a map of starting chars exists */
+#define PCRE_STUDY_MAPPED   0x02     /* a map of starting chars exists */

 /* Masks for identifying the public options: all permitted at compile time,
 only some permitted at run or study time. */

 #ifdef FOR_PYTHON
 #define PUBLIC_OPTIONS \
-  (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE|PCRE_DOTALL)
+  (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
+   PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_LOCALE)
 #else
 #define PUBLIC_OPTIONS \
-  (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE)
+  (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
+   PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA)
 #endif
-#define PUBLIC_EXEC_OPTIONS (PCRE_CASELESS|PCRE_ANCHORED|PCRE_MULTILINE)
+#define PUBLIC_EXEC_OPTIONS \
+  (PCRE_CASELESS|PCRE_ANCHORED|PCRE_MULTILINE|PCRE_NOTBOL|PCRE_NOTEOL| \
+   PCRE_DOTALL|PCRE_DOLLAR_ENDONLY)
+
 #define PUBLIC_STUDY_OPTIONS (PCRE_CASELESS)

 /* Magic number to provide a small check against being handed junk. */
@ -79,26 +95,22 @@ typedef int BOOL;
 #define FALSE   0
 #define TRUE    1

-/* Flags for character classes - see also class_ops table below. */
-
-#define CLASS_DIGITS         0x01
-#define CLASS_NOT_DIGITS     0x02
-#define CLASS_WHITESPACE     0x04
-#define CLASS_NOT_WHITESPACE 0x08
-#define CLASS_WORD           0x10
-#define CLASS_NOT_WORD       0x20
-
 /* These are escaped items that aren't just an encoding of a particular data
 value such as \n. They must have non-zero values, as check_escape() returns
 their negation. Also, they must appear in the same order as in the opcode
 definitions below, up to ESC_Z. The final one must be ESC_REF as subsequent
 values are used for \1, \2, \3, etc. There is a test in the code for an escape
-greater than ESC_b and less than ESC_Z to detect the types that may be
+greater than ESC_b and less than ESC_X to detect the types that may be
 repeated. If any new escapes are put in-between that don't consume a character,
 that code will have to change. */

 enum { ESC_A = 1, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, ESC_w,
-       ESC_Z, ESC_REF };
+
+                    /* These are not Perl escapes, so can't appear in the */
+       ESC_X,       /* simple table-lookup because they must be conditional */
+                    /* on PCRE_EXTRA. */
+       ESC_Z,
+       ESC_REF };

 /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
 that extract substrings. Starting from 1 (i.e. after OP_END), the values up to
@ -110,21 +122,28 @@ enum {
  /* Values corresponding to backslashed metacharacters */

  OP_SOD,            /* Start of data: \A */
-  OP_NOT_WORD_BOUNDARY,  /* \W */
-  OP_WORD_BOUNDARY,      /* \w */
+  OP_NOT_WORD_BOUNDARY,  /* \B */
+  OP_WORD_BOUNDARY,      /* \b */
  OP_NOT_DIGIT,          /* \D */
  OP_DIGIT,              /* \d */
  OP_NOT_WHITESPACE,     /* \S */
  OP_WHITESPACE,         /* \s */
  OP_NOT_WORDCHAR,       /* \W */
  OP_WORDCHAR,           /* \w */
+  OP_CUT,            /* The analogue of Prolog's "cut" operation (extension) */
  OP_EOD,            /* End of data: or \Z. This must always be the last
                        of the backslashed meta values. */

+  OP_NOT_WORD_BOUNDARY_L,  /* localized \B */
+  OP_WORD_BOUNDARY_L,      /* localized \b */
+  OP_NOT_WORDCHAR_L,       /* localized \W */
+  OP_WORDCHAR_L,           /* localized \w */
+
  OP_CIRC,           /* Start of line - varies with multiline switch */
  OP_DOLL,           /* End of line - varies with multiline switch */
  OP_ANY,            /* Match any character */
  OP_CHARS,          /* Match string of characters */
+  OP_NOT,            /* Match anything but the following char */

  OP_STAR,           /* The maximizing and minimizing versions of */
  OP_MINSTAR,        /* all these opcodes must come in pairs, with */
@ -132,9 +151,19 @@ enum {
  OP_MINPLUS,        /* This first set applies to single characters */
  OP_QUERY,
  OP_MINQUERY,
-  OP_UPTO,           /* From 0 to n matches. */
+  OP_UPTO,           /* From 0 to n matches */
  OP_MINUPTO,
-  OP_EXACT,          /* Exactly n matches. */
+  OP_EXACT,          /* Exactly n matches */
+
+  OP_NOTSTAR,        /* The maximizing and minimizing versions of */
+  OP_NOTMINSTAR,     /* all these opcodes must come in pairs, with */
+  OP_NOTPLUS,        /* the minimizing one second. */
+  OP_NOTMINPLUS,     /* This first set applies to "not" single characters */
+  OP_NOTQUERY,
+  OP_NOTMINQUERY,
+  OP_NOTUPTO,        /* From 0 to n matches */
+  OP_NOTMINUPTO,
+  OP_NOTEXACT,       /* Exactly n matches */

  OP_TYPESTAR,       /* The maximizing and minimizing versions of */
  OP_TYPEMINSTAR,    /* all these opcodes must come in pairs, with */
@ -142,9 +171,9 @@ enum {
  OP_TYPEMINPLUS,    /* be in exactly the same order as those above. */
  OP_TYPEQUERY,      /* This set applies to character types such as \d */
  OP_TYPEMINQUERY,
-  OP_TYPEUPTO,
+  OP_TYPEUPTO,       /* From 0 to n matches */
  OP_TYPEMINUPTO,
-  OP_TYPEEXACT,
+  OP_TYPEEXACT,      /* Exactly n matches */

  OP_CRSTAR,         /* The maximizing and minimizing versions of */
  OP_CRMINSTAR,      /* all these opcodes must come in pairs, with */
@ -152,11 +181,11 @@ enum {
  OP_CRMINPLUS,      /* be in exactly the same order as those above. */
  OP_CRQUERY,        /* These are for character classes and back refs */
  OP_CRMINQUERY,
-  OP_CRRANGE,        /* These are different to the two seta above. */
+  OP_CRRANGE,        /* These are different to the three seta above. */
  OP_CRMINRANGE,

  OP_CLASS,          /* Match a character class */
-  OP_NEGCLASS,       /* Don't match a character class */
+  OP_CLASS_L,        /* Match a character class */
  OP_REF,            /* Match a back reference */

  OP_ALT,            /* Start of alternation */
@ -166,6 +195,7 @@ enum {

  OP_ASSERT,
  OP_ASSERT_NOT,
+  OP_ONCE,           /* Once matched, don't back up into the subpattern */

  OP_BRAZERO,        /* These two must remain together and in this */
  OP_BRAMINZERO,     /* order. */
@ -179,6 +209,35 @@ left after OP_BRA, i.e. 255 - OP_BRA. We actually set it somewhat lower. */

 #define EXTRACT_MAX  99

+/* The texts of compile-time error messages are defined as macros here so that
+they can be accessed by the POSIX wrapper and converted into error codes.  Yes,
+I could have used error codes in the first place, but didn't feel like changing
+just to accommodate the POSIX wrapper. */
+
+#define ERR1  "\\ at end of pattern"
+#define ERR2  "\\c at end of pattern"
+#define ERR3  "unrecognized character follows \\"
+#define ERR4  "numbers out of order in {} quantifier"
+#define ERR5  "number too big in {} quantifier"
+#define ERR6  "missing terminating ] for character class"
+#define ERR7  "invalid escape sequence in character class"
+#define ERR8  "range out of order in character class"
+#define ERR9  "nothing to repeat"
+#define ERR10 "operand of unlimited repeat could match the empty string"
+#define ERR11 "internal error: unexpected repeat"
+#define ERR12 "unrecognized character after (?"
+#define ERR13 "too many capturing parenthesized sub-patterns"
+#define ERR14 "missing )"
+#define ERR15 "back reference to non-existent subpattern"
+#define ERR16 "erroffset passed as NULL"
+#define ERR17 "unknown option bit(s) set"
+#define ERR18 "missing ) after comment"
+#define ERR19 "too many sets of parentheses"
+#define ERR20 "regular expression too large"
+#define ERR21 "failed to get memory"
+#define ERR22 "unmatched brackets"
+#define ERR23 "internal error: code overflow"
+
 /* All character handling must be done as unsigned characters. Otherwise there
 are problems with top-bit-set characters and functions such as isspace().
 However, we leave the interface to the outside world as char *, because that
@ -193,8 +252,9 @@ runs on as long as necessary after the end. */

 typedef struct real_pcre {
  unsigned int  magic_number;
-  unsigned char options;
+  unsigned short int options;
  unsigned char top_bracket;
+  unsigned char top_backref;
  unsigned char first_char;
  unsigned char code[1];
 } real_pcre;
@ -206,21 +266,29 @@ typedef struct real_pcre_extra {
  unsigned char start_bits[32];
 } real_pcre_extra;

-/* Global tables from pcre-chartables.c */
+/* Global tables from chartables.c */

 extern uschar pcre_lcc[];
-extern uschar pcre_ucc[];
+extern uschar pcre_fcc[];
+extern uschar pcre_cbits[];
 extern uschar pcre_ctypes[];

 /* Bit definitions for entries in pcre_ctypes[]. */

 #define ctype_space   0x01
-#define ctype_digit   0x02
-#define ctype_xdigit  0x04
-#define ctype_word    0x08   /* alphameric or '_' */
-#ifdef FOR_PYTHON
-#define ctype_odigit  0x10   /* Octal digits */
-#endif
+#define ctype_letter  0x02
+#define ctype_digit   0x04
+#define ctype_xdigit  0x08
+#define ctype_word    0x10   /* alphameric or '_' */
+#define ctype_odigit  0x20   /* octal digit */
 #define ctype_meta    0x80   /* regexp meta char or zero (end pattern) */

-/* End of pcre-internal.h */
+/* Offsets for the bitmap tables */
+
+#define cbit_digit    0
+#define cbit_letter  32
+#define cbit_word    64
+#define cbit_space   96
+#define cbit_length 128      /* Length of the cbits table */
+
+/* End of internal.h */
--- a/Modules/pcre.h
+++ b/Modules/pcre.h
@ -4,24 +4,32 @@

 /* Copyright (c) 1997 University of Cambridge */

-/* Have to include stdlib.h in order to ensure that size_t is defined;
-it is needed in there for malloc. */
+#ifndef _PCRE_H
+#define _PCRE_H

-#ifndef PCRE_H
-#define PCRE_H
-
-#include <stdlib.h>
 #ifdef FOR_PYTHON
 #include "Python.h"
 #endif

+/* Have to include stdlib.h in order to ensure that size_t is defined;
+it is needed here for malloc. */
+
+#include <stdlib.h>
+
 /* Options */

-#define PCRE_CASELESS     0x01
-#define PCRE_EXTENDED     0x02
-#define PCRE_ANCHORED     0x04
-#define PCRE_MULTILINE    0x08
-#define PCRE_DOTALL       0x10
+#define PCRE_CASELESS        0x0001
+#define PCRE_EXTENDED        0x0002
+#define PCRE_ANCHORED        0x0004
+#define PCRE_MULTILINE       0x0008
+#define PCRE_DOTALL          0x0010
+#define PCRE_DOLLAR_ENDONLY  0x0020
+#define PCRE_EXTRA           0x0040
+#define PCRE_NOTBOL          0x0080
+#define PCRE_NOTEOL          0x0100
+#ifdef FOR_PYTHON
+#define PCRE_LOCALE          0x0200
+#endif

 /* Exec-time error codes */

@ -31,6 +39,7 @@ it is needed in there for malloc. */
 #define PCRE_ERROR_BADOPTION      (-4)
 #define PCRE_ERROR_BADMAGIC       (-5)
 #define PCRE_ERROR_UNKNOWN_NODE   (-6)
+#define PCRE_ERROR_NOMEMORY       (-7)

 /* Types */

@ -46,14 +55,14 @@ extern void  (*pcre_free)(void *);
 /* Functions */

 #ifdef FOR_PYTHON
-extern pcre *pcre_compile(char *, int, char **, int *, PyObject *);
+extern pcre *pcre_compile(const char *, int, char **, int *, PyObject *);
 #else
-extern pcre *pcre_compile(char *, int, char **, int *);
+extern pcre *pcre_compile(const char *, int, char **, int *);
 #endif
-extern int pcre_exec(pcre *, pcre_extra *, char *, int, int, int *, int);
-extern int pcre_info(pcre *, int *, int *);
-extern pcre_extra *pcre_study(pcre *, int, char **);
+extern int pcre_exec(const pcre *, const pcre_extra *, const char *,
+  int, int, int *, int);
+extern int pcre_info(const pcre *, int *, int *);
+extern pcre_extra *pcre_study(const pcre *, int, char **);
 extern char *pcre_version(void);

-#endif /* ifndef PCRE_H */
-/* End of pcre.h */
+#endif /* End of pcre.h */
--- a/Modules/pcremodule.c
+++ b/Modules/pcremodule.c
@ -105,39 +105,47 @@ PyPcre_exec(self, args)
 	PcreObject *self;
 	PyObject *args;
 {
-        unsigned char *string;
-	int stringlen, pos=0, options=0, i, count;
-	int offsets[100*2]; /* XXX must this be fixed? */
+        char *string;
+	int stringlen, pos = 0, options=0, endpos = -1, i, count;
+	int offsets[100*2]; 
 	PyObject *list;

-	if (!PyArg_ParseTuple(args, "s#|ii", &string, &stringlen, &pos, &options))
+	if (!PyArg_ParseTuple(args, "s#|iiii", &string, &stringlen, &pos, &endpos, &options))
 		return NULL;
+	if (endpos == -1) {endpos = stringlen;}
 	count = pcre_exec(self->regex, self->regex_extra, 
-			  string+pos, stringlen-pos, options,
+			  (char *)string+pos, endpos - pos, options,
 			  offsets, sizeof(offsets)/sizeof(int) );
+	/* If an error occurred during the match, and an exception was raised,
+	   just return NULL and leave the exception alone.  The most likely
+	   problem to cause this would be running out of memory for
+	   the failure stack. */
+	if (PyErr_Occurred())
+	{
+		return NULL;
+	}
 	if (count==PCRE_ERROR_NOMATCH) {Py_INCREF(Py_None); return Py_None;}
 	if (count<0)
-	  {
-	    PyErr_SetObject(ErrorObject, Py_BuildValue("si", "Regex error", count));
-	    return NULL;
-	  }
+	{
+		PyErr_SetObject(ErrorObject, Py_BuildValue("si", "Regex error", count));
+		return NULL;
+	}
 	
 	list=PyList_New(self->num_groups+1);
 	if (list==NULL) return NULL;
-	/* XXX count can be >size_offset! */
 	for(i=0; i<=self->num_groups; i++)
-	  {
-	    PyObject *v;
-	    int start=offsets[i*2], end=offsets[i*2+1];
-	    /* If the group wasn't affected by the match, return -1, -1 */
-            if (start<0 || count<=i) 
-	      {start=end=-1;}
-	    else 
-	      {start += pos; end +=pos;}
-	    v=Py_BuildValue("ii", start, end);
-	    if (v==NULL) {Py_DECREF(list); return NULL;}
-	    PyList_SetItem(list, i, v);
-	  }
+	{
+		PyObject *v;
+		int start=offsets[i*2], end=offsets[i*2+1];
+		/* If the group wasn't affected by the match, return -1, -1 */
+		if (start<0 || count<=i) 
+		{start=end=-1;}
+		else 
+		{start += pos; end +=pos;}
+		v=Py_BuildValue("ii", start, end);
+		if (v==NULL) {Py_DECREF(list); return NULL;}
+		PyList_SetItem(list, i, v);
+	}
 	return list;
 }

@ -182,7 +190,7 @@ PyPcre_compile(self, args)
 {
 	PcreObject *rv;
 	PyObject *dictionary;
-	unsigned char *pattern, *newpattern;
+	char *pattern, *newpattern;
 	char *error;
 	int num_zeros, i, j;
 	
@ -192,282 +200,274 @@ PyPcre_compile(self, args)
 		return NULL;
 	rv = newPcreObject(args);
 	if ( rv == NULL )
-	    return NULL;
+		return NULL;

 	/* PCRE doesn't like having null bytes in its pattern, so we have to replace 
 	   any zeros in the string with the characters '\0'. */
 	num_zeros=1;
 	for(i=0; i<patternlen; i++) {
-	  if (pattern[i]==0) num_zeros++;
+		if (pattern[i]==0) num_zeros++;
 	}
 	newpattern=malloc(patternlen+num_zeros);
 	if (newpattern==NULL) {
-	  PyErr_SetString(PyExc_MemoryError, "can't allocate memory for new pattern");
-	  return NULL;
+		PyErr_SetString(PyExc_MemoryError, "can't allocate memory for new pattern");
+		return NULL;
 	}
 	for (i=j=0; i<patternlen; i++, j++)
-	  {
-	    if (pattern[i]!=0) newpattern[j]=pattern[i];
-	    else {
-	      newpattern[j++]='\\';
-	      newpattern[j]  ='0';
-	    }
-	  }
+	{
+		if (pattern[i]!=0) newpattern[j]=pattern[i];
+		else {
+			newpattern[j++]='\\';
+			newpattern[j]  ='0';
+		}
+	}
 	newpattern[j]='\0';

-	rv->regex = pcre_compile(newpattern, options, 
+	rv->regex = pcre_compile((char*)newpattern, options, 
 				 &error, &erroroffset, dictionary);
 	free(newpattern);
 	if (rv->regex==NULL) 
-	  {
-	    PyMem_DEL(rv);
-	    if (!PyErr_Occurred())
-	      {
-		PyErr_SetObject(ErrorObject, Py_BuildValue("si", error, erroroffset));
-	      }
-	    return NULL;
-	  }
+	{
+		PyMem_DEL(rv);
+		if (!PyErr_Occurred())
+		{
+			PyErr_SetObject(ErrorObject, Py_BuildValue("si", error, erroroffset));
+		}
+		return NULL;
+	}
 	rv->regex_extra=pcre_study(rv->regex, 0, &error);
 	if (rv->regex_extra==NULL && error!=NULL) 
-	  {
-	    PyMem_DEL(rv);
-	    PyErr_SetObject(ErrorObject, Py_BuildValue("si", error, 0));
-	    return NULL;
-	  }
+	{
+		PyMem_DEL(rv);
+		PyErr_SetObject(ErrorObject, Py_BuildValue("si", error, 0));
+		return NULL;
+	}
        rv->num_groups = pcre_info(rv->regex, NULL, NULL);
 	if (rv->num_groups<0) 
-	  {
-	    PyErr_SetObject(ErrorObject, Py_BuildValue("si", "Regex error", rv->num_groups));
-	    PyMem_DEL(rv);
-	    return NULL;
-	  }
+	{
+		PyErr_SetObject(ErrorObject, Py_BuildValue("si", "Regex error", rv->num_groups));
+		PyMem_DEL(rv);
+		return NULL;
+	}
 	return (PyObject *)rv;
 }

 static PyObject *
 PyPcre_expand_escape(pattern, pattern_len, indexptr, typeptr)
-     unsigned char *pattern;
-     int pattern_len, *indexptr, *typeptr;
+	unsigned char *pattern;
+	int pattern_len, *indexptr, *typeptr;
 {
-  unsigned char c;
-  int index = *indexptr;
+	unsigned char c;
+	int index = *indexptr;
  
-  if (pattern_len<=index)
-    {
-      PyErr_SetString(ErrorObject, "escape ends too soon");
-      return NULL;
-    }
-  c=pattern[index]; index++;
-  *typeptr=CHAR;
-
-  switch (c)
-    {
-    case('t'):
-      *indexptr=index;
-      return Py_BuildValue("c", (char)9);
-      break;
-    case('n'):
-      *indexptr = index;
-      return Py_BuildValue("c", (char)10);
-      break;
-    case('v'):
-      *indexptr = index;
-      return Py_BuildValue("c", (char)11);
-      break;
-    case('r'):
-      *indexptr = index;
-      return Py_BuildValue("c", (char)13);
-      break;
-    case('f'):
-      *indexptr = index;
-      return Py_BuildValue("c", (char)12);
-      break;
-    case('a'):
-      *indexptr = index;
-      return Py_BuildValue("c", (char)7);
-      break;
-    case('b'):
-      *indexptr=index;
-      return Py_BuildValue("c", (char)8);
-      break;
-
-    case('x'):
-      {
-	int end, length;
-	unsigned char *string;
-	PyObject *v;
-
-	end=index; 
-	while (end<pattern_len && 
-	       ( pcre_ctypes[ pattern[end] ] & ctype_xdigit ) )
-	  end++;
-	if (end==index)
-	  {
-	    PyErr_SetString(ErrorObject, "\\x must be followed by hex digits");
-	    return NULL;
-	  }
-	length=end-index;
-	string=malloc(length+4+1);
-	if (string==NULL)
-	  {
-	    PyErr_SetString(PyExc_MemoryError, "can't allocate memory for \\x string");
-	    return NULL;
-	  }
-	/* Create a string containing "\x<hexdigits>", which will be
-	   passed to eval() */
-	string[0]=string[length+3]='"';
-	string[1]='\\';
-	string[length+4]='\0';
-	memcpy(string+2, pattern+index-1, length+1);
-	v=PyRun_String((char *)string, Py_eval_input, 
-		       PyEval_GetGlobals(), PyEval_GetLocals());
-	free(string);
-	/* The evaluation raised an exception */
-	if (v==NULL) return NULL;
-	*indexptr = end;
-	return v;
-      }
-      break;
-
-    case('E'):    case('G'):    case('L'):    case('Q'):
-    case('U'):    case('l'):    case('u'):
-      {
-	char message[50];
-	sprintf(message, "\\%c is not allowed", c);
-	PyErr_SetString(ErrorObject, message);
-	return NULL;
-      }
-
-    case('g'):
-      {
-	int end, valid, i;
 	if (pattern_len<=index)
-	  {
-	    PyErr_SetString(ErrorObject, "unfinished symbolic reference");
-	    return NULL;
-	  }
-	if (pattern[index]!='<')
-	  {
-	    PyErr_SetString(ErrorObject, "missing < in symbolic reference");
-	    return NULL;
-	  }
-	index++;
-	end=index;
-	while (end<pattern_len && pattern[end]!='>')
-	  end++;
-	if (end==pattern_len)
-	  {
-	    PyErr_SetString(ErrorObject, "unfinished symbolic reference");
-	    return NULL;
-	  }
-	valid=1;
-	if (index==end		/* Zero-length name */
-	    || !(pcre_ctypes[pattern[index]] & ctype_word) /* First char. not alphanumeric */
-	    || (pcre_ctypes[pattern[index]] & ctype_digit) ) /* First char. a digit */
-	  valid=0;
+	{
+		PyErr_SetString(ErrorObject, "escape ends too soon");
+		return NULL;
+	}
+	c=pattern[index]; index++;
+	*typeptr=CHAR;

-	for(i=index+1; i<end; i++)
-	  {
-	    if (!(pcre_ctypes[pattern[i]] & ctype_word) )
-	      valid=0;
-	  }	
-	if (!valid)
-	  {
-	    /* XXX should include the text of the reference */
-	    PyErr_SetString(ErrorObject, "illegal symbolic reference");
-	    return NULL;
-	  }
-	    
-	*typeptr = MEMORY_REFERENCE;
-	*indexptr = end+1;
-	return Py_BuildValue("s#", pattern+index, end-index);
-      }
-    break;
+	switch (c)
+	{
+	case('t'):
+		*indexptr=index;
+		return Py_BuildValue("c", (char)9);
+		break;
+	case('n'):
+		*indexptr = index;
+		return Py_BuildValue("c", (char)10);
+		break;
+	case('v'):
+		*indexptr = index;
+		return Py_BuildValue("c", (char)11);
+		break;
+	case('r'):
+		*indexptr = index;
+		return Py_BuildValue("c", (char)13);
+		break;
+	case('f'):
+		*indexptr = index;
+		return Py_BuildValue("c", (char)12);
+		break;
+	case('a'):
+		*indexptr = index;
+		return Py_BuildValue("c", (char)7);
+		break;
+	case('b'):
+		*indexptr=index;
+		return Py_BuildValue("c", (char)8);
+		break;

-    case('0'):
-      {
-	/* \0 always indicates an octal escape, so we consume up to 3
-	   characters, as long as they're all octal digits */
-	int octval=0, i;
-	index--;
-	for(i=index;
-	    i<=index+2 && i<pattern_len 
-	      && (pcre_ctypes[ pattern[i] ] & ctype_odigit );
-	    i++)
-	  {
-	    octval = octval * 8 + pattern[i] - '0';
-	  }
-	if (octval>255)
-	  {
-	    PyErr_SetString(ErrorObject, "octal value out of range");
-	    return NULL;
-	  }
-	*indexptr = i;
-	return Py_BuildValue("c", (unsigned char)octval);
-      }
-      break;
-    case('1'):    case('2'):    case('3'):    case('4'):
-    case('5'):    case('6'):    case('7'):    case('8'):
-    case('9'):
-      {
-	/* Handle \?, where ? is from 1 through 9 */
-	int value=0;
-	index--;
-	/* If it's at least a two-digit reference, like \34, it might
-           either be a 3-digit octal escape (\123) or a 2-digit
-           decimal memory reference (\34) */
+	case('x'):
+	{
+		int x, ch, end;

-	if ( (index+1) <pattern_len && 
-	    (pcre_ctypes[ pattern[index+1] ] & ctype_digit) )
-	  {
-	    if ( (index+2) <pattern_len && 
-		(pcre_ctypes[ pattern[index+2] ] & ctype_odigit) &&
-		(pcre_ctypes[ pattern[index+1] ] & ctype_odigit) &&
-		(pcre_ctypes[ pattern[index  ] ] & ctype_odigit)
-		)
-	      {
-		/* 3 octal digits */
-		value= 8*8*(pattern[index  ]-'0') +
-		         8*(pattern[index+1]-'0') +
-		           (pattern[index+2]-'0');
-		if (value>255)
-		  {
-		    PyErr_SetString(ErrorObject, "octal value out of range");
-		    return NULL;
-		  }
-		*indexptr = index+3;
-		return Py_BuildValue("c", (unsigned char)value);
-	      }
-	    else
-	      {
-		/* 2-digit form, so it's a memory reference */
-		value= 10*(pattern[index  ]-'0') +
-		          (pattern[index+1]-'0');
-		if (value<1 || EXTRACT_MAX<=value)
-		  {
-		    PyErr_SetString(ErrorObject, "memory reference out of range");
-		    return NULL;
-		  }
-		*typeptr = MEMORY_REFERENCE;
-		*indexptr = index+2;
-		return Py_BuildValue("i", value);
-	      }
-	  }
-	else 
-	  {
-	    /* Single-digit form, like \2, so it's a memory reference */
-	    *typeptr = MEMORY_REFERENCE;
-	    *indexptr = index+1;
-	    return Py_BuildValue("i", pattern[index]-'0');
-	  }
-      }
-      break;
-
-    default:
-	*indexptr = index;
-	return Py_BuildValue("c", c);
+		x = 0; end = index;
+		while ( (end<pattern_len && pcre_ctypes[ pattern[end] ] & ctype_xdigit) != 0)
+		{
+			ch = pattern[end];
+			x = x * 16 + pcre_lcc[ch] -
+				(((pcre_ctypes[ch] & ctype_digit) != 0)? '0' : 'W');
+			x &= 255;
+			end++;
+		}
+		if (end==index)
+		{
+			PyErr_SetString(ErrorObject, "\\x must be followed by hex digits");
+			return NULL;
+		}
+		*indexptr = end;
+		return Py_BuildValue("c", (char)x);
+	}
 	break;
-    }
+
+	case('E'):    case('G'):    case('L'):    case('Q'):
+	case('U'):    case('l'):    case('u'):
+	{
+		char message[50];
+		sprintf(message, "\\%c is not allowed", c);
+		PyErr_SetString(ErrorObject, message);
+		return NULL;
+	}
+
+	case('g'):
+	{
+		int end, i;
+		if (pattern_len<=index)
+		{
+			PyErr_SetString(ErrorObject, "unfinished symbolic reference");
+			return NULL;
+		}
+		if (pattern[index]!='<')
+		{
+			PyErr_SetString(ErrorObject, "missing < in symbolic reference");
+			return NULL;
+		}
+		index++;
+		end=index;
+		while (end<pattern_len && pattern[end]!='>')
+			end++;
+		if (end==pattern_len)
+		{
+			PyErr_SetString(ErrorObject, "unfinished symbolic reference");
+			return NULL;
+		}
+
+		if (index==end)		/* Zero-length name */
+		{
+			/* XXX should include the text of the reference */
+			PyErr_SetString(ErrorObject, "zero-length symbolic reference");
+			return NULL;
+		}
+		if (!(pcre_ctypes[pattern[index]] & ctype_word) /* First char. not alphanumeric */
+		    || (pcre_ctypes[pattern[index]] & ctype_digit) ) /* First char. a digit */
+		{
+			/* XXX should include the text of the reference */
+			PyErr_SetString(ErrorObject, "first character of symbolic reference not a letter or _");
+			return NULL;
+		}
+
+		for(i=index+1; i<end; i++)
+		{
+			if (!(pcre_ctypes[pattern[i]] & ctype_word) )
+			{
+				/* XXX should include the text of the reference */
+				PyErr_SetString(ErrorObject, "illegal symbolic reference");
+				return NULL;
+			}
+		}	
+	    
+		*typeptr = MEMORY_REFERENCE;
+		*indexptr = end+1;
+		return Py_BuildValue("s#", pattern+index, end-index);
+	}
+	break;
+
+	case('0'):
+	{
+		/* \0 always indicates an octal escape, so we consume up to 3
+		   characters, as long as they're all octal digits */
+		int octval=0, i;
+		index--;
+		for(i=index;
+		    i<=index+2 && i<pattern_len 
+			    && (pcre_ctypes[ pattern[i] ] & ctype_odigit );
+		    i++)
+		{
+			octval = octval * 8 + pattern[i] - '0';
+		}
+		if (octval>255)
+		{
+			PyErr_SetString(ErrorObject, "octal value out of range");
+			return NULL;
+		}
+		*indexptr = i;
+		return Py_BuildValue("c", (unsigned char)octval);
+	}
+	break;
+	case('1'):    case('2'):    case('3'):    case('4'):
+	case('5'):    case('6'):    case('7'):    case('8'):
+	case('9'):
+	{
+		/* Handle \?, where ? is from 1 through 9 */
+		int value=0;
+		index--;
+		/* If it's at least a two-digit reference, like \34, it might
+		   either be a 3-digit octal escape (\123) or a 2-digit
+		   decimal memory reference (\34) */
+
+		if ( (index+1) <pattern_len && 
+		     (pcre_ctypes[ pattern[index+1] ] & ctype_digit) )
+		{
+			if ( (index+2) <pattern_len && 
+			     (pcre_ctypes[ pattern[index+2] ] & ctype_odigit) &&
+			     (pcre_ctypes[ pattern[index+1] ] & ctype_odigit) &&
+			     (pcre_ctypes[ pattern[index  ] ] & ctype_odigit)
+				)
+			{
+				/* 3 octal digits */
+				value= 8*8*(pattern[index  ]-'0') +
+					8*(pattern[index+1]-'0') +
+					(pattern[index+2]-'0');
+				if (value>255)
+				{
+					PyErr_SetString(ErrorObject, "octal value out of range");
+					return NULL;
+				}
+				*indexptr = index+3;
+				return Py_BuildValue("c", (unsigned char)value);
+			}
+			else
+			{
+				/* 2-digit form, so it's a memory reference */
+				value= 10*(pattern[index  ]-'0') +
+					(pattern[index+1]-'0');
+				if (value<1 || EXTRACT_MAX<=value)
+				{
+					PyErr_SetString(ErrorObject, "memory reference out of range");
+					return NULL;
+				}
+				*typeptr = MEMORY_REFERENCE;
+				*indexptr = index+2;
+				return Py_BuildValue("i", value);
+			}
+		}
+		else 
+		{
+			/* Single-digit form, like \2, so it's a memory reference */
+			*typeptr = MEMORY_REFERENCE;
+			*indexptr = index+1;
+			return Py_BuildValue("i", pattern[index]-'0');
+		}
+	}
+	break;
+
+	default:
+		*indexptr = index;
+		return Py_BuildValue("c", c);
+		break;
+	}
 }

 static PyObject *
@ -475,117 +475,127 @@ PyPcre_expand(self, args)
 	PyObject *self;
 	PyObject *args;
 {
-  PyObject *results, *match_obj;
-  PyObject *repl_obj, *newstring;
-  unsigned char *repl;
-  int size, total_len, i, start, pos;
+	PyObject *results, *match_obj;
+	PyObject *repl_obj, *newstring;
+	unsigned char *repl;
+	int size, total_len, i, start, pos;

-  if (!PyArg_ParseTuple(args, "OS", &match_obj, &repl_obj)) 
-    return NULL;
+	if (!PyArg_ParseTuple(args, "OS", &match_obj, &repl_obj)) 
+		return NULL;

-  repl=(unsigned char *)PyString_AsString(repl_obj);
-  size=PyString_Size(repl_obj);
-  results=PyList_New(0);
-  if (results==NULL) return NULL;
-  for(start=total_len=i=0; i<size; i++)
-    {
-      if (repl[i]=='\\')
+	repl=(unsigned char *)PyString_AsString(repl_obj);
+	size=PyString_Size(repl_obj);
+	results=PyList_New(0);
+	if (results==NULL) return NULL;
+	for(start=total_len=i=0; i<size; i++)
 	{
-	  PyObject *value;
-	  int escape_type;
+		if (repl[i]=='\\')
+		{
+			PyObject *value;
+			int escape_type;

-	  if (start!=i)
-	    {
-	      PyList_Append(results, 
-			    PyString_FromStringAndSize((char *)repl+start, i-start));
-	      total_len += i-start;
-	    }
-	  i++;
-	  value=PyPcre_expand_escape(repl, size, &i, &escape_type);
-	  if (value==NULL)
-	    {
-	      /* PyPcre_expand_escape triggered an exception of some sort,
-		 so just return */
-	      Py_DECREF(results);
-	      return NULL;
-	    }
-	  switch (escape_type)
-	    {
-	    case (CHAR):
-	      PyList_Append(results, value);
-	      total_len += PyString_Size(value);
-	      break;
-	    case(MEMORY_REFERENCE):
-	      {
-		PyObject *r, *tuple, *result;
-		r=PyObject_GetAttrString(match_obj, "group");
-		tuple=PyTuple_New(1);
-		Py_INCREF(value);
-		PyTuple_SetItem(tuple, 0, value);
-		result=PyEval_CallObject(r, tuple);
-		Py_DECREF(r); Py_DECREF(tuple);
-		if (result==NULL)
-		  {
-		    /* The group() method trigged an exception of some sort */
-		    Py_DECREF(results);
-		    return NULL;
-		  }
-		if (result==Py_None)
-		  {
-		    char message[50];
-		    sprintf(message, 
-			    "group %li did not contribute to the match",
-			    PyInt_AsLong(value));
-		    PyErr_SetString(ErrorObject, 
-				    message);
-		    Py_DECREF(result);
-		    Py_DECREF(results);
-		    return NULL;
-		  }
-		/* XXX typecheck that it's a string! */
-		PyList_Append(results, result);
-		total_len += PyString_Size(result);
-		Py_DECREF(result);
-	      }
-	      break;
-	    default:
-	      Py_DECREF(results);
-	      PyErr_SetString(ErrorObject, 
-			      "bad escape in replacement");
-	      return NULL;
-	    }
-	  start=i;
-	  i--; /* Decrement now, because the 'for' loop will increment it */
+			if (start!=i)
+			{
+				PyList_Append(results, 
+					      PyString_FromStringAndSize((char *)repl+start, i-start));
+				total_len += i-start;
+			}
+			i++;
+			value=PyPcre_expand_escape(repl, size, &i, &escape_type);
+			if (value==NULL)
+			{
+				/* PyPcre_expand_escape triggered an exception of some sort,
+				   so just return */
+				Py_DECREF(results);
+				return NULL;
+			}
+			switch (escape_type)
+			{
+			case (CHAR):
+				PyList_Append(results, value);
+				total_len += PyString_Size(value);
+				break;
+			case(MEMORY_REFERENCE):
+			{
+				PyObject *r, *tuple, *result;
+				r=PyObject_GetAttrString(match_obj, "group");
+				tuple=PyTuple_New(1);
+				Py_INCREF(value);
+				PyTuple_SetItem(tuple, 0, value);
+				result=PyEval_CallObject(r, tuple);
+				Py_DECREF(r); Py_DECREF(tuple);
+				if (result==NULL)
+				{
+					/* The group() method trigged an exception of some sort */
+					Py_DECREF(results);
+					Py_DECREF(value);
+					return NULL;
+				}
+				if (result==Py_None)
+				{
+					char message[50];
+					sprintf(message, 
+						"group did not contribute to the match");
+					PyErr_SetString(ErrorObject, 
+							message);
+					Py_DECREF(result);
+					Py_DECREF(value);
+					Py_DECREF(results);
+					return NULL;
+				}
+				/* typecheck that it's a string! */
+				if (!PyString_Check(result))
+				{
+					Py_DECREF(results);
+					Py_DECREF(result);
+					PyErr_SetString(ErrorObject, 
+							"group() must return a string value for replacement");
+					return NULL;
+				}
+				PyList_Append(results, result);
+				total_len += PyString_Size(result);
+				Py_DECREF(result);
+			}
+			break;
+			default:
+				Py_DECREF(results);
+				PyErr_SetString(ErrorObject, 
+						"bad escape in replacement");
+				return NULL;
+			}
+			Py_DECREF(value);
+			start=i;
+			i--; /* Decrement now, because the 'for' loop will increment it */
+		}
+	} /* endif repl[i]!='\\' */
+
+	if (start!=i)
+	{
+		PyList_Append(results, PyString_FromStringAndSize((char *)repl+start, i-start));
+		total_len += i-start;
 	}
-    } /* endif repl[i]!='\\' */

-  if (start!=i)
-    {
-      PyList_Append(results, PyString_FromStringAndSize((char *)repl+start, i-start));
-      total_len += i-start;
-    }
-
-  /* Whew!  Now we've constructed a list containing various pieces of
-     strings that will make up our final result.  So, iterate over 
-     the list concatenating them.  A new string measuring total_len
-     bytes is allocated and filled in. */
+	/* Whew!  Now we've constructed a list containing various pieces of
+	   strings that will make up our final result.  So, iterate over 
+	   the list concatenating them.  A new string measuring total_len
+	   bytes is allocated and filled in. */
     
-  newstring=PyString_FromStringAndSize(NULL, total_len);
-  if (newstring==NULL)
-    {
-      Py_DECREF(results);
-      return NULL;
-    }
+	newstring=PyString_FromStringAndSize(NULL, total_len);
+	if (newstring==NULL)
+	{
+		Py_DECREF(results);
+		return NULL;
+	}

-  repl=(unsigned char *)PyString_AsString(newstring);
-  for (pos=i=0; i<PyList_Size(results); i++)
-    {
-      PyObject *item=PyList_GetItem(results, i);
-      memcpy(repl+pos, PyString_AsString(item), PyString_Size(item) );
-      pos += PyString_Size(item);
-    }
-  Py_DECREF(results);
-  return newstring;
+	repl=(unsigned char *)PyString_AsString(newstring);
+	for (pos=i=0; i<PyList_Size(results); i++)
+	{
+		PyObject *item=PyList_GetItem(results, i);
+		memcpy(repl+pos, PyString_AsString(item), PyString_Size(item) );
+		pos += PyString_Size(item);
+	}
+	Py_DECREF(results);
+	return newstring;
 }


@ -642,67 +652,7 @@ initpcre()
 	insint(d, "MULTILINE", PCRE_MULTILINE);
 	insint(d, "DOTALL", PCRE_DOTALL);
 	insint(d, "VERBOSE", PCRE_EXTENDED);
-	
-	/* Insert the opcodes */
-	insint(d, "OP_END", OP_END);
-	insint(d, "OP_SOD", OP_SOD);
-	insint(d, "OP_NOT_WORD_BOUNDARY", OP_NOT_WORD_BOUNDARY);
-	insint(d, "OP_WORD_BOUNDARY", OP_WORD_BOUNDARY);
-	insint(d, "OP_NOT_DIGIT", OP_NOT_DIGIT);
-	insint(d, "OP_NOT_WHITESPACE", OP_NOT_WHITESPACE);
-	insint(d, "OP_WHITESPACE", OP_WHITESPACE);
-	insint(d, "OP_NOT_WORDCHAR", OP_NOT_WORDCHAR);
-	insint(d, "OP_WORDCHAR", OP_WORDCHAR);
-	insint(d, "OP_EOD", OP_EOD);
-	insint(d, "OP_CIRC", OP_CIRC);
-	insint(d, "OP_DOLL", OP_DOLL);
-	insint(d, "OP_ANY", OP_ANY);
-	insint(d, "OP_CHARS", OP_CHARS);
-
-	insint(d, "OP_STAR", OP_STAR);
-	insint(d, "OP_MINSTAR", OP_MINSTAR);
-	insint(d, "OP_PLUS", OP_PLUS);
-	insint(d, "OP_MINPLUS", OP_MINPLUS);
-	insint(d, "OP_QUERY", OP_QUERY);
-	insint(d, "OP_MINQUERY", OP_MINQUERY);
-	insint(d, "OP_UPTO", OP_UPTO);
-	insint(d, "OP_MINUPTO", OP_MINUPTO);
-	insint(d, "OP_EXACT", OP_EXACT);
-
-	insint(d, "OP_TYPESTAR", OP_TYPESTAR);
-	insint(d, "OP_TYPEMINSTAR", OP_TYPEMINSTAR);
-	insint(d, "OP_TYPEPLUS", OP_TYPEPLUS);
-	insint(d, "OP_TYPEMINPLUS", OP_TYPEMINPLUS);
-	insint(d, "OP_TYPEQUERY", OP_TYPEQUERY);
-	insint(d, "OP_TYPEMINQUERY", OP_TYPEMINQUERY);
-	insint(d, "OP_TYPEUPTO", OP_TYPEUPTO);
-	insint(d, "OP_TYPEMINUPTO", OP_TYPEMINUPTO);
-	insint(d, "OP_TYPEEXACT", OP_TYPEEXACT);
-
-	insint(d, "OP_CRSTAR", OP_CRSTAR);
-	insint(d, "OP_CRMINSTAR", OP_CRMINSTAR);
-	insint(d, "OP_CRPLUS", OP_CRPLUS);
-	insint(d, "OP_CRMINPLUS", OP_CRMINPLUS);
-	insint(d, "OP_CRQUERY", OP_CRQUERY);
-	insint(d, "OP_CRMINQUERY", OP_CRMINQUERY);
-	insint(d, "OP_CRRANGE", OP_CRRANGE);
-	insint(d, "OP_CRMINRANGE", OP_CRMINRANGE);
-
-	insint(d, "OP_CLASS", OP_CLASS);
-	insint(d, "OP_NEGCLASS", OP_NEGCLASS);
-	insint(d, "OP_REF", OP_REF);
-
-	insint(d, "OP_ALT", OP_ALT);
-	insint(d, "OP_KET", OP_KET);
-	insint(d, "OP_KETRMAX", OP_KETRMAX);
-	insint(d, "OP_KETRMIN", OP_KETRMIN);
-
-	insint(d, "OP_ASSERT", OP_ASSERT);
-	insint(d, "OP_ASSERT_NOT", OP_ASSERT_NOT);
-
-	insint(d, "OP_BRAZERO", OP_BRAZERO);
-	insint(d, "OP_BRAMINZERO", OP_BRAMINZERO);
-	insint(d, "OP_BRA", OP_BRA);
+	insint(d, "LOCALE", PCRE_LOCALE);
 	
 	/* Check for errors */
 	if (PyErr_Occurred())
--- a/Modules/pypcre.c
+++ b/Modules/pypcre.c