AMK's latest

2025-09-26 18:29:57 +00:00 · 1998-04-03 21:13:31 +00:00 · 1998-04-03 21:13:31 +00:00 · 042ff9eb3a
commit 042ff9eb3a
parent 104be4a4a7
4 changed files with 168 additions and 90 deletions
--- a/Modules/pcre-int.h
+++ b/Modules/pcre-int.h
@ -3,7 +3,7 @@
 *************************************************/
-#define PCRE_VERSION       "1.04 22-Dec-1997"
+#define PCRE_VERSION       "1.07 16-Feb-1998"
 /* This is a library of functions to support regular expressions whose syntax
@ -12,7 +12,7 @@ the file Tech.Notes for some information on the internals.
 Written by: Philip Hazel <ph10@cam.ac.uk>
-           Copyright (c) 1997 University of Cambridge
+           Copyright (c) 1998 University of Cambridge
 -----------------------------------------------------------------------------
 Permission is granted to anyone to use this software for any purpose on any
@ -192,6 +192,7 @@ enum {
  OP_CRMINRANGE,
  OP_CLASS,          /* Match a character class */
  OP_NEGCLASS,       /* Match a character class, specified negatively */
  OP_CLASS_L,        /* Match a character class */
  OP_REF,            /* Match a back reference */
--- a/Modules/pcre.h
+++ b/Modules/pcre.h
@ -2,7 +2,7 @@
 *       Perl-Compatible Regular Expressions      *
 *************************************************/
-/* Copyright (c) 1997 University of Cambridge */
+/* Copyright (c) 1998 University of Cambridge */
 #ifndef _PCRE_H
 #define _PCRE_H
@ -17,6 +17,12 @@ it is needed here for malloc. */
 #include <sys/types.h>
 #include <stdlib.h>
 /* Allow for C++ users */
 #ifdef __cplusplus
 extern "C" {
 #endif
 /* Options */
 #define PCRE_CASELESS        0x0001
@ -68,4 +74,8 @@ extern int pcre_info(const pcre *, int *, int *);
 extern pcre_extra *pcre_study(const pcre *, int, const char **);
 extern const char *pcre_version(void);
 #ifdef __cplusplus
 }  /* extern "C" */
 #endif
 #endif /* End of pcre.h */
--- a/Modules/pcremodule.c
+++ b/Modules/pcremodule.c
@ -72,7 +72,7 @@ staticforward PyTypeObject Pcre_Type;
 #define NOT_WORD_BOUNDARY	6
 #define BEGINNING_OF_BUFFER	7
 #define END_OF_BUFFER		8
-
+#define STRING                  9
 static PcreObject *
 newPcreObject(arg)
@ -191,49 +191,20 @@ PyPcre_compile(self, args)
 {
 	PcreObject *rv;
 	PyObject *dictionary;
-	char *pattern, *newpattern;
+	char *pattern;
 	const char *error;
 	int num_zeros, i, j;
-	int patternlen, options, erroroffset;
+	int options, erroroffset;
-	if (!PyArg_ParseTuple(args, "s#iO!", &pattern, &patternlen, &options,
+	if (!PyArg_ParseTuple(args, "siO!", &pattern, &options,
 			      &PyDict_Type, &dictionary))
 		return NULL;
 	rv = newPcreObject(args);
 	if ( rv == NULL )
 		return NULL;
-	/* PCRE doesn't like having null bytes in its pattern, so we have to replace 
+	rv->regex = pcre_compile((char*)pattern, options, 
 	   any zeros in the string with the characters '\000'. This increases the size
 	   of the string by 3*num_zeros, plus 1 byte for the terminating \0.  */
 	num_zeros=1;      /* Start at 1; this will give 3 extra bytes of leeway */
 	for(i=0; i<patternlen; i++) {
 		if (pattern[i]==0) num_zeros++;
 	}
 	newpattern=malloc(patternlen + num_zeros*3 + 4); 
 	if (newpattern==NULL) {
 		PyErr_SetString(PyExc_MemoryError, "can't allocate memory for new pattern");
 		return NULL;
 	}
 	for (i=j=0; i<patternlen; i++, j++)
 	{
 		if (pattern[i]!=0) newpattern[j]=pattern[i];
 		else {
 			newpattern[j++] ='\\';
 			newpattern[j++] = '0';
 			newpattern[j++] = '0';
 			newpattern[j  ] = '0';
 		}
 	}
 	/* Keep purify happy; for pcre, one null byte is enough! */
 	newpattern[j++]='\0';
 	newpattern[j++]='\0';
 	newpattern[j++]='\0';
        newpattern[j]='\0';
 	rv->regex = pcre_compile((char*)newpattern, options, 
 				 &error, &erroroffset, dictionary);
 	free(newpattern);
 	if (rv->regex==NULL) 
 	{
 		PyMem_DEL(rv);
@ -312,6 +283,10 @@ PyPcre_expand_escape(pattern, pattern_len, indexptr, typeptr)
 		*indexptr=index;
 		return Py_BuildValue("c", (char)8);
 		break;
 	case('\\'):
 		*indexptr=index;
 		return Py_BuildValue("c", '\\');
 		break;
 	case('x'):
 	{
@ -348,6 +323,8 @@ PyPcre_expand_escape(pattern, pattern_len, indexptr, typeptr)
 	case('g'):
 	{
 		int end, i;
 		int group_num = 0, is_number=0;
 		if (pattern_len<=index)
 		{
 			PyErr_SetString(ErrorObject, "unfinished symbolic reference");
@ -374,16 +351,22 @@ PyPcre_expand_escape(pattern, pattern_len, indexptr, typeptr)
 			PyErr_SetString(ErrorObject, "zero-length symbolic reference");
 			return NULL;
 		}
-		if (!(pcre_ctypes[pattern[index]] & ctype_word) /* First char. not alphanumeric */
+		if ((pcre_ctypes[pattern[index]] & ctype_digit)) /* First char. a digit */
 		    || (pcre_ctypes[pattern[index]] & ctype_digit) ) /* First char. a digit */
 		{
-			/* XXX should include the text of the reference */
+		        is_number = 1;
-			PyErr_SetString(ErrorObject, "first character of symbolic reference not a letter or _");
+			group_num = pattern[index] - '0';
 			return NULL;
 		}
 		for(i=index+1; i<end; i++)
 		{
 		        if (is_number && 
 			    !(pcre_ctypes[pattern[i]] & ctype_digit) )
 			{
 				/* XXX should include the text of the reference */
 				PyErr_SetString(ErrorObject, "illegal non-digit character in \\g<...> starting with digit");
 				return NULL;			       
 			}
 			else {group_num = group_num * 10 + pattern[i] - '0';}
 			if (!(pcre_ctypes[pattern[i]] & ctype_word) )
 			{
 				/* XXX should include the text of the reference */
@ -394,6 +377,9 @@ PyPcre_expand_escape(pattern, pattern_len, indexptr, typeptr)
 		*typeptr = MEMORY_REFERENCE;
 		*indexptr = end+1;
 		/* If it's a number, return the integer value of the group */
 		if (is_number) return Py_BuildValue("i", group_num);
 		/* Otherwise, return a string containing the group name */
 		return Py_BuildValue("s#", pattern+index, end-index);
 	}
 	break;
@ -478,8 +464,11 @@ PyPcre_expand_escape(pattern, pattern_len, indexptr, typeptr)
 	break;
 	default:
 	  /* It's some unknown escape like \s, so return a string containing
 	     \s */
 		*typeptr = STRING;
 		*indexptr = index;
-		return Py_BuildValue("c", c);
+		return Py_BuildValue("s#", pattern+index-2, 2);
 		break;
 	}
 }
@ -571,6 +560,12 @@ PyPcre_expand(self, args)
 				Py_DECREF(result);
 			}
 			break;
 			case(STRING):
 			  {
 			    PyList_Append(results, value);
 			    total_len += PyString_Size(value);
 			    break;
 			  }
 			default:
 				Py_DECREF(results);
 				PyErr_SetString(ErrorObject, 
--- a/Modules/pypcre.c
+++ b/Modules/pypcre.c
@ -211,7 +211,7 @@ the file Tech.Notes for some information on the internals.
 Written by: Philip Hazel <ph10@cam.ac.uk>
-           Copyright (c) 1997 University of Cambridge
+           Copyright (c) 1998 University of Cambridge
 -----------------------------------------------------------------------------
 Permission is granted to anyone to use this software for any purpose on any
@ -409,6 +409,7 @@ do
      according to the repeat count. */
      case OP_CLASS:
      case OP_NEGCLASS:
        {
        tcode++;
        for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
@ -547,7 +548,7 @@ the file Tech.Notes for some information on the internals.
 Written by: Philip Hazel <ph10@cam.ac.uk>
-           Copyright (c) 1997 University of Cambridge
+           Copyright (c) 1998 University of Cambridge
 -----------------------------------------------------------------------------
 Permission is granted to anyone to use this software for any purpose on any
@ -586,18 +587,26 @@ the external pcre header. */
 #ifndef Py_eval_input
 /* For Python 1.4, graminit.h has to be explicitly included */
 #define Py_eval_input eval_input
 #endif /* FOR_PYTHON */
 /* Allow compilation as C++ source code, should anybody want to do that. */
 #ifdef __cplusplus
 #define class pcre_class
 #endif
 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
-static char rep_min[] = { 0, 0, 1, 1, 0, 0 };
+static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
-static char rep_max[] = { 0, 0, 0, 0, 1, 1 };
+static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
-/* Text forms of OP_ values and things, for debugging */
+/* Text forms of OP_ values and things, for debugging (not all used) */
 #ifdef DEBUG
 static const char *OP_names[] = { 
@ -610,7 +619,7 @@ static const char *OP_names[] = {
  "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
  "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
  "*", "*?", "+", "+?", "?", "??", "{", "{",
-  "class", "classL", "Ref",
+  "class", "negclass", "classL", "Ref",
  "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "Once",
  "Brazero", "Braminzero", "Bra"
 };
@ -621,7 +630,7 @@ are simple data values; negative values are for special things like \d and so
 on. Zero means further processing is needed (for things like \x), or the escape
 is invalid. */
-static short int escapes[] = {
+static const short int escapes[] = {
    0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
    0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
  '@', -ESC_A, -ESC_B,      0, -ESC_D,      0,      0,      0,   /* @ - G */
@ -636,8 +645,9 @@ static short int escapes[] = {
 /* Definition to allow mutual recursion */
-static BOOL compile_regex(int, int *, uschar **, const uschar **, 
+static BOOL 
-			   const char **, PyObject *);
+compile_regex(int, int *, uschar **, const uschar **, const char **,
 	      PyObject *); 
 /* Structure for passing "static" information around between the functions
 doing the matching, so that they are thread-safe. */
@ -866,12 +876,13 @@ do {
      /* Check a class or a back reference for a zero minimum */
      case OP_CLASS:
      case OP_NEGCLASS:
      case OP_REF:
      case OP_CLASS_L:
 	switch(*cc)
 	  {
 	  case (OP_REF):    cc += 2; break;
-	  case (OP_CLASS):  cc += 1+32; break;
+	  case (OP_CLASS): case (OP_NEGCLASS): cc += 1+32; break;
 	  case (OP_CLASS_L): cc += 1+1+32; break;
 	  }
@ -1017,15 +1028,17 @@ else
    {
      /* PYTHON: Try to compute an octal value for a character */
-      for(c=0, i=0; c!=-1 && ptr[i]!=0 && i<3; i++) 
+      for(c=0, i=0; ptr[i]!=0 && i<3; i++) 
 	{
 	  if (( pcre_ctypes[ ptr[i] ] & ctype_odigit) != 0)
 	    c = c * 8 + ptr[i]-'0';
 	  else
-	    c = -1; /* Non-octal character */
+	    break; /* Non-octal character--break out of the loop */
 	}
-      /* Aha!  There were 3 octal digits, so it must be a character */
+      /* It's a character if there were exactly 3 octal digits, or if
-      if (c != -1 && i == 3) 
+	 we're inside a character class and there was at least one
 	 octal digit. */
      if ( (i == 3) || (isclass && i!=0) )
 	{
 	  ptr += i-1;
 	  break;
@ -1278,11 +1291,14 @@ for (;; ptr++)
 	class_flag = NULL;
      }
-    /* If the first character is '^', set the negation flag */
+    /* If the first character is '^', set the negation flag, and use a
    different opcode. This only matters if caseless matching is specified at
    runtime. */
    if ((c = *(++ptr)) == '^')
      {
      negate_class = TRUE;
      if (*(code-1)==OP_CLASS) *(code-1) = OP_NEGCLASS;
      c = *(++ptr);
      }
    else negate_class = FALSE;
@ -1648,7 +1664,8 @@ for (;; ptr++)
    /* If previous was a character class or a back reference, we put the repeat
    stuff after it. */
-    else if (*previous == OP_CLASS || *previous==OP_CLASS_L || *previous == OP_REF)
+    else if (*previous == OP_CLASS || *previous == OP_NEGCLASS || 
 	     *previous==OP_CLASS_L || *previous == OP_REF)
      {
      if (repeat_min == 0 && repeat_max == -1)
        *code++ = OP_CRSTAR + repeat_type;
@ -2003,7 +2020,7 @@ for (;; ptr++)
    the next state. */
    previous[1] = length;
-    ptr--;
+    if (length < 255) ptr--;
    break;
    }
  }                   /* end of big loop */
@ -2832,6 +2849,7 @@ while (code < code_end)
    goto CLASS_REF_REPEAT;
    case OP_CLASS:
    case OP_NEGCLASS:
    case OP_CLASS_L:
      {
      int i, min, max;
@ -2840,11 +2858,14 @@ while (code < code_end)
 	{
 	  code++;
 	  printf("Locflag = %i ", *code++);
 	  printf("    [");
 	}
      else 
-        code++;
+	{
 	  if (*code++ == OP_CLASS) printf("    [");
 	  else printf("   ^[");
 	}
      printf("    [");
      for (i = 0; i < 256; i++)
        {
@ -3601,10 +3622,14 @@ for (;;)
    item to see if there is repeat information following. Then obey similar
    code to character type repeats - written out again for speed. If caseless
    matching was set at runtime but not at compile time, we have to check both
-    versions of a character. */
+    versions of a character, and we have to behave differently for positive and
    negative classes. This is the only time where OP_CLASS and OP_NEGCLASS are
    treated differently. */
    case OP_CLASS:
    case OP_NEGCLASS:
      {
      BOOL nasty_case = *ecode == OP_NEGCLASS && md->runtime_caseless;
      const uschar *data = ecode + 1;  /* Save for matching */
      ecode += 33;                     /* Advance past the item */
@ -3633,15 +3658,8 @@ for (;;)
        break;
        default:               /* No repeat follows */
-        if (eptr >= md->end_subject) FAIL;
+	  min = max = 1;
-        c = *eptr++;
+	  break;
        if ((data[c/8] & (1 << (c&7))) != 0) continue;    /* With main loop */
        if (md->runtime_caseless)
          {
          c = pcre_fcc[c];
          if ((data[c/8] & (1 << (c&7))) != 0) continue;  /* With main loop */
          }
        FAIL;
        }
      /* First, ensure the minimum number of matches are present. */
@ -3650,13 +3668,31 @@ for (;;)
        {
        if (eptr >= md->end_subject) FAIL;
        c = *eptr++;
-        if ((data[c/8] & (1 << (c&7))) != 0) continue;
+
-        if (md->runtime_caseless)
+        /* Either not runtime caseless, or it was a positive class. For
        runtime caseless, continue if either case is in the map. */
        if (!nasty_case)
          {
          c = pcre_fcc[c];
          if ((data[c/8] & (1 << (c&7))) != 0) continue;
          if (md->runtime_caseless)
            {
            c = pcre_fcc[c];
            if ((data[c/8] & (1 << (c&7))) != 0) continue;
            }
          }
-        FAIL;
+
        /* Runtime caseless and it was a negative class. Continue only if
        both cases are in the map. */
        else
          {
           if ((data[c/8] & (1 << (c&7))) == 0) FAIL;
           c = pcre_fcc[c];
           if ((data[c/8] & (1 << (c&7))) != 0) continue;
           }
 	FAIL;
        }
      /* If max == min we can continue with the main loop without the
@ -3674,12 +3710,30 @@ for (;;)
          if (match(eptr, ecode, offset_top, md)) SUCCEED;
          if (i >= max || eptr >= md->end_subject) FAIL;
          c = *eptr++;
-          if ((data[c/8] & (1 << (c&7))) != 0) continue;
+
-          if (md->runtime_caseless)
+          /* Either not runtime caseless, or it was a positive class. For
          runtime caseless, continue if either case is in the map. */
          if (!nasty_case)
            {
            c = pcre_fcc[c];
            if ((data[c/8] & (1 << (c&7))) != 0) continue;
            if (md->runtime_caseless)
              {
              c = pcre_fcc[c];
              if ((data[c/8] & (1 << (c&7))) != 0) continue;
              }
            }
          /* Runtime caseless and it was a negative class. Continue only if
          both cases are in the map. */
          else
             {
             if ((data[c/8] & (1 << (c&7))) == 0) return FALSE;
             c = pcre_fcc[c];
             if ((data[c/8] & (1 << (c&7))) != 0) continue;
             }
          FAIL;
          }
        /* Control never gets here */
@ -3694,12 +3748,30 @@ for (;;)
          {
          if (eptr >= md->end_subject) break;
          c = *eptr;
-          if ((data[c/8] & (1 << (c&7))) != 0) continue;
+
-          if (md->runtime_caseless)
+          /* Either not runtime caseless, or it was a positive class. For
          runtime caseless, continue if either case is in the map. */
          if (!nasty_case)
            {
            if ((data[c/8] & (1 << (c&7))) != 0) continue;
            if (md->runtime_caseless)
              {
              c = pcre_fcc[c];
              if ((data[c/8] & (1 << (c&7))) != 0) continue;
              }
            }
          /* Runtime caseless and it was a negative class. Continue only if
          both cases are in the map. */
          else
            {
            if ((data[c/8] & (1 << (c&7))) == 0) break;
            c = pcre_fcc[c];
            if ((data[c/8] & (1 << (c&7))) != 0) continue;
            }
          break;
          }
@ -4430,17 +4502,17 @@ pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
  /* The "volatile" directives are to make gcc -Wall stop complaining
     that these variables can be clobbered by the longjmp.  Hopefully
     they won't cost too much performance. */ 
-int resetcount, ocount;
+volatile int resetcount, ocount;
-int first_char = -1;
+volatile int first_char = -1;
 match_data match_block;
 const uschar *start_bits = NULL;
 const uschar *start_match = (const uschar *)subject + start_pos;
 const uschar *end_subject;
 const real_pcre *re = (const real_pcre *)external_re;
 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
-BOOL using_temporary_offsets = FALSE;
+volatile BOOL using_temporary_offsets = FALSE;
-BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
+volatile BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
-BOOL startline = (re->options & PCRE_STARTLINE) != 0;
+volatile BOOL startline = (re->options & PCRE_STARTLINE) != 0;
 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
@ -4480,7 +4552,7 @@ ocount = offsetcount & (-2);
 if (re->top_backref > 0 && re->top_backref >= ocount/2)
  {
  ocount = re->top_backref * 2 + 2;
-  match_block.offset_vector = (pcre_malloc)(ocount * sizeof(int));
+  match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
  if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
  using_temporary_offsets = TRUE;
  DPRINTF(("Got memory to hold back references\n"));
@ -4639,10 +4711,10 @@ do
  free_stack(&match_block);
  return rc;
  }  /* End of (if setjmp(match_block.error_env)...) */
  free_stack(&match_block);
  /* Return an error code; pcremodule.c will preserve the exception */
  if (PyErr_Occurred()) return PCRE_ERROR_NOMEMORY;
  free_stack(&match_block);
  }
 while (!anchored &&
       match_block.errorcode == PCRE_ERROR_NOMATCH &&