pcre/pcre.c

   1 /*************************************************
   2 *      Perl-Compatible Regular Expressions       *
   3 *************************************************/
   4
   5 /*
   6 This is a library of functions to support regular expressions whose syntax
   7 and semantics are as close as possible to those of the Perl 5 language. See
   8 the file Tech.Notes for some information on the internals.
   9
  10 Written by: Philip Hazel <ph10@cam.ac.uk>
  11
  12            Copyright (c) 1997-2000 University of Cambridge
  13
  14 -----------------------------------------------------------------------------
  15 Permission is granted to anyone to use this software for any purpose on any
  16 computer system, and to redistribute it freely, subject to the following
  17 restrictions:
  18
  19 1. This software is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  22
  23 2. The origin of this software must not be misrepresented, either by
  24    explicit claim or by omission.
  25
  26 3. Altered versions must be plainly marked as such, and must not be
  27    misrepresented as being the original software.
  28
  29 4. If PCRE is embedded in any software that is released under the GNU
  30    General Purpose Licence (GPL), then the terms of that licence shall
  31    supersede any condition above with which it is incompatible.
  32 -----------------------------------------------------------------------------
  33 */
  34
  35
  36 /* Define DEBUG to get debugging output on stdout. */
  37
  38 /* #define DEBUG */
  39
  40 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
  41 inline, and there are *still* stupid compilers about that don't like indented
  42 pre-processor statements. I suppose it's only been 10 years... */
  43
  44 #ifdef DEBUG
  45 #define DPRINTF(p) printf p
  46 #else
  47 #define DPRINTF(p) /*nothing*/
  48 #endif
  49
  50 /* Include the internals header, which itself includes Standard C headers plus
  51 the external pcre header. */
  52
  53 #include "internal.h"
  54
  55
  56 /* Allow compilation as C++ source code, should anybody want to do that. */
  57
  58 #ifdef __cplusplus
  59 #define class pcre_class
  60 #endif
  61
  62
  63 /* Number of items on the nested bracket stacks at compile time. This should
  64 not be set greater than 200. */
  65
  66 #define BRASTACK_SIZE 200
  67
  68
  69 /* The number of bytes in a literal character string above which we can't add
  70 any more is different when UTF-8 characters may be encountered. */
  71
  72 #ifdef SUPPORT_UTF8
  73 #define MAXLIT 250
  74 #else
  75 #define MAXLIT 255
  76 #endif
  77
  78
  79 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
  80
  81 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
  82 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
  83
  84 /* Text forms of OP_ values and things, for debugging (not all used) */
  85
  86 #ifdef DEBUG
  87 static const char *OP_names[] = {
  88   "End", "\\A", "\\B", "\\b", "\\D", "\\d",
  89   "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
  90   "Opt", "^", "$", "Any", "chars", "not",
  91   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
  92   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
  93   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
  94   "*", "*?", "+", "+?", "?", "??", "{", "{",
  95   "class", "Ref", "Recurse",
  96   "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
  97   "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
  98   "Brazero", "Braminzero", "Bra"
  99 };
 100 #endif
 101
 102 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
 103 are simple data values; negative values are for special things like \d and so
 104 on. Zero means further processing is needed (for things like \x), or the escape
 105 is invalid. */
 106
 107 static const short int escapes[] = {
 108     0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
 109     0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
 110   '@', -ESC_A, -ESC_B,      0, -ESC_D,      0,      0,      0,   /* @ - G */
 111     0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
 112     0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
 113     0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
 114   '`',      7, -ESC_b,      0, -ESC_d,     27,   '\f',      0,   /* ` - g */
 115     0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */
 116     0,      0,   '\r', -ESC_s,   '\t',      0,      0, -ESC_w,   /* p - w */
 117     0,      0, -ESC_z                                            /* x - z */
 118 };
 119
 120 /* Tables of names of POSIX character classes and their lengths. The list is
 121 terminated by a zero length entry. The first three must be alpha, upper, lower,
 122 as this is assumed for handling case independence. */
 123
 124 static const char *posix_names[] = {
 125   "alpha", "lower", "upper",
 126   "alnum", "ascii", "cntrl", "digit", "graph",
 127   "print", "punct", "space", "word",  "xdigit" };
 128
 129 static const uschar posix_name_lengths[] = {
 130   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
 131
 132 /* Table of class bit maps for each POSIX class; up to three may be combined
 133 to form the class. */
 134
 135 static const int posix_class_maps[] = {
 136   cbit_lower, cbit_upper, -1,             /* alpha */
 137   cbit_lower, -1,         -1,             /* lower */
 138   cbit_upper, -1,         -1,             /* upper */
 139   cbit_digit, cbit_lower, cbit_upper,     /* alnum */
 140   cbit_print, cbit_cntrl, -1,             /* ascii */
 141   cbit_cntrl, -1,         -1,             /* cntrl */
 142   cbit_digit, -1,         -1,             /* digit */
 143   cbit_graph, -1,         -1,             /* graph */
 144   cbit_print, -1,         -1,             /* print */
 145   cbit_punct, -1,         -1,             /* punct */
 146   cbit_space, -1,         -1,             /* space */
 147   cbit_word,  -1,         -1,             /* word */
 148   cbit_xdigit,-1,         -1              /* xdigit */
 149 };
 150
 151
 152 /* Definition to allow mutual recursion */
 153
 154 static BOOL
 155   compile_regex(int, int, int *, uschar **, const uschar **, const char **,
 156     BOOL, int, int *, int *, compile_data *);
 157
 158 /* Structure for building a chain of data that actually lives on the
 159 stack, for holding the values of the subject pointer at the start of each
 160 subpattern, so as to detect when an empty string has been matched by a
 161 subpattern - to break infinite loops. */
 162
 163 typedef struct eptrblock {
 164   struct eptrblock *prev;
 165   const uschar *saved_eptr;
 166 } eptrblock;
 167
 168 /* Flag bits for the match() function */
 169
 170 #define match_condassert   0x01    /* Called to check a condition assertion */
 171 #define match_isgroup      0x02    /* Set if start of bracketed group */
 172
 173
 174
 175 /*************************************************
 176 *               Global variables                 *
 177 *************************************************/
 178
 179 /* PCRE is thread-clean and doesn't use any global variables in the normal
 180 sense. However, it calls memory allocation and free functions via the two
 181 indirections below, which are can be changed by the caller, but are shared
 182 between all threads. */
 183
 184 void *(*pcre_malloc)(size_t) = malloc;
 185 void  (*pcre_free)(void *) = free;
 186
 187
 188
 189 /*************************************************
 190 *    Macros and tables for character handling    *
 191 *************************************************/
 192
 193 /* When UTF-8 encoding is being used, a character is no longer just a single
 194 byte. The macros for character handling generate simple sequences when used in
 195 byte-mode, and more complicated ones for UTF-8 characters. */
 196
 197 #ifndef SUPPORT_UTF8
 198 #define GETCHARINC(c, eptr) c = *eptr++;
 199 #define GETCHARLEN(c, eptr, len) c = *eptr;
 200 #define BACKCHAR(eptr)
 201
 202 #else   /* SUPPORT_UTF8 */
 203
 204 /* Get the next UTF-8 character, advancing the pointer */
 205
 206 #define GETCHARINC(c, eptr) \
 207   c = *eptr++; \
 208   if (md->utf8 && (c & 0xc0) == 0xc0) \
 209     { \
 210     int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
 211     int s = 6 - a;                  /* Amount to shift next byte */  \
 212     c &= utf8_table3[a];            /* Low order bits from first byte */ \
 213     while (a-- > 0) \
 214       { \
 215       c |= (*eptr++ & 0x3f) << s; \
 216       s += 6; \
 217       } \
 218     }
 219
 220 /* Get the next UTF-8 character, not advancing the pointer, setting length */
 221
 222 #define GETCHARLEN(c, eptr, len) \
 223   c = *eptr; \
 224   len = 1; \
 225   if (md->utf8 && (c & 0xc0) == 0xc0) \
 226     { \
 227     int i; \
 228     int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
 229     int s = 6 - a;                  /* Amount to shift next byte */  \
 230     c &= utf8_table3[a];            /* Low order bits from first byte */ \
 231     for (i = 1; i <= a; i++) \
 232       { \
 233       c |= (eptr[i] & 0x3f) << s; \
 234       s += 6; \
 235       } \
 236     len += a; \
 237     }
 238
 239 /* If the pointer is not at the start of a character, move it back until
 240 it is. */
 241
 242 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
 243
 244 #endif
 245
 246
 247
 248 /*************************************************
 249 *             Default character tables           *
 250 *************************************************/
 251
 252 /* A default set of character tables is included in the PCRE binary. Its source
 253 is built by the maketables auxiliary program, which uses the default C ctypes
 254 functions, and put in the file chartables.c. These tables are used by PCRE
 255 whenever the caller of pcre_compile() does not provide an alternate set of
 256 tables. */
 257
 258 #include "chartables.c"
 259
 260
 261
 262 #ifdef SUPPORT_UTF8
 263 /*************************************************
 264 *           Tables for UTF-8 support             *
 265 *************************************************/
 266
 267 /* These are the breakpoints for different numbers of bytes in a UTF-8
 268 character. */
 269
 270 static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
 271
 272 /* These are the indicator bits and the mask for the data bits to set in the
 273 first byte of a character, indexed by the number of additional bytes. */
 274
 275 static int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
 276 static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
 277
 278 /* Table of the number of extra characters, indexed by the first character
 279 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
 280 0x3d. */
 281
 282 static uschar utf8_table4[] = {
 283   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 284   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 285   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 286   3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
 287
 288
 289 /*************************************************
 290 *       Convert character value to UTF-8         *
 291 *************************************************/
 292
 293 /* This function takes an integer value in the range 0 - 0x7fffffff
 294 and encodes it as a UTF-8 character in 0 to 6 bytes.
 295
 296 Arguments:
 297   cvalue     the character value
 298   buffer     pointer to buffer for result - at least 6 bytes long
 299
 300 Returns:     number of characters placed in the buffer
 301 */
 302
 303 static int
 304 ord2utf8(int cvalue, uschar *buffer)
 305 {
 306 register int i, j;
 307 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
 308   if (cvalue <= utf8_table1[i]) break;
 309 *buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
 310 cvalue >>= 6 - i;
 311 for (j = 0; j < i; j++)
 312   {
 313   *buffer++ = 0x80 | (cvalue & 0x3f);
 314   cvalue >>= 6;
 315   }
 316 return i + 1;
 317 }
 318 #endif
 319
 320
 321
 322 /*************************************************
 323 *          Return version string                 *
 324 *************************************************/
 325
 326 #define STRING(a)  # a
 327 #define XSTRING(s) STRING(s)
 328
 329 const char *
 330 pcre_version(void)
 331 {
 332 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
 333 }
 334
 335
 336
 337
 338 /*************************************************
 339 * (Obsolete) Return info about compiled pattern  *
 340 *************************************************/
 341
 342 /* This is the original "info" function. It picks potentially useful data out
 343 of the private structure, but its interface was too rigid. It remains for
 344 backwards compatibility. The public options are passed back in an int - though
 345 the re->options field has been expanded to a long int, all the public options
 346 at the low end of it, and so even on 16-bit systems this will still be OK.
 347 Therefore, I haven't changed the API for pcre_info().
 348
 349 Arguments:
 350   external_re   points to compiled code
 351   optptr        where to pass back the options
 352   first_char    where to pass back the first character,
 353                 or -1 if multiline and all branches start ^,
 354                 or -2 otherwise
 355
 356 Returns:        number of capturing subpatterns
 357                 or negative values on error
 358 */
 359
 360 int
 361 pcre_info(const pcre *external_re, int *optptr, int *first_char)
 362 {
 363 const real_pcre *re = (const real_pcre *)external_re;
 364 if (re == NULL) return PCRE_ERROR_NULL;
 365 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
 366 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
 367 if (first_char != NULL)
 368   *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
 369      ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
 370 return re->top_bracket;
 371 }
 372
 373
 374
 375 /*************************************************
 376 *        Return info about compiled pattern      *
 377 *************************************************/
 378
 379 /* This is a newer "info" function which has an extensible interface so
 380 that additional items can be added compatibly.
 381
 382 Arguments:
 383   external_re      points to compiled code
 384   external_study   points to study data, or NULL
 385   what             what information is required
 386   where            where to put the information
 387
 388 Returns:           0 if data returned, negative on error
 389 */
 390
 391 int
 392 pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
 393   void *where)
 394 {
 395 const real_pcre *re = (const real_pcre *)external_re;
 396 const real_pcre_extra *study = (const real_pcre_extra *)study_data;
 397
 398 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
 399 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
 400
 401 switch (what)
 402   {
 403   case PCRE_INFO_OPTIONS:
 404   *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
 405   break;
 406
 407   case PCRE_INFO_SIZE:
 408   *((size_t *)where) = re->size;
 409   break;
 410
 411   case PCRE_INFO_CAPTURECOUNT:
 412   *((int *)where) = re->top_bracket;
 413   break;
 414
 415   case PCRE_INFO_BACKREFMAX:
 416   *((int *)where) = re->top_backref;
 417   break;
 418
 419   case PCRE_INFO_FIRSTCHAR:
 420   *((int *)where) =
 421     ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
 422     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
 423   break;
 424
 425   case PCRE_INFO_FIRSTTABLE:
 426   *((const uschar **)where) =
 427     (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
 428       study->start_bits : NULL;
 429   break;
 430
 431   case PCRE_INFO_LASTLITERAL:
 432   *((int *)where) =
 433     ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
 434   break;
 435
 436   default: return PCRE_ERROR_BADOPTION;
 437   }
 438
 439 return 0;
 440 }
 441
 442
 443
 444 #ifdef DEBUG
 445 /*************************************************
 446 *        Debugging function to print chars       *
 447 *************************************************/
 448
 449 /* Print a sequence of chars in printable format, stopping at the end of the
 450 subject if the requested.
 451
 452 Arguments:
 453   p           points to characters
 454   length      number to print
 455   is_subject  TRUE if printing from within md->start_subject
 456   md          pointer to matching data block, if is_subject is TRUE
 457
 458 Returns:     nothing
 459 */
 460
 461 static void
 462 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
 463 {
 464 int c;
 465 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
 466 while (length-- > 0)
 467   if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
 468 }
 469 #endif
 470
 471
 472
 473
 474 /*************************************************
 475 *            Handle escapes                      *
 476 *************************************************/
 477
 478 /* This function is called when a \ has been encountered. It either returns a
 479 positive value for a simple escape such as \n, or a negative value which
 480 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
 481 a positive value greater than 255 may be returned. On entry, ptr is pointing at
 482 the \. On exit, it is on the final character of the escape sequence.
 483
 484 Arguments:
 485   ptrptr     points to the pattern position pointer
 486   errorptr   points to the pointer to the error message
 487   bracount   number of previous extracting brackets
 488   options    the options bits
 489   isclass    TRUE if inside a character class
 490   cd         pointer to char tables block
 491
 492 Returns:     zero or positive => a data character
 493              negative => a special escape sequence
 494              on error, errorptr is set
 495 */
 496
 497 static int
 498 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
 499   int options, BOOL isclass, compile_data *cd)
 500 {
 501 const uschar *ptr = *ptrptr;
 502 int c, i;
 503
 504 /* If backslash is at the end of the pattern, it's an error. */
 505
 506 c = *(++ptr);
 507 if (c == 0) *errorptr = ERR1;
 508
 509 /* Digits or letters may have special meaning; all others are literals. */
 510
 511 else if (c < '0' || c > 'z') {}
 512
 513 /* Do an initial lookup in a table. A non-zero result is something that can be
 514 returned immediately. Otherwise further processing may be required. */
 515
 516 else if ((i = escapes[c - '0']) != 0) c = i;
 517
 518 /* Escapes that need further processing, or are illegal. */
 519
 520 else
 521   {
 522   const uschar *oldptr;
 523   switch (c)
 524     {
 525     /* The handling of escape sequences consisting of a string of digits
 526     starting with one that is not zero is not straightforward. By experiment,
 527     the way Perl works seems to be as follows:
 528
 529     Outside a character class, the digits are read as a decimal number. If the
 530     number is less than 10, or if there are that many previous extracting
 531     left brackets, then it is a back reference. Otherwise, up to three octal
 532     digits are read to form an escaped byte. Thus \123 is likely to be octal
 533     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
 534     value is greater than 377, the least significant 8 bits are taken. Inside a
 535     character class, \ followed by a digit is always an octal number. */
 536
 537     case '1': case '2': case '3': case '4': case '5':
 538     case '6': case '7': case '8': case '9':
 539
 540     if (!isclass)
 541       {
 542       oldptr = ptr;
 543       c -= '0';
 544       while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
 545         c = c * 10 + *(++ptr) - '0';
 546       if (c < 10 || c <= bracount)
 547         {
 548         c = -(ESC_REF + c);
 549         break;
 550         }
 551       ptr = oldptr;      /* Put the pointer back and fall through */
 552       }
 553
 554     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
 555     generates a binary zero byte and treats the digit as a following literal.
 556     Thus we have to pull back the pointer by one. */
 557
 558     if ((c = *ptr) >= '8')
 559       {
 560       ptr--;
 561       c = 0;
 562       break;
 563       }
 564
 565     /* \0 always starts an octal number, but we may drop through to here with a
 566     larger first octal digit. */
 567
 568     case '0':
 569     c -= '0';
 570     while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
 571       ptr[1] != '8' && ptr[1] != '9')
 572         c = c * 8 + *(++ptr) - '0';
 573     c &= 255;     /* Take least significant 8 bits */
 574     break;
 575
 576     /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
 577     which can be greater than 0xff, but only if the ddd are hex digits. */
 578
 579     case 'x':
 580 #ifdef SUPPORT_UTF8
 581     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
 582       {
 583       const uschar *pt = ptr + 2;
 584       register int count = 0;
 585       c = 0;
 586       while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
 587         {
 588         count++;
 589         c = c * 16 + cd->lcc[*pt] -
 590           (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
 591         pt++;
 592         }
 593       if (*pt == '}')
 594         {
 595         if (c < 0 || count > 8) *errorptr = ERR34;
 596         ptr = pt;
 597         break;
 598         }
 599       /* If the sequence of hex digits does not end with '}', then we don't
 600       recognize this construct; fall through to the normal \x handling. */
 601       }
 602 #endif
 603
 604     /* Read just a single hex char */
 605
 606     c = 0;
 607     while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
 608       {
 609       ptr++;
 610       c = c * 16 + cd->lcc[*ptr] -
 611         (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
 612       }
 613     break;
 614
 615     /* Other special escapes not starting with a digit are straightforward */
 616
 617     case 'c':
 618     c = *(++ptr);
 619     if (c == 0)
 620       {
 621       *errorptr = ERR2;
 622       return 0;
 623       }
 624
 625     /* A letter is upper-cased; then the 0x40 bit is flipped */
 626
 627     if (c >= 'a' && c <= 'z') c = cd->fcc[c];
 628     c ^= 0x40;
 629     break;
 630
 631     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
 632     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
 633     for Perl compatibility, it is a literal. This code looks a bit odd, but
 634     there used to be some cases other than the default, and there may be again
 635     in future, so I haven't "optimized" it. */
 636
 637     default:
 638     if ((options & PCRE_EXTRA) != 0) switch(c)
 639       {
 640       default:
 641       *errorptr = ERR3;
 642       break;
 643       }
 644     break;
 645     }
 646   }
 647
 648 *ptrptr = ptr;
 649 return c;
 650 }
 651
 652
 653
 654 /*************************************************
 655 *            Check for counted repeat            *
 656 *************************************************/
 657
 658 /* This function is called when a '{' is encountered in a place where it might
 659 start a quantifier. It looks ahead to see if it really is a quantifier or not.
 660 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
 661 where the ddds are digits.
 662
 663 Arguments:
 664   p         pointer to the first char after '{'
 665   cd        pointer to char tables block
 666
 667 Returns:    TRUE or FALSE
 668 */
 669
 670 static BOOL
 671 is_counted_repeat(const uschar *p, compile_data *cd)
 672 {
 673 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
 674 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
 675 if (*p == '}') return TRUE;
 676
 677 if (*p++ != ',') return FALSE;
 678 if (*p == '}') return TRUE;
 679
 680 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
 681 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
 682 return (*p == '}');
 683 }
 684
 685
 686
 687 /*************************************************
 688 *         Read repeat counts                     *
 689 *************************************************/
 690
 691 /* Read an item of the form {n,m} and return the values. This is called only
 692 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
 693 so the syntax is guaranteed to be correct, but we need to check the values.
 694
 695 Arguments:
 696   p          pointer to first char after '{'
 697   minp       pointer to int for min
 698   maxp       pointer to int for max
 699              returned as -1 if no max
 700   errorptr   points to pointer to error message
 701   cd         pointer to character tables clock
 702
 703 Returns:     pointer to '}' on success;
 704              current ptr on error, with errorptr set
 705 */
 706
 707 static const uschar *
 708 read_repeat_counts(const uschar *p, int *minp, int *maxp,
 709   const char **errorptr, compile_data *cd)
 710 {
 711 int min = 0;
 712 int max = -1;
 713
 714 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
 715
 716 if (*p == '}') max = min; else
 717   {
 718   if (*(++p) != '}')
 719     {
 720     max = 0;
 721     while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
 722     if (max < min)
 723       {
 724       *errorptr = ERR4;
 725       return p;
 726       }
 727     }
 728   }
 729
 730 /* Do paranoid checks, then fill in the required variables, and pass back the
 731 pointer to the terminating '}'. */
 732
 733 if (min > 65535 || max > 65535)
 734   *errorptr = ERR5;
 735 else
 736   {
 737   *minp = min;
 738   *maxp = max;
 739   }
 740 return p;
 741 }
 742
 743
 744
 745 /*************************************************
 746 *        Find the fixed length of a pattern      *
 747 *************************************************/
 748
 749 /* Scan a pattern and compute the fixed length of subject that will match it,
 750 if the length is fixed. This is needed for dealing with backward assertions.
 751
 752 Arguments:
 753   code     points to the start of the pattern (the bracket)
 754   options  the compiling options
 755
 756 Returns:   the fixed length, or -1 if there is no fixed length
 757 */
 758
 759 static int
 760 find_fixedlength(uschar *code, int options)
 761 {
 762 int length = -1;
 763
 764 register int branchlength = 0;
 765 register uschar *cc = code + 3;
 766
 767 /* Scan along the opcodes for this branch. If we get to the end of the
 768 branch, check the length against that of the other branches. */
 769
 770 for (;;)
 771   {
 772   int d;
 773   register int op = *cc;
 774   if (op >= OP_BRA) op = OP_BRA;
 775
 776   switch (op)
 777     {
 778     case OP_BRA:
 779     case OP_ONCE:
 780     case OP_COND:
 781     d = find_fixedlength(cc, options);
 782     if (d < 0) return -1;
 783     branchlength += d;
 784     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
 785     cc += 3;
 786     break;
 787
 788     /* Reached end of a branch; if it's a ket it is the end of a nested
 789     call. If it's ALT it is an alternation in a nested call. If it is
 790     END it's the end of the outer call. All can be handled by the same code. */
 791
 792     case OP_ALT:
 793     case OP_KET:
 794     case OP_KETRMAX:
 795     case OP_KETRMIN:
 796     case OP_END:
 797     if (length < 0) length = branchlength;
 798       else if (length != branchlength) return -1;
 799     if (*cc != OP_ALT) return length;
 800     cc += 3;
 801     branchlength = 0;
 802     break;
 803
 804     /* Skip over assertive subpatterns */
 805
 806     case OP_ASSERT:
 807     case OP_ASSERT_NOT:
 808     case OP_ASSERTBACK:
 809     case OP_ASSERTBACK_NOT:
 810     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
 811     cc += 3;
 812     break;
 813
 814     /* Skip over things that don't match chars */
 815
 816     case OP_REVERSE:
 817     cc++;
 818     /* Fall through */
 819
 820     case OP_CREF:
 821     case OP_OPT:
 822     cc++;
 823     /* Fall through */
 824
 825     case OP_SOD:
 826     case OP_EOD:
 827     case OP_EODN:
 828     case OP_CIRC:
 829     case OP_DOLL:
 830     case OP_NOT_WORD_BOUNDARY:
 831     case OP_WORD_BOUNDARY:
 832     cc++;
 833     break;
 834
 835     /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
 836     This requires a scan of the string, unfortunately. We assume valid UTF-8
 837     strings, so all we do is reduce the length by one for byte whose bits are
 838     10xxxxxx. */
 839
 840     case OP_CHARS:
 841     branchlength += *(++cc);
 842 #ifdef SUPPORT_UTF8
 843     for (d = 1; d <= *cc; d++)
 844       if ((cc[d] & 0xc0) == 0x80) branchlength--;
 845 #endif
 846     cc += *cc + 1;
 847     break;
 848
 849     /* Handle exact repetitions */
 850
 851     case OP_EXACT:
 852     case OP_TYPEEXACT:
 853     branchlength += (cc[1] << 8) + cc[2];
 854     cc += 4;
 855     break;
 856
 857     /* Handle single-char matchers */
 858
 859     case OP_NOT_DIGIT:
 860     case OP_DIGIT:
 861     case OP_NOT_WHITESPACE:
 862     case OP_WHITESPACE:
 863     case OP_NOT_WORDCHAR:
 864     case OP_WORDCHAR:
 865     case OP_ANY:
 866     branchlength++;
 867     cc++;
 868     break;
 869
 870
 871     /* Check a class for variable quantification */
 872
 873     case OP_CLASS:
 874     cc += (*cc == OP_REF)? 2 : 33;
 875
 876     switch (*cc)
 877       {
 878       case OP_CRSTAR:
 879       case OP_CRMINSTAR:
 880       case OP_CRQUERY:
 881       case OP_CRMINQUERY:
 882       return -1;
 883
 884       case OP_CRRANGE:
 885       case OP_CRMINRANGE:
 886       if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
 887       branchlength += (cc[1] << 8) + cc[2];
 888       cc += 5;
 889       break;
 890
 891       default:
 892       branchlength++;
 893       }
 894     break;
 895
 896     /* Anything else is variable length */
 897
 898     default:
 899     return -1;
 900     }
 901   }
 902 /* Control never gets here */
 903 }
 904
 905
 906
 907
 908 /*************************************************
 909 *           Check for POSIX class syntax         *
 910 *************************************************/
 911
 912 /* This function is called when the sequence "[:" or "[." or "[=" is
 913 encountered in a character class. It checks whether this is followed by an
 914 optional ^ and then a sequence of letters, terminated by a matching ":]" or
 915 ".]" or "=]".
 916
 917 Argument:
 918   ptr      pointer to the initial [
 919   endptr   where to return the end pointer
 920   cd       pointer to compile data
 921
 922 Returns:   TRUE or FALSE
 923 */
 924
 925 static BOOL
 926 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
 927 {
 928 int terminator;          /* Don't combine these lines; the Solaris cc */
 929 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
 930 if (*(++ptr) == '^') ptr++;
 931 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
 932 if (*ptr == terminator && ptr[1] == ']')
 933   {
 934   *endptr = ptr;
 935   return TRUE;
 936   }
 937 return FALSE;
 938 }
 939
 940
 941
 942
 943 /*************************************************
 944 *          Check POSIX class name                *
 945 *************************************************/
 946
 947 /* This function is called to check the name given in a POSIX-style class entry
 948 such as [:alnum:].
 949
 950 Arguments:
 951   ptr        points to the first letter
 952   len        the length of the name
 953
 954 Returns:     a value representing the name, or -1 if unknown
 955 */
 956
 957 static int
 958 check_posix_name(const uschar *ptr, int len)
 959 {
 960 register int yield = 0;
 961 while (posix_name_lengths[yield] != 0)
 962   {
 963   if (len == posix_name_lengths[yield] &&
 964     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
 965   yield++;
 966   }
 967 return -1;
 968 }
 969
 970
 971
 972
 973 /*************************************************
 974 *           Compile one branch                   *
 975 *************************************************/
 976
 977 /* Scan the pattern, compiling it into the code vector.
 978
 979 Arguments:
 980   options      the option bits
 981   brackets     points to number of brackets used
 982   code         points to the pointer to the current code point
 983   ptrptr       points to the current pattern pointer
 984   errorptr     points to pointer to error message
 985   optchanged   set to the value of the last OP_OPT item compiled
 986   reqchar      set to the last literal character required, else -1
 987   countlits    set to count of mandatory literal characters
 988   cd           contains pointers to tables
 989
 990 Returns:       TRUE on success
 991                FALSE, with *errorptr set on error
 992 */
 993
 994 static BOOL
 995 compile_branch(int options, int *brackets, uschar **codeptr,
 996   const uschar **ptrptr, const char **errorptr, int *optchanged,
 997   int *reqchar, int *countlits, compile_data *cd)
 998 {
 999 int repeat_type, op_type;
1000 int repeat_min, repeat_max;
1001 int bravalue, length;
1002 int greedy_default, greedy_non_default;
1003 int prevreqchar;
1004 int condcount = 0;
1005 int subcountlits = 0;
1006 register int c;
1007 register uschar *code = *codeptr;
1008 uschar *tempcode;
1009 const uschar *ptr = *ptrptr;
1010 const uschar *tempptr;
1011 uschar *previous = NULL;
1012 uschar class[32];
1013
1014 /* Set up the default and non-default settings for greediness */
1015
1016 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1017 greedy_non_default = greedy_default ^ 1;
1018
1019 /* Initialize no required char, and count of literals */
1020
1021 *reqchar = prevreqchar = -1;
1022 *countlits = 0;
1023
1024 /* Switch on next character until the end of the branch */
1025
1026 for (;; ptr++)
1027   {
1028   BOOL negate_class;
1029   int class_charcount;
1030   int class_lastchar;
1031   int newoptions;
1032   int condref;
1033   int subreqchar;
1034
1035   c = *ptr;
1036   if ((options & PCRE_EXTENDED) != 0)
1037     {
1038     if ((cd->ctypes[c] & ctype_space) != 0) continue;
1039     if (c == '#')
1040       {
1041       /* The space before the ; is to avoid a warning on a silly compiler
1042       on the Macintosh. */
1043       while ((c = *(++ptr)) != 0 && c != '\n') ;
1044       continue;
1045       }
1046     }
1047
1048   switch(c)
1049     {
1050     /* The branch terminates at end of string, |, or ). */
1051
1052     case 0:
1053     case '|':
1054     case ')':
1055     *codeptr = code;
1056     *ptrptr = ptr;
1057     return TRUE;
1058
1059     /* Handle single-character metacharacters */
1060
1061     case '^':
1062     previous = NULL;
1063     *code++ = OP_CIRC;
1064     break;
1065
1066     case '$':
1067     previous = NULL;
1068     *code++ = OP_DOLL;
1069     break;
1070
1071     case '.':
1072     previous = code;
1073     *code++ = OP_ANY;
1074     break;
1075
1076     /* Character classes. These always build a 32-byte bitmap of the permitted
1077     characters, except in the special case where there is only one character.
1078     For negated classes, we build the map as usual, then invert it at the end.
1079     */
1080
1081     case '[':
1082     previous = code;
1083     *code++ = OP_CLASS;
1084
1085     /* If the first character is '^', set the negation flag and skip it. */
1086
1087     if ((c = *(++ptr)) == '^')
1088       {
1089       negate_class = TRUE;
1090       c = *(++ptr);
1091       }
1092     else negate_class = FALSE;
1093
1094     /* Keep a count of chars so that we can optimize the case of just a single
1095     character. */
1096
1097     class_charcount = 0;
1098     class_lastchar = -1;
1099
1100     /* Initialize the 32-char bit map to all zeros. We have to build the
1101     map in a temporary bit of store, in case the class contains only 1
1102     character, because in that case the compiled code doesn't use the
1103     bit map. */
1104
1105     memset(class, 0, 32 * sizeof(uschar));
1106
1107     /* Process characters until ] is reached. By writing this as a "do" it
1108     means that an initial ] is taken as a data character. */
1109
1110     do
1111       {
1112       if (c == 0)
1113         {
1114         *errorptr = ERR6;
1115         goto FAILED;
1116         }
1117
1118       /* Handle POSIX class names. Perl allows a negation extension of the
1119       form [:^name]. A square bracket that doesn't match the syntax is
1120       treated as a literal. We also recognize the POSIX constructions
1121       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1122       5.6 does. */
1123
1124       if (c == '[' &&
1125           (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1126           check_posix_syntax(ptr, &tempptr, cd))
1127         {
1128         BOOL local_negate = FALSE;
1129         int posix_class, i;
1130         register const uschar *cbits = cd->cbits;
1131
1132         if (ptr[1] != ':')
1133           {
1134           *errorptr = ERR31;
1135           goto FAILED;
1136           }
1137
1138         ptr += 2;
1139         if (*ptr == '^')
1140           {
1141           local_negate = TRUE;
1142           ptr++;
1143           }
1144
1145         posix_class = check_posix_name(ptr, tempptr - ptr);
1146         if (posix_class < 0)
1147           {
1148           *errorptr = ERR30;
1149           goto FAILED;
1150           }
1151
1152         /* If matching is caseless, upper and lower are converted to
1153         alpha. This relies on the fact that the class table starts with
1154         alpha, lower, upper as the first 3 entries. */
1155
1156         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1157           posix_class = 0;
1158
1159         /* Or into the map we are building up to 3 of the static class
1160         tables, or their negations. */
1161
1162         posix_class *= 3;
1163         for (i = 0; i < 3; i++)
1164           {
1165           int taboffset = posix_class_maps[posix_class + i];
1166           if (taboffset < 0) break;
1167           if (local_negate)
1168             for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1169           else
1170             for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1171           }
1172
1173         ptr = tempptr + 1;
1174         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
1175         continue;
1176         }
1177
1178       /* Backslash may introduce a single character, or it may introduce one
1179       of the specials, which just set a flag. Escaped items are checked for
1180       validity in the pre-compiling pass. The sequence \b is a special case.
1181       Inside a class (and only there) it is treated as backspace. Elsewhere
1182       it marks a word boundary. Other escapes have preset maps ready to
1183       or into the one we are building. We assume they have more than one
1184       character in them, so set class_count bigger than one. */
1185
1186       if (c == '\\')
1187         {
1188         c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1189         if (-c == ESC_b) c = '\b';
1190         else if (c < 0)
1191           {
1192           register const uschar *cbits = cd->cbits;
1193           class_charcount = 10;
1194           switch (-c)
1195             {
1196             case ESC_d:
1197             for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1198             continue;
1199
1200             case ESC_D:
1201             for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1202             continue;
1203
1204             case ESC_w:
1205             for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1206             continue;
1207
1208             case ESC_W:
1209             for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1210             continue;
1211
1212             case ESC_s:
1213             for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1214             continue;
1215
1216             case ESC_S:
1217             for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1218             continue;
1219
1220             default:
1221             *errorptr = ERR7;
1222             goto FAILED;
1223             }
1224           }
1225
1226         /* Fall through if single character, but don't at present allow
1227         chars > 255 in UTF-8 mode. */
1228
1229 #ifdef SUPPORT_UTF8
1230         if (c > 255)
1231           {
1232           *errorptr = ERR33;
1233           goto FAILED;
1234           }
1235 #endif
1236         }
1237
1238       /* A single character may be followed by '-' to form a range. However,
1239       Perl does not permit ']' to be the end of the range. A '-' character
1240       here is treated as a literal. */
1241
1242       if (ptr[1] == '-' && ptr[2] != ']')
1243         {
1244         int d;
1245         ptr += 2;
1246         d = *ptr;
1247
1248         if (d == 0)
1249           {
1250           *errorptr = ERR6;
1251           goto FAILED;
1252           }
1253
1254         /* The second part of a range can be a single-character escape, but
1255         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1256         in such circumstances. */
1257
1258         if (d == '\\')
1259           {
1260           const uschar *oldptr = ptr;
1261           d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1262
1263 #ifdef SUPPORT_UTF8
1264           if (d > 255)
1265             {
1266             *errorptr = ERR33;
1267             goto FAILED;
1268             }
1269 #endif
1270           /* \b is backslash; any other special means the '-' was literal */
1271
1272           if (d < 0)
1273             {
1274             if (d == -ESC_b) d = '\b'; else
1275               {
1276               ptr = oldptr - 2;
1277               goto SINGLE_CHARACTER;  /* A few lines below */
1278               }
1279             }
1280           }
1281
1282         if (d < c)
1283           {
1284           *errorptr = ERR8;
1285           goto FAILED;
1286           }
1287
1288         for (; c <= d; c++)
1289           {
1290           class[c/8] |= (1 << (c&7));
1291           if ((options & PCRE_CASELESS) != 0)
1292             {
1293             int uc = cd->fcc[c];           /* flip case */
1294             class[uc/8] |= (1 << (uc&7));
1295             }
1296           class_charcount++;                /* in case a one-char range */
1297           class_lastchar = c;
1298           }
1299         continue;   /* Go get the next char in the class */
1300         }
1301
1302       /* Handle a lone single character - we can get here for a normal
1303       non-escape char, or after \ that introduces a single character. */
1304
1305       SINGLE_CHARACTER:
1306
1307       class [c/8] |= (1 << (c&7));
1308       if ((options & PCRE_CASELESS) != 0)
1309         {
1310         c = cd->fcc[c];   /* flip case */
1311         class[c/8] |= (1 << (c&7));
1312         }
1313       class_charcount++;
1314       class_lastchar = c;
1315       }
1316
1317     /* Loop until ']' reached; the check for end of string happens inside the
1318     loop. This "while" is the end of the "do" above. */
1319
1320     while ((c = *(++ptr)) != ']');
1321
1322     /* If class_charcount is 1 and class_lastchar is not negative, we saw
1323     precisely one character. This doesn't need the whole 32-byte bit map.
1324     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1325     it's negative. */
1326
1327     if (class_charcount == 1 && class_lastchar >= 0)
1328       {
1329       if (negate_class)
1330         {
1331         code[-1] = OP_NOT;
1332         }
1333       else
1334         {
1335         code[-1] = OP_CHARS;
1336         *code++ = 1;
1337         }
1338       *code++ = class_lastchar;
1339       }
1340
1341     /* Otherwise, negate the 32-byte map if necessary, and copy it into
1342     the code vector. */
1343
1344     else
1345       {
1346       if (negate_class)
1347         for (c = 0; c < 32; c++) code[c] = ~class[c];
1348       else
1349         memcpy(code, class, 32);
1350       code += 32;
1351       }
1352     break;
1353
1354     /* Various kinds of repeat */
1355
1356     case '{':
1357     if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
1358     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
1359     if (*errorptr != NULL) goto FAILED;
1360     goto REPEAT;
1361
1362     case '*':
1363     repeat_min = 0;
1364     repeat_max = -1;
1365     goto REPEAT;
1366
1367     case '+':
1368     repeat_min = 1;
1369     repeat_max = -1;
1370     goto REPEAT;
1371
1372     case '?':
1373     repeat_min = 0;
1374     repeat_max = 1;
1375
1376     REPEAT:
1377     if (previous == NULL)
1378       {
1379       *errorptr = ERR9;
1380       goto FAILED;
1381       }
1382
1383     /* If the next character is '?' this is a minimizing repeat, by default,
1384     but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1385     next character. */
1386
1387     if (ptr[1] == '?')
1388       { repeat_type = greedy_non_default; ptr++; }
1389     else repeat_type = greedy_default;
1390
1391     /* If previous was a string of characters, chop off the last one and use it
1392     as the subject of the repeat. If there was only one character, we can
1393     abolish the previous item altogether. A repeat with a zero minimum wipes
1394     out any reqchar setting, backing up to the previous value. We must also
1395     adjust the countlits value. */
1396
1397     if (*previous == OP_CHARS)
1398       {
1399       int len = previous[1];
1400
1401       if (repeat_min == 0) *reqchar = prevreqchar;
1402       *countlits += repeat_min - 1;
1403
1404       if (len == 1)
1405         {
1406         c = previous[2];
1407         code = previous;
1408         }
1409       else
1410         {
1411         c = previous[len+1];
1412         previous[1]--;
1413         code--;
1414         }
1415       op_type = 0;                 /* Use single-char op codes */
1416       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
1417       }
1418
1419     /* If previous was a single negated character ([^a] or similar), we use
1420     one of the special opcodes, replacing it. The code is shared with single-
1421     character repeats by adding a suitable offset into repeat_type. */
1422
1423     else if ((int)*previous == OP_NOT)
1424       {
1425       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
1426       c = previous[1];
1427       code = previous;
1428       goto OUTPUT_SINGLE_REPEAT;
1429       }
1430
1431     /* If previous was a character type match (\d or similar), abolish it and
1432     create a suitable repeat item. The code is shared with single-character
1433     repeats by adding a suitable offset into repeat_type. */
1434
1435     else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1436       {
1437       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
1438       c = *previous;
1439       code = previous;
1440
1441       OUTPUT_SINGLE_REPEAT:
1442
1443       /* If the maximum is zero then the minimum must also be zero; Perl allows
1444       this case, so we do too - by simply omitting the item altogether. */
1445
1446       if (repeat_max == 0) goto END_REPEAT;
1447
1448       /* Combine the op_type with the repeat_type */
1449
1450       repeat_type += op_type;
1451
1452       /* A minimum of zero is handled either as the special case * or ?, or as
1453       an UPTO, with the maximum given. */
1454
1455       if (repeat_min == 0)
1456         {
1457         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1458           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1459         else
1460           {
1461           *code++ = OP_UPTO + repeat_type;
1462           *code++ = repeat_max >> 8;
1463           *code++ = (repeat_max & 255);
1464           }
1465         }
1466
1467       /* The case {1,} is handled as the special case + */
1468
1469       else if (repeat_min == 1 && repeat_max == -1)
1470         *code++ = OP_PLUS + repeat_type;
1471
1472       /* The case {n,n} is just an EXACT, while the general case {n,m} is
1473       handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1474
1475       else
1476         {
1477         if (repeat_min != 1)
1478           {
1479           *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
1480           *code++ = repeat_min >> 8;
1481           *code++ = (repeat_min & 255);
1482           }
1483
1484         /* If the mininum is 1 and the previous item was a character string,
1485         we either have to put back the item that got cancelled if the string
1486         length was 1, or add the character back onto the end of a longer
1487         string. For a character type nothing need be done; it will just get
1488         put back naturally. Note that the final character is always going to
1489         get added below. */
1490
1491         else if (*previous == OP_CHARS)
1492           {
1493           if (code == previous) code += 2; else previous[1]++;
1494           }
1495
1496         /*  For a single negated character we also have to put back the
1497         item that got cancelled. */
1498
1499         else if (*previous == OP_NOT) code++;
1500
1501         /* If the maximum is unlimited, insert an OP_STAR. */
1502
1503         if (repeat_max < 0)
1504           {
1505           *code++ = c;
1506           *code++ = OP_STAR + repeat_type;
1507           }
1508
1509         /* Else insert an UPTO if the max is greater than the min. */
1510
1511         else if (repeat_max != repeat_min)
1512           {
1513           *code++ = c;
1514           repeat_max -= repeat_min;
1515           *code++ = OP_UPTO + repeat_type;
1516           *code++ = repeat_max >> 8;
1517           *code++ = (repeat_max & 255);
1518           }
1519         }
1520
1521       /* The character or character type itself comes last in all cases. */
1522
1523       *code++ = c;
1524       }
1525
1526     /* If previous was a character class or a back reference, we put the repeat
1527     stuff after it, but just skip the item if the repeat was {0,0}. */
1528
1529     else if (*previous == OP_CLASS || *previous == OP_REF)
1530       {
1531       if (repeat_max == 0)
1532         {
1533         code = previous;
1534         goto END_REPEAT;
1535         }
1536       if (repeat_min == 0 && repeat_max == -1)
1537         *code++ = OP_CRSTAR + repeat_type;
1538       else if (repeat_min == 1 && repeat_max == -1)
1539         *code++ = OP_CRPLUS + repeat_type;
1540       else if (repeat_min == 0 && repeat_max == 1)
1541         *code++ = OP_CRQUERY + repeat_type;
1542       else
1543         {
1544         *code++ = OP_CRRANGE + repeat_type;
1545         *code++ = repeat_min >> 8;
1546         *code++ = repeat_min & 255;
1547         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
1548         *code++ = repeat_max >> 8;
1549         *code++ = repeat_max & 255;
1550         }
1551       }
1552
1553     /* If previous was a bracket group, we may have to replicate it in certain
1554     cases. */
1555
1556     else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1557              (int)*previous == OP_COND)
1558       {
1559       register int i;
1560       int ketoffset = 0;
1561       int len = code - previous;
1562       uschar *bralink = NULL;
1563
1564       /* If the maximum repeat count is unlimited, find the end of the bracket
1565       by scanning through from the start, and compute the offset back to it
1566       from the current code pointer. There may be an OP_OPT setting following
1567       the final KET, so we can't find the end just by going back from the code
1568       pointer. */
1569
1570       if (repeat_max == -1)
1571         {
1572         register uschar *ket = previous;
1573         do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1574         ketoffset = code - ket;
1575         }
1576
1577       /* The case of a zero minimum is special because of the need to stick
1578       OP_BRAZERO in front of it, and because the group appears once in the
1579       data, whereas in other cases it appears the minimum number of times. For
1580       this reason, it is simplest to treat this case separately, as otherwise
1581       the code gets far too mess. There are several special subcases when the
1582       minimum is zero. */
1583
1584       if (repeat_min == 0)
1585         {
1586         /* If we set up a required char from the bracket, we must back off
1587         to the previous value and reset the countlits value too. */
1588
1589         if (subcountlits > 0)
1590           {
1591           *reqchar = prevreqchar;
1592           *countlits -= subcountlits;
1593           }
1594
1595         /* If the maximum is also zero, we just omit the group from the output
1596         altogether. */
1597
1598         if (repeat_max == 0)
1599           {
1600           code = previous;
1601           goto END_REPEAT;
1602           }
1603
1604         /* If the maximum is 1 or unlimited, we just have to stick in the
1605         BRAZERO and do no more at this point. */
1606
1607         if (repeat_max <= 1)
1608           {
1609           memmove(previous+1, previous, len);
1610           code++;
1611           *previous++ = OP_BRAZERO + repeat_type;
1612           }
1613
1614         /* If the maximum is greater than 1 and limited, we have to replicate
1615         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1616         The first one has to be handled carefully because it's the original
1617         copy, which has to be moved up. The remainder can be handled by code
1618         that is common with the non-zero minimum case below. We just have to
1619         adjust the value or repeat_max, since one less copy is required. */
1620
1621         else
1622           {
1623           int offset;
1624           memmove(previous+4, previous, len);
1625           code += 4;
1626           *previous++ = OP_BRAZERO + repeat_type;
1627           *previous++ = OP_BRA;
1628
1629           /* We chain together the bracket offset fields that have to be
1630           filled in later when the ends of the brackets are reached. */
1631
1632           offset = (bralink == NULL)? 0 : previous - bralink;
1633           bralink = previous;
1634           *previous++ = offset >> 8;
1635           *previous++ = offset & 255;
1636           }
1637
1638         repeat_max--;
1639         }
1640
1641       /* If the minimum is greater than zero, replicate the group as many
1642       times as necessary, and adjust the maximum to the number of subsequent
1643       copies that we need. */
1644
1645       else
1646         {
1647         for (i = 1; i < repeat_min; i++)
1648           {
1649           memcpy(code, previous, len);
1650           code += len;
1651           }
1652         if (repeat_max > 0) repeat_max -= repeat_min;
1653         }
1654
1655       /* This code is common to both the zero and non-zero minimum cases. If
1656       the maximum is limited, it replicates the group in a nested fashion,
1657       remembering the bracket starts on a stack. In the case of a zero minimum,
1658       the first one was set up above. In all cases the repeat_max now specifies
1659       the number of additional copies needed. */
1660
1661       if (repeat_max >= 0)
1662         {
1663         for (i = repeat_max - 1; i >= 0; i--)
1664           {
1665           *code++ = OP_BRAZERO + repeat_type;
1666
1667           /* All but the final copy start a new nesting, maintaining the
1668           chain of brackets outstanding. */
1669
1670           if (i != 0)
1671             {
1672             int offset;
1673             *code++ = OP_BRA;
1674             offset = (bralink == NULL)? 0 : code - bralink;
1675             bralink = code;
1676             *code++ = offset >> 8;
1677             *code++ = offset & 255;
1678             }
1679
1680           memcpy(code, previous, len);
1681           code += len;
1682           }
1683
1684         /* Now chain through the pending brackets, and fill in their length
1685         fields (which are holding the chain links pro tem). */
1686
1687         while (bralink != NULL)
1688           {
1689           int oldlinkoffset;
1690           int offset = code - bralink + 1;
1691           uschar *bra = code - offset;
1692           oldlinkoffset = (bra[1] << 8) + bra[2];
1693           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1694           *code++ = OP_KET;
1695           *code++ = bra[1] = offset >> 8;
1696           *code++ = bra[2] = (offset & 255);
1697           }
1698         }
1699
1700       /* If the maximum is unlimited, set a repeater in the final copy. We
1701       can't just offset backwards from the current code point, because we
1702       don't know if there's been an options resetting after the ket. The
1703       correct offset was computed above. */
1704
1705       else code[-ketoffset] = OP_KETRMAX + repeat_type;
1706       }
1707
1708     /* Else there's some kind of shambles */
1709
1710     else
1711       {
1712       *errorptr = ERR11;
1713       goto FAILED;
1714       }
1715
1716     /* In all case we no longer have a previous item. */
1717
1718     END_REPEAT:
1719     previous = NULL;
1720     break;
1721
1722
1723     /* Start of nested bracket sub-expression, or comment or lookahead or
1724     lookbehind or option setting or condition. First deal with special things
1725     that can come after a bracket; all are introduced by ?, and the appearance
1726     of any of them means that this is not a referencing group. They were
1727     checked for validity in the first pass over the string, so we don't have to
1728     check for syntax errors here.  */
1729
1730     case '(':
1731     newoptions = options;
1732     condref = -1;
1733
1734     if (*(++ptr) == '?')
1735       {
1736       int set, unset;
1737       int *optset;
1738
1739       switch (*(++ptr))
1740         {
1741         case '#':                 /* Comment; skip to ket */
1742         ptr++;
1743         while (*ptr != ')') ptr++;
1744         continue;
1745
1746         case ':':                 /* Non-extracting bracket */
1747         bravalue = OP_BRA;
1748         ptr++;
1749         break;
1750
1751         case '(':
1752         bravalue = OP_COND;       /* Conditional group */
1753         if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1754           {
1755           condref = *ptr - '0';
1756           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1757           if (condref == 0)
1758             {
1759             *errorptr = ERR35;
1760             goto FAILED;
1761             }
1762           ptr++;
1763           }
1764         else ptr--;
1765         break;
1766
1767         case '=':                 /* Positive lookahead */
1768         bravalue = OP_ASSERT;
1769         ptr++;
1770         break;
1771
1772         case '!':                 /* Negative lookahead */
1773         bravalue = OP_ASSERT_NOT;
1774         ptr++;
1775         break;
1776
1777         case '<':                 /* Lookbehinds */
1778         switch (*(++ptr))
1779           {
1780           case '=':               /* Positive lookbehind */
1781           bravalue = OP_ASSERTBACK;
1782           ptr++;
1783           break;
1784
1785           case '!':               /* Negative lookbehind */
1786           bravalue = OP_ASSERTBACK_NOT;
1787           ptr++;
1788           break;
1789
1790           default:                /* Syntax error */
1791           *errorptr = ERR24;
1792           goto FAILED;
1793           }
1794         break;
1795
1796         case '>':                 /* One-time brackets */
1797         bravalue = OP_ONCE;
1798         ptr++;
1799         break;
1800
1801         case 'R':                 /* Pattern recursion */
1802         *code++ = OP_RECURSE;
1803         ptr++;
1804         continue;
1805
1806         default:                  /* Option setting */
1807         set = unset = 0;
1808         optset = &set;
1809
1810         while (*ptr != ')' && *ptr != ':')
1811           {
1812           switch (*ptr++)
1813             {
1814             case '-': optset = &unset; break;
1815
1816             case 'i': *optset |= PCRE_CASELESS; break;
1817             case 'm': *optset |= PCRE_MULTILINE; break;
1818             case 's': *optset |= PCRE_DOTALL; break;
1819             case 'x': *optset |= PCRE_EXTENDED; break;
1820             case 'U': *optset |= PCRE_UNGREEDY; break;
1821             case 'X': *optset |= PCRE_EXTRA; break;
1822
1823             default:
1824             *errorptr = ERR12;
1825             goto FAILED;
1826             }
1827           }
1828
1829         /* Set up the changed option bits, but don't change anything yet. */
1830
1831         newoptions = (options | set) & (~unset);
1832
1833         /* If the options ended with ')' this is not the start of a nested
1834         group with option changes, so the options change at this level. At top
1835         level there is nothing else to be done (the options will in fact have
1836         been set from the start of compiling as a result of the first pass) but
1837         at an inner level we must compile code to change the ims options if
1838         necessary, and pass the new setting back so that it can be put at the
1839         start of any following branches, and when this group ends, a resetting
1840         item can be compiled. */
1841
1842         if (*ptr == ')')
1843           {
1844           if ((options & PCRE_INGROUP) != 0 &&
1845               (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1846             {
1847             *code++ = OP_OPT;
1848             *code++ = *optchanged = newoptions & PCRE_IMS;
1849             }
1850           options = newoptions;  /* Change options at this level */
1851           previous = NULL;       /* This item can't be repeated */
1852           continue;              /* It is complete */
1853           }
1854
1855         /* If the options ended with ':' we are heading into a nested group
1856         with possible change of options. Such groups are non-capturing and are
1857         not assertions of any kind. All we need to do is skip over the ':';
1858         the newoptions value is handled below. */
1859
1860         bravalue = OP_BRA;
1861         ptr++;
1862         }
1863       }
1864
1865     /* Else we have a referencing group; adjust the opcode. */
1866
1867     else
1868       {
1869       if (++(*brackets) > EXTRACT_MAX)
1870         {
1871         *errorptr = ERR13;
1872         goto FAILED;
1873         }
1874       bravalue = OP_BRA + *brackets;
1875       }
1876
1877     /* Process nested bracketed re. Assertions may not be repeated, but other
1878     kinds can be. We copy code into a non-register variable in order to be able
1879     to pass its address because some compilers complain otherwise. Pass in a
1880     new setting for the ims options if they have changed. */
1881
1882     previous = (bravalue >= OP_ONCE)? code : NULL;
1883     *code = bravalue;
1884     tempcode = code;
1885
1886     if (!compile_regex(
1887          options | PCRE_INGROUP,       /* Set for all nested groups */
1888          ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1889            newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1890          brackets,                     /* Bracket level */
1891          &tempcode,                    /* Where to put code (updated) */
1892          &ptr,                         /* Input pointer (updated) */
1893          errorptr,                     /* Where to put an error message */
1894          (bravalue == OP_ASSERTBACK ||
1895           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1896          condref,                      /* Condition reference number */
1897          &subreqchar,                  /* For possible last char */
1898          &subcountlits,                /* For literal count */
1899          cd))                          /* Tables block */
1900       goto FAILED;
1901
1902     /* At the end of compiling, code is still pointing to the start of the
1903     group, while tempcode has been updated to point past the end of the group
1904     and any option resetting that may follow it. The pattern pointer (ptr)
1905     is on the bracket. */
1906
1907     /* If this is a conditional bracket, check that there are no more than
1908     two branches in the group. */
1909
1910     if (bravalue == OP_COND)
1911       {
1912       uschar *tc = code;
1913       condcount = 0;
1914
1915       do {
1916          condcount++;
1917          tc += (tc[1] << 8) | tc[2];
1918          }
1919       while (*tc != OP_KET);
1920
1921       if (condcount > 2)
1922         {
1923         *errorptr = ERR27;
1924         goto FAILED;
1925         }
1926       }
1927
1928     /* Handle updating of the required character. If the subpattern didn't
1929     set one, leave it as it was. Otherwise, update it for normal brackets of
1930     all kinds, forward assertions, and conditions with two branches. Don't
1931     update the literal count for forward assertions, however. If the bracket
1932     is followed by a quantifier with zero repeat, we have to back off. Hence
1933     the definition of prevreqchar and subcountlits outside the main loop so
1934     that they can be accessed for the back off. */
1935
1936     if (subreqchar > 0 &&
1937          (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1938          (bravalue == OP_COND && condcount == 2)))
1939       {
1940       prevreqchar = *reqchar;
1941       *reqchar = subreqchar;
1942       if (bravalue != OP_ASSERT) *countlits += subcountlits;
1943       }
1944
1945     /* Now update the main code pointer to the end of the group. */
1946
1947     code = tempcode;
1948
1949     /* Error if hit end of pattern */
1950
1951     if (*ptr != ')')
1952       {
1953       *errorptr = ERR14;
1954       goto FAILED;
1955       }
1956     break;
1957
1958     /* Check \ for being a real metacharacter; if not, fall through and handle
1959     it as a data character at the start of a string. Escape items are checked
1960     for validity in the pre-compiling pass. */
1961
1962     case '\\':
1963     tempptr = ptr;
1964     c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1965
1966     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1967     are arranged to be the negation of the corresponding OP_values. For the
1968     back references, the values are ESC_REF plus the reference number. Only
1969     back references and those types that consume a character may be repeated.
1970     We can test for values between ESC_b and ESC_Z for the latter; this may
1971     have to change if any new ones are ever created. */
1972
1973     if (c < 0)
1974       {
1975       if (-c >= ESC_REF)
1976         {
1977         previous = code;
1978         *code++ = OP_REF;
1979         *code++ = -c - ESC_REF;
1980         }
1981       else
1982         {
1983         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1984         *code++ = -c;
1985         }
1986       continue;
1987       }
1988
1989     /* Data character: reset and fall through */
1990
1991     ptr = tempptr;
1992     c = '\\';
1993
1994     /* Handle a run of data characters until a metacharacter is encountered.
1995     The first character is guaranteed not to be whitespace or # when the
1996     extended flag is set. */
1997
1998     NORMAL_CHAR:
1999     default:
2000     previous = code;
2001     *code = OP_CHARS;
2002     code += 2;
2003     length = 0;
2004
2005     do
2006       {
2007       if ((options & PCRE_EXTENDED) != 0)
2008         {
2009         if ((cd->ctypes[c] & ctype_space) != 0) continue;
2010         if (c == '#')
2011           {
2012           /* The space before the ; is to avoid a warning on a silly compiler
2013           on the Macintosh. */
2014           while ((c = *(++ptr)) != 0 && c != '\n') ;
2015           if (c == 0) break;
2016           continue;
2017           }
2018         }
2019
2020       /* Backslash may introduce a data char or a metacharacter. Escaped items
2021       are checked for validity in the pre-compiling pass. Stop the string
2022       before a metaitem. */
2023
2024       if (c == '\\')
2025         {
2026         tempptr = ptr;
2027         c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2028         if (c < 0) { ptr = tempptr; break; }
2029
2030         /* If a character is > 127 in UTF-8 mode, we have to turn it into
2031         two or more characters in the UTF-8 encoding. */
2032
2033 #ifdef SUPPORT_UTF8
2034         if (c > 127 && (options & PCRE_UTF8) != 0)
2035           {
2036           uschar buffer[8];
2037           int len = ord2utf8(c, buffer);
2038           for (c = 0; c < len; c++) *code++ = buffer[c];
2039           length += len;
2040           continue;
2041           }
2042 #endif
2043         }
2044
2045       /* Ordinary character or single-char escape */
2046
2047       *code++ = c;
2048       length++;
2049       }
2050
2051     /* This "while" is the end of the "do" above. */
2052
2053     while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
2054
2055     /* Update the last character and the count of literals */
2056
2057     prevreqchar = (length > 1)? code[-2] : *reqchar;
2058     *reqchar = code[-1];
2059     *countlits += length;
2060
2061     /* Compute the length and set it in the data vector, and advance to
2062     the next state. */
2063
2064     previous[1] = length;
2065     if (length < MAXLIT) ptr--;
2066     break;
2067     }
2068   }                   /* end of big loop */
2069
2070 /* Control never reaches here by falling through, only by a goto for all the
2071 error states. Pass back the position in the pattern so that it can be displayed
2072 to the user for diagnosing the error. */
2073
2074 FAILED:
2075 *ptrptr = ptr;
2076 return FALSE;
2077 }
2078
2079
2080
2081
2082 /*************************************************
2083 *     Compile sequence of alternatives           *
2084 *************************************************/
2085
2086 /* On entry, ptr is pointing past the bracket character, but on return
2087 it points to the closing bracket, or vertical bar, or end of string.
2088 The code variable is pointing at the byte into which the BRA operator has been
2089 stored. If the ims options are changed at the start (for a (?ims: group) or
2090 during any branch, we need to insert an OP_OPT item at the start of every
2091 following branch to ensure they get set correctly at run time, and also pass
2092 the new options into every subsequent branch compile.
2093
2094 Argument:
2095   options     the option bits
2096   optchanged  new ims options to set as if (?ims) were at the start, or -1
2097                for no change
2098   brackets    -> int containing the number of extracting brackets used
2099   codeptr     -> the address of the current code pointer
2100   ptrptr      -> the address of the current pattern pointer
2101   errorptr    -> pointer to error message
2102   lookbehind  TRUE if this is a lookbehind assertion
2103   condref     >= 0 for OPT_CREF setting at start of conditional group
2104   reqchar     -> place to put the last required character, or a negative number
2105   countlits   -> place to put the shortest literal count of any branch
2106   cd          points to the data block with tables pointers
2107
2108 Returns:      TRUE on success
2109 */
2110
2111 static BOOL
2112 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
2113   const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
2114   int *reqchar, int *countlits, compile_data *cd)
2115 {
2116 const uschar *ptr = *ptrptr;
2117 uschar *code = *codeptr;
2118 uschar *last_branch = code;
2119 uschar *start_bracket = code;
2120 uschar *reverse_count = NULL;
2121 int oldoptions = options & PCRE_IMS;
2122 int branchreqchar, branchcountlits;
2123
2124 *reqchar = -1;
2125 *countlits = INT_MAX;
2126 code += 3;
2127
2128 /* At the start of a reference-based conditional group, insert the reference
2129 number as an OP_CREF item. */
2130
2131 if (condref >= 0)
2132   {
2133   *code++ = OP_CREF;
2134   *code++ = condref;
2135   }
2136
2137 /* Loop for each alternative branch */
2138
2139 for (;;)
2140   {
2141   int length;
2142
2143   /* Handle change of options */
2144
2145   if (optchanged >= 0)
2146     {
2147     *code++ = OP_OPT;
2148     *code++ = optchanged;
2149     options = (options & ~PCRE_IMS) | optchanged;
2150     }
2151
2152   /* Set up dummy OP_REVERSE if lookbehind assertion */
2153
2154   if (lookbehind)
2155     {
2156     *code++ = OP_REVERSE;
2157     reverse_count = code;
2158     *code++ = 0;
2159     *code++ = 0;
2160     }
2161
2162   /* Now compile the branch */
2163
2164   if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
2165       &branchreqchar, &branchcountlits, cd))
2166     {
2167     *ptrptr = ptr;
2168     return FALSE;
2169     }
2170
2171   /* Fill in the length of the last branch */
2172
2173   length = code - last_branch;
2174   last_branch[1] = length >> 8;
2175   last_branch[2] = length & 255;
2176
2177   /* Save the last required character if all branches have the same; a current
2178   value of -1 means unset, while -2 means "previous branch had no last required
2179   char".  */
2180
2181   if (*reqchar != -2)
2182     {
2183     if (branchreqchar >= 0)
2184       {
2185       if (*reqchar == -1) *reqchar = branchreqchar;
2186       else if (*reqchar != branchreqchar) *reqchar = -2;
2187       }
2188     else *reqchar = -2;
2189     }
2190
2191   /* Keep the shortest literal count */
2192
2193   if (branchcountlits < *countlits) *countlits = branchcountlits;
2194   DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
2195
2196   /* If lookbehind, check that this branch matches a fixed-length string,
2197   and put the length into the OP_REVERSE item. Temporarily mark the end of
2198   the branch with OP_END. */
2199
2200   if (lookbehind)
2201     {
2202     *code = OP_END;
2203     length = find_fixedlength(last_branch, options);
2204     DPRINTF(("fixed length = %d\n", length));
2205     if (length < 0)
2206       {
2207       *errorptr = ERR25;
2208       *ptrptr = ptr;
2209       return FALSE;
2210       }
2211     reverse_count[0] = (length >> 8);
2212     reverse_count[1] = length & 255;
2213     }
2214
2215   /* Reached end of expression, either ')' or end of pattern. Insert a
2216   terminating ket and the length of the whole bracketed item, and return,
2217   leaving the pointer at the terminating char. If any of the ims options
2218   were changed inside the group, compile a resetting op-code following. */
2219
2220   if (*ptr != '|')
2221     {
2222     length = code - start_bracket;
2223     *code++ = OP_KET;
2224     *code++ = length >> 8;
2225     *code++ = length & 255;
2226     if (optchanged >= 0)
2227       {
2228       *code++ = OP_OPT;
2229       *code++ = oldoptions;
2230       }
2231     *codeptr = code;
2232     *ptrptr = ptr;
2233     return TRUE;
2234     }
2235
2236   /* Another branch follows; insert an "or" node and advance the pointer. */
2237
2238   *code = OP_ALT;
2239   last_branch = code;
2240   code += 3;
2241   ptr++;
2242   }
2243 /* Control never reaches here */
2244 }
2245
2246
2247
2248
2249 /*************************************************
2250 *      Find first significant op code            *
2251 *************************************************/
2252
2253 /* This is called by several functions that scan a compiled expression looking
2254 for a fixed first character, or an anchoring op code etc. It skips over things
2255 that do not influence this. For one application, a change of caseless option is
2256 important.
2257
2258 Arguments:
2259   code       pointer to the start of the group
2260   options    pointer to external options
2261   optbit     the option bit whose changing is significant, or
2262              zero if none are
2263   optstop    TRUE to return on option change, otherwise change the options
2264                value and continue
2265
2266 Returns:     pointer to the first significant opcode
2267 */
2268
2269 static const uschar*
2270 first_significant_code(const uschar *code, int *options, int optbit,
2271   BOOL optstop)
2272 {
2273 for (;;)
2274   {
2275   switch ((int)*code)
2276     {
2277     case OP_OPT:
2278     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
2279       {
2280       if (optstop) return code;
2281       *options = (int)code[1];
2282       }
2283     code += 2;
2284     break;
2285
2286     case OP_CREF:
2287     code += 2;
2288     break;
2289
2290     case OP_WORD_BOUNDARY:
2291     case OP_NOT_WORD_BOUNDARY:
2292     code++;
2293     break;
2294
2295     case OP_ASSERT_NOT:
2296     case OP_ASSERTBACK:
2297     case OP_ASSERTBACK_NOT:
2298     do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
2299     code += 3;
2300     break;
2301
2302     default:
2303     return code;
2304     }
2305   }
2306 /* Control never reaches here */
2307 }
2308
2309
2310
2311
2312 /*************************************************
2313 *          Check for anchored expression         *
2314 *************************************************/
2315
2316 /* Try to find out if this is an anchored regular expression. Consider each
2317 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2318 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2319 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2320 counts, since OP_CIRC can match in the middle.
2321
2322 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2323 because that will try the rest of the pattern at all possible matching points,
2324 so there is no point trying them again.
2325
2326 Arguments:
2327   code       points to start of expression (the bracket)
2328   options    points to the options setting
2329
2330 Returns:     TRUE or FALSE
2331 */
2332
2333 static BOOL
2334 is_anchored(register const uschar *code, int *options)
2335 {
2336 do {
2337    const uschar *scode = first_significant_code(code + 3, options,
2338      PCRE_MULTILINE, FALSE);
2339    register int op = *scode;
2340    if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2341      { if (!is_anchored(scode, options)) return FALSE; }
2342    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2343             (*options & PCRE_DOTALL) != 0)
2344      { if (scode[1] != OP_ANY) return FALSE; }
2345    else if (op != OP_SOD &&
2346            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2347      return FALSE;
2348    code += (code[1] << 8) + code[2];
2349    }
2350 while (*code == OP_ALT);
2351 return TRUE;
2352 }
2353
2354
2355
2356 /*************************************************
2357 *         Check for starting with ^ or .*        *
2358 *************************************************/
2359
2360 /* This is called to find out if every branch starts with ^ or .* so that
2361 "first char" processing can be done to speed things up in multiline
2362 matching and for non-DOTALL patterns that start with .* (which must start at
2363 the beginning or after \n).
2364
2365 Argument:  points to start of expression (the bracket)
2366 Returns:   TRUE or FALSE
2367 */
2368
2369 static BOOL
2370 is_startline(const uschar *code)
2371 {
2372 do {
2373    const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
2374    register int op = *scode;
2375    if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2376      { if (!is_startline(scode)) return FALSE; }
2377    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2378      { if (scode[1] != OP_ANY) return FALSE; }
2379    else if (op != OP_CIRC) return FALSE;
2380    code += (code[1] << 8) + code[2];
2381    }
2382 while (*code == OP_ALT);
2383 return TRUE;
2384 }
2385
2386
2387
2388 /*************************************************
2389 *          Check for fixed first char            *
2390 *************************************************/
2391
2392 /* Try to find out if there is a fixed first character. This is called for
2393 unanchored expressions, as it speeds up their processing quite considerably.
2394 Consider each alternative branch. If they all start with the same char, or with
2395 a bracket all of whose alternatives start with the same char (recurse ad lib),
2396 then we return that char, otherwise -1.
2397
2398 Arguments:
2399   code       points to start of expression (the bracket)
2400   options    pointer to the options (used to check casing changes)
2401
2402 Returns:     -1 or the fixed first char
2403 */
2404
2405 static int
2406 find_firstchar(const uschar *code, int *options)
2407 {
2408 register int c = -1;
2409 do {
2410    int d;
2411    const uschar *scode = first_significant_code(code + 3, options,
2412      PCRE_CASELESS, TRUE);
2413    register int op = *scode;
2414
2415    if (op >= OP_BRA) op = OP_BRA;
2416
2417    switch(op)
2418      {
2419      default:
2420      return -1;
2421
2422      case OP_BRA:
2423      case OP_ASSERT:
2424      case OP_ONCE:
2425      case OP_COND:
2426      if ((d = find_firstchar(scode, options)) < 0) return -1;
2427      if (c < 0) c = d; else if (c != d) return -1;
2428      break;
2429
2430      case OP_EXACT:       /* Fall through */
2431      scode++;
2432
2433      case OP_CHARS:       /* Fall through */
2434      scode++;
2435
2436      case OP_PLUS:
2437      case OP_MINPLUS:
2438      if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
2439      break;
2440      }
2441
2442    code += (code[1] << 8) + code[2];
2443    }
2444 while (*code == OP_ALT);
2445 return c;
2446 }
2447
2448
2449
2450
2451
2452 /*************************************************
2453 *        Compile a Regular Expression            *
2454 *************************************************/
2455
2456 /* This function takes a string and returns a pointer to a block of store
2457 holding a compiled version of the expression.
2458
2459 Arguments:
2460   pattern      the regular expression
2461   options      various option bits
2462   errorptr     pointer to pointer to error text
2463   erroroffset  ptr offset in pattern where error was detected
2464   tables       pointer to character tables or NULL
2465
2466 Returns:       pointer to compiled data block, or NULL on error,
2467                with errorptr and erroroffset set
2468 */
2469
2470 pcre *
2471 pcre_compile(const char *pattern, int options, const char **errorptr,
2472   int *erroroffset, const unsigned char *tables)
2473 {
2474 real_pcre *re;
2475 int length = 3;      /* For initial BRA plus length */
2476 int runlength;
2477 int c, reqchar, countlits;
2478 int bracount = 0;
2479 int top_backref = 0;
2480 int branch_extra = 0;
2481 int branch_newextra;
2482 unsigned int brastackptr = 0;
2483 size_t size;
2484 uschar *code;
2485 const uschar *ptr;
2486 compile_data compile_block;
2487 int brastack[BRASTACK_SIZE];
2488 uschar bralenstack[BRASTACK_SIZE];
2489 const size_t pattern_length = strlen(pattern);
2490
2491 #ifdef DEBUG
2492 uschar *code_base, *code_end;
2493 #endif
2494
2495 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
2496
2497 #ifndef SUPPORT_UTF8
2498 if ((options & PCRE_UTF8) != 0)
2499   {
2500   *errorptr = ERR32;
2501   return NULL;
2502   }
2503 #endif
2504
2505 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2506 can do is just return NULL. */
2507
2508 if (errorptr == NULL) return NULL;
2509 *errorptr = NULL;
2510
2511 /* However, we can give a message for this error */
2512
2513 if (erroroffset == NULL)
2514   {
2515   *errorptr = ERR16;
2516   return NULL;
2517   }
2518 *erroroffset = 0;
2519
2520 if ((options & ~PUBLIC_OPTIONS) != 0)
2521   {
2522   *errorptr = ERR17;
2523   return NULL;
2524   }
2525
2526 /* Set up pointers to the individual character tables */
2527
2528 if (tables == NULL) tables = pcre_default_tables;
2529 compile_block.lcc = tables + lcc_offset;
2530 compile_block.fcc = tables + fcc_offset;
2531 compile_block.cbits = tables + cbits_offset;
2532 compile_block.ctypes = tables + ctypes_offset;
2533
2534 /* Reflect pattern for debugging output */
2535
2536 DPRINTF(("------------------------------------------------------------------\n"));
2537 DPRINTF(("%s\n", pattern));
2538
2539 /* The first thing to do is to make a pass over the pattern to compute the
2540 amount of store required to hold the compiled code. This does not have to be
2541 perfect as long as errors are overestimates. At the same time we can detect any
2542 internal flag settings. Make an attempt to correct for any counted white space
2543 if an "extended" flag setting appears late in the pattern. We can't be so
2544 clever for #-comments. */
2545
2546 ptr = (const uschar *)(pattern - 1);
2547 while ((c = *(++ptr)) != 0)
2548   {
2549   int min, max;
2550   int class_charcount;
2551
2552   if ((options & PCRE_EXTENDED) != 0)
2553     {
2554     if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2555     if (c == '#')
2556       {
2557       /* The space before the ; is to avoid a warning on a silly compiler
2558       on the Macintosh. */
2559       while ((c = *(++ptr)) != 0 && c != '\n') ;
2560       continue;
2561       }
2562     }
2563
2564   switch(c)
2565     {
2566     /* A backslashed item may be an escaped "normal" character or a
2567     character type. For a "normal" character, put the pointers and
2568     character back so that tests for whitespace etc. in the input
2569     are done correctly. */
2570
2571     case '\\':
2572       {
2573       const uschar *save_ptr = ptr;
2574       c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2575       if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2576       if (c >= 0)
2577         {
2578         ptr = save_ptr;
2579         c = '\\';
2580         goto NORMAL_CHAR;
2581         }
2582       }
2583     length++;
2584
2585     /* A back reference needs an additional char, plus either one or 5
2586     bytes for a repeat. We also need to keep the value of the highest
2587     back reference. */
2588
2589     if (c <= -ESC_REF)
2590       {
2591       int refnum = -c - ESC_REF;
2592       if (refnum > top_backref) top_backref = refnum;
2593       length++;   /* For single back reference */
2594       if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2595         {
2596         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2597         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2598         if ((min == 0 && (max == 1 || max == -1)) ||
2599           (min == 1 && max == -1))
2600             length++;
2601         else length += 5;
2602         if (ptr[1] == '?') ptr++;
2603         }
2604       }
2605     continue;
2606
2607     case '^':
2608     case '.':
2609     case '$':
2610     case '*':     /* These repeats won't be after brackets; */
2611     case '+':     /* those are handled separately */
2612     case '?':
2613     length++;
2614     continue;
2615
2616     /* This covers the cases of repeats after a single char, metachar, class,
2617     or back reference. */
2618
2619     case '{':
2620     if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2621     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2622     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2623     if ((min == 0 && (max == 1 || max == -1)) ||
2624       (min == 1 && max == -1))
2625         length++;
2626     else
2627       {
2628       length--;   /* Uncount the original char or metachar */
2629       if (min == 1) length++; else if (min > 0) length += 4;
2630       if (max > 0) length += 4; else length += 2;
2631       }
2632     if (ptr[1] == '?') ptr++;
2633     continue;
2634
2635     /* An alternation contains an offset to the next branch or ket. If any ims
2636     options changed in the previous branch(es), and/or if we are in a
2637     lookbehind assertion, extra space will be needed at the start of the
2638     branch. This is handled by branch_extra. */
2639
2640     case '|':
2641     length += 3 + branch_extra;
2642     continue;
2643
2644     /* A character class uses 33 characters. Don't worry about character types
2645     that aren't allowed in classes - they'll get picked up during the compile.
2646     A character class that contains only one character uses 2 or 3 bytes,
2647     depending on whether it is negated or not. Notice this where we can. */
2648
2649     case '[':
2650     class_charcount = 0;
2651     if (*(++ptr) == '^') ptr++;
2652     do
2653       {
2654       if (*ptr == '\\')
2655         {
2656         int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2657           &compile_block);
2658         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2659         if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2660         }
2661       else class_charcount++;
2662       ptr++;
2663       }
2664     while (*ptr != 0 && *ptr != ']');
2665
2666     /* Repeats for negated single chars are handled by the general code */
2667
2668     if (class_charcount == 1) length += 3; else
2669       {
2670       length += 33;
2671
2672       /* A repeat needs either 1 or 5 bytes. */
2673
2674       if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2675         {
2676         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2677         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2678         if ((min == 0 && (max == 1 || max == -1)) ||
2679           (min == 1 && max == -1))
2680             length++;
2681         else length += 5;
2682         if (ptr[1] == '?') ptr++;
2683         }
2684       }
2685     continue;
2686
2687     /* Brackets may be genuine groups or special things */
2688
2689     case '(':
2690     branch_newextra = 0;
2691
2692     /* Handle special forms of bracket, which all start (? */
2693
2694     if (ptr[1] == '?')
2695       {
2696       int set, unset;
2697       int *optset;
2698
2699       switch (c = ptr[2])
2700         {
2701         /* Skip over comments entirely */
2702         case '#':
2703         ptr += 3;
2704         while (*ptr != 0 && *ptr != ')') ptr++;
2705         if (*ptr == 0)
2706           {
2707           *errorptr = ERR18;
2708           goto PCRE_ERROR_RETURN;
2709           }
2710         continue;
2711
2712         /* Non-referencing groups and lookaheads just move the pointer on, and
2713         then behave like a non-special bracket, except that they don't increment
2714         the count of extracting brackets. Ditto for the "once only" bracket,
2715         which is in Perl from version 5.005. */
2716
2717         case ':':
2718         case '=':
2719         case '!':
2720         case '>':
2721         ptr += 2;
2722         break;
2723
2724         /* A recursive call to the regex is an extension, to provide the
2725         facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2726
2727         case 'R':
2728         if (ptr[3] != ')')
2729           {
2730           *errorptr = ERR29;
2731           goto PCRE_ERROR_RETURN;
2732           }
2733         ptr += 3;
2734         length += 1;
2735         break;
2736
2737         /* Lookbehinds are in Perl from version 5.005 */
2738
2739         case '<':
2740         if (ptr[3] == '=' || ptr[3] == '!')
2741           {
2742           ptr += 3;
2743           branch_newextra = 3;
2744           length += 3;         /* For the first branch */
2745           break;
2746           }
2747         *errorptr = ERR24;
2748         goto PCRE_ERROR_RETURN;
2749
2750         /* Conditionals are in Perl from version 5.005. The bracket must either
2751         be followed by a number (for bracket reference) or by an assertion
2752         group. */
2753
2754         case '(':
2755         if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2756           {
2757           ptr += 4;
2758           length += 2;
2759           while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2760           if (*ptr != ')')
2761             {
2762             *errorptr = ERR26;
2763             goto PCRE_ERROR_RETURN;
2764             }
2765           }
2766         else   /* An assertion must follow */
2767           {
2768           ptr++;   /* Can treat like ':' as far as spacing is concerned */
2769           if (ptr[2] != '?' ||
2770              (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2771             {
2772             ptr += 2;    /* To get right offset in message */
2773             *errorptr = ERR28;
2774             goto PCRE_ERROR_RETURN;
2775             }
2776           }
2777         break;
2778
2779         /* Else loop checking valid options until ) is met. Anything else is an
2780         error. If we are without any brackets, i.e. at top level, the settings
2781         act as if specified in the options, so massage the options immediately.
2782         This is for backward compatibility with Perl 5.004. */
2783
2784         default:
2785         set = unset = 0;
2786         optset = &set;
2787         ptr += 2;
2788
2789         for (;; ptr++)
2790           {
2791           c = *ptr;
2792           switch (c)
2793             {
2794             case 'i':
2795             *optset |= PCRE_CASELESS;
2796             continue;
2797
2798             case 'm':
2799             *optset |= PCRE_MULTILINE;
2800             continue;
2801
2802             case 's':
2803             *optset |= PCRE_DOTALL;
2804             continue;
2805
2806             case 'x':
2807             *optset |= PCRE_EXTENDED;
2808             continue;
2809
2810             case 'X':
2811             *optset |= PCRE_EXTRA;
2812             continue;
2813
2814             case 'U':
2815             *optset |= PCRE_UNGREEDY;
2816             continue;
2817
2818             case '-':
2819             optset = &unset;
2820             continue;
2821
2822             /* A termination by ')' indicates an options-setting-only item;
2823             this is global at top level; otherwise nothing is done here and
2824             it is handled during the compiling process on a per-bracket-group
2825             basis. */
2826
2827             case ')':
2828             if (brastackptr == 0)
2829               {
2830               options = (options | set) & (~unset);
2831               set = unset = 0;     /* To save length */
2832               }
2833             /* Fall through */
2834
2835             /* A termination by ':' indicates the start of a nested group with
2836             the given options set. This is again handled at compile time, but
2837             we must allow for compiled space if any of the ims options are
2838             set. We also have to allow for resetting space at the end of
2839             the group, which is why 4 is added to the length and not just 2.
2840             If there are several changes of options within the same group, this
2841             will lead to an over-estimate on the length, but this shouldn't
2842             matter very much. We also have to allow for resetting options at
2843             the start of any alternations, which we do by setting
2844             branch_newextra to 2. Finally, we record whether the case-dependent
2845             flag ever changes within the regex. This is used by the "required
2846             character" code. */
2847
2848             case ':':
2849             if (((set|unset) & PCRE_IMS) != 0)
2850               {
2851               length += 4;
2852               branch_newextra = 2;
2853               if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2854               }
2855             goto END_OPTIONS;
2856
2857             /* Unrecognized option character */
2858
2859             default:
2860             *errorptr = ERR12;
2861             goto PCRE_ERROR_RETURN;
2862             }
2863           }
2864
2865         /* If we hit a closing bracket, that's it - this is a freestanding
2866         option-setting. We need to ensure that branch_extra is updated if
2867         necessary. The only values branch_newextra can have here are 0 or 2.
2868         If the value is 2, then branch_extra must either be 2 or 5, depending
2869         on whether this is a lookbehind group or not. */
2870
2871         END_OPTIONS:
2872         if (c == ')')
2873           {
2874           if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2875             branch_extra += branch_newextra;
2876           continue;
2877           }
2878
2879         /* If options were terminated by ':' control comes here. Fall through
2880         to handle the group below. */
2881         }
2882       }
2883
2884     /* Extracting brackets must be counted so we can process escapes in a
2885     Perlish way. */
2886
2887     else bracount++;
2888
2889     /* Non-special forms of bracket. Save length for computing whole length
2890     at end if there's a repeat that requires duplication of the group. Also
2891     save the current value of branch_extra, and start the new group with
2892     the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2893     for a lookbehind assertion. */
2894
2895     if (brastackptr >= sizeof(brastack)/sizeof(int))
2896       {
2897       *errorptr = ERR19;
2898       goto PCRE_ERROR_RETURN;
2899       }
2900
2901     bralenstack[brastackptr] = branch_extra;
2902     branch_extra = branch_newextra;
2903
2904     brastack[brastackptr++] = length;
2905     length += 3;
2906     continue;
2907
2908     /* Handle ket. Look for subsequent max/min; for certain sets of values we
2909     have to replicate this bracket up to that many times. If brastackptr is
2910     0 this is an unmatched bracket which will generate an error, but take care
2911     not to try to access brastack[-1] when computing the length and restoring
2912     the branch_extra value. */
2913
2914     case ')':
2915     length += 3;
2916       {
2917       int minval = 1;
2918       int maxval = 1;
2919       int duplength;
2920
2921       if (brastackptr > 0)
2922         {
2923         duplength = length - brastack[--brastackptr];
2924         branch_extra = bralenstack[brastackptr];
2925         }
2926       else duplength = 0;
2927
2928       /* Leave ptr at the final char; for read_repeat_counts this happens
2929       automatically; for the others we need an increment. */
2930
2931       if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2932         {
2933         ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2934           &compile_block);
2935         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2936         }
2937       else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2938       else if (c == '+') { maxval = -1; ptr++; }
2939       else if (c == '?') { minval = 0; ptr++; }
2940
2941       /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2942       group, and if the maximum is greater than zero, we have to replicate
2943       maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2944       bracket set - hence the 7. */
2945
2946       if (minval == 0)
2947         {
2948         length++;
2949         if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2950         }
2951
2952       /* When the minimum is greater than zero, 1 we have to replicate up to
2953       minval-1 times, with no additions required in the copies. Then, if
2954       there is a limited maximum we have to replicate up to maxval-1 times
2955       allowing for a BRAZERO item before each optional copy and nesting
2956       brackets for all but one of the optional copies. */
2957
2958       else
2959         {
2960         length += (minval - 1) * duplength;
2961         if (maxval > minval)   /* Need this test as maxval=-1 means no limit */
2962           length += (maxval - minval) * (duplength + 7) - 6;
2963         }
2964       }
2965     continue;
2966
2967     /* Non-special character. For a run of such characters the length required
2968     is the number of characters + 2, except that the maximum run length is 255.
2969     We won't get a skipped space or a non-data escape or the start of a #
2970     comment as the first character, so the length can't be zero. */
2971
2972     NORMAL_CHAR:
2973     default:
2974     length += 2;
2975     runlength = 0;
2976     do
2977       {
2978       if ((options & PCRE_EXTENDED) != 0)
2979         {
2980         if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2981         if (c == '#')
2982           {
2983           /* The space before the ; is to avoid a warning on a silly compiler
2984           on the Macintosh. */
2985           while ((c = *(++ptr)) != 0 && c != '\n') ;
2986           continue;
2987           }
2988         }
2989
2990       /* Backslash may introduce a data char or a metacharacter; stop the
2991       string before the latter. */
2992
2993       if (c == '\\')
2994         {
2995         const uschar *saveptr = ptr;
2996         c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2997           &compile_block);
2998         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2999         if (c < 0) { ptr = saveptr; break; }
3000
3001 #ifdef SUPPORT_UTF8
3002         if (c > 127 && (options & PCRE_UTF8) != 0)
3003           {
3004           int i;
3005           for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
3006             if (c <= utf8_table1[i]) break;
3007           runlength += i;
3008           }
3009 #endif
3010         }
3011
3012       /* Ordinary character or single-char escape */
3013
3014       runlength++;
3015
3016       if (ptr > pattern + pattern_length)
3017         {
3018         *errorptr = "internal error";
3019         goto PCRE_ERROR_RETURN;
3020         }
3021       if (ptr[1] >= tables_length - ctypes_offset)
3022         {
3023         *errorptr = ERR6;
3024         goto PCRE_ERROR_RETURN;
3025         }
3026       }
3027
3028     /* This "while" is the end of the "do" above. */
3029
3030     while (runlength < MAXLIT &&
3031       (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
3032
3033     ptr--;
3034     length += runlength;
3035     continue;
3036     }
3037   }
3038
3039 length += 4;    /* For final KET and END */
3040
3041 if (length > 65539)
3042   {
3043   *errorptr = ERR20;
3044   return NULL;
3045   }
3046
3047 /* Compute the size of data block needed and get it, either from malloc or
3048 externally provided function. We specify "code[0]" in the offsetof() expression
3049 rather than just "code", because it has been reported that one broken compiler
3050 fails on "code" because it is also an independent variable. It should make no
3051 difference to the value of the offsetof(). */
3052
3053 size = length + offsetof(real_pcre, code[0]);
3054 re = (real_pcre *)(pcre_malloc)(size);
3055
3056 if (re == NULL)
3057   {
3058   *errorptr = ERR21;
3059   return NULL;
3060   }
3061
3062 /* Put in the magic number, and save the size, options, and table pointer */
3063
3064 re->magic_number = MAGIC_NUMBER;
3065 re->size = size;
3066 re->options = options;
3067 re->tables = tables;
3068
3069 /* Set up a starting, non-extracting bracket, then compile the expression. On
3070 error, *errorptr will be set non-NULL, so we don't need to look at the result
3071 of the function here. */
3072
3073 ptr = (const uschar *)pattern;
3074 code = re->code;
3075 *code = OP_BRA;
3076 bracount = 0;
3077 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
3078   &reqchar, &countlits, &compile_block);
3079 re->top_bracket = bracount;
3080 re->top_backref = top_backref;
3081
3082 /* If not reached end of pattern on success, there's an excess bracket. */
3083
3084 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
3085
3086 /* Fill in the terminating state and check for disastrous overflow, but
3087 if debugging, leave the test till after things are printed out. */
3088
3089 *code++ = OP_END;
3090
3091 #ifndef DEBUG
3092 if (code - re->code > length) *errorptr = ERR23;
3093 #endif
3094
3095 /* Give an error if there's back reference to a non-existent capturing
3096 subpattern. */
3097
3098 if (top_backref > re->top_bracket) *errorptr = ERR15;
3099
3100 /* Failed to compile */
3101
3102 if (*errorptr != NULL)
3103   {
3104   (pcre_free)(re);
3105   PCRE_ERROR_RETURN:
3106   *erroroffset = ptr - (const uschar *)pattern;
3107   return NULL;
3108   }
3109
3110 /* If the anchored option was not passed, set flag if we can determine that the
3111 pattern is anchored by virtue of ^ characters or \A or anything else (such as
3112 starting with .* when DOTALL is set).
3113
3114 Otherwise, see if we can determine what the first character has to be, because
3115 that speeds up unanchored matches no end. If not, see if we can set the
3116 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
3117 start with ^. and also when all branches start with .* for non-DOTALL matches.
3118 */
3119
3120 if ((options & PCRE_ANCHORED) == 0)
3121   {
3122   int temp_options = options;
3123   if (is_anchored(re->code, &temp_options))
3124     re->options |= PCRE_ANCHORED;
3125   else
3126     {
3127     int ch = find_firstchar(re->code, &temp_options);
3128     if (ch >= 0)
3129       {
3130       re->first_char = ch;
3131       re->options |= PCRE_FIRSTSET;
3132       }
3133     else if (is_startline(re->code))
3134       re->options |= PCRE_STARTLINE;
3135     }
3136   }
3137
3138 /* Save the last required character if there are at least two literal
3139 characters on all paths, or if there is no first character setting. */
3140
3141 if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
3142   {
3143   re->req_char = reqchar;
3144   re->options |= PCRE_REQCHSET;
3145   }
3146
3147 /* Print out the compiled data for debugging */
3148
3149 #ifdef DEBUG
3150
3151 printf("Length = %d top_bracket = %d top_backref = %d\n",
3152   length, re->top_bracket, re->top_backref);
3153
3154 if (re->options != 0)
3155   {
3156   printf("%s%s%s%s%s%s%s%s%s\n",
3157     ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
3158     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
3159     ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
3160     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
3161     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
3162     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
3163     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
3164     ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
3165     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
3166   }
3167
3168 if ((re->options & PCRE_FIRSTSET) != 0)
3169   {
3170   if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
3171     else printf("First char = \\x%02x\n", re->first_char);
3172   }
3173
3174 if ((re->options & PCRE_REQCHSET) != 0)
3175   {
3176   if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
3177     else printf("Req char = \\x%02x\n", re->req_char);
3178   }
3179
3180 code_end = code;
3181 code_base = code = re->code;
3182
3183 while (code < code_end)
3184   {
3185   int charlength;
3186
3187   printf("%3d ", code - code_base);
3188
3189   if (*code >= OP_BRA)
3190     {
3191     printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
3192     code += 2;
3193     }
3194
3195   else switch(*code)
3196     {
3197     case OP_OPT:
3198     printf(" %.2x %s", code[1], OP_names[*code]);
3199     code++;
3200     break;
3201
3202     case OP_COND:
3203     printf("%3d Cond", (code[1] << 8) + code[2]);
3204     code += 2;
3205     break;
3206
3207     case OP_CREF:
3208     printf(" %.2d %s", code[1], OP_names[*code]);
3209     code++;
3210     break;
3211
3212     case OP_CHARS:
3213     charlength = *(++code);
3214     printf("%3d ", charlength);
3215     while (charlength-- > 0)
3216       if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
3217     break;
3218
3219     case OP_KETRMAX:
3220     case OP_KETRMIN:
3221     case OP_ALT:
3222     case OP_KET:
3223     case OP_ASSERT:
3224     case OP_ASSERT_NOT:
3225     case OP_ASSERTBACK:
3226     case OP_ASSERTBACK_NOT:
3227     case OP_ONCE:
3228     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3229     code += 2;
3230     break;
3231
3232     case OP_REVERSE:
3233     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3234     code += 2;
3235     break;
3236
3237     case OP_STAR:
3238     case OP_MINSTAR:
3239     case OP_PLUS:
3240     case OP_MINPLUS:
3241     case OP_QUERY:
3242     case OP_MINQUERY:
3243     case OP_TYPESTAR:
3244     case OP_TYPEMINSTAR:
3245     case OP_TYPEPLUS:
3246     case OP_TYPEMINPLUS:
3247     case OP_TYPEQUERY:
3248     case OP_TYPEMINQUERY:
3249     if (*code >= OP_TYPESTAR)
3250       printf("    %s", OP_names[code[1]]);
3251     else if (isprint(c = code[1])) printf("    %c", c);
3252       else printf("    \\x%02x", c);
3253     printf("%s", OP_names[*code++]);
3254     break;
3255
3256     case OP_EXACT:
3257     case OP_UPTO:
3258     case OP_MINUPTO:
3259     if (isprint(c = code[3])) printf("    %c{", c);
3260       else printf("    \\x%02x{", c);
3261     if (*code != OP_EXACT) printf("0,");
3262     printf("%d}", (code[1] << 8) + code[2]);
3263     if (*code == OP_MINUPTO) printf("?");
3264     code += 3;
3265     break;
3266
3267     case OP_TYPEEXACT:
3268     case OP_TYPEUPTO:
3269     case OP_TYPEMINUPTO:
3270     printf("    %s{", OP_names[code[3]]);
3271     if (*code != OP_TYPEEXACT) printf(",");
3272     printf("%d}", (code[1] << 8) + code[2]);
3273     if (*code == OP_TYPEMINUPTO) printf("?");
3274     code += 3;
3275     break;
3276
3277     case OP_NOT:
3278     if (isprint(c = *(++code))) printf("    [^%c]", c);
3279       else printf("    [^\\x%02x]", c);
3280     break;
3281
3282     case OP_NOTSTAR:
3283     case OP_NOTMINSTAR:
3284     case OP_NOTPLUS:
3285     case OP_NOTMINPLUS:
3286     case OP_NOTQUERY:
3287     case OP_NOTMINQUERY:
3288     if (isprint(c = code[1])) printf("    [^%c]", c);
3289       else printf("    [^\\x%02x]", c);
3290     printf("%s", OP_names[*code++]);
3291     break;
3292
3293     case OP_NOTEXACT:
3294     case OP_NOTUPTO:
3295     case OP_NOTMINUPTO:
3296     if (isprint(c = code[3])) printf("    [^%c]{", c);
3297       else printf("    [^\\x%02x]{", c);
3298     if (*code != OP_NOTEXACT) printf(",");
3299     printf("%d}", (code[1] << 8) + code[2]);
3300     if (*code == OP_NOTMINUPTO) printf("?");
3301     code += 3;
3302     break;
3303
3304     case OP_REF:
3305     printf("    \\%d", *(++code));
3306     code ++;
3307     goto CLASS_REF_REPEAT;
3308
3309     case OP_CLASS:
3310       {
3311       int i, min, max;
3312       code++;
3313       printf("    [");
3314
3315       for (i = 0; i < 256; i++)
3316         {
3317         if ((code[i/8] & (1 << (i&7))) != 0)
3318           {
3319           int j;
3320           for (j = i+1; j < 256; j++)
3321             if ((code[j/8] & (1 << (j&7))) == 0) break;
3322           if (i == '-' || i == ']') printf("\\");
3323           if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
3324           if (--j > i)
3325             {
3326             printf("-");
3327             if (j == '-' || j == ']') printf("\\");
3328             if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
3329             }
3330           i = j;
3331           }
3332         }
3333       printf("]");
3334       code += 32;
3335
3336       CLASS_REF_REPEAT:
3337
3338       switch(*code)
3339         {
3340         case OP_CRSTAR:
3341         case OP_CRMINSTAR:
3342         case OP_CRPLUS:
3343         case OP_CRMINPLUS:
3344         case OP_CRQUERY:
3345         case OP_CRMINQUERY:
3346         printf("%s", OP_names[*code]);
3347         break;
3348
3349         case OP_CRRANGE:
3350         case OP_CRMINRANGE:
3351         min = (code[1] << 8) + code[2];
3352         max = (code[3] << 8) + code[4];
3353         if (max == 0) printf("{%d,}", min);
3354         else printf("{%d,%d}", min, max);
3355         if (*code == OP_CRMINRANGE) printf("?");
3356         code += 4;
3357         break;
3358
3359         default:
3360         code--;
3361         }
3362       }
3363     break;
3364
3365     /* Anything else is just a one-node item */
3366
3367     default:
3368     printf("    %s", OP_names[*code]);
3369     break;
3370     }
3371
3372   code++;
3373   printf("\n");
3374   }
3375 printf("------------------------------------------------------------------\n");
3376
3377 /* This check is done here in the debugging case so that the code that
3378 was compiled can be seen. */
3379
3380 if (code - re->code > length)
3381   {
3382   *errorptr = ERR23;
3383   (pcre_free)(re);
3384   *erroroffset = ptr - (uschar *)pattern;
3385   return NULL;
3386   }
3387 #endif
3388
3389 return (pcre *)re;
3390 }
3391
3392
3393
3394 /*************************************************
3395 *          Match a back-reference                *
3396 *************************************************/
3397
3398 /* If a back reference hasn't been set, the length that is passed is greater
3399 than the number of characters left in the string, so the match fails.
3400
3401 Arguments:
3402   offset      index into the offset vector
3403   eptr        points into the subject
3404   length      length to be matched
3405   md          points to match data block
3406   ims         the ims flags
3407
3408 Returns:      TRUE if matched
3409 */
3410
3411 static BOOL
3412 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3413   unsigned long int ims)
3414 {
3415 const uschar *p = md->start_subject + md->offset_vector[offset];
3416
3417 #ifdef DEBUG
3418 if (eptr >= md->end_subject)
3419   printf("matching subject <null>");
3420 else
3421   {
3422   printf("matching subject ");
3423   pchars(eptr, length, TRUE, md);
3424   }
3425 printf(" against backref ");
3426 pchars(p, length, FALSE, md);
3427 printf("\n");
3428 #endif
3429
3430 /* Always fail if not enough characters left */
3431
3432 if (length > md->end_subject - eptr) return FALSE;
3433
3434 /* Separate the caselesss case for speed */
3435
3436 if ((ims & PCRE_CASELESS) != 0)
3437   {
3438   while (length-- > 0)
3439     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
3440   }
3441 else
3442   { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3443
3444 return TRUE;
3445 }
3446
3447
3448
3449 /*************************************************
3450 *         Match from current position            *
3451 *************************************************/
3452
3453 /* On entry ecode points to the first opcode, and eptr to the first character
3454 in the subject string, while eptrb holds the value of eptr at the start of the
3455 last bracketed group - used for breaking infinite loops matching zero-length
3456 strings.
3457
3458 Arguments:
3459    eptr        pointer in subject
3460    ecode       position in code
3461    offset_top  current top pointer
3462    md          pointer to "static" info for the match
3463    ims         current /i, /m, and /s options
3464    eptrb       pointer to chain of blocks containing eptr at start of
3465                  brackets - for testing for empty matches
3466    flags       can contain
3467                  match_condassert - this is an assertion condition
3468                  match_isgroup - this is the start of a bracketed group
3469
3470 Returns:       TRUE if matched
3471 */
3472
3473 static BOOL
3474 match(register const uschar *eptr, register const uschar *ecode,
3475   int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3476   int flags)
3477 {
3478 unsigned long int original_ims = ims;   /* Save for resetting on ')' */
3479 eptrblock newptrb;
3480
3481 /* At the start of a bracketed group, add the current subject pointer to the
3482 stack of such pointers, to be re-instated at the end of the group when we hit
3483 the closing ket. When match() is called in other circumstances, we don't add to
3484 the stack. */
3485
3486 if ((flags & match_isgroup) != 0)
3487   {
3488   newptrb.prev = eptrb;
3489   newptrb.saved_eptr = eptr;
3490   eptrb = &newptrb;
3491   }
3492
3493 /* Now start processing the operations. */
3494
3495 for (;;)
3496   {
3497   int op = (int)*ecode;
3498   int min, max, ctype;
3499   register int i;
3500   register int c;
3501   BOOL minimize = FALSE;
3502
3503   /* Opening capturing bracket. If there is space in the offset vector, save
3504   the current subject position in the working slot at the top of the vector. We
3505   mustn't change the current values of the data slot, because they may be set
3506   from a previous iteration of this group, and be referred to by a reference
3507   inside the group.
3508
3509   If the bracket fails to match, we need to restore this value and also the
3510   values of the final offsets, in case they were set by a previous iteration of
3511   the same bracket.
3512
3513   If there isn't enough space in the offset vector, treat this as if it were a
3514   non-capturing bracket. Don't worry about setting the flag for the error case
3515   here; that is handled in the code for KET. */
3516
3517   if (op > OP_BRA)
3518     {
3519     int number = op - OP_BRA;
3520     int offset = number << 1;
3521
3522 #ifdef DEBUG
3523     printf("start bracket %d subject=", number);
3524     pchars(eptr, 16, TRUE, md);
3525     printf("\n");
3526 #endif
3527
3528     if (offset < md->offset_max)
3529       {
3530       int save_offset1 = md->offset_vector[offset];
3531       int save_offset2 = md->offset_vector[offset+1];
3532       int save_offset3 = md->offset_vector[md->offset_end - number];
3533
3534       DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3535       md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3536
3537       do
3538         {
3539         if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3540           return TRUE;
3541         ecode += (ecode[1] << 8) + ecode[2];
3542         }
3543       while (*ecode == OP_ALT);
3544
3545       DPRINTF(("bracket %d failed\n", number));
3546
3547       md->offset_vector[offset] = save_offset1;
3548       md->offset_vector[offset+1] = save_offset2;
3549       md->offset_vector[md->offset_end - number] = save_offset3;
3550       return FALSE;
3551       }
3552
3553     /* Insufficient room for saving captured contents */
3554
3555     else op = OP_BRA;
3556     }
3557
3558   /* Other types of node can be handled by a switch */
3559
3560   switch(op)
3561     {
3562     case OP_BRA:     /* Non-capturing bracket: optimized */
3563     DPRINTF(("start bracket 0\n"));
3564     do
3565       {
3566       if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3567         return TRUE;
3568       ecode += (ecode[1] << 8) + ecode[2];
3569       }
3570     while (*ecode == OP_ALT);
3571     DPRINTF(("bracket 0 failed\n"));
3572     return FALSE;
3573
3574     /* Conditional group: compilation checked that there are no more than
3575     two branches. If the condition is false, skipping the first branch takes us
3576     past the end if there is only one branch, but that's OK because that is
3577     exactly what going to the ket would do. */
3578
3579     case OP_COND:
3580     if (ecode[3] == OP_CREF)         /* Condition is extraction test */
3581       {
3582       int offset = ecode[4] << 1;    /* Doubled reference number */
3583       return match(eptr,
3584         ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3585           5 : 3 + (ecode[1] << 8) + ecode[2]),
3586         offset_top, md, ims, eptrb, match_isgroup);
3587       }
3588
3589     /* The condition is an assertion. Call match() to evaluate it - setting
3590     the final argument TRUE causes it to stop at the end of an assertion. */
3591
3592     else
3593       {
3594       if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3595           match_condassert | match_isgroup))
3596         {
3597         ecode += 3 + (ecode[4] << 8) + ecode[5];
3598         while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3599         }
3600       else ecode += (ecode[1] << 8) + ecode[2];
3601       return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3602       }
3603     /* Control never reaches here */
3604
3605     /* Skip over conditional reference data if encountered (should not be) */
3606
3607     case OP_CREF:
3608     ecode += 2;
3609     break;
3610
3611     /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3612     an empty string - recursion will then try other alternatives, if any. */
3613
3614     case OP_END:
3615     if (md->notempty && eptr == md->start_match) return FALSE;
3616     md->end_match_ptr = eptr;          /* Record where we ended */
3617     md->end_offset_top = offset_top;   /* and how many extracts were taken */
3618     return TRUE;
3619
3620     /* Change option settings */
3621
3622     case OP_OPT:
3623     ims = ecode[1];
3624     ecode += 2;
3625     DPRINTF(("ims set to %02lx\n", ims));
3626     break;
3627
3628     /* Assertion brackets. Check the alternative branches in turn - the
3629     matching won't pass the KET for an assertion. If any one branch matches,
3630     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3631     start of each branch to move the current point backwards, so the code at
3632     this level is identical to the lookahead case. */
3633
3634     case OP_ASSERT:
3635     case OP_ASSERTBACK:
3636     do
3637       {
3638       if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3639       ecode += (ecode[1] << 8) + ecode[2];
3640       }
3641     while (*ecode == OP_ALT);
3642     if (*ecode == OP_KET) return FALSE;
3643
3644     /* If checking an assertion for a condition, return TRUE. */
3645
3646     if ((flags & match_condassert) != 0) return TRUE;
3647
3648     /* Continue from after the assertion, updating the offsets high water
3649     mark, since extracts may have been taken during the assertion. */
3650
3651     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3652     ecode += 3;
3653     offset_top = md->end_offset_top;
3654     continue;
3655
3656     /* Negative assertion: all branches must fail to match */
3657
3658     case OP_ASSERT_NOT:
3659     case OP_ASSERTBACK_NOT:
3660     do
3661       {
3662       if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3663         return FALSE;
3664       ecode += (ecode[1] << 8) + ecode[2];
3665       }
3666     while (*ecode == OP_ALT);
3667
3668     if ((flags & match_condassert) != 0) return TRUE;
3669
3670     ecode += 3;
3671     continue;
3672
3673     /* Move the subject pointer back. This occurs only at the start of
3674     each branch of a lookbehind assertion. If we are too close to the start to
3675     move back, this match function fails. When working with UTF-8 we move
3676     back a number of characters, not bytes. */
3677
3678     case OP_REVERSE:
3679 #ifdef SUPPORT_UTF8
3680     c = (ecode[1] << 8) + ecode[2];
3681     for (i = 0; i < c; i++)
3682       {
3683       eptr--;
3684       BACKCHAR(eptr)
3685       }
3686 #else
3687     eptr -= (ecode[1] << 8) + ecode[2];
3688 #endif
3689
3690     if (eptr < md->start_subject) return FALSE;
3691     ecode += 3;
3692     break;
3693
3694     /* Recursion matches the current regex, nested. If there are any capturing
3695     brackets started but not finished, we have to save their starting points
3696     and reinstate them after the recursion. However, we don't know how many
3697     such there are (offset_top records the completed total) so we just have
3698     to save all the potential data. There may be up to 99 such values, which
3699     is a bit large to put on the stack, but using malloc for small numbers
3700     seems expensive. As a compromise, the stack is used when there are fewer
3701     than 16 values to store; otherwise malloc is used. A problem is what to do
3702     if the malloc fails ... there is no way of returning to the top level with
3703     an error. Save the top 15 values on the stack, and accept that the rest
3704     may be wrong. */
3705
3706     case OP_RECURSE:
3707       {
3708       BOOL rc;
3709       int *save;
3710       int stacksave[15];
3711
3712       c = md->offset_max;
3713
3714       if (c < 16) save = stacksave; else
3715         {
3716         save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3717         if (save == NULL)
3718           {
3719           save = stacksave;
3720           c = 15;
3721           }
3722         }
3723
3724       for (i = 1; i <= c; i++)
3725         save[i] = md->offset_vector[md->offset_end - i];
3726       rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3727         match_isgroup);
3728       for (i = 1; i <= c; i++)
3729         md->offset_vector[md->offset_end - i] = save[i];
3730       if (save != stacksave) (pcre_free)(save);
3731       if (!rc) return FALSE;
3732
3733       /* In case the recursion has set more capturing values, save the final
3734       number, then move along the subject till after the recursive match,
3735       and advance one byte in the pattern code. */
3736
3737       offset_top = md->end_offset_top;
3738       eptr = md->end_match_ptr;
3739       ecode++;
3740       }
3741     break;
3742
3743     /* "Once" brackets are like assertion brackets except that after a match,
3744     the point in the subject string is not moved back. Thus there can never be
3745     a move back into the brackets. Check the alternative branches in turn - the
3746     matching won't pass the KET for this kind of subpattern. If any one branch
3747     matches, we carry on as at the end of a normal bracket, leaving the subject
3748     pointer. */
3749
3750     case OP_ONCE:
3751       {
3752       const uschar *prev = ecode;
3753       const uschar *saved_eptr = eptr;
3754
3755       do
3756         {
3757         if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3758           break;
3759         ecode += (ecode[1] << 8) + ecode[2];
3760         }
3761       while (*ecode == OP_ALT);
3762
3763       /* If hit the end of the group (which could be repeated), fail */
3764
3765       if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3766
3767       /* Continue as from after the assertion, updating the offsets high water
3768       mark, since extracts may have been taken. */
3769
3770       do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3771
3772       offset_top = md->end_offset_top;
3773       eptr = md->end_match_ptr;
3774
3775       /* For a non-repeating ket, just continue at this level. This also
3776       happens for a repeating ket if no characters were matched in the group.
3777       This is the forcible breaking of infinite loops as implemented in Perl
3778       5.005. If there is an options reset, it will get obeyed in the normal
3779       course of events. */
3780
3781       if (*ecode == OP_KET || eptr == saved_eptr)
3782         {
3783         ecode += 3;
3784         break;
3785         }
3786
3787       /* The repeating kets try the rest of the pattern or restart from the
3788       preceding bracket, in the appropriate order. We need to reset any options
3789       that changed within the bracket before re-running it, so check the next
3790       opcode. */
3791
3792       if (ecode[3] == OP_OPT)
3793         {
3794         ims = (ims & ~PCRE_IMS) | ecode[4];
3795         DPRINTF(("ims set to %02lx at group repeat\n", ims));
3796         }
3797
3798       if (*ecode == OP_KETRMIN)
3799         {
3800         if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3801             match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3802               return TRUE;
3803         }
3804       else  /* OP_KETRMAX */
3805         {
3806         if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3807             match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3808         }
3809       }
3810     return FALSE;
3811
3812     /* An alternation is the end of a branch; scan along to find the end of the
3813     bracketed group and go to there. */
3814
3815     case OP_ALT:
3816     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3817     break;
3818
3819     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3820     that it may occur zero times. It may repeat infinitely, or not at all -
3821     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3822     repeat limits are compiled as a number of copies, with the optional ones
3823     preceded by BRAZERO or BRAMINZERO. */
3824
3825     case OP_BRAZERO:
3826       {
3827       const uschar *next = ecode+1;
3828       if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
3829         return TRUE;
3830       do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3831       ecode = next + 3;
3832       }
3833     break;
3834
3835     case OP_BRAMINZERO:
3836       {
3837       const uschar *next = ecode+1;
3838       do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3839       if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
3840         return TRUE;
3841       ecode++;
3842       }
3843     break;
3844
3845     /* End of a group, repeated or non-repeating. If we are at the end of
3846     an assertion "group", stop matching and return TRUE, but record the
3847     current high water mark for use by positive assertions. Do this also
3848     for the "once" (not-backup up) groups. */
3849
3850     case OP_KET:
3851     case OP_KETRMIN:
3852     case OP_KETRMAX:
3853       {
3854       const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3855       const uschar *saved_eptr = eptrb->saved_eptr;
3856
3857       eptrb = eptrb->prev;    /* Back up the stack of bracket start pointers */
3858
3859       if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3860           *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3861           *prev == OP_ONCE)
3862         {
3863         md->end_match_ptr = eptr;      /* For ONCE */
3864         md->end_offset_top = offset_top;
3865         return TRUE;
3866         }
3867
3868       /* In all other cases except a conditional group we have to check the
3869       group number back at the start and if necessary complete handling an
3870       extraction by setting the offsets and bumping the high water mark. */
3871
3872       if (*prev != OP_COND)
3873         {
3874         int number = *prev - OP_BRA;
3875         int offset = number << 1;
3876
3877 #ifdef DEBUG
3878         printf("end bracket %d", number);
3879         printf("\n");
3880 #endif
3881
3882         if (number > 0)
3883           {
3884           if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3885             {
3886             md->offset_vector[offset] =
3887               md->offset_vector[md->offset_end - number];
3888             md->offset_vector[offset+1] = eptr - md->start_subject;
3889             if (offset_top <= offset) offset_top = offset + 2;
3890             }
3891           }
3892         }
3893
3894       /* Reset the value of the ims flags, in case they got changed during
3895       the group. */
3896
3897       ims = original_ims;
3898       DPRINTF(("ims reset to %02lx\n", ims));
3899
3900       /* For a non-repeating ket, just continue at this level. This also
3901       happens for a repeating ket if no characters were matched in the group.
3902       This is the forcible breaking of infinite loops as implemented in Perl
3903       5.005. If there is an options reset, it will get obeyed in the normal
3904       course of events. */
3905
3906       if (*ecode == OP_KET || eptr == saved_eptr)
3907         {
3908         ecode += 3;
3909         break;
3910         }
3911
3912       /* The repeating kets try the rest of the pattern or restart from the
3913       preceding bracket, in the appropriate order. */
3914
3915       if (*ecode == OP_KETRMIN)
3916         {
3917         if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3918             match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3919               return TRUE;
3920         }
3921       else  /* OP_KETRMAX */
3922         {
3923         if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3924             match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3925         }
3926       }
3927     return FALSE;
3928
3929     /* Start of subject unless notbol, or after internal newline if multiline */
3930
3931     case OP_CIRC:
3932     if (md->notbol && eptr == md->start_subject) return FALSE;
3933     if ((ims & PCRE_MULTILINE) != 0)
3934       {
3935       if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3936       ecode++;
3937       break;
3938       }
3939     /* ... else fall through */
3940
3941     /* Start of subject assertion */
3942
3943     case OP_SOD:
3944     if (eptr != md->start_subject) return FALSE;
3945     ecode++;
3946     break;
3947
3948     /* Assert before internal newline if multiline, or before a terminating
3949     newline unless endonly is set, else end of subject unless noteol is set. */
3950
3951     case OP_DOLL:
3952     if ((ims & PCRE_MULTILINE) != 0)
3953       {
3954       if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
3955         else { if (md->noteol) return FALSE; }
3956       ecode++;
3957       break;
3958       }
3959     else
3960       {
3961       if (md->noteol) return FALSE;
3962       if (!md->endonly)
3963         {
3964         if (eptr < md->end_subject - 1 ||
3965            (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3966
3967         ecode++;
3968         break;
3969         }
3970       }
3971     /* ... else fall through */
3972
3973     /* End of subject assertion (\z) */
3974
3975     case OP_EOD:
3976     if (eptr < md->end_subject) return FALSE;
3977     ecode++;
3978     break;
3979
3980     /* End of subject or ending \n assertion (\Z) */
3981
3982     case OP_EODN:
3983     if (eptr < md->end_subject - 1 ||
3984        (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3985     ecode++;
3986     break;
3987
3988     /* Word boundary assertions */
3989
3990     case OP_NOT_WORD_BOUNDARY:
3991     case OP_WORD_BOUNDARY:
3992       {
3993       BOOL prev_is_word = (eptr != md->start_subject) &&
3994         ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3995       BOOL cur_is_word = (eptr < md->end_subject) &&
3996         ((md->ctypes[*eptr] & ctype_word) != 0);
3997       if ((*ecode++ == OP_WORD_BOUNDARY)?
3998            cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3999         return FALSE;
4000       }
4001     break;
4002
4003     /* Match a single character type; inline for speed */
4004
4005     case OP_ANY:
4006     if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
4007       return FALSE;
4008     if (eptr++ >= md->end_subject) return FALSE;
4009 #ifdef SUPPORT_UTF8
4010     if (md->utf8)
4011       while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4012 #endif
4013     ecode++;
4014     break;
4015
4016     case OP_NOT_DIGIT:
4017     if (eptr >= md->end_subject ||
4018        (md->ctypes[*eptr++] & ctype_digit) != 0)
4019       return FALSE;
4020     ecode++;
4021     break;
4022
4023     case OP_DIGIT:
4024     if (eptr >= md->end_subject ||
4025        (md->ctypes[*eptr++] & ctype_digit) == 0)
4026       return FALSE;
4027     ecode++;
4028     break;
4029
4030     case OP_NOT_WHITESPACE:
4031     if (eptr >= md->end_subject ||
4032        (md->ctypes[*eptr++] & ctype_space) != 0)
4033       return FALSE;
4034     ecode++;
4035     break;
4036
4037     case OP_WHITESPACE:
4038     if (eptr >= md->end_subject ||
4039        (md->ctypes[*eptr++] & ctype_space) == 0)
4040       return FALSE;
4041     ecode++;
4042     break;
4043
4044     case OP_NOT_WORDCHAR:
4045     if (eptr >= md->end_subject ||
4046        (md->ctypes[*eptr++] & ctype_word) != 0)
4047       return FALSE;
4048     ecode++;
4049     break;
4050
4051     case OP_WORDCHAR:
4052     if (eptr >= md->end_subject ||
4053        (md->ctypes[*eptr++] & ctype_word) == 0)
4054       return FALSE;
4055     ecode++;
4056     break;
4057
4058     /* Match a back reference, possibly repeatedly. Look past the end of the
4059     item to see if there is repeat information following. The code is similar
4060     to that for character classes, but repeated for efficiency. Then obey
4061     similar code to character type repeats - written out again for speed.
4062     However, if the referenced string is the empty string, always treat
4063     it as matched, any number of times (otherwise there could be infinite
4064     loops). */
4065
4066     case OP_REF:
4067       {
4068       int length;
4069       int offset = ecode[1] << 1;                /* Doubled reference number */
4070       ecode += 2;                                /* Advance past the item */
4071
4072       /* If the reference is unset, set the length to be longer than the amount
4073       of subject left; this ensures that every attempt at a match fails. We
4074       can't just fail here, because of the possibility of quantifiers with zero
4075       minima. */
4076
4077       length = (offset >= offset_top || md->offset_vector[offset] < 0)?
4078         md->end_subject - eptr + 1 :
4079         md->offset_vector[offset+1] - md->offset_vector[offset];
4080
4081       /* Set up for repetition, or handle the non-repeated case */
4082
4083       switch (*ecode)
4084         {
4085         case OP_CRSTAR:
4086         case OP_CRMINSTAR:
4087         case OP_CRPLUS:
4088         case OP_CRMINPLUS:
4089         case OP_CRQUERY:
4090         case OP_CRMINQUERY:
4091         c = *ecode++ - OP_CRSTAR;
4092         minimize = (c & 1) != 0;
4093         min = rep_min[c];                 /* Pick up values from tables; */
4094         max = rep_max[c];                 /* zero for max => infinity */
4095         if (max == 0) max = INT_MAX;
4096         break;
4097
4098         case OP_CRRANGE:
4099         case OP_CRMINRANGE:
4100         minimize = (*ecode == OP_CRMINRANGE);
4101         min = (ecode[1] << 8) + ecode[2];
4102         max = (ecode[3] << 8) + ecode[4];
4103         if (max == 0) max = INT_MAX;
4104         ecode += 5;
4105         break;
4106
4107         default:               /* No repeat follows */
4108         if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4109         eptr += length;
4110         continue;              /* With the main loop */
4111         }
4112
4113       /* If the length of the reference is zero, just continue with the
4114       main loop. */
4115
4116       if (length == 0) continue;
4117
4118       /* First, ensure the minimum number of matches are present. We get back
4119       the length of the reference string explicitly rather than passing the
4120       address of eptr, so that eptr can be a register variable. */
4121
4122       for (i = 1; i <= min; i++)
4123         {
4124         if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4125         eptr += length;
4126         }
4127
4128       /* If min = max, continue at the same level without recursion.
4129       They are not both allowed to be zero. */
4130
4131       if (min == max) continue;
4132
4133       /* If minimizing, keep trying and advancing the pointer */
4134
4135       if (minimize)
4136         {
4137         for (i = min;; i++)
4138           {
4139           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4140             return TRUE;
4141           if (i >= max || !match_ref(offset, eptr, length, md, ims))
4142             return FALSE;
4143           eptr += length;
4144           }
4145         /* Control never gets here */
4146         }
4147
4148       /* If maximizing, find the longest string and work backwards */
4149
4150       else
4151         {
4152         const uschar *pp = eptr;
4153         for (i = min; i < max; i++)
4154           {
4155           if (!match_ref(offset, eptr, length, md, ims)) break;
4156           eptr += length;
4157           }
4158         while (eptr >= pp)
4159           {
4160           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4161             return TRUE;
4162           eptr -= length;
4163           }
4164         return FALSE;
4165         }
4166       }
4167     /* Control never gets here */
4168
4169
4170
4171     /* Match a character class, possibly repeatedly. Look past the end of the
4172     item to see if there is repeat information following. Then obey similar
4173     code to character type repeats - written out again for speed. */
4174
4175     case OP_CLASS:
4176       {
4177       const uschar *data = ecode + 1;  /* Save for matching */
4178       ecode += 33;                     /* Advance past the item */
4179
4180       switch (*ecode)
4181         {
4182         case OP_CRSTAR:
4183         case OP_CRMINSTAR:
4184         case OP_CRPLUS:
4185         case OP_CRMINPLUS:
4186         case OP_CRQUERY:
4187         case OP_CRMINQUERY:
4188         c = *ecode++ - OP_CRSTAR;
4189         minimize = (c & 1) != 0;
4190         min = rep_min[c];                 /* Pick up values from tables; */
4191         max = rep_max[c];                 /* zero for max => infinity */
4192         if (max == 0) max = INT_MAX;
4193         break;
4194
4195         case OP_CRRANGE:
4196         case OP_CRMINRANGE:
4197         minimize = (*ecode == OP_CRMINRANGE);
4198         min = (ecode[1] << 8) + ecode[2];
4199         max = (ecode[3] << 8) + ecode[4];
4200         if (max == 0) max = INT_MAX;
4201         ecode += 5;
4202         break;
4203
4204         default:               /* No repeat follows */
4205         min = max = 1;
4206         break;
4207         }
4208
4209       /* First, ensure the minimum number of matches are present. */
4210
4211       for (i = 1; i <= min; i++)
4212         {
4213         if (eptr >= md->end_subject) return FALSE;
4214         GETCHARINC(c, eptr)         /* Get character; increment eptr */
4215
4216 #ifdef SUPPORT_UTF8
4217         /* We do not yet support class members > 255 */
4218         if (c > 255) return FALSE;
4219 #endif
4220
4221         if ((data[c/8] & (1 << (c&7))) != 0) continue;
4222         return FALSE;
4223         }
4224
4225       /* If max == min we can continue with the main loop without the
4226       need to recurse. */
4227
4228       if (min == max) continue;
4229
4230       /* If minimizing, keep testing the rest of the expression and advancing
4231       the pointer while it matches the class. */
4232
4233       if (minimize)
4234         {
4235         for (i = min;; i++)
4236           {
4237           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4238             return TRUE;
4239           if (i >= max || eptr >= md->end_subject) return FALSE;
4240           GETCHARINC(c, eptr)       /* Get character; increment eptr */
4241
4242 #ifdef SUPPORT_UTF8
4243           /* We do not yet support class members > 255 */
4244           if (c > 255) return FALSE;
4245 #endif
4246           if ((data[c/8] & (1 << (c&7))) != 0) continue;
4247           return FALSE;
4248           }
4249         /* Control never gets here */
4250         }
4251
4252       /* If maximizing, find the longest possible run, then work backwards. */
4253
4254       else
4255         {
4256         const uschar *pp = eptr;
4257         int len = 1;
4258         for (i = min; i < max; i++)
4259           {
4260           if (eptr >= md->end_subject) break;
4261           GETCHARLEN(c, eptr, len)  /* Get character, set length if UTF-8 */
4262
4263 #ifdef SUPPORT_UTF8
4264           /* We do not yet support class members > 255 */
4265           if (c > 255) break;
4266 #endif
4267           if ((data[c/8] & (1 << (c&7))) == 0) break;
4268           eptr += len;
4269           }
4270
4271         while (eptr >= pp)
4272           {
4273           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4274             return TRUE;
4275
4276 #ifdef SUPPORT_UTF8
4277           BACKCHAR(eptr)
4278 #endif
4279           }
4280         return FALSE;
4281         }
4282       }
4283     /* Control never gets here */
4284
4285     /* Match a run of characters */
4286
4287     case OP_CHARS:
4288       {
4289       register int length = ecode[1];
4290       ecode += 2;
4291
4292 #ifdef DEBUG    /* Sigh. Some compilers never learn. */
4293       if (eptr >= md->end_subject)
4294         printf("matching subject <null> against pattern ");
4295       else
4296         {
4297         printf("matching subject ");
4298         pchars(eptr, length, TRUE, md);
4299         printf(" against pattern ");
4300         }
4301       pchars(ecode, length, FALSE, md);
4302       printf("\n");
4303 #endif
4304
4305       if (length > md->end_subject - eptr) return FALSE;
4306       if ((ims & PCRE_CASELESS) != 0)
4307         {
4308         while (length-- > 0)
4309           if (md->lcc[*ecode++] != md->lcc[*eptr++])
4310             return FALSE;
4311         }
4312       else
4313         {
4314         while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
4315         }
4316       }
4317     break;
4318
4319     /* Match a single character repeatedly; different opcodes share code. */
4320
4321     case OP_EXACT:
4322     min = max = (ecode[1] << 8) + ecode[2];
4323     ecode += 3;
4324     goto REPEATCHAR;
4325
4326     case OP_UPTO:
4327     case OP_MINUPTO:
4328     min = 0;
4329     max = (ecode[1] << 8) + ecode[2];
4330     minimize = *ecode == OP_MINUPTO;
4331     ecode += 3;
4332     goto REPEATCHAR;
4333
4334     case OP_STAR:
4335     case OP_MINSTAR:
4336     case OP_PLUS:
4337     case OP_MINPLUS:
4338     case OP_QUERY:
4339     case OP_MINQUERY:
4340     c = *ecode++ - OP_STAR;
4341     minimize = (c & 1) != 0;
4342     min = rep_min[c];                 /* Pick up values from tables; */
4343     max = rep_max[c];                 /* zero for max => infinity */
4344     if (max == 0) max = INT_MAX;
4345
4346     /* Common code for all repeated single-character matches. We can give
4347     up quickly if there are fewer than the minimum number of characters left in
4348     the subject. */
4349
4350     REPEATCHAR:
4351     if (min > md->end_subject - eptr) return FALSE;
4352     c = *ecode++;
4353
4354     /* The code is duplicated for the caseless and caseful cases, for speed,
4355     since matching characters is likely to be quite common. First, ensure the
4356     minimum number of matches are present. If min = max, continue at the same
4357     level without recursing. Otherwise, if minimizing, keep trying the rest of
4358     the expression and advancing one matching character if failing, up to the
4359     maximum. Alternatively, if maximizing, find the maximum number of
4360     characters and work backwards. */
4361
4362     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4363       max, eptr));
4364
4365     if ((ims & PCRE_CASELESS) != 0)
4366       {
4367       c = md->lcc[c];
4368       for (i = 1; i <= min; i++)
4369         if (c != md->lcc[*eptr++]) return FALSE;
4370       if (min == max) continue;
4371       if (minimize)
4372         {
4373         for (i = min;; i++)
4374           {
4375           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4376             return TRUE;
4377           if (i >= max || eptr >= md->end_subject ||
4378               c != md->lcc[*eptr++])
4379             return FALSE;
4380           }
4381         /* Control never gets here */
4382         }
4383       else
4384         {
4385         const uschar *pp = eptr;
4386         for (i = min; i < max; i++)
4387           {
4388           if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
4389           eptr++;
4390           }
4391         while (eptr >= pp)
4392           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4393             return TRUE;
4394         return FALSE;
4395         }
4396       /* Control never gets here */
4397       }
4398
4399     /* Caseful comparisons */
4400
4401     else
4402       {
4403       for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
4404       if (min == max) continue;
4405       if (minimize)
4406         {
4407         for (i = min;; i++)
4408           {
4409           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4410             return TRUE;
4411           if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
4412           }
4413         /* Control never gets here */
4414         }
4415       else
4416         {
4417         const uschar *pp = eptr;
4418         for (i = min; i < max; i++)
4419           {
4420           if (eptr >= md->end_subject || c != *eptr) break;
4421           eptr++;
4422           }
4423         while (eptr >= pp)
4424          if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4425            return TRUE;
4426         return FALSE;
4427         }
4428       }
4429     /* Control never gets here */
4430
4431     /* Match a negated single character */
4432
4433     case OP_NOT:
4434     if (eptr >= md->end_subject) return FALSE;
4435     ecode++;
4436     if ((ims & PCRE_CASELESS) != 0)
4437       {
4438       if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
4439       }
4440     else
4441       {
4442       if (*ecode++ == *eptr++) return FALSE;
4443       }
4444     break;
4445
4446     /* Match a negated single character repeatedly. This is almost a repeat of
4447     the code for a repeated single character, but I haven't found a nice way of
4448     commoning these up that doesn't require a test of the positive/negative
4449     option for each character match. Maybe that wouldn't add very much to the
4450     time taken, but character matching *is* what this is all about... */
4451
4452     case OP_NOTEXACT:
4453     min = max = (ecode[1] << 8) + ecode[2];
4454     ecode += 3;
4455     goto REPEATNOTCHAR;
4456
4457     case OP_NOTUPTO:
4458     case OP_NOTMINUPTO:
4459     min = 0;
4460     max = (ecode[1] << 8) + ecode[2];
4461     minimize = *ecode == OP_NOTMINUPTO;
4462     ecode += 3;
4463     goto REPEATNOTCHAR;
4464
4465     case OP_NOTSTAR:
4466     case OP_NOTMINSTAR:
4467     case OP_NOTPLUS:
4468     case OP_NOTMINPLUS:
4469     case OP_NOTQUERY:
4470     case OP_NOTMINQUERY:
4471     c = *ecode++ - OP_NOTSTAR;
4472     minimize = (c & 1) != 0;
4473     min = rep_min[c];                 /* Pick up values from tables; */
4474     max = rep_max[c];                 /* zero for max => infinity */
4475     if (max == 0) max = INT_MAX;
4476
4477     /* Common code for all repeated single-character matches. We can give
4478     up quickly if there are fewer than the minimum number of characters left in
4479     the subject. */
4480
4481     REPEATNOTCHAR:
4482     if (min > md->end_subject - eptr) return FALSE;
4483     c = *ecode++;
4484
4485     /* The code is duplicated for the caseless and caseful cases, for speed,
4486     since matching characters is likely to be quite common. First, ensure the
4487     minimum number of matches are present. If min = max, continue at the same
4488     level without recursing. Otherwise, if minimizing, keep trying the rest of
4489     the expression and advancing one matching character if failing, up to the
4490     maximum. Alternatively, if maximizing, find the maximum number of
4491     characters and work backwards. */
4492
4493     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4494       max, eptr));
4495
4496     if ((ims & PCRE_CASELESS) != 0)
4497       {
4498       c = md->lcc[c];
4499       for (i = 1; i <= min; i++)
4500         if (c == md->lcc[*eptr++]) return FALSE;
4501       if (min == max) continue;
4502       if (minimize)
4503         {
4504         for (i = min;; i++)
4505           {
4506           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4507             return TRUE;
4508           if (i >= max || eptr >= md->end_subject ||
4509               c == md->lcc[*eptr++])
4510             return FALSE;
4511           }
4512         /* Control never gets here */
4513         }
4514       else
4515         {
4516         const uschar *pp = eptr;
4517         for (i = min; i < max; i++)
4518           {
4519           if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
4520           eptr++;
4521           }
4522         while (eptr >= pp)
4523           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4524             return TRUE;
4525         return FALSE;
4526         }
4527       /* Control never gets here */
4528       }
4529
4530     /* Caseful comparisons */
4531
4532     else
4533       {
4534       for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
4535       if (min == max) continue;
4536       if (minimize)
4537         {
4538         for (i = min;; i++)
4539           {
4540           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4541             return TRUE;
4542           if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
4543           }
4544         /* Control never gets here */
4545         }
4546       else
4547         {
4548         const uschar *pp = eptr;
4549         for (i = min; i < max; i++)
4550           {
4551           if (eptr >= md->end_subject || c == *eptr) break;
4552           eptr++;
4553           }
4554         while (eptr >= pp)
4555          if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4556            return TRUE;
4557         return FALSE;
4558         }
4559       }
4560     /* Control never gets here */
4561
4562     /* Match a single character type repeatedly; several different opcodes
4563     share code. This is very similar to the code for single characters, but we
4564     repeat it in the interests of efficiency. */
4565
4566     case OP_TYPEEXACT:
4567     min = max = (ecode[1] << 8) + ecode[2];
4568     minimize = TRUE;
4569     ecode += 3;
4570     goto REPEATTYPE;
4571
4572     case OP_TYPEUPTO:
4573     case OP_TYPEMINUPTO:
4574     min = 0;
4575     max = (ecode[1] << 8) + ecode[2];
4576     minimize = *ecode == OP_TYPEMINUPTO;
4577     ecode += 3;
4578     goto REPEATTYPE;
4579
4580     case OP_TYPESTAR:
4581     case OP_TYPEMINSTAR:
4582     case OP_TYPEPLUS:
4583     case OP_TYPEMINPLUS:
4584     case OP_TYPEQUERY:
4585     case OP_TYPEMINQUERY:
4586     c = *ecode++ - OP_TYPESTAR;
4587     minimize = (c & 1) != 0;
4588     min = rep_min[c];                 /* Pick up values from tables; */
4589     max = rep_max[c];                 /* zero for max => infinity */
4590     if (max == 0) max = INT_MAX;
4591
4592     /* Common code for all repeated single character type matches */
4593
4594     REPEATTYPE:
4595     ctype = *ecode++;      /* Code for the character type */
4596
4597     /* First, ensure the minimum number of matches are present. Use inline
4598     code for maximizing the speed, and do the type test once at the start
4599     (i.e. keep it out of the loop). Also we can test that there are at least
4600     the minimum number of bytes before we start, except when doing '.' in
4601     UTF8 mode. Leave the test in in all cases; in the special case we have
4602     to test after each character. */
4603
4604     if (min > md->end_subject - eptr) return FALSE;
4605     if (min > 0) switch(ctype)
4606       {
4607       case OP_ANY:
4608 #ifdef SUPPORT_UTF8
4609       if (md->utf8)
4610         {
4611         for (i = 1; i <= min; i++)
4612           {
4613           if (eptr >= md->end_subject ||
4614              (*eptr++ == '\n' && (ims & PCRE_DOTALL) == 0))
4615             return FALSE;
4616           while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4617           }
4618         break;
4619         }
4620 #endif
4621       /* Non-UTF8 can be faster */
4622       if ((ims & PCRE_DOTALL) == 0)
4623         { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
4624       else eptr += min;
4625       break;
4626
4627       case OP_NOT_DIGIT:
4628       for (i = 1; i <= min; i++)
4629         if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
4630       break;
4631
4632       case OP_DIGIT:
4633       for (i = 1; i <= min; i++)
4634         if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
4635       break;
4636
4637       case OP_NOT_WHITESPACE:
4638       for (i = 1; i <= min; i++)
4639         if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
4640       break;
4641
4642       case OP_WHITESPACE:
4643       for (i = 1; i <= min; i++)
4644         if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
4645       break;
4646
4647       case OP_NOT_WORDCHAR:
4648       for (i = 1; i <= min; i++)
4649         if ((md->ctypes[*eptr++] & ctype_word) != 0)
4650           return FALSE;
4651       break;
4652
4653       case OP_WORDCHAR:
4654       for (i = 1; i <= min; i++)
4655         if ((md->ctypes[*eptr++] & ctype_word) == 0)
4656           return FALSE;
4657       break;
4658       }
4659
4660     /* If min = max, continue at the same level without recursing */
4661
4662     if (min == max) continue;
4663
4664     /* If minimizing, we have to test the rest of the pattern before each
4665     subsequent match. */
4666
4667     if (minimize)
4668       {
4669       for (i = min;; i++)
4670         {
4671         if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) return TRUE;
4672         if (i >= max || eptr >= md->end_subject) return FALSE;
4673
4674         c = *eptr++;
4675         switch(ctype)
4676           {
4677           case OP_ANY:
4678           if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
4679 #ifdef SUPPORT_UTF8
4680           if (md->utf8)
4681             while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4682 #endif
4683           break;
4684
4685           case OP_NOT_DIGIT:
4686           if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
4687           break;
4688
4689           case OP_DIGIT:
4690           if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
4691           break;
4692
4693           case OP_NOT_WHITESPACE:
4694           if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
4695           break;
4696
4697           case OP_WHITESPACE:
4698           if  ((md->ctypes[c] & ctype_space) == 0) return FALSE;
4699           break;
4700
4701           case OP_NOT_WORDCHAR:
4702           if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
4703           break;
4704
4705           case OP_WORDCHAR:
4706           if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
4707           break;
4708           }
4709         }
4710       /* Control never gets here */
4711       }
4712
4713     /* If maximizing it is worth using inline code for speed, doing the type
4714     test once at the start (i.e. keep it out of the loop). */
4715
4716     else
4717       {
4718       const uschar *pp = eptr;
4719       switch(ctype)
4720         {
4721         case OP_ANY:
4722
4723         /* Special code is required for UTF8, but when the maximum is unlimited
4724         we don't need it. */
4725
4726 #ifdef SUPPORT_UTF8
4727         if (md->utf8 && max < INT_MAX)
4728           {
4729           if ((ims & PCRE_DOTALL) == 0)
4730             {
4731             for (i = min; i < max; i++)
4732               {
4733               if (eptr >= md->end_subject || *eptr++ == '\n') break;
4734               while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4735               }
4736             }
4737           else
4738             {
4739             for (i = min; i < max; i++)
4740               {
4741               eptr++;
4742               while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4743               }
4744             }
4745           break;
4746           }
4747 #endif
4748         /* Non-UTF8 can be faster */
4749         if ((ims & PCRE_DOTALL) == 0)
4750           {
4751           for (i = min; i < max; i++)
4752             {
4753             if (eptr >= md->end_subject || *eptr == '\n') break;
4754             eptr++;
4755             }
4756           }
4757         else
4758           {
4759           c = max - min;
4760           if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4761           eptr += c;
4762           }
4763         break;
4764
4765         case OP_NOT_DIGIT:
4766         for (i = min; i < max; i++)
4767           {
4768           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4769             break;
4770           eptr++;
4771           }
4772         break;
4773
4774         case OP_DIGIT:
4775         for (i = min; i < max; i++)
4776           {
4777           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4778             break;
4779           eptr++;
4780           }
4781         break;
4782
4783         case OP_NOT_WHITESPACE:
4784         for (i = min; i < max; i++)
4785           {
4786           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4787             break;
4788           eptr++;
4789           }
4790         break;
4791
4792         case OP_WHITESPACE:
4793         for (i = min; i < max; i++)
4794           {
4795           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4796             break;
4797           eptr++;
4798           }
4799         break;
4800
4801         case OP_NOT_WORDCHAR:
4802         for (i = min; i < max; i++)
4803           {
4804           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4805             break;
4806           eptr++;
4807           }
4808         break;
4809
4810         case OP_WORDCHAR:
4811         for (i = min; i < max; i++)
4812           {
4813           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4814             break;
4815           eptr++;
4816           }
4817         break;
4818         }
4819
4820       while (eptr >= pp)
4821         {
4822         if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4823           return TRUE;
4824 #ifdef SUPPORT_UTF8
4825         if (md->utf8)
4826           while (eptr > pp && (*eptr & 0xc0) == 0x80) eptr--;
4827 #endif
4828         }
4829       return FALSE;
4830       }
4831     /* Control never gets here */
4832
4833     /* There's been some horrible disaster. */
4834
4835     default:
4836     DPRINTF(("Unknown opcode %d\n", *ecode));
4837     md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4838     return FALSE;
4839     }
4840
4841   /* Do not stick any code in here without much thought; it is assumed
4842   that "continue" in the code above comes out to here to repeat the main
4843   loop. */
4844
4845   }             /* End of main loop */
4846 /* Control never reaches here */
4847 }
4848
4849
4850
4851
4852 /*************************************************
4853 *         Execute a Regular Expression           *
4854 *************************************************/
4855
4856 /* This function applies a compiled re to a subject string and picks out
4857 portions of the string if it matches. Two elements in the vector are set for
4858 each substring: the offsets to the start and end of the substring.
4859
4860 Arguments:
4861   external_re     points to the compiled expression
4862   external_extra  points to "hints" from pcre_study() or is NULL
4863   subject         points to the subject string
4864   length          length of subject string (may contain binary zeros)
4865   start_offset    where to start in the subject string
4866   options         option bits
4867   offsets         points to a vector of ints to be filled in with offsets
4868   offsetcount     the number of elements in the vector
4869
4870 Returns:          > 0 => success; value is the number of elements filled in
4871                   = 0 => success, but offsets is not big enough
4872                    -1 => failed to match
4873                  < -1 => some kind of unexpected problem
4874 */
4875
4876 int
4877 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4878   const char *subject, int length, int start_offset, int options, int *offsets,
4879   int offsetcount)
4880 {
4881 int resetcount, ocount;
4882 int first_char = -1;
4883 int req_char = -1;
4884 int req_char2 = -1;
4885 unsigned long int ims = 0;
4886 match_data match_block;
4887 const uschar *start_bits = NULL;
4888 const uschar *start_match = (const uschar *)subject + start_offset;
4889 const uschar *end_subject;
4890 const uschar *req_char_ptr = start_match - 1;
4891 const real_pcre *re = (const real_pcre *)external_re;
4892 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4893 BOOL using_temporary_offsets = FALSE;
4894 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4895 BOOL startline = (re->options & PCRE_STARTLINE) != 0;
4896
4897 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4898
4899 if (re == NULL || subject == NULL ||
4900    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4901 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4902
4903 match_block.start_pattern = re->code;
4904 match_block.start_subject = (const uschar *)subject;
4905 match_block.end_subject = match_block.start_subject + length;
4906 end_subject = match_block.end_subject;
4907
4908 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4909 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
4910
4911 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4912 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4913 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4914
4915 match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */
4916
4917 match_block.lcc = re->tables + lcc_offset;
4918 match_block.ctypes = re->tables + ctypes_offset;
4919
4920 /* The ims options can vary during the matching as a result of the presence
4921 of (?ims) items in the pattern. They are kept in a local variable so that
4922 restoring at the exit of a group is easy. */
4923
4924 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4925
4926 /* If the expression has got more back references than the offsets supplied can
4927 hold, we get a temporary bit of working store to use during the matching.
4928 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4929 of 3. */
4930
4931 ocount = offsetcount - (offsetcount % 3);
4932
4933 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4934   {
4935   ocount = re->top_backref * 3 + 3;
4936   match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4937   if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4938   using_temporary_offsets = TRUE;
4939   DPRINTF(("Got memory to hold back references\n"));
4940   }
4941 else match_block.offset_vector = offsets;
4942
4943 match_block.offset_end = ocount;
4944 match_block.offset_max = (2*ocount)/3;
4945 match_block.offset_overflow = FALSE;
4946
4947 /* Compute the minimum number of offsets that we need to reset each time. Doing
4948 this makes a huge difference to execution time when there aren't many brackets
4949 in the pattern. */
4950
4951 resetcount = 2 + re->top_bracket * 2;
4952 if (resetcount > offsetcount) resetcount = ocount;
4953
4954 /* Reset the working variable associated with each extraction. These should
4955 never be used unless previously set, but they get saved and restored, and so we
4956 initialize them to avoid reading uninitialized locations. */
4957
4958 if (match_block.offset_vector != NULL)
4959   {
4960   register int *iptr = match_block.offset_vector + ocount;
4961   register int *iend = iptr - resetcount/2 + 1;
4962   while (--iptr >= iend) *iptr = -1;
4963   }
4964
4965 /* Set up the first character to match, if available. The first_char value is
4966 never set for an anchored regular expression, but the anchoring may be forced
4967 at run time, so we have to test for anchoring. The first char may be unset for
4968 an unanchored pattern, of course. If there's no first char and the pattern was
4969 studied, there may be a bitmap of possible first characters. */
4970
4971 if (!anchored)
4972   {
4973   if ((re->options & PCRE_FIRSTSET) != 0)
4974     {
4975     first_char = re->first_char;
4976     if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4977     }
4978   else
4979     if (!startline && extra != NULL &&
4980       (extra->options & PCRE_STUDY_MAPPED) != 0)
4981         start_bits = extra->start_bits;
4982   }
4983
4984 /* For anchored or unanchored matches, there may be a "last known required
4985 character" set. If the PCRE_CASELESS is set, implying that the match starts
4986 caselessly, or if there are any changes of this flag within the regex, set up
4987 both cases of the character. Otherwise set the two values the same, which will
4988 avoid duplicate testing (which takes significant time). This covers the vast
4989 majority of cases. It will be suboptimal when the case flag changes in a regex
4990 and the required character in fact is caseful. */
4991
4992 if ((re->options & PCRE_REQCHSET) != 0)
4993   {
4994   req_char = re->req_char;
4995   req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
4996     (re->tables + fcc_offset)[req_char] : req_char;
4997   }
4998
4999 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5000 the loop runs just once. */
5001
5002 do
5003   {
5004   int rc;
5005   register int *iptr = match_block.offset_vector;
5006   register int *iend = iptr + resetcount;
5007
5008   /* Reset the maximum number of extractions we might see. */
5009
5010   while (iptr < iend) *iptr++ = -1;
5011
5012   /* Advance to a unique first char if possible */
5013
5014   if (first_char >= 0)
5015     {
5016     if ((ims & PCRE_CASELESS) != 0)
5017       while (start_match < end_subject &&
5018              match_block.lcc[*start_match] != first_char)
5019         start_match++;
5020     else
5021       while (start_match < end_subject && *start_match != first_char)
5022         start_match++;
5023     }
5024
5025   /* Or to just after \n for a multiline match if possible */
5026
5027   else if (startline)
5028     {
5029     if (start_match > match_block.start_subject + start_offset)
5030       {
5031       while (start_match < end_subject && start_match[-1] != '\n')
5032         start_match++;
5033       }
5034     }
5035
5036   /* Or to a non-unique first char after study */
5037
5038   else if (start_bits != NULL)
5039     {
5040     while (start_match < end_subject)
5041       {
5042       register int c = *start_match;
5043       if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
5044       }
5045     }
5046
5047 #ifdef DEBUG  /* Sigh. Some compilers never learn. */
5048   printf(">>>> Match against: ");
5049   pchars(start_match, end_subject - start_match, TRUE, &match_block);
5050   printf("\n");
5051 #endif
5052
5053   /* If req_char is set, we know that that character must appear in the subject
5054   for the match to succeed. If the first character is set, req_char must be
5055   later in the subject; otherwise the test starts at the match point. This
5056   optimization can save a huge amount of backtracking in patterns with nested
5057   unlimited repeats that aren't going to match. We don't know what the state of
5058   case matching may be when this character is hit, so test for it in both its
5059   cases if necessary. However, the different cased versions will not be set up
5060   unless PCRE_CASELESS was given or the casing state changes within the regex.
5061   Writing separate code makes it go faster, as does using an autoincrement and
5062   backing off on a match. */
5063
5064   if (req_char >= 0)
5065     {
5066     register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
5067
5068     /* We don't need to repeat the search if we haven't yet reached the
5069     place we found it at last time. */
5070
5071     if (p > req_char_ptr)
5072       {
5073       /* Do a single test if no case difference is set up */
5074
5075       if (req_char == req_char2)
5076         {
5077         while (p < end_subject)
5078           {
5079           if (*p++ == req_char) { p--; break; }
5080           }
5081         }
5082
5083       /* Otherwise test for either case */
5084
5085       else
5086         {
5087         while (p < end_subject)
5088           {
5089           register int pp = *p++;
5090           if (pp == req_char || pp == req_char2) { p--; break; }
5091           }
5092         }
5093
5094       /* If we can't find the required character, break the matching loop */
5095
5096       if (p >= end_subject) break;
5097
5098       /* If we have found the required character, save the point where we
5099       found it, so that we don't search again next time round the loop if
5100       the start hasn't passed this character yet. */
5101
5102       req_char_ptr = p;
5103       }
5104     }
5105
5106   /* When a match occurs, substrings will be set for all internal extractions;
5107   we just need to set up the whole thing as substring 0 before returning. If
5108   there were too many extractions, set the return code to zero. In the case
5109   where we had to get some local store to hold offsets for backreferences, copy
5110   those back references that we can. In this case there need not be overflow
5111   if certain parts of the pattern were not used. */
5112
5113   match_block.start_match = start_match;
5114   if (!match(start_match, re->code, 2, &match_block, ims, NULL, match_isgroup))
5115     continue;
5116
5117   /* Copy the offset information from temporary store if necessary */
5118
5119   if (using_temporary_offsets)
5120     {
5121     if (offsetcount >= 4)
5122       {
5123       memcpy(offsets + 2, match_block.offset_vector + 2,
5124         (offsetcount - 2) * sizeof(int));
5125       DPRINTF(("Copied offsets from temporary memory\n"));
5126       }
5127     if (match_block.end_offset_top > offsetcount)
5128       match_block.offset_overflow = TRUE;
5129
5130     DPRINTF(("Freeing temporary memory\n"));
5131     (pcre_free)(match_block.offset_vector);
5132     }
5133
5134   rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
5135
5136   if (match_block.offset_end < 2) rc = 0; else
5137     {
5138     offsets[0] = start_match - match_block.start_subject;
5139     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
5140     }
5141
5142   DPRINTF((">>>> returning %d\n", rc));
5143   return rc;
5144   }
5145
5146 /* This "while" is the end of the "do" above */
5147
5148 while (!anchored &&
5149        match_block.errorcode == PCRE_ERROR_NOMATCH &&
5150        start_match++ < end_subject);
5151
5152 if (using_temporary_offsets)
5153   {
5154   DPRINTF(("Freeing temporary memory\n"));
5155   (pcre_free)(match_block.offset_vector);
5156   }
5157
5158 DPRINTF((">>>> returning %d\n", match_block.errorcode));
5159
5160 return match_block.errorcode;
5161 }
5162
5163 /* End of pcre.c */