1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
10 Written by: Philip Hazel <ph10@cam.ac.uk>
12 Copyright (c) 1997-2000 University of Cambridge
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
29 4. If PCRE is embedded in any software that is released under the GNU
30 General Purpose Licence (GPL), then the terms of that licence shall
31 supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
36 /* Define DEBUG to get debugging output on stdout. */
40 /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41 inline, and there are *still* stupid compilers about that don't like indented
42 pre-processor statements. I suppose it's only been 10 years... */
45 #define DPRINTF(p) printf p
47 #define DPRINTF(p) /*nothing*/
50 /* Include the internals header, which itself includes Standard C headers plus
51 the external pcre header. */
56 /* Allow compilation as C++ source code, should anybody want to do that. */
59 #define class pcre_class
63 /* Number of items on the nested bracket stacks at compile time. This should
64 not be set greater than 200. */
66 #define BRASTACK_SIZE 200
69 /* The number of bytes in a literal character string above which we can't add
70 any more is different when UTF-8 characters may be encountered. */
79 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
81 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
82 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
84 /* Text forms of OP_ values and things, for debugging (not all used) */
87 static const char *OP_names[] = {
88 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
89 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
90 "Opt", "^", "$", "Any", "chars", "not",
91 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
92 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
93 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
94 "*", "*?", "+", "+?", "?", "??", "{", "{",
95 "class", "Ref", "Recurse",
96 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
97 "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
98 "Brazero", "Braminzero", "Bra"
102 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
103 are simple data values; negative values are for special things like \d and so
104 on. Zero means further processing is needed (for things like \x), or the escape
107 static const short int escapes[] = {
108 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
109 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
110 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
111 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
112 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
113 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
114 '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
115 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
116 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
117 0, 0, -ESC_z /* x - z */
120 /* Tables of names of POSIX character classes and their lengths. The list is
121 terminated by a zero length entry. The first three must be alpha, upper, lower,
122 as this is assumed for handling case independence. */
124 static const char *posix_names[] = {
125 "alpha", "lower", "upper",
126 "alnum", "ascii", "cntrl", "digit", "graph",
127 "print", "punct", "space", "word", "xdigit" };
129 static const uschar posix_name_lengths[] = {
130 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
132 /* Table of class bit maps for each POSIX class; up to three may be combined
133 to form the class. */
135 static const int posix_class_maps[] = {
136 cbit_lower, cbit_upper, -1, /* alpha */
137 cbit_lower, -1, -1, /* lower */
138 cbit_upper, -1, -1, /* upper */
139 cbit_digit, cbit_lower, cbit_upper, /* alnum */
140 cbit_print, cbit_cntrl, -1, /* ascii */
141 cbit_cntrl, -1, -1, /* cntrl */
142 cbit_digit, -1, -1, /* digit */
143 cbit_graph, -1, -1, /* graph */
144 cbit_print, -1, -1, /* print */
145 cbit_punct, -1, -1, /* punct */
146 cbit_space, -1, -1, /* space */
147 cbit_word, -1, -1, /* word */
148 cbit_xdigit,-1, -1 /* xdigit */
152 /* Definition to allow mutual recursion */
155 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
156 BOOL, int, int *, int *, compile_data *);
158 /* Structure for building a chain of data that actually lives on the
159 stack, for holding the values of the subject pointer at the start of each
160 subpattern, so as to detect when an empty string has been matched by a
161 subpattern - to break infinite loops. */
163 typedef struct eptrblock {
164 struct eptrblock *prev;
165 const uschar *saved_eptr;
168 /* Flag bits for the match() function */
170 #define match_condassert 0x01 /* Called to check a condition assertion */
171 #define match_isgroup 0x02 /* Set if start of bracketed group */
175 /*************************************************
177 *************************************************/
179 /* PCRE is thread-clean and doesn't use any global variables in the normal
180 sense. However, it calls memory allocation and free functions via the two
181 indirections below, which are can be changed by the caller, but are shared
182 between all threads. */
184 void *(*pcre_malloc)(size_t) = malloc;
185 void (*pcre_free)(void *) = free;
189 /*************************************************
190 * Macros and tables for character handling *
191 *************************************************/
193 /* When UTF-8 encoding is being used, a character is no longer just a single
194 byte. The macros for character handling generate simple sequences when used in
195 byte-mode, and more complicated ones for UTF-8 characters. */
198 #define GETCHARINC(c, eptr) c = *eptr++;
199 #define GETCHARLEN(c, eptr, len) c = *eptr;
200 #define BACKCHAR(eptr)
202 #else /* SUPPORT_UTF8 */
204 /* Get the next UTF-8 character, advancing the pointer */
206 #define GETCHARINC(c, eptr) \
208 if (md->utf8 && (c & 0xc0) == 0xc0) \
210 int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
211 int s = 6 - a; /* Amount to shift next byte */ \
212 c &= utf8_table3[a]; /* Low order bits from first byte */ \
215 c |= (*eptr++ & 0x3f) << s; \
220 /* Get the next UTF-8 character, not advancing the pointer, setting length */
222 #define GETCHARLEN(c, eptr, len) \
225 if (md->utf8 && (c & 0xc0) == 0xc0) \
228 int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
229 int s = 6 - a; /* Amount to shift next byte */ \
230 c &= utf8_table3[a]; /* Low order bits from first byte */ \
231 for (i = 1; i <= a; i++) \
233 c |= (eptr[i] & 0x3f) << s; \
239 /* If the pointer is not at the start of a character, move it back until
242 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
248 /*************************************************
249 * Default character tables *
250 *************************************************/
252 /* A default set of character tables is included in the PCRE binary. Its source
253 is built by the maketables auxiliary program, which uses the default C ctypes
254 functions, and put in the file chartables.c. These tables are used by PCRE
255 whenever the caller of pcre_compile() does not provide an alternate set of
258 #include "chartables.c"
263 /*************************************************
264 * Tables for UTF-8 support *
265 *************************************************/
267 /* These are the breakpoints for different numbers of bytes in a UTF-8
270 static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
272 /* These are the indicator bits and the mask for the data bits to set in the
273 first byte of a character, indexed by the number of additional bytes. */
275 static int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
276 static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
278 /* Table of the number of extra characters, indexed by the first character
279 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
282 static uschar utf8_table4[] = {
283 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
284 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
285 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
286 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
289 /*************************************************
290 * Convert character value to UTF-8 *
291 *************************************************/
293 /* This function takes an integer value in the range 0 - 0x7fffffff
294 and encodes it as a UTF-8 character in 0 to 6 bytes.
297 cvalue the character value
298 buffer pointer to buffer for result - at least 6 bytes long
300 Returns: number of characters placed in the buffer
304 ord2utf8(int cvalue, uschar *buffer)
307 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
308 if (cvalue <= utf8_table1[i]) break;
309 *buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
311 for (j = 0; j < i; j++)
313 *buffer++ = 0x80 | (cvalue & 0x3f);
322 /*************************************************
323 * Return version string *
324 *************************************************/
326 #define STRING(a) # a
327 #define XSTRING(s) STRING(s)
332 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
338 /*************************************************
339 * (Obsolete) Return info about compiled pattern *
340 *************************************************/
342 /* This is the original "info" function. It picks potentially useful data out
343 of the private structure, but its interface was too rigid. It remains for
344 backwards compatibility. The public options are passed back in an int - though
345 the re->options field has been expanded to a long int, all the public options
346 at the low end of it, and so even on 16-bit systems this will still be OK.
347 Therefore, I haven't changed the API for pcre_info().
350 external_re points to compiled code
351 optptr where to pass back the options
352 first_char where to pass back the first character,
353 or -1 if multiline and all branches start ^,
356 Returns: number of capturing subpatterns
357 or negative values on error
361 pcre_info(const pcre *external_re, int *optptr, int *first_char)
363 const real_pcre *re = (const real_pcre *)external_re;
364 if (re == NULL) return PCRE_ERROR_NULL;
365 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
366 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
367 if (first_char != NULL)
368 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
369 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
370 return re->top_bracket;
375 /*************************************************
376 * Return info about compiled pattern *
377 *************************************************/
379 /* This is a newer "info" function which has an extensible interface so
380 that additional items can be added compatibly.
383 external_re points to compiled code
384 external_study points to study data, or NULL
385 what what information is required
386 where where to put the information
388 Returns: 0 if data returned, negative on error
392 pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
395 const real_pcre *re = (const real_pcre *)external_re;
396 const real_pcre_extra *study = (const real_pcre_extra *)study_data;
398 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
399 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
403 case PCRE_INFO_OPTIONS:
404 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
408 *((size_t *)where) = re->size;
411 case PCRE_INFO_CAPTURECOUNT:
412 *((int *)where) = re->top_bracket;
415 case PCRE_INFO_BACKREFMAX:
416 *((int *)where) = re->top_backref;
419 case PCRE_INFO_FIRSTCHAR:
421 ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
422 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
425 case PCRE_INFO_FIRSTTABLE:
426 *((const uschar **)where) =
427 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
428 study->start_bits : NULL;
431 case PCRE_INFO_LASTLITERAL:
433 ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
436 default: return PCRE_ERROR_BADOPTION;
445 /*************************************************
446 * Debugging function to print chars *
447 *************************************************/
449 /* Print a sequence of chars in printable format, stopping at the end of the
450 subject if the requested.
453 p points to characters
454 length number to print
455 is_subject TRUE if printing from within md->start_subject
456 md pointer to matching data block, if is_subject is TRUE
462 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
465 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
467 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
474 /*************************************************
476 *************************************************/
478 /* This function is called when a \ has been encountered. It either returns a
479 positive value for a simple escape such as \n, or a negative value which
480 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
481 a positive value greater than 255 may be returned. On entry, ptr is pointing at
482 the \. On exit, it is on the final character of the escape sequence.
485 ptrptr points to the pattern position pointer
486 errorptr points to the pointer to the error message
487 bracount number of previous extracting brackets
488 options the options bits
489 isclass TRUE if inside a character class
490 cd pointer to char tables block
492 Returns: zero or positive => a data character
493 negative => a special escape sequence
494 on error, errorptr is set
498 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
499 int options, BOOL isclass, compile_data *cd)
501 const uschar *ptr = *ptrptr;
504 /* If backslash is at the end of the pattern, it's an error. */
507 if (c == 0) *errorptr = ERR1;
509 /* Digits or letters may have special meaning; all others are literals. */
511 else if (c < '0' || c > 'z') {}
513 /* Do an initial lookup in a table. A non-zero result is something that can be
514 returned immediately. Otherwise further processing may be required. */
516 else if ((i = escapes[c - '0']) != 0) c = i;
518 /* Escapes that need further processing, or are illegal. */
522 const uschar *oldptr;
525 /* The handling of escape sequences consisting of a string of digits
526 starting with one that is not zero is not straightforward. By experiment,
527 the way Perl works seems to be as follows:
529 Outside a character class, the digits are read as a decimal number. If the
530 number is less than 10, or if there are that many previous extracting
531 left brackets, then it is a back reference. Otherwise, up to three octal
532 digits are read to form an escaped byte. Thus \123 is likely to be octal
533 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
534 value is greater than 377, the least significant 8 bits are taken. Inside a
535 character class, \ followed by a digit is always an octal number. */
537 case '1': case '2': case '3': case '4': case '5':
538 case '6': case '7': case '8': case '9':
544 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
545 c = c * 10 + *(++ptr) - '0';
546 if (c < 10 || c <= bracount)
551 ptr = oldptr; /* Put the pointer back and fall through */
554 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
555 generates a binary zero byte and treats the digit as a following literal.
556 Thus we have to pull back the pointer by one. */
558 if ((c = *ptr) >= '8')
565 /* \0 always starts an octal number, but we may drop through to here with a
566 larger first octal digit. */
570 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
571 ptr[1] != '8' && ptr[1] != '9')
572 c = c * 8 + *(++ptr) - '0';
573 c &= 255; /* Take least significant 8 bits */
576 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
577 which can be greater than 0xff, but only if the ddd are hex digits. */
581 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
583 const uschar *pt = ptr + 2;
584 register int count = 0;
586 while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
589 c = c * 16 + cd->lcc[*pt] -
590 (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
595 if (c < 0 || count > 8) *errorptr = ERR34;
599 /* If the sequence of hex digits does not end with '}', then we don't
600 recognize this construct; fall through to the normal \x handling. */
604 /* Read just a single hex char */
607 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
610 c = c * 16 + cd->lcc[*ptr] -
611 (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
615 /* Other special escapes not starting with a digit are straightforward */
625 /* A letter is upper-cased; then the 0x40 bit is flipped */
627 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
631 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
632 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
633 for Perl compatibility, it is a literal. This code looks a bit odd, but
634 there used to be some cases other than the default, and there may be again
635 in future, so I haven't "optimized" it. */
638 if ((options & PCRE_EXTRA) != 0) switch(c)
654 /*************************************************
655 * Check for counted repeat *
656 *************************************************/
658 /* This function is called when a '{' is encountered in a place where it might
659 start a quantifier. It looks ahead to see if it really is a quantifier or not.
660 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
661 where the ddds are digits.
664 p pointer to the first char after '{'
665 cd pointer to char tables block
667 Returns: TRUE or FALSE
671 is_counted_repeat(const uschar *p, compile_data *cd)
673 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
674 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
675 if (*p == '}') return TRUE;
677 if (*p++ != ',') return FALSE;
678 if (*p == '}') return TRUE;
680 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
681 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
687 /*************************************************
688 * Read repeat counts *
689 *************************************************/
691 /* Read an item of the form {n,m} and return the values. This is called only
692 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
693 so the syntax is guaranteed to be correct, but we need to check the values.
696 p pointer to first char after '{'
697 minp pointer to int for min
698 maxp pointer to int for max
699 returned as -1 if no max
700 errorptr points to pointer to error message
701 cd pointer to character tables clock
703 Returns: pointer to '}' on success;
704 current ptr on error, with errorptr set
707 static const uschar *
708 read_repeat_counts(const uschar *p, int *minp, int *maxp,
709 const char **errorptr, compile_data *cd)
714 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
716 if (*p == '}') max = min; else
721 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
730 /* Do paranoid checks, then fill in the required variables, and pass back the
731 pointer to the terminating '}'. */
733 if (min > 65535 || max > 65535)
745 /*************************************************
746 * Find the fixed length of a pattern *
747 *************************************************/
749 /* Scan a pattern and compute the fixed length of subject that will match it,
750 if the length is fixed. This is needed for dealing with backward assertions.
753 code points to the start of the pattern (the bracket)
754 options the compiling options
756 Returns: the fixed length, or -1 if there is no fixed length
760 find_fixedlength(uschar *code, int options)
764 register int branchlength = 0;
765 register uschar *cc = code + 3;
767 /* Scan along the opcodes for this branch. If we get to the end of the
768 branch, check the length against that of the other branches. */
773 register int op = *cc;
774 if (op >= OP_BRA) op = OP_BRA;
781 d = find_fixedlength(cc, options);
782 if (d < 0) return -1;
784 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
788 /* Reached end of a branch; if it's a ket it is the end of a nested
789 call. If it's ALT it is an alternation in a nested call. If it is
790 END it's the end of the outer call. All can be handled by the same code. */
797 if (length < 0) length = branchlength;
798 else if (length != branchlength) return -1;
799 if (*cc != OP_ALT) return length;
804 /* Skip over assertive subpatterns */
809 case OP_ASSERTBACK_NOT:
810 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
814 /* Skip over things that don't match chars */
830 case OP_NOT_WORD_BOUNDARY:
831 case OP_WORD_BOUNDARY:
835 /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
836 This requires a scan of the string, unfortunately. We assume valid UTF-8
837 strings, so all we do is reduce the length by one for byte whose bits are
841 branchlength += *(++cc);
843 for (d = 1; d <= *cc; d++)
844 if ((cc[d] & 0xc0) == 0x80) branchlength--;
849 /* Handle exact repetitions */
853 branchlength += (cc[1] << 8) + cc[2];
857 /* Handle single-char matchers */
861 case OP_NOT_WHITESPACE:
863 case OP_NOT_WORDCHAR:
871 /* Check a class for variable quantification */
874 cc += (*cc == OP_REF)? 2 : 33;
886 if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
887 branchlength += (cc[1] << 8) + cc[2];
896 /* Anything else is variable length */
902 /* Control never gets here */
908 /*************************************************
909 * Check for POSIX class syntax *
910 *************************************************/
912 /* This function is called when the sequence "[:" or "[." or "[=" is
913 encountered in a character class. It checks whether this is followed by an
914 optional ^ and then a sequence of letters, terminated by a matching ":]" or
918 ptr pointer to the initial [
919 endptr where to return the end pointer
920 cd pointer to compile data
922 Returns: TRUE or FALSE
926 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
928 int terminator; /* Don't combine these lines; the Solaris cc */
929 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
930 if (*(++ptr) == '^') ptr++;
931 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
932 if (*ptr == terminator && ptr[1] == ']')
943 /*************************************************
944 * Check POSIX class name *
945 *************************************************/
947 /* This function is called to check the name given in a POSIX-style class entry
951 ptr points to the first letter
952 len the length of the name
954 Returns: a value representing the name, or -1 if unknown
958 check_posix_name(const uschar *ptr, int len)
960 register int yield = 0;
961 while (posix_name_lengths[yield] != 0)
963 if (len == posix_name_lengths[yield] &&
964 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
973 /*************************************************
974 * Compile one branch *
975 *************************************************/
977 /* Scan the pattern, compiling it into the code vector.
980 options the option bits
981 brackets points to number of brackets used
982 code points to the pointer to the current code point
983 ptrptr points to the current pattern pointer
984 errorptr points to pointer to error message
985 optchanged set to the value of the last OP_OPT item compiled
986 reqchar set to the last literal character required, else -1
987 countlits set to count of mandatory literal characters
988 cd contains pointers to tables
990 Returns: TRUE on success
991 FALSE, with *errorptr set on error
995 compile_branch(int options, int *brackets, uschar **codeptr,
996 const uschar **ptrptr, const char **errorptr, int *optchanged,
997 int *reqchar, int *countlits, compile_data *cd)
999 int repeat_type, op_type;
1000 int repeat_min, repeat_max;
1001 int bravalue, length;
1002 int greedy_default, greedy_non_default;
1005 int subcountlits = 0;
1007 register uschar *code = *codeptr;
1009 const uschar *ptr = *ptrptr;
1010 const uschar *tempptr;
1011 uschar *previous = NULL;
1014 /* Set up the default and non-default settings for greediness */
1016 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1017 greedy_non_default = greedy_default ^ 1;
1019 /* Initialize no required char, and count of literals */
1021 *reqchar = prevreqchar = -1;
1024 /* Switch on next character until the end of the branch */
1029 int class_charcount;
1036 if ((options & PCRE_EXTENDED) != 0)
1038 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1041 /* The space before the ; is to avoid a warning on a silly compiler
1042 on the Macintosh. */
1043 while ((c = *(++ptr)) != 0 && c != '\n') ;
1050 /* The branch terminates at end of string, |, or ). */
1059 /* Handle single-character metacharacters */
1076 /* Character classes. These always build a 32-byte bitmap of the permitted
1077 characters, except in the special case where there is only one character.
1078 For negated classes, we build the map as usual, then invert it at the end.
1085 /* If the first character is '^', set the negation flag and skip it. */
1087 if ((c = *(++ptr)) == '^')
1089 negate_class = TRUE;
1092 else negate_class = FALSE;
1094 /* Keep a count of chars so that we can optimize the case of just a single
1097 class_charcount = 0;
1098 class_lastchar = -1;
1100 /* Initialize the 32-char bit map to all zeros. We have to build the
1101 map in a temporary bit of store, in case the class contains only 1
1102 character, because in that case the compiled code doesn't use the
1105 memset(class, 0, 32 * sizeof(uschar));
1107 /* Process characters until ] is reached. By writing this as a "do" it
1108 means that an initial ] is taken as a data character. */
1118 /* Handle POSIX class names. Perl allows a negation extension of the
1119 form [:^name]. A square bracket that doesn't match the syntax is
1120 treated as a literal. We also recognize the POSIX constructions
1121 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1125 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1126 check_posix_syntax(ptr, &tempptr, cd))
1128 BOOL local_negate = FALSE;
1130 register const uschar *cbits = cd->cbits;
1141 local_negate = TRUE;
1145 posix_class = check_posix_name(ptr, tempptr - ptr);
1146 if (posix_class < 0)
1152 /* If matching is caseless, upper and lower are converted to
1153 alpha. This relies on the fact that the class table starts with
1154 alpha, lower, upper as the first 3 entries. */
1156 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1159 /* Or into the map we are building up to 3 of the static class
1160 tables, or their negations. */
1163 for (i = 0; i < 3; i++)
1165 int taboffset = posix_class_maps[posix_class + i];
1166 if (taboffset < 0) break;
1168 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1170 for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1174 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1178 /* Backslash may introduce a single character, or it may introduce one
1179 of the specials, which just set a flag. Escaped items are checked for
1180 validity in the pre-compiling pass. The sequence \b is a special case.
1181 Inside a class (and only there) it is treated as backspace. Elsewhere
1182 it marks a word boundary. Other escapes have preset maps ready to
1183 or into the one we are building. We assume they have more than one
1184 character in them, so set class_count bigger than one. */
1188 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1189 if (-c == ESC_b) c = '\b';
1192 register const uschar *cbits = cd->cbits;
1193 class_charcount = 10;
1197 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1201 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1205 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1209 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1213 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1217 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1226 /* Fall through if single character, but don't at present allow
1227 chars > 255 in UTF-8 mode. */
1238 /* A single character may be followed by '-' to form a range. However,
1239 Perl does not permit ']' to be the end of the range. A '-' character
1240 here is treated as a literal. */
1242 if (ptr[1] == '-' && ptr[2] != ']')
1254 /* The second part of a range can be a single-character escape, but
1255 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1256 in such circumstances. */
1260 const uschar *oldptr = ptr;
1261 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1270 /* \b is backslash; any other special means the '-' was literal */
1274 if (d == -ESC_b) d = '\b'; else
1277 goto SINGLE_CHARACTER; /* A few lines below */
1290 class[c/8] |= (1 << (c&7));
1291 if ((options & PCRE_CASELESS) != 0)
1293 int uc = cd->fcc[c]; /* flip case */
1294 class[uc/8] |= (1 << (uc&7));
1296 class_charcount++; /* in case a one-char range */
1299 continue; /* Go get the next char in the class */
1302 /* Handle a lone single character - we can get here for a normal
1303 non-escape char, or after \ that introduces a single character. */
1307 class [c/8] |= (1 << (c&7));
1308 if ((options & PCRE_CASELESS) != 0)
1310 c = cd->fcc[c]; /* flip case */
1311 class[c/8] |= (1 << (c&7));
1317 /* Loop until ']' reached; the check for end of string happens inside the
1318 loop. This "while" is the end of the "do" above. */
1320 while ((c = *(++ptr)) != ']');
1322 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1323 precisely one character. This doesn't need the whole 32-byte bit map.
1324 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1327 if (class_charcount == 1 && class_lastchar >= 0)
1335 code[-1] = OP_CHARS;
1338 *code++ = class_lastchar;
1341 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1347 for (c = 0; c < 32; c++) code[c] = ~class[c];
1349 memcpy(code, class, 32);
1354 /* Various kinds of repeat */
1357 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
1358 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
1359 if (*errorptr != NULL) goto FAILED;
1377 if (previous == NULL)
1383 /* If the next character is '?' this is a minimizing repeat, by default,
1384 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1388 { repeat_type = greedy_non_default; ptr++; }
1389 else repeat_type = greedy_default;
1391 /* If previous was a string of characters, chop off the last one and use it
1392 as the subject of the repeat. If there was only one character, we can
1393 abolish the previous item altogether. A repeat with a zero minimum wipes
1394 out any reqchar setting, backing up to the previous value. We must also
1395 adjust the countlits value. */
1397 if (*previous == OP_CHARS)
1399 int len = previous[1];
1401 if (repeat_min == 0) *reqchar = prevreqchar;
1402 *countlits += repeat_min - 1;
1411 c = previous[len+1];
1415 op_type = 0; /* Use single-char op codes */
1416 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1419 /* If previous was a single negated character ([^a] or similar), we use
1420 one of the special opcodes, replacing it. The code is shared with single-
1421 character repeats by adding a suitable offset into repeat_type. */
1423 else if ((int)*previous == OP_NOT)
1425 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1428 goto OUTPUT_SINGLE_REPEAT;
1431 /* If previous was a character type match (\d or similar), abolish it and
1432 create a suitable repeat item. The code is shared with single-character
1433 repeats by adding a suitable offset into repeat_type. */
1435 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1437 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1441 OUTPUT_SINGLE_REPEAT:
1443 /* If the maximum is zero then the minimum must also be zero; Perl allows
1444 this case, so we do too - by simply omitting the item altogether. */
1446 if (repeat_max == 0) goto END_REPEAT;
1448 /* Combine the op_type with the repeat_type */
1450 repeat_type += op_type;
1452 /* A minimum of zero is handled either as the special case * or ?, or as
1453 an UPTO, with the maximum given. */
1455 if (repeat_min == 0)
1457 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1458 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1461 *code++ = OP_UPTO + repeat_type;
1462 *code++ = repeat_max >> 8;
1463 *code++ = (repeat_max & 255);
1467 /* The case {1,} is handled as the special case + */
1469 else if (repeat_min == 1 && repeat_max == -1)
1470 *code++ = OP_PLUS + repeat_type;
1472 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1473 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1477 if (repeat_min != 1)
1479 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1480 *code++ = repeat_min >> 8;
1481 *code++ = (repeat_min & 255);
1484 /* If the mininum is 1 and the previous item was a character string,
1485 we either have to put back the item that got cancelled if the string
1486 length was 1, or add the character back onto the end of a longer
1487 string. For a character type nothing need be done; it will just get
1488 put back naturally. Note that the final character is always going to
1491 else if (*previous == OP_CHARS)
1493 if (code == previous) code += 2; else previous[1]++;
1496 /* For a single negated character we also have to put back the
1497 item that got cancelled. */
1499 else if (*previous == OP_NOT) code++;
1501 /* If the maximum is unlimited, insert an OP_STAR. */
1506 *code++ = OP_STAR + repeat_type;
1509 /* Else insert an UPTO if the max is greater than the min. */
1511 else if (repeat_max != repeat_min)
1514 repeat_max -= repeat_min;
1515 *code++ = OP_UPTO + repeat_type;
1516 *code++ = repeat_max >> 8;
1517 *code++ = (repeat_max & 255);
1521 /* The character or character type itself comes last in all cases. */
1526 /* If previous was a character class or a back reference, we put the repeat
1527 stuff after it, but just skip the item if the repeat was {0,0}. */
1529 else if (*previous == OP_CLASS || *previous == OP_REF)
1531 if (repeat_max == 0)
1536 if (repeat_min == 0 && repeat_max == -1)
1537 *code++ = OP_CRSTAR + repeat_type;
1538 else if (repeat_min == 1 && repeat_max == -1)
1539 *code++ = OP_CRPLUS + repeat_type;
1540 else if (repeat_min == 0 && repeat_max == 1)
1541 *code++ = OP_CRQUERY + repeat_type;
1544 *code++ = OP_CRRANGE + repeat_type;
1545 *code++ = repeat_min >> 8;
1546 *code++ = repeat_min & 255;
1547 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1548 *code++ = repeat_max >> 8;
1549 *code++ = repeat_max & 255;
1553 /* If previous was a bracket group, we may have to replicate it in certain
1556 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1557 (int)*previous == OP_COND)
1561 int len = code - previous;
1562 uschar *bralink = NULL;
1564 /* If the maximum repeat count is unlimited, find the end of the bracket
1565 by scanning through from the start, and compute the offset back to it
1566 from the current code pointer. There may be an OP_OPT setting following
1567 the final KET, so we can't find the end just by going back from the code
1570 if (repeat_max == -1)
1572 register uschar *ket = previous;
1573 do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1574 ketoffset = code - ket;
1577 /* The case of a zero minimum is special because of the need to stick
1578 OP_BRAZERO in front of it, and because the group appears once in the
1579 data, whereas in other cases it appears the minimum number of times. For
1580 this reason, it is simplest to treat this case separately, as otherwise
1581 the code gets far too mess. There are several special subcases when the
1584 if (repeat_min == 0)
1586 /* If we set up a required char from the bracket, we must back off
1587 to the previous value and reset the countlits value too. */
1589 if (subcountlits > 0)
1591 *reqchar = prevreqchar;
1592 *countlits -= subcountlits;
1595 /* If the maximum is also zero, we just omit the group from the output
1598 if (repeat_max == 0)
1604 /* If the maximum is 1 or unlimited, we just have to stick in the
1605 BRAZERO and do no more at this point. */
1607 if (repeat_max <= 1)
1609 memmove(previous+1, previous, len);
1611 *previous++ = OP_BRAZERO + repeat_type;
1614 /* If the maximum is greater than 1 and limited, we have to replicate
1615 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1616 The first one has to be handled carefully because it's the original
1617 copy, which has to be moved up. The remainder can be handled by code
1618 that is common with the non-zero minimum case below. We just have to
1619 adjust the value or repeat_max, since one less copy is required. */
1624 memmove(previous+4, previous, len);
1626 *previous++ = OP_BRAZERO + repeat_type;
1627 *previous++ = OP_BRA;
1629 /* We chain together the bracket offset fields that have to be
1630 filled in later when the ends of the brackets are reached. */
1632 offset = (bralink == NULL)? 0 : previous - bralink;
1634 *previous++ = offset >> 8;
1635 *previous++ = offset & 255;
1641 /* If the minimum is greater than zero, replicate the group as many
1642 times as necessary, and adjust the maximum to the number of subsequent
1643 copies that we need. */
1647 for (i = 1; i < repeat_min; i++)
1649 memcpy(code, previous, len);
1652 if (repeat_max > 0) repeat_max -= repeat_min;
1655 /* This code is common to both the zero and non-zero minimum cases. If
1656 the maximum is limited, it replicates the group in a nested fashion,
1657 remembering the bracket starts on a stack. In the case of a zero minimum,
1658 the first one was set up above. In all cases the repeat_max now specifies
1659 the number of additional copies needed. */
1661 if (repeat_max >= 0)
1663 for (i = repeat_max - 1; i >= 0; i--)
1665 *code++ = OP_BRAZERO + repeat_type;
1667 /* All but the final copy start a new nesting, maintaining the
1668 chain of brackets outstanding. */
1674 offset = (bralink == NULL)? 0 : code - bralink;
1676 *code++ = offset >> 8;
1677 *code++ = offset & 255;
1680 memcpy(code, previous, len);
1684 /* Now chain through the pending brackets, and fill in their length
1685 fields (which are holding the chain links pro tem). */
1687 while (bralink != NULL)
1690 int offset = code - bralink + 1;
1691 uschar *bra = code - offset;
1692 oldlinkoffset = (bra[1] << 8) + bra[2];
1693 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1695 *code++ = bra[1] = offset >> 8;
1696 *code++ = bra[2] = (offset & 255);
1700 /* If the maximum is unlimited, set a repeater in the final copy. We
1701 can't just offset backwards from the current code point, because we
1702 don't know if there's been an options resetting after the ket. The
1703 correct offset was computed above. */
1705 else code[-ketoffset] = OP_KETRMAX + repeat_type;
1708 /* Else there's some kind of shambles */
1716 /* In all case we no longer have a previous item. */
1723 /* Start of nested bracket sub-expression, or comment or lookahead or
1724 lookbehind or option setting or condition. First deal with special things
1725 that can come after a bracket; all are introduced by ?, and the appearance
1726 of any of them means that this is not a referencing group. They were
1727 checked for validity in the first pass over the string, so we don't have to
1728 check for syntax errors here. */
1731 newoptions = options;
1734 if (*(++ptr) == '?')
1741 case '#': /* Comment; skip to ket */
1743 while (*ptr != ')') ptr++;
1746 case ':': /* Non-extracting bracket */
1752 bravalue = OP_COND; /* Conditional group */
1753 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1755 condref = *ptr - '0';
1756 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1767 case '=': /* Positive lookahead */
1768 bravalue = OP_ASSERT;
1772 case '!': /* Negative lookahead */
1773 bravalue = OP_ASSERT_NOT;
1777 case '<': /* Lookbehinds */
1780 case '=': /* Positive lookbehind */
1781 bravalue = OP_ASSERTBACK;
1785 case '!': /* Negative lookbehind */
1786 bravalue = OP_ASSERTBACK_NOT;
1790 default: /* Syntax error */
1796 case '>': /* One-time brackets */
1801 case 'R': /* Pattern recursion */
1802 *code++ = OP_RECURSE;
1806 default: /* Option setting */
1810 while (*ptr != ')' && *ptr != ':')
1814 case '-': optset = &unset; break;
1816 case 'i': *optset |= PCRE_CASELESS; break;
1817 case 'm': *optset |= PCRE_MULTILINE; break;
1818 case 's': *optset |= PCRE_DOTALL; break;
1819 case 'x': *optset |= PCRE_EXTENDED; break;
1820 case 'U': *optset |= PCRE_UNGREEDY; break;
1821 case 'X': *optset |= PCRE_EXTRA; break;
1829 /* Set up the changed option bits, but don't change anything yet. */
1831 newoptions = (options | set) & (~unset);
1833 /* If the options ended with ')' this is not the start of a nested
1834 group with option changes, so the options change at this level. At top
1835 level there is nothing else to be done (the options will in fact have
1836 been set from the start of compiling as a result of the first pass) but
1837 at an inner level we must compile code to change the ims options if
1838 necessary, and pass the new setting back so that it can be put at the
1839 start of any following branches, and when this group ends, a resetting
1840 item can be compiled. */
1844 if ((options & PCRE_INGROUP) != 0 &&
1845 (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1848 *code++ = *optchanged = newoptions & PCRE_IMS;
1850 options = newoptions; /* Change options at this level */
1851 previous = NULL; /* This item can't be repeated */
1852 continue; /* It is complete */
1855 /* If the options ended with ':' we are heading into a nested group
1856 with possible change of options. Such groups are non-capturing and are
1857 not assertions of any kind. All we need to do is skip over the ':';
1858 the newoptions value is handled below. */
1865 /* Else we have a referencing group; adjust the opcode. */
1869 if (++(*brackets) > EXTRACT_MAX)
1874 bravalue = OP_BRA + *brackets;
1877 /* Process nested bracketed re. Assertions may not be repeated, but other
1878 kinds can be. We copy code into a non-register variable in order to be able
1879 to pass its address because some compilers complain otherwise. Pass in a
1880 new setting for the ims options if they have changed. */
1882 previous = (bravalue >= OP_ONCE)? code : NULL;
1887 options | PCRE_INGROUP, /* Set for all nested groups */
1888 ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1889 newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1890 brackets, /* Bracket level */
1891 &tempcode, /* Where to put code (updated) */
1892 &ptr, /* Input pointer (updated) */
1893 errorptr, /* Where to put an error message */
1894 (bravalue == OP_ASSERTBACK ||
1895 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1896 condref, /* Condition reference number */
1897 &subreqchar, /* For possible last char */
1898 &subcountlits, /* For literal count */
1899 cd)) /* Tables block */
1902 /* At the end of compiling, code is still pointing to the start of the
1903 group, while tempcode has been updated to point past the end of the group
1904 and any option resetting that may follow it. The pattern pointer (ptr)
1905 is on the bracket. */
1907 /* If this is a conditional bracket, check that there are no more than
1908 two branches in the group. */
1910 if (bravalue == OP_COND)
1917 tc += (tc[1] << 8) | tc[2];
1919 while (*tc != OP_KET);
1928 /* Handle updating of the required character. If the subpattern didn't
1929 set one, leave it as it was. Otherwise, update it for normal brackets of
1930 all kinds, forward assertions, and conditions with two branches. Don't
1931 update the literal count for forward assertions, however. If the bracket
1932 is followed by a quantifier with zero repeat, we have to back off. Hence
1933 the definition of prevreqchar and subcountlits outside the main loop so
1934 that they can be accessed for the back off. */
1936 if (subreqchar > 0 &&
1937 (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1938 (bravalue == OP_COND && condcount == 2)))
1940 prevreqchar = *reqchar;
1941 *reqchar = subreqchar;
1942 if (bravalue != OP_ASSERT) *countlits += subcountlits;
1945 /* Now update the main code pointer to the end of the group. */
1949 /* Error if hit end of pattern */
1958 /* Check \ for being a real metacharacter; if not, fall through and handle
1959 it as a data character at the start of a string. Escape items are checked
1960 for validity in the pre-compiling pass. */
1964 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1966 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1967 are arranged to be the negation of the corresponding OP_values. For the
1968 back references, the values are ESC_REF plus the reference number. Only
1969 back references and those types that consume a character may be repeated.
1970 We can test for values between ESC_b and ESC_Z for the latter; this may
1971 have to change if any new ones are ever created. */
1979 *code++ = -c - ESC_REF;
1983 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1989 /* Data character: reset and fall through */
1994 /* Handle a run of data characters until a metacharacter is encountered.
1995 The first character is guaranteed not to be whitespace or # when the
1996 extended flag is set. */
2007 if ((options & PCRE_EXTENDED) != 0)
2009 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2012 /* The space before the ; is to avoid a warning on a silly compiler
2013 on the Macintosh. */
2014 while ((c = *(++ptr)) != 0 && c != '\n') ;
2020 /* Backslash may introduce a data char or a metacharacter. Escaped items
2021 are checked for validity in the pre-compiling pass. Stop the string
2022 before a metaitem. */
2027 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2028 if (c < 0) { ptr = tempptr; break; }
2030 /* If a character is > 127 in UTF-8 mode, we have to turn it into
2031 two or more characters in the UTF-8 encoding. */
2034 if (c > 127 && (options & PCRE_UTF8) != 0)
2037 int len = ord2utf8(c, buffer);
2038 for (c = 0; c < len; c++) *code++ = buffer[c];
2045 /* Ordinary character or single-char escape */
2051 /* This "while" is the end of the "do" above. */
2053 while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
2055 /* Update the last character and the count of literals */
2057 prevreqchar = (length > 1)? code[-2] : *reqchar;
2058 *reqchar = code[-1];
2059 *countlits += length;
2061 /* Compute the length and set it in the data vector, and advance to
2064 previous[1] = length;
2065 if (length < MAXLIT) ptr--;
2068 } /* end of big loop */
2070 /* Control never reaches here by falling through, only by a goto for all the
2071 error states. Pass back the position in the pattern so that it can be displayed
2072 to the user for diagnosing the error. */
2082 /*************************************************
2083 * Compile sequence of alternatives *
2084 *************************************************/
2086 /* On entry, ptr is pointing past the bracket character, but on return
2087 it points to the closing bracket, or vertical bar, or end of string.
2088 The code variable is pointing at the byte into which the BRA operator has been
2089 stored. If the ims options are changed at the start (for a (?ims: group) or
2090 during any branch, we need to insert an OP_OPT item at the start of every
2091 following branch to ensure they get set correctly at run time, and also pass
2092 the new options into every subsequent branch compile.
2095 options the option bits
2096 optchanged new ims options to set as if (?ims) were at the start, or -1
2098 brackets -> int containing the number of extracting brackets used
2099 codeptr -> the address of the current code pointer
2100 ptrptr -> the address of the current pattern pointer
2101 errorptr -> pointer to error message
2102 lookbehind TRUE if this is a lookbehind assertion
2103 condref >= 0 for OPT_CREF setting at start of conditional group
2104 reqchar -> place to put the last required character, or a negative number
2105 countlits -> place to put the shortest literal count of any branch
2106 cd points to the data block with tables pointers
2108 Returns: TRUE on success
2112 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
2113 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
2114 int *reqchar, int *countlits, compile_data *cd)
2116 const uschar *ptr = *ptrptr;
2117 uschar *code = *codeptr;
2118 uschar *last_branch = code;
2119 uschar *start_bracket = code;
2120 uschar *reverse_count = NULL;
2121 int oldoptions = options & PCRE_IMS;
2122 int branchreqchar, branchcountlits;
2125 *countlits = INT_MAX;
2128 /* At the start of a reference-based conditional group, insert the reference
2129 number as an OP_CREF item. */
2137 /* Loop for each alternative branch */
2143 /* Handle change of options */
2145 if (optchanged >= 0)
2148 *code++ = optchanged;
2149 options = (options & ~PCRE_IMS) | optchanged;
2152 /* Set up dummy OP_REVERSE if lookbehind assertion */
2156 *code++ = OP_REVERSE;
2157 reverse_count = code;
2162 /* Now compile the branch */
2164 if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
2165 &branchreqchar, &branchcountlits, cd))
2171 /* Fill in the length of the last branch */
2173 length = code - last_branch;
2174 last_branch[1] = length >> 8;
2175 last_branch[2] = length & 255;
2177 /* Save the last required character if all branches have the same; a current
2178 value of -1 means unset, while -2 means "previous branch had no last required
2183 if (branchreqchar >= 0)
2185 if (*reqchar == -1) *reqchar = branchreqchar;
2186 else if (*reqchar != branchreqchar) *reqchar = -2;
2191 /* Keep the shortest literal count */
2193 if (branchcountlits < *countlits) *countlits = branchcountlits;
2194 DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
2196 /* If lookbehind, check that this branch matches a fixed-length string,
2197 and put the length into the OP_REVERSE item. Temporarily mark the end of
2198 the branch with OP_END. */
2203 length = find_fixedlength(last_branch, options);
2204 DPRINTF(("fixed length = %d\n", length));
2211 reverse_count[0] = (length >> 8);
2212 reverse_count[1] = length & 255;
2215 /* Reached end of expression, either ')' or end of pattern. Insert a
2216 terminating ket and the length of the whole bracketed item, and return,
2217 leaving the pointer at the terminating char. If any of the ims options
2218 were changed inside the group, compile a resetting op-code following. */
2222 length = code - start_bracket;
2224 *code++ = length >> 8;
2225 *code++ = length & 255;
2226 if (optchanged >= 0)
2229 *code++ = oldoptions;
2236 /* Another branch follows; insert an "or" node and advance the pointer. */
2243 /* Control never reaches here */
2249 /*************************************************
2250 * Find first significant op code *
2251 *************************************************/
2253 /* This is called by several functions that scan a compiled expression looking
2254 for a fixed first character, or an anchoring op code etc. It skips over things
2255 that do not influence this. For one application, a change of caseless option is
2259 code pointer to the start of the group
2260 options pointer to external options
2261 optbit the option bit whose changing is significant, or
2263 optstop TRUE to return on option change, otherwise change the options
2266 Returns: pointer to the first significant opcode
2269 static const uschar*
2270 first_significant_code(const uschar *code, int *options, int optbit,
2278 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
2280 if (optstop) return code;
2281 *options = (int)code[1];
2290 case OP_WORD_BOUNDARY:
2291 case OP_NOT_WORD_BOUNDARY:
2297 case OP_ASSERTBACK_NOT:
2298 do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
2306 /* Control never reaches here */
2312 /*************************************************
2313 * Check for anchored expression *
2314 *************************************************/
2316 /* Try to find out if this is an anchored regular expression. Consider each
2317 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2318 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2319 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2320 counts, since OP_CIRC can match in the middle.
2322 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2323 because that will try the rest of the pattern at all possible matching points,
2324 so there is no point trying them again.
2327 code points to start of expression (the bracket)
2328 options points to the options setting
2330 Returns: TRUE or FALSE
2334 is_anchored(register const uschar *code, int *options)
2337 const uschar *scode = first_significant_code(code + 3, options,
2338 PCRE_MULTILINE, FALSE);
2339 register int op = *scode;
2340 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2341 { if (!is_anchored(scode, options)) return FALSE; }
2342 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2343 (*options & PCRE_DOTALL) != 0)
2344 { if (scode[1] != OP_ANY) return FALSE; }
2345 else if (op != OP_SOD &&
2346 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2348 code += (code[1] << 8) + code[2];
2350 while (*code == OP_ALT);
2356 /*************************************************
2357 * Check for starting with ^ or .* *
2358 *************************************************/
2360 /* This is called to find out if every branch starts with ^ or .* so that
2361 "first char" processing can be done to speed things up in multiline
2362 matching and for non-DOTALL patterns that start with .* (which must start at
2363 the beginning or after \n).
2365 Argument: points to start of expression (the bracket)
2366 Returns: TRUE or FALSE
2370 is_startline(const uschar *code)
2373 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
2374 register int op = *scode;
2375 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2376 { if (!is_startline(scode)) return FALSE; }
2377 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2378 { if (scode[1] != OP_ANY) return FALSE; }
2379 else if (op != OP_CIRC) return FALSE;
2380 code += (code[1] << 8) + code[2];
2382 while (*code == OP_ALT);
2388 /*************************************************
2389 * Check for fixed first char *
2390 *************************************************/
2392 /* Try to find out if there is a fixed first character. This is called for
2393 unanchored expressions, as it speeds up their processing quite considerably.
2394 Consider each alternative branch. If they all start with the same char, or with
2395 a bracket all of whose alternatives start with the same char (recurse ad lib),
2396 then we return that char, otherwise -1.
2399 code points to start of expression (the bracket)
2400 options pointer to the options (used to check casing changes)
2402 Returns: -1 or the fixed first char
2406 find_firstchar(const uschar *code, int *options)
2408 register int c = -1;
2411 const uschar *scode = first_significant_code(code + 3, options,
2412 PCRE_CASELESS, TRUE);
2413 register int op = *scode;
2415 if (op >= OP_BRA) op = OP_BRA;
2426 if ((d = find_firstchar(scode, options)) < 0) return -1;
2427 if (c < 0) c = d; else if (c != d) return -1;
2430 case OP_EXACT: /* Fall through */
2433 case OP_CHARS: /* Fall through */
2438 if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
2442 code += (code[1] << 8) + code[2];
2444 while (*code == OP_ALT);
2452 /*************************************************
2453 * Compile a Regular Expression *
2454 *************************************************/
2456 /* This function takes a string and returns a pointer to a block of store
2457 holding a compiled version of the expression.
2460 pattern the regular expression
2461 options various option bits
2462 errorptr pointer to pointer to error text
2463 erroroffset ptr offset in pattern where error was detected
2464 tables pointer to character tables or NULL
2466 Returns: pointer to compiled data block, or NULL on error,
2467 with errorptr and erroroffset set
2471 pcre_compile(const char *pattern, int options, const char **errorptr,
2472 int *erroroffset, const unsigned char *tables)
2475 int length = 3; /* For initial BRA plus length */
2477 int c, reqchar, countlits;
2479 int top_backref = 0;
2480 int branch_extra = 0;
2481 int branch_newextra;
2482 unsigned int brastackptr = 0;
2486 compile_data compile_block;
2487 int brastack[BRASTACK_SIZE];
2488 uschar bralenstack[BRASTACK_SIZE];
2489 const size_t pattern_length = strlen(pattern);
2492 uschar *code_base, *code_end;
2495 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
2497 #ifndef SUPPORT_UTF8
2498 if ((options & PCRE_UTF8) != 0)
2505 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2506 can do is just return NULL. */
2508 if (errorptr == NULL) return NULL;
2511 /* However, we can give a message for this error */
2513 if (erroroffset == NULL)
2520 if ((options & ~PUBLIC_OPTIONS) != 0)
2526 /* Set up pointers to the individual character tables */
2528 if (tables == NULL) tables = pcre_default_tables;
2529 compile_block.lcc = tables + lcc_offset;
2530 compile_block.fcc = tables + fcc_offset;
2531 compile_block.cbits = tables + cbits_offset;
2532 compile_block.ctypes = tables + ctypes_offset;
2534 /* Reflect pattern for debugging output */
2536 DPRINTF(("------------------------------------------------------------------\n"));
2537 DPRINTF(("%s\n", pattern));
2539 /* The first thing to do is to make a pass over the pattern to compute the
2540 amount of store required to hold the compiled code. This does not have to be
2541 perfect as long as errors are overestimates. At the same time we can detect any
2542 internal flag settings. Make an attempt to correct for any counted white space
2543 if an "extended" flag setting appears late in the pattern. We can't be so
2544 clever for #-comments. */
2546 ptr = (const uschar *)(pattern - 1);
2547 while ((c = *(++ptr)) != 0)
2550 int class_charcount;
2552 if ((options & PCRE_EXTENDED) != 0)
2554 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2557 /* The space before the ; is to avoid a warning on a silly compiler
2558 on the Macintosh. */
2559 while ((c = *(++ptr)) != 0 && c != '\n') ;
2566 /* A backslashed item may be an escaped "normal" character or a
2567 character type. For a "normal" character, put the pointers and
2568 character back so that tests for whitespace etc. in the input
2569 are done correctly. */
2573 const uschar *save_ptr = ptr;
2574 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2575 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2585 /* A back reference needs an additional char, plus either one or 5
2586 bytes for a repeat. We also need to keep the value of the highest
2591 int refnum = -c - ESC_REF;
2592 if (refnum > top_backref) top_backref = refnum;
2593 length++; /* For single back reference */
2594 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2596 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2597 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2598 if ((min == 0 && (max == 1 || max == -1)) ||
2599 (min == 1 && max == -1))
2602 if (ptr[1] == '?') ptr++;
2610 case '*': /* These repeats won't be after brackets; */
2611 case '+': /* those are handled separately */
2616 /* This covers the cases of repeats after a single char, metachar, class,
2617 or back reference. */
2620 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2621 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2622 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2623 if ((min == 0 && (max == 1 || max == -1)) ||
2624 (min == 1 && max == -1))
2628 length--; /* Uncount the original char or metachar */
2629 if (min == 1) length++; else if (min > 0) length += 4;
2630 if (max > 0) length += 4; else length += 2;
2632 if (ptr[1] == '?') ptr++;
2635 /* An alternation contains an offset to the next branch or ket. If any ims
2636 options changed in the previous branch(es), and/or if we are in a
2637 lookbehind assertion, extra space will be needed at the start of the
2638 branch. This is handled by branch_extra. */
2641 length += 3 + branch_extra;
2644 /* A character class uses 33 characters. Don't worry about character types
2645 that aren't allowed in classes - they'll get picked up during the compile.
2646 A character class that contains only one character uses 2 or 3 bytes,
2647 depending on whether it is negated or not. Notice this where we can. */
2650 class_charcount = 0;
2651 if (*(++ptr) == '^') ptr++;
2656 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2658 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2659 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2661 else class_charcount++;
2664 while (*ptr != 0 && *ptr != ']');
2666 /* Repeats for negated single chars are handled by the general code */
2668 if (class_charcount == 1) length += 3; else
2672 /* A repeat needs either 1 or 5 bytes. */
2674 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2676 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2677 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2678 if ((min == 0 && (max == 1 || max == -1)) ||
2679 (min == 1 && max == -1))
2682 if (ptr[1] == '?') ptr++;
2687 /* Brackets may be genuine groups or special things */
2690 branch_newextra = 0;
2692 /* Handle special forms of bracket, which all start (? */
2701 /* Skip over comments entirely */
2704 while (*ptr != 0 && *ptr != ')') ptr++;
2708 goto PCRE_ERROR_RETURN;
2712 /* Non-referencing groups and lookaheads just move the pointer on, and
2713 then behave like a non-special bracket, except that they don't increment
2714 the count of extracting brackets. Ditto for the "once only" bracket,
2715 which is in Perl from version 5.005. */
2724 /* A recursive call to the regex is an extension, to provide the
2725 facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2731 goto PCRE_ERROR_RETURN;
2737 /* Lookbehinds are in Perl from version 5.005 */
2740 if (ptr[3] == '=' || ptr[3] == '!')
2743 branch_newextra = 3;
2744 length += 3; /* For the first branch */
2748 goto PCRE_ERROR_RETURN;
2750 /* Conditionals are in Perl from version 5.005. The bracket must either
2751 be followed by a number (for bracket reference) or by an assertion
2755 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2759 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2763 goto PCRE_ERROR_RETURN;
2766 else /* An assertion must follow */
2768 ptr++; /* Can treat like ':' as far as spacing is concerned */
2769 if (ptr[2] != '?' ||
2770 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2772 ptr += 2; /* To get right offset in message */
2774 goto PCRE_ERROR_RETURN;
2779 /* Else loop checking valid options until ) is met. Anything else is an
2780 error. If we are without any brackets, i.e. at top level, the settings
2781 act as if specified in the options, so massage the options immediately.
2782 This is for backward compatibility with Perl 5.004. */
2795 *optset |= PCRE_CASELESS;
2799 *optset |= PCRE_MULTILINE;
2803 *optset |= PCRE_DOTALL;
2807 *optset |= PCRE_EXTENDED;
2811 *optset |= PCRE_EXTRA;
2815 *optset |= PCRE_UNGREEDY;
2822 /* A termination by ')' indicates an options-setting-only item;
2823 this is global at top level; otherwise nothing is done here and
2824 it is handled during the compiling process on a per-bracket-group
2828 if (brastackptr == 0)
2830 options = (options | set) & (~unset);
2831 set = unset = 0; /* To save length */
2835 /* A termination by ':' indicates the start of a nested group with
2836 the given options set. This is again handled at compile time, but
2837 we must allow for compiled space if any of the ims options are
2838 set. We also have to allow for resetting space at the end of
2839 the group, which is why 4 is added to the length and not just 2.
2840 If there are several changes of options within the same group, this
2841 will lead to an over-estimate on the length, but this shouldn't
2842 matter very much. We also have to allow for resetting options at
2843 the start of any alternations, which we do by setting
2844 branch_newextra to 2. Finally, we record whether the case-dependent
2845 flag ever changes within the regex. This is used by the "required
2849 if (((set|unset) & PCRE_IMS) != 0)
2852 branch_newextra = 2;
2853 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2857 /* Unrecognized option character */
2861 goto PCRE_ERROR_RETURN;
2865 /* If we hit a closing bracket, that's it - this is a freestanding
2866 option-setting. We need to ensure that branch_extra is updated if
2867 necessary. The only values branch_newextra can have here are 0 or 2.
2868 If the value is 2, then branch_extra must either be 2 or 5, depending
2869 on whether this is a lookbehind group or not. */
2874 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2875 branch_extra += branch_newextra;
2879 /* If options were terminated by ':' control comes here. Fall through
2880 to handle the group below. */
2884 /* Extracting brackets must be counted so we can process escapes in a
2889 /* Non-special forms of bracket. Save length for computing whole length
2890 at end if there's a repeat that requires duplication of the group. Also
2891 save the current value of branch_extra, and start the new group with
2892 the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2893 for a lookbehind assertion. */
2895 if (brastackptr >= sizeof(brastack)/sizeof(int))
2898 goto PCRE_ERROR_RETURN;
2901 bralenstack[brastackptr] = branch_extra;
2902 branch_extra = branch_newextra;
2904 brastack[brastackptr++] = length;
2908 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2909 have to replicate this bracket up to that many times. If brastackptr is
2910 0 this is an unmatched bracket which will generate an error, but take care
2911 not to try to access brastack[-1] when computing the length and restoring
2912 the branch_extra value. */
2921 if (brastackptr > 0)
2923 duplength = length - brastack[--brastackptr];
2924 branch_extra = bralenstack[brastackptr];
2928 /* Leave ptr at the final char; for read_repeat_counts this happens
2929 automatically; for the others we need an increment. */
2931 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2933 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2935 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2937 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2938 else if (c == '+') { maxval = -1; ptr++; }
2939 else if (c == '?') { minval = 0; ptr++; }
2941 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2942 group, and if the maximum is greater than zero, we have to replicate
2943 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2944 bracket set - hence the 7. */
2949 if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2952 /* When the minimum is greater than zero, 1 we have to replicate up to
2953 minval-1 times, with no additions required in the copies. Then, if
2954 there is a limited maximum we have to replicate up to maxval-1 times
2955 allowing for a BRAZERO item before each optional copy and nesting
2956 brackets for all but one of the optional copies. */
2960 length += (minval - 1) * duplength;
2961 if (maxval > minval) /* Need this test as maxval=-1 means no limit */
2962 length += (maxval - minval) * (duplength + 7) - 6;
2967 /* Non-special character. For a run of such characters the length required
2968 is the number of characters + 2, except that the maximum run length is 255.
2969 We won't get a skipped space or a non-data escape or the start of a #
2970 comment as the first character, so the length can't be zero. */
2978 if ((options & PCRE_EXTENDED) != 0)
2980 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2983 /* The space before the ; is to avoid a warning on a silly compiler
2984 on the Macintosh. */
2985 while ((c = *(++ptr)) != 0 && c != '\n') ;
2990 /* Backslash may introduce a data char or a metacharacter; stop the
2991 string before the latter. */
2995 const uschar *saveptr = ptr;
2996 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2998 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2999 if (c < 0) { ptr = saveptr; break; }
3002 if (c > 127 && (options & PCRE_UTF8) != 0)
3005 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
3006 if (c <= utf8_table1[i]) break;
3012 /* Ordinary character or single-char escape */
3016 if (ptr > pattern + pattern_length)
3018 *errorptr = "internal error";
3019 goto PCRE_ERROR_RETURN;
3021 if (ptr[1] >= tables_length - ctypes_offset)
3024 goto PCRE_ERROR_RETURN;
3028 /* This "while" is the end of the "do" above. */
3030 while (runlength < MAXLIT &&
3031 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
3034 length += runlength;
3039 length += 4; /* For final KET and END */
3047 /* Compute the size of data block needed and get it, either from malloc or
3048 externally provided function. We specify "code[0]" in the offsetof() expression
3049 rather than just "code", because it has been reported that one broken compiler
3050 fails on "code" because it is also an independent variable. It should make no
3051 difference to the value of the offsetof(). */
3053 size = length + offsetof(real_pcre, code[0]);
3054 re = (real_pcre *)(pcre_malloc)(size);
3062 /* Put in the magic number, and save the size, options, and table pointer */
3064 re->magic_number = MAGIC_NUMBER;
3066 re->options = options;
3067 re->tables = tables;
3069 /* Set up a starting, non-extracting bracket, then compile the expression. On
3070 error, *errorptr will be set non-NULL, so we don't need to look at the result
3071 of the function here. */
3073 ptr = (const uschar *)pattern;
3077 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
3078 &reqchar, &countlits, &compile_block);
3079 re->top_bracket = bracount;
3080 re->top_backref = top_backref;
3082 /* If not reached end of pattern on success, there's an excess bracket. */
3084 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
3086 /* Fill in the terminating state and check for disastrous overflow, but
3087 if debugging, leave the test till after things are printed out. */
3092 if (code - re->code > length) *errorptr = ERR23;
3095 /* Give an error if there's back reference to a non-existent capturing
3098 if (top_backref > re->top_bracket) *errorptr = ERR15;
3100 /* Failed to compile */
3102 if (*errorptr != NULL)
3106 *erroroffset = ptr - (const uschar *)pattern;
3110 /* If the anchored option was not passed, set flag if we can determine that the
3111 pattern is anchored by virtue of ^ characters or \A or anything else (such as
3112 starting with .* when DOTALL is set).
3114 Otherwise, see if we can determine what the first character has to be, because
3115 that speeds up unanchored matches no end. If not, see if we can set the
3116 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
3117 start with ^. and also when all branches start with .* for non-DOTALL matches.
3120 if ((options & PCRE_ANCHORED) == 0)
3122 int temp_options = options;
3123 if (is_anchored(re->code, &temp_options))
3124 re->options |= PCRE_ANCHORED;
3127 int ch = find_firstchar(re->code, &temp_options);
3130 re->first_char = ch;
3131 re->options |= PCRE_FIRSTSET;
3133 else if (is_startline(re->code))
3134 re->options |= PCRE_STARTLINE;
3138 /* Save the last required character if there are at least two literal
3139 characters on all paths, or if there is no first character setting. */
3141 if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
3143 re->req_char = reqchar;
3144 re->options |= PCRE_REQCHSET;
3147 /* Print out the compiled data for debugging */
3151 printf("Length = %d top_bracket = %d top_backref = %d\n",
3152 length, re->top_bracket, re->top_backref);
3154 if (re->options != 0)
3156 printf("%s%s%s%s%s%s%s%s%s\n",
3157 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
3158 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
3159 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
3160 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
3161 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
3162 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
3163 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
3164 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
3165 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
3168 if ((re->options & PCRE_FIRSTSET) != 0)
3170 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
3171 else printf("First char = \\x%02x\n", re->first_char);
3174 if ((re->options & PCRE_REQCHSET) != 0)
3176 if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
3177 else printf("Req char = \\x%02x\n", re->req_char);
3181 code_base = code = re->code;
3183 while (code < code_end)
3187 printf("%3d ", code - code_base);
3189 if (*code >= OP_BRA)
3191 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
3198 printf(" %.2x %s", code[1], OP_names[*code]);
3203 printf("%3d Cond", (code[1] << 8) + code[2]);
3208 printf(" %.2d %s", code[1], OP_names[*code]);
3213 charlength = *(++code);
3214 printf("%3d ", charlength);
3215 while (charlength-- > 0)
3216 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
3226 case OP_ASSERTBACK_NOT:
3228 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3233 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3244 case OP_TYPEMINSTAR:
3246 case OP_TYPEMINPLUS:
3248 case OP_TYPEMINQUERY:
3249 if (*code >= OP_TYPESTAR)
3250 printf(" %s", OP_names[code[1]]);
3251 else if (isprint(c = code[1])) printf(" %c", c);
3252 else printf(" \\x%02x", c);
3253 printf("%s", OP_names[*code++]);
3259 if (isprint(c = code[3])) printf(" %c{", c);
3260 else printf(" \\x%02x{", c);
3261 if (*code != OP_EXACT) printf("0,");
3262 printf("%d}", (code[1] << 8) + code[2]);
3263 if (*code == OP_MINUPTO) printf("?");
3269 case OP_TYPEMINUPTO:
3270 printf(" %s{", OP_names[code[3]]);
3271 if (*code != OP_TYPEEXACT) printf(",");
3272 printf("%d}", (code[1] << 8) + code[2]);
3273 if (*code == OP_TYPEMINUPTO) printf("?");
3278 if (isprint(c = *(++code))) printf(" [^%c]", c);
3279 else printf(" [^\\x%02x]", c);
3287 case OP_NOTMINQUERY:
3288 if (isprint(c = code[1])) printf(" [^%c]", c);
3289 else printf(" [^\\x%02x]", c);
3290 printf("%s", OP_names[*code++]);
3296 if (isprint(c = code[3])) printf(" [^%c]{", c);
3297 else printf(" [^\\x%02x]{", c);
3298 if (*code != OP_NOTEXACT) printf(",");
3299 printf("%d}", (code[1] << 8) + code[2]);
3300 if (*code == OP_NOTMINUPTO) printf("?");
3305 printf(" \\%d", *(++code));
3307 goto CLASS_REF_REPEAT;
3315 for (i = 0; i < 256; i++)
3317 if ((code[i/8] & (1 << (i&7))) != 0)
3320 for (j = i+1; j < 256; j++)
3321 if ((code[j/8] & (1 << (j&7))) == 0) break;
3322 if (i == '-' || i == ']') printf("\\");
3323 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
3327 if (j == '-' || j == ']') printf("\\");
3328 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
3346 printf("%s", OP_names[*code]);
3351 min = (code[1] << 8) + code[2];
3352 max = (code[3] << 8) + code[4];
3353 if (max == 0) printf("{%d,}", min);
3354 else printf("{%d,%d}", min, max);
3355 if (*code == OP_CRMINRANGE) printf("?");
3365 /* Anything else is just a one-node item */
3368 printf(" %s", OP_names[*code]);
3375 printf("------------------------------------------------------------------\n");
3377 /* This check is done here in the debugging case so that the code that
3378 was compiled can be seen. */
3380 if (code - re->code > length)
3384 *erroroffset = ptr - (uschar *)pattern;
3394 /*************************************************
3395 * Match a back-reference *
3396 *************************************************/
3398 /* If a back reference hasn't been set, the length that is passed is greater
3399 than the number of characters left in the string, so the match fails.
3402 offset index into the offset vector
3403 eptr points into the subject
3404 length length to be matched
3405 md points to match data block
3408 Returns: TRUE if matched
3412 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3413 unsigned long int ims)
3415 const uschar *p = md->start_subject + md->offset_vector[offset];
3418 if (eptr >= md->end_subject)
3419 printf("matching subject <null>");
3422 printf("matching subject ");
3423 pchars(eptr, length, TRUE, md);
3425 printf(" against backref ");
3426 pchars(p, length, FALSE, md);
3430 /* Always fail if not enough characters left */
3432 if (length > md->end_subject - eptr) return FALSE;
3434 /* Separate the caselesss case for speed */
3436 if ((ims & PCRE_CASELESS) != 0)
3438 while (length-- > 0)
3439 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
3442 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3449 /*************************************************
3450 * Match from current position *
3451 *************************************************/
3453 /* On entry ecode points to the first opcode, and eptr to the first character
3454 in the subject string, while eptrb holds the value of eptr at the start of the
3455 last bracketed group - used for breaking infinite loops matching zero-length
3459 eptr pointer in subject
3460 ecode position in code
3461 offset_top current top pointer
3462 md pointer to "static" info for the match
3463 ims current /i, /m, and /s options
3464 eptrb pointer to chain of blocks containing eptr at start of
3465 brackets - for testing for empty matches
3467 match_condassert - this is an assertion condition
3468 match_isgroup - this is the start of a bracketed group
3470 Returns: TRUE if matched
3474 match(register const uschar *eptr, register const uschar *ecode,
3475 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3478 unsigned long int original_ims = ims; /* Save for resetting on ')' */
3481 /* At the start of a bracketed group, add the current subject pointer to the
3482 stack of such pointers, to be re-instated at the end of the group when we hit
3483 the closing ket. When match() is called in other circumstances, we don't add to
3486 if ((flags & match_isgroup) != 0)
3488 newptrb.prev = eptrb;
3489 newptrb.saved_eptr = eptr;
3493 /* Now start processing the operations. */
3497 int op = (int)*ecode;
3498 int min, max, ctype;
3501 BOOL minimize = FALSE;
3503 /* Opening capturing bracket. If there is space in the offset vector, save
3504 the current subject position in the working slot at the top of the vector. We
3505 mustn't change the current values of the data slot, because they may be set
3506 from a previous iteration of this group, and be referred to by a reference
3509 If the bracket fails to match, we need to restore this value and also the
3510 values of the final offsets, in case they were set by a previous iteration of
3513 If there isn't enough space in the offset vector, treat this as if it were a
3514 non-capturing bracket. Don't worry about setting the flag for the error case
3515 here; that is handled in the code for KET. */
3519 int number = op - OP_BRA;
3520 int offset = number << 1;
3523 printf("start bracket %d subject=", number);
3524 pchars(eptr, 16, TRUE, md);
3528 if (offset < md->offset_max)
3530 int save_offset1 = md->offset_vector[offset];
3531 int save_offset2 = md->offset_vector[offset+1];
3532 int save_offset3 = md->offset_vector[md->offset_end - number];
3534 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3535 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3539 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3541 ecode += (ecode[1] << 8) + ecode[2];
3543 while (*ecode == OP_ALT);
3545 DPRINTF(("bracket %d failed\n", number));
3547 md->offset_vector[offset] = save_offset1;
3548 md->offset_vector[offset+1] = save_offset2;
3549 md->offset_vector[md->offset_end - number] = save_offset3;
3553 /* Insufficient room for saving captured contents */
3558 /* Other types of node can be handled by a switch */
3562 case OP_BRA: /* Non-capturing bracket: optimized */
3563 DPRINTF(("start bracket 0\n"));
3566 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3568 ecode += (ecode[1] << 8) + ecode[2];
3570 while (*ecode == OP_ALT);
3571 DPRINTF(("bracket 0 failed\n"));
3574 /* Conditional group: compilation checked that there are no more than
3575 two branches. If the condition is false, skipping the first branch takes us
3576 past the end if there is only one branch, but that's OK because that is
3577 exactly what going to the ket would do. */
3580 if (ecode[3] == OP_CREF) /* Condition is extraction test */
3582 int offset = ecode[4] << 1; /* Doubled reference number */
3584 ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3585 5 : 3 + (ecode[1] << 8) + ecode[2]),
3586 offset_top, md, ims, eptrb, match_isgroup);
3589 /* The condition is an assertion. Call match() to evaluate it - setting
3590 the final argument TRUE causes it to stop at the end of an assertion. */
3594 if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3595 match_condassert | match_isgroup))
3597 ecode += 3 + (ecode[4] << 8) + ecode[5];
3598 while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3600 else ecode += (ecode[1] << 8) + ecode[2];
3601 return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3603 /* Control never reaches here */
3605 /* Skip over conditional reference data if encountered (should not be) */
3611 /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3612 an empty string - recursion will then try other alternatives, if any. */
3615 if (md->notempty && eptr == md->start_match) return FALSE;
3616 md->end_match_ptr = eptr; /* Record where we ended */
3617 md->end_offset_top = offset_top; /* and how many extracts were taken */
3620 /* Change option settings */
3625 DPRINTF(("ims set to %02lx\n", ims));
3628 /* Assertion brackets. Check the alternative branches in turn - the
3629 matching won't pass the KET for an assertion. If any one branch matches,
3630 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3631 start of each branch to move the current point backwards, so the code at
3632 this level is identical to the lookahead case. */
3638 if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3639 ecode += (ecode[1] << 8) + ecode[2];
3641 while (*ecode == OP_ALT);
3642 if (*ecode == OP_KET) return FALSE;
3644 /* If checking an assertion for a condition, return TRUE. */
3646 if ((flags & match_condassert) != 0) return TRUE;
3648 /* Continue from after the assertion, updating the offsets high water
3649 mark, since extracts may have been taken during the assertion. */
3651 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3653 offset_top = md->end_offset_top;
3656 /* Negative assertion: all branches must fail to match */
3659 case OP_ASSERTBACK_NOT:
3662 if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3664 ecode += (ecode[1] << 8) + ecode[2];
3666 while (*ecode == OP_ALT);
3668 if ((flags & match_condassert) != 0) return TRUE;
3673 /* Move the subject pointer back. This occurs only at the start of
3674 each branch of a lookbehind assertion. If we are too close to the start to
3675 move back, this match function fails. When working with UTF-8 we move
3676 back a number of characters, not bytes. */
3680 c = (ecode[1] << 8) + ecode[2];
3681 for (i = 0; i < c; i++)
3687 eptr -= (ecode[1] << 8) + ecode[2];
3690 if (eptr < md->start_subject) return FALSE;
3694 /* Recursion matches the current regex, nested. If there are any capturing
3695 brackets started but not finished, we have to save their starting points
3696 and reinstate them after the recursion. However, we don't know how many
3697 such there are (offset_top records the completed total) so we just have
3698 to save all the potential data. There may be up to 99 such values, which
3699 is a bit large to put on the stack, but using malloc for small numbers
3700 seems expensive. As a compromise, the stack is used when there are fewer
3701 than 16 values to store; otherwise malloc is used. A problem is what to do
3702 if the malloc fails ... there is no way of returning to the top level with
3703 an error. Save the top 15 values on the stack, and accept that the rest
3714 if (c < 16) save = stacksave; else
3716 save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3724 for (i = 1; i <= c; i++)
3725 save[i] = md->offset_vector[md->offset_end - i];
3726 rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3728 for (i = 1; i <= c; i++)
3729 md->offset_vector[md->offset_end - i] = save[i];
3730 if (save != stacksave) (pcre_free)(save);
3731 if (!rc) return FALSE;
3733 /* In case the recursion has set more capturing values, save the final
3734 number, then move along the subject till after the recursive match,
3735 and advance one byte in the pattern code. */
3737 offset_top = md->end_offset_top;
3738 eptr = md->end_match_ptr;
3743 /* "Once" brackets are like assertion brackets except that after a match,
3744 the point in the subject string is not moved back. Thus there can never be
3745 a move back into the brackets. Check the alternative branches in turn - the
3746 matching won't pass the KET for this kind of subpattern. If any one branch
3747 matches, we carry on as at the end of a normal bracket, leaving the subject
3752 const uschar *prev = ecode;
3753 const uschar *saved_eptr = eptr;
3757 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3759 ecode += (ecode[1] << 8) + ecode[2];
3761 while (*ecode == OP_ALT);
3763 /* If hit the end of the group (which could be repeated), fail */
3765 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3767 /* Continue as from after the assertion, updating the offsets high water
3768 mark, since extracts may have been taken. */
3770 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3772 offset_top = md->end_offset_top;
3773 eptr = md->end_match_ptr;
3775 /* For a non-repeating ket, just continue at this level. This also
3776 happens for a repeating ket if no characters were matched in the group.
3777 This is the forcible breaking of infinite loops as implemented in Perl
3778 5.005. If there is an options reset, it will get obeyed in the normal
3779 course of events. */
3781 if (*ecode == OP_KET || eptr == saved_eptr)
3787 /* The repeating kets try the rest of the pattern or restart from the
3788 preceding bracket, in the appropriate order. We need to reset any options
3789 that changed within the bracket before re-running it, so check the next
3792 if (ecode[3] == OP_OPT)
3794 ims = (ims & ~PCRE_IMS) | ecode[4];
3795 DPRINTF(("ims set to %02lx at group repeat\n", ims));
3798 if (*ecode == OP_KETRMIN)
3800 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3801 match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3804 else /* OP_KETRMAX */
3806 if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3807 match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3812 /* An alternation is the end of a branch; scan along to find the end of the
3813 bracketed group and go to there. */
3816 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3819 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3820 that it may occur zero times. It may repeat infinitely, or not at all -
3821 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3822 repeat limits are compiled as a number of copies, with the optional ones
3823 preceded by BRAZERO or BRAMINZERO. */
3827 const uschar *next = ecode+1;
3828 if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
3830 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3837 const uschar *next = ecode+1;
3838 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3839 if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
3845 /* End of a group, repeated or non-repeating. If we are at the end of
3846 an assertion "group", stop matching and return TRUE, but record the
3847 current high water mark for use by positive assertions. Do this also
3848 for the "once" (not-backup up) groups. */
3854 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3855 const uschar *saved_eptr = eptrb->saved_eptr;
3857 eptrb = eptrb->prev; /* Back up the stack of bracket start pointers */
3859 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3860 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3863 md->end_match_ptr = eptr; /* For ONCE */
3864 md->end_offset_top = offset_top;
3868 /* In all other cases except a conditional group we have to check the
3869 group number back at the start and if necessary complete handling an
3870 extraction by setting the offsets and bumping the high water mark. */
3872 if (*prev != OP_COND)
3874 int number = *prev - OP_BRA;
3875 int offset = number << 1;
3878 printf("end bracket %d", number);
3884 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3886 md->offset_vector[offset] =
3887 md->offset_vector[md->offset_end - number];
3888 md->offset_vector[offset+1] = eptr - md->start_subject;
3889 if (offset_top <= offset) offset_top = offset + 2;
3894 /* Reset the value of the ims flags, in case they got changed during
3898 DPRINTF(("ims reset to %02lx\n", ims));
3900 /* For a non-repeating ket, just continue at this level. This also
3901 happens for a repeating ket if no characters were matched in the group.
3902 This is the forcible breaking of infinite loops as implemented in Perl
3903 5.005. If there is an options reset, it will get obeyed in the normal
3904 course of events. */
3906 if (*ecode == OP_KET || eptr == saved_eptr)
3912 /* The repeating kets try the rest of the pattern or restart from the
3913 preceding bracket, in the appropriate order. */
3915 if (*ecode == OP_KETRMIN)
3917 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3918 match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3921 else /* OP_KETRMAX */
3923 if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3924 match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3929 /* Start of subject unless notbol, or after internal newline if multiline */
3932 if (md->notbol && eptr == md->start_subject) return FALSE;
3933 if ((ims & PCRE_MULTILINE) != 0)
3935 if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3939 /* ... else fall through */
3941 /* Start of subject assertion */
3944 if (eptr != md->start_subject) return FALSE;
3948 /* Assert before internal newline if multiline, or before a terminating
3949 newline unless endonly is set, else end of subject unless noteol is set. */
3952 if ((ims & PCRE_MULTILINE) != 0)
3954 if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
3955 else { if (md->noteol) return FALSE; }
3961 if (md->noteol) return FALSE;
3964 if (eptr < md->end_subject - 1 ||
3965 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3971 /* ... else fall through */
3973 /* End of subject assertion (\z) */
3976 if (eptr < md->end_subject) return FALSE;
3980 /* End of subject or ending \n assertion (\Z) */
3983 if (eptr < md->end_subject - 1 ||
3984 (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3988 /* Word boundary assertions */
3990 case OP_NOT_WORD_BOUNDARY:
3991 case OP_WORD_BOUNDARY:
3993 BOOL prev_is_word = (eptr != md->start_subject) &&
3994 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3995 BOOL cur_is_word = (eptr < md->end_subject) &&
3996 ((md->ctypes[*eptr] & ctype_word) != 0);
3997 if ((*ecode++ == OP_WORD_BOUNDARY)?
3998 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
4003 /* Match a single character type; inline for speed */
4006 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
4008 if (eptr++ >= md->end_subject) return FALSE;
4011 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4017 if (eptr >= md->end_subject ||
4018 (md->ctypes[*eptr++] & ctype_digit) != 0)
4024 if (eptr >= md->end_subject ||
4025 (md->ctypes[*eptr++] & ctype_digit) == 0)
4030 case OP_NOT_WHITESPACE:
4031 if (eptr >= md->end_subject ||
4032 (md->ctypes[*eptr++] & ctype_space) != 0)
4038 if (eptr >= md->end_subject ||
4039 (md->ctypes[*eptr++] & ctype_space) == 0)
4044 case OP_NOT_WORDCHAR:
4045 if (eptr >= md->end_subject ||
4046 (md->ctypes[*eptr++] & ctype_word) != 0)
4052 if (eptr >= md->end_subject ||
4053 (md->ctypes[*eptr++] & ctype_word) == 0)
4058 /* Match a back reference, possibly repeatedly. Look past the end of the
4059 item to see if there is repeat information following. The code is similar
4060 to that for character classes, but repeated for efficiency. Then obey
4061 similar code to character type repeats - written out again for speed.
4062 However, if the referenced string is the empty string, always treat
4063 it as matched, any number of times (otherwise there could be infinite
4069 int offset = ecode[1] << 1; /* Doubled reference number */
4070 ecode += 2; /* Advance past the item */
4072 /* If the reference is unset, set the length to be longer than the amount
4073 of subject left; this ensures that every attempt at a match fails. We
4074 can't just fail here, because of the possibility of quantifiers with zero
4077 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
4078 md->end_subject - eptr + 1 :
4079 md->offset_vector[offset+1] - md->offset_vector[offset];
4081 /* Set up for repetition, or handle the non-repeated case */
4091 c = *ecode++ - OP_CRSTAR;
4092 minimize = (c & 1) != 0;
4093 min = rep_min[c]; /* Pick up values from tables; */
4094 max = rep_max[c]; /* zero for max => infinity */
4095 if (max == 0) max = INT_MAX;
4100 minimize = (*ecode == OP_CRMINRANGE);
4101 min = (ecode[1] << 8) + ecode[2];
4102 max = (ecode[3] << 8) + ecode[4];
4103 if (max == 0) max = INT_MAX;
4107 default: /* No repeat follows */
4108 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4110 continue; /* With the main loop */
4113 /* If the length of the reference is zero, just continue with the
4116 if (length == 0) continue;
4118 /* First, ensure the minimum number of matches are present. We get back
4119 the length of the reference string explicitly rather than passing the
4120 address of eptr, so that eptr can be a register variable. */
4122 for (i = 1; i <= min; i++)
4124 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4128 /* If min = max, continue at the same level without recursion.
4129 They are not both allowed to be zero. */
4131 if (min == max) continue;
4133 /* If minimizing, keep trying and advancing the pointer */
4139 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4141 if (i >= max || !match_ref(offset, eptr, length, md, ims))
4145 /* Control never gets here */
4148 /* If maximizing, find the longest string and work backwards */
4152 const uschar *pp = eptr;
4153 for (i = min; i < max; i++)
4155 if (!match_ref(offset, eptr, length, md, ims)) break;
4160 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4167 /* Control never gets here */
4171 /* Match a character class, possibly repeatedly. Look past the end of the
4172 item to see if there is repeat information following. Then obey similar
4173 code to character type repeats - written out again for speed. */
4177 const uschar *data = ecode + 1; /* Save for matching */
4178 ecode += 33; /* Advance past the item */
4188 c = *ecode++ - OP_CRSTAR;
4189 minimize = (c & 1) != 0;
4190 min = rep_min[c]; /* Pick up values from tables; */
4191 max = rep_max[c]; /* zero for max => infinity */
4192 if (max == 0) max = INT_MAX;
4197 minimize = (*ecode == OP_CRMINRANGE);
4198 min = (ecode[1] << 8) + ecode[2];
4199 max = (ecode[3] << 8) + ecode[4];
4200 if (max == 0) max = INT_MAX;
4204 default: /* No repeat follows */
4209 /* First, ensure the minimum number of matches are present. */
4211 for (i = 1; i <= min; i++)
4213 if (eptr >= md->end_subject) return FALSE;
4214 GETCHARINC(c, eptr) /* Get character; increment eptr */
4217 /* We do not yet support class members > 255 */
4218 if (c > 255) return FALSE;
4221 if ((data[c/8] & (1 << (c&7))) != 0) continue;
4225 /* If max == min we can continue with the main loop without the
4228 if (min == max) continue;
4230 /* If minimizing, keep testing the rest of the expression and advancing
4231 the pointer while it matches the class. */
4237 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4239 if (i >= max || eptr >= md->end_subject) return FALSE;
4240 GETCHARINC(c, eptr) /* Get character; increment eptr */
4243 /* We do not yet support class members > 255 */
4244 if (c > 255) return FALSE;
4246 if ((data[c/8] & (1 << (c&7))) != 0) continue;
4249 /* Control never gets here */
4252 /* If maximizing, find the longest possible run, then work backwards. */
4256 const uschar *pp = eptr;
4258 for (i = min; i < max; i++)
4260 if (eptr >= md->end_subject) break;
4261 GETCHARLEN(c, eptr, len) /* Get character, set length if UTF-8 */
4264 /* We do not yet support class members > 255 */
4267 if ((data[c/8] & (1 << (c&7))) == 0) break;
4273 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4283 /* Control never gets here */
4285 /* Match a run of characters */
4289 register int length = ecode[1];
4292 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4293 if (eptr >= md->end_subject)
4294 printf("matching subject <null> against pattern ");
4297 printf("matching subject ");
4298 pchars(eptr, length, TRUE, md);
4299 printf(" against pattern ");
4301 pchars(ecode, length, FALSE, md);
4305 if (length > md->end_subject - eptr) return FALSE;
4306 if ((ims & PCRE_CASELESS) != 0)
4308 while (length-- > 0)
4309 if (md->lcc[*ecode++] != md->lcc[*eptr++])
4314 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
4319 /* Match a single character repeatedly; different opcodes share code. */
4322 min = max = (ecode[1] << 8) + ecode[2];
4329 max = (ecode[1] << 8) + ecode[2];
4330 minimize = *ecode == OP_MINUPTO;
4340 c = *ecode++ - OP_STAR;
4341 minimize = (c & 1) != 0;
4342 min = rep_min[c]; /* Pick up values from tables; */
4343 max = rep_max[c]; /* zero for max => infinity */
4344 if (max == 0) max = INT_MAX;
4346 /* Common code for all repeated single-character matches. We can give
4347 up quickly if there are fewer than the minimum number of characters left in
4351 if (min > md->end_subject - eptr) return FALSE;
4354 /* The code is duplicated for the caseless and caseful cases, for speed,
4355 since matching characters is likely to be quite common. First, ensure the
4356 minimum number of matches are present. If min = max, continue at the same
4357 level without recursing. Otherwise, if minimizing, keep trying the rest of
4358 the expression and advancing one matching character if failing, up to the
4359 maximum. Alternatively, if maximizing, find the maximum number of
4360 characters and work backwards. */
4362 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4365 if ((ims & PCRE_CASELESS) != 0)
4368 for (i = 1; i <= min; i++)
4369 if (c != md->lcc[*eptr++]) return FALSE;
4370 if (min == max) continue;
4375 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4377 if (i >= max || eptr >= md->end_subject ||
4378 c != md->lcc[*eptr++])
4381 /* Control never gets here */
4385 const uschar *pp = eptr;
4386 for (i = min; i < max; i++)
4388 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
4392 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4396 /* Control never gets here */
4399 /* Caseful comparisons */
4403 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
4404 if (min == max) continue;
4409 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4411 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
4413 /* Control never gets here */
4417 const uschar *pp = eptr;
4418 for (i = min; i < max; i++)
4420 if (eptr >= md->end_subject || c != *eptr) break;
4424 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4429 /* Control never gets here */
4431 /* Match a negated single character */
4434 if (eptr >= md->end_subject) return FALSE;
4436 if ((ims & PCRE_CASELESS) != 0)
4438 if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
4442 if (*ecode++ == *eptr++) return FALSE;
4446 /* Match a negated single character repeatedly. This is almost a repeat of
4447 the code for a repeated single character, but I haven't found a nice way of
4448 commoning these up that doesn't require a test of the positive/negative
4449 option for each character match. Maybe that wouldn't add very much to the
4450 time taken, but character matching *is* what this is all about... */
4453 min = max = (ecode[1] << 8) + ecode[2];
4460 max = (ecode[1] << 8) + ecode[2];
4461 minimize = *ecode == OP_NOTMINUPTO;
4470 case OP_NOTMINQUERY:
4471 c = *ecode++ - OP_NOTSTAR;
4472 minimize = (c & 1) != 0;
4473 min = rep_min[c]; /* Pick up values from tables; */
4474 max = rep_max[c]; /* zero for max => infinity */
4475 if (max == 0) max = INT_MAX;
4477 /* Common code for all repeated single-character matches. We can give
4478 up quickly if there are fewer than the minimum number of characters left in
4482 if (min > md->end_subject - eptr) return FALSE;
4485 /* The code is duplicated for the caseless and caseful cases, for speed,
4486 since matching characters is likely to be quite common. First, ensure the
4487 minimum number of matches are present. If min = max, continue at the same
4488 level without recursing. Otherwise, if minimizing, keep trying the rest of
4489 the expression and advancing one matching character if failing, up to the
4490 maximum. Alternatively, if maximizing, find the maximum number of
4491 characters and work backwards. */
4493 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4496 if ((ims & PCRE_CASELESS) != 0)
4499 for (i = 1; i <= min; i++)
4500 if (c == md->lcc[*eptr++]) return FALSE;
4501 if (min == max) continue;
4506 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4508 if (i >= max || eptr >= md->end_subject ||
4509 c == md->lcc[*eptr++])
4512 /* Control never gets here */
4516 const uschar *pp = eptr;
4517 for (i = min; i < max; i++)
4519 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
4523 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4527 /* Control never gets here */
4530 /* Caseful comparisons */
4534 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
4535 if (min == max) continue;
4540 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4542 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
4544 /* Control never gets here */
4548 const uschar *pp = eptr;
4549 for (i = min; i < max; i++)
4551 if (eptr >= md->end_subject || c == *eptr) break;
4555 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4560 /* Control never gets here */
4562 /* Match a single character type repeatedly; several different opcodes
4563 share code. This is very similar to the code for single characters, but we
4564 repeat it in the interests of efficiency. */
4567 min = max = (ecode[1] << 8) + ecode[2];
4573 case OP_TYPEMINUPTO:
4575 max = (ecode[1] << 8) + ecode[2];
4576 minimize = *ecode == OP_TYPEMINUPTO;
4581 case OP_TYPEMINSTAR:
4583 case OP_TYPEMINPLUS:
4585 case OP_TYPEMINQUERY:
4586 c = *ecode++ - OP_TYPESTAR;
4587 minimize = (c & 1) != 0;
4588 min = rep_min[c]; /* Pick up values from tables; */
4589 max = rep_max[c]; /* zero for max => infinity */
4590 if (max == 0) max = INT_MAX;
4592 /* Common code for all repeated single character type matches */
4595 ctype = *ecode++; /* Code for the character type */
4597 /* First, ensure the minimum number of matches are present. Use inline
4598 code for maximizing the speed, and do the type test once at the start
4599 (i.e. keep it out of the loop). Also we can test that there are at least
4600 the minimum number of bytes before we start, except when doing '.' in
4601 UTF8 mode. Leave the test in in all cases; in the special case we have
4602 to test after each character. */
4604 if (min > md->end_subject - eptr) return FALSE;
4605 if (min > 0) switch(ctype)
4611 for (i = 1; i <= min; i++)
4613 if (eptr >= md->end_subject ||
4614 (*eptr++ == '\n' && (ims & PCRE_DOTALL) == 0))
4616 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4621 /* Non-UTF8 can be faster */
4622 if ((ims & PCRE_DOTALL) == 0)
4623 { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
4628 for (i = 1; i <= min; i++)
4629 if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
4633 for (i = 1; i <= min; i++)
4634 if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
4637 case OP_NOT_WHITESPACE:
4638 for (i = 1; i <= min; i++)
4639 if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
4643 for (i = 1; i <= min; i++)
4644 if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
4647 case OP_NOT_WORDCHAR:
4648 for (i = 1; i <= min; i++)
4649 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4654 for (i = 1; i <= min; i++)
4655 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4660 /* If min = max, continue at the same level without recursing */
4662 if (min == max) continue;
4664 /* If minimizing, we have to test the rest of the pattern before each
4665 subsequent match. */
4671 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) return TRUE;
4672 if (i >= max || eptr >= md->end_subject) return FALSE;
4678 if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
4681 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4686 if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
4690 if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
4693 case OP_NOT_WHITESPACE:
4694 if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
4698 if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
4701 case OP_NOT_WORDCHAR:
4702 if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
4706 if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
4710 /* Control never gets here */
4713 /* If maximizing it is worth using inline code for speed, doing the type
4714 test once at the start (i.e. keep it out of the loop). */
4718 const uschar *pp = eptr;
4723 /* Special code is required for UTF8, but when the maximum is unlimited
4724 we don't need it. */
4727 if (md->utf8 && max < INT_MAX)
4729 if ((ims & PCRE_DOTALL) == 0)
4731 for (i = min; i < max; i++)
4733 if (eptr >= md->end_subject || *eptr++ == '\n') break;
4734 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4739 for (i = min; i < max; i++)
4742 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4748 /* Non-UTF8 can be faster */
4749 if ((ims & PCRE_DOTALL) == 0)
4751 for (i = min; i < max; i++)
4753 if (eptr >= md->end_subject || *eptr == '\n') break;
4760 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4766 for (i = min; i < max; i++)
4768 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4775 for (i = min; i < max; i++)
4777 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4783 case OP_NOT_WHITESPACE:
4784 for (i = min; i < max; i++)
4786 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4793 for (i = min; i < max; i++)
4795 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4801 case OP_NOT_WORDCHAR:
4802 for (i = min; i < max; i++)
4804 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4811 for (i = min; i < max; i++)
4813 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4822 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4826 while (eptr > pp && (*eptr & 0xc0) == 0x80) eptr--;
4831 /* Control never gets here */
4833 /* There's been some horrible disaster. */
4836 DPRINTF(("Unknown opcode %d\n", *ecode));
4837 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4841 /* Do not stick any code in here without much thought; it is assumed
4842 that "continue" in the code above comes out to here to repeat the main
4845 } /* End of main loop */
4846 /* Control never reaches here */
4852 /*************************************************
4853 * Execute a Regular Expression *
4854 *************************************************/
4856 /* This function applies a compiled re to a subject string and picks out
4857 portions of the string if it matches. Two elements in the vector are set for
4858 each substring: the offsets to the start and end of the substring.
4861 external_re points to the compiled expression
4862 external_extra points to "hints" from pcre_study() or is NULL
4863 subject points to the subject string
4864 length length of subject string (may contain binary zeros)
4865 start_offset where to start in the subject string
4867 offsets points to a vector of ints to be filled in with offsets
4868 offsetcount the number of elements in the vector
4870 Returns: > 0 => success; value is the number of elements filled in
4871 = 0 => success, but offsets is not big enough
4872 -1 => failed to match
4873 < -1 => some kind of unexpected problem
4877 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4878 const char *subject, int length, int start_offset, int options, int *offsets,
4881 int resetcount, ocount;
4882 int first_char = -1;
4885 unsigned long int ims = 0;
4886 match_data match_block;
4887 const uschar *start_bits = NULL;
4888 const uschar *start_match = (const uschar *)subject + start_offset;
4889 const uschar *end_subject;
4890 const uschar *req_char_ptr = start_match - 1;
4891 const real_pcre *re = (const real_pcre *)external_re;
4892 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4893 BOOL using_temporary_offsets = FALSE;
4894 BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4895 BOOL startline = (re->options & PCRE_STARTLINE) != 0;
4897 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4899 if (re == NULL || subject == NULL ||
4900 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4901 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4903 match_block.start_pattern = re->code;
4904 match_block.start_subject = (const uschar *)subject;
4905 match_block.end_subject = match_block.start_subject + length;
4906 end_subject = match_block.end_subject;
4908 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4909 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
4911 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4912 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4913 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4915 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4917 match_block.lcc = re->tables + lcc_offset;
4918 match_block.ctypes = re->tables + ctypes_offset;
4920 /* The ims options can vary during the matching as a result of the presence
4921 of (?ims) items in the pattern. They are kept in a local variable so that
4922 restoring at the exit of a group is easy. */
4924 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4926 /* If the expression has got more back references than the offsets supplied can
4927 hold, we get a temporary bit of working store to use during the matching.
4928 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4931 ocount = offsetcount - (offsetcount % 3);
4933 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4935 ocount = re->top_backref * 3 + 3;
4936 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4937 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4938 using_temporary_offsets = TRUE;
4939 DPRINTF(("Got memory to hold back references\n"));
4941 else match_block.offset_vector = offsets;
4943 match_block.offset_end = ocount;
4944 match_block.offset_max = (2*ocount)/3;
4945 match_block.offset_overflow = FALSE;
4947 /* Compute the minimum number of offsets that we need to reset each time. Doing
4948 this makes a huge difference to execution time when there aren't many brackets
4951 resetcount = 2 + re->top_bracket * 2;
4952 if (resetcount > offsetcount) resetcount = ocount;
4954 /* Reset the working variable associated with each extraction. These should
4955 never be used unless previously set, but they get saved and restored, and so we
4956 initialize them to avoid reading uninitialized locations. */
4958 if (match_block.offset_vector != NULL)
4960 register int *iptr = match_block.offset_vector + ocount;
4961 register int *iend = iptr - resetcount/2 + 1;
4962 while (--iptr >= iend) *iptr = -1;
4965 /* Set up the first character to match, if available. The first_char value is
4966 never set for an anchored regular expression, but the anchoring may be forced
4967 at run time, so we have to test for anchoring. The first char may be unset for
4968 an unanchored pattern, of course. If there's no first char and the pattern was
4969 studied, there may be a bitmap of possible first characters. */
4973 if ((re->options & PCRE_FIRSTSET) != 0)
4975 first_char = re->first_char;
4976 if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4979 if (!startline && extra != NULL &&
4980 (extra->options & PCRE_STUDY_MAPPED) != 0)
4981 start_bits = extra->start_bits;
4984 /* For anchored or unanchored matches, there may be a "last known required
4985 character" set. If the PCRE_CASELESS is set, implying that the match starts
4986 caselessly, or if there are any changes of this flag within the regex, set up
4987 both cases of the character. Otherwise set the two values the same, which will
4988 avoid duplicate testing (which takes significant time). This covers the vast
4989 majority of cases. It will be suboptimal when the case flag changes in a regex
4990 and the required character in fact is caseful. */
4992 if ((re->options & PCRE_REQCHSET) != 0)
4994 req_char = re->req_char;
4995 req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
4996 (re->tables + fcc_offset)[req_char] : req_char;
4999 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5000 the loop runs just once. */
5005 register int *iptr = match_block.offset_vector;
5006 register int *iend = iptr + resetcount;
5008 /* Reset the maximum number of extractions we might see. */
5010 while (iptr < iend) *iptr++ = -1;
5012 /* Advance to a unique first char if possible */
5014 if (first_char >= 0)
5016 if ((ims & PCRE_CASELESS) != 0)
5017 while (start_match < end_subject &&
5018 match_block.lcc[*start_match] != first_char)
5021 while (start_match < end_subject && *start_match != first_char)
5025 /* Or to just after \n for a multiline match if possible */
5029 if (start_match > match_block.start_subject + start_offset)
5031 while (start_match < end_subject && start_match[-1] != '\n')
5036 /* Or to a non-unique first char after study */
5038 else if (start_bits != NULL)
5040 while (start_match < end_subject)
5042 register int c = *start_match;
5043 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
5047 #ifdef DEBUG /* Sigh. Some compilers never learn. */
5048 printf(">>>> Match against: ");
5049 pchars(start_match, end_subject - start_match, TRUE, &match_block);
5053 /* If req_char is set, we know that that character must appear in the subject
5054 for the match to succeed. If the first character is set, req_char must be
5055 later in the subject; otherwise the test starts at the match point. This
5056 optimization can save a huge amount of backtracking in patterns with nested
5057 unlimited repeats that aren't going to match. We don't know what the state of
5058 case matching may be when this character is hit, so test for it in both its
5059 cases if necessary. However, the different cased versions will not be set up
5060 unless PCRE_CASELESS was given or the casing state changes within the regex.
5061 Writing separate code makes it go faster, as does using an autoincrement and
5062 backing off on a match. */
5066 register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
5068 /* We don't need to repeat the search if we haven't yet reached the
5069 place we found it at last time. */
5071 if (p > req_char_ptr)
5073 /* Do a single test if no case difference is set up */
5075 if (req_char == req_char2)
5077 while (p < end_subject)
5079 if (*p++ == req_char) { p--; break; }
5083 /* Otherwise test for either case */
5087 while (p < end_subject)
5089 register int pp = *p++;
5090 if (pp == req_char || pp == req_char2) { p--; break; }
5094 /* If we can't find the required character, break the matching loop */
5096 if (p >= end_subject) break;
5098 /* If we have found the required character, save the point where we
5099 found it, so that we don't search again next time round the loop if
5100 the start hasn't passed this character yet. */
5106 /* When a match occurs, substrings will be set for all internal extractions;
5107 we just need to set up the whole thing as substring 0 before returning. If
5108 there were too many extractions, set the return code to zero. In the case
5109 where we had to get some local store to hold offsets for backreferences, copy
5110 those back references that we can. In this case there need not be overflow
5111 if certain parts of the pattern were not used. */
5113 match_block.start_match = start_match;
5114 if (!match(start_match, re->code, 2, &match_block, ims, NULL, match_isgroup))
5117 /* Copy the offset information from temporary store if necessary */
5119 if (using_temporary_offsets)
5121 if (offsetcount >= 4)
5123 memcpy(offsets + 2, match_block.offset_vector + 2,
5124 (offsetcount - 2) * sizeof(int));
5125 DPRINTF(("Copied offsets from temporary memory\n"));
5127 if (match_block.end_offset_top > offsetcount)
5128 match_block.offset_overflow = TRUE;
5130 DPRINTF(("Freeing temporary memory\n"));
5131 (pcre_free)(match_block.offset_vector);
5134 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
5136 if (match_block.offset_end < 2) rc = 0; else
5138 offsets[0] = start_match - match_block.start_subject;
5139 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
5142 DPRINTF((">>>> returning %d\n", rc));
5146 /* This "while" is the end of the "do" above */
5149 match_block.errorcode == PCRE_ERROR_NOMATCH &&
5150 start_match++ < end_subject);
5152 if (using_temporary_offsets)
5154 DPRINTF(("Freeing temporary memory\n"));
5155 (pcre_free)(match_block.offset_vector);
5158 DPRINTF((">>>> returning %d\n", match_block.errorcode));
5160 return match_block.errorcode;