1 const char pcrs_rcs[] = "$Id: pcrs.c,v 1.7 2001/06/29 13:33:04 oes Exp $";
3 /*********************************************************************
5 * File : $Source: /cvsroot/ijbswa/current/pcrs.c,v $
7 * Purpose : This is the alpha release of libpcrs. It is only published
8 * at this early stage of development, because it is
9 * needed for a new feature in JunkBuster.
11 * While no inconsistencies, memory leaks or functional bugs
12 * are known at this time, there *could* be plenty ;-). Also,
13 * Many pcre-specific options are not yet supported, and
14 * error handling needs improvement.
16 * pcrs is a supplement to the brilliant pcre library by Philip
17 * Hazel (ph10@cam.ac.uk) and adds Perl-style substitution. That
18 * is, it mimics Perl's 's' operator.
20 * Currently, there's no documentation besides comments and the
23 * Short note: I addition to perl's options, 'U' for ungreedy
24 * and 't' for trivial (i.e.: ignore backrefs in the substitute)
27 * Copyright : Written and Copyright (C) 2000, 2001 by Andreas S. Oesterhelt
28 * <andreas@oesterhelt.org>
30 * This program is free software; you can redistribute it
31 * and/or modify it under the terms of the GNU General
32 * Public License as published by the Free Software
33 * Foundation; either version 2 of the License, or (at
34 * your option) any later version.
36 * This program is distributed in the hope that it will
37 * be useful, but WITHOUT ANY WARRANTY; without even the
38 * implied warranty of MERCHANTABILITY or FITNESS FOR A
39 * PARTICULAR PURPOSE. See the GNU General Public
40 * License for more details.
42 * The GNU General Public License should be included with
43 * this file. If not, you can view it at
44 * http://www.gnu.org/copyleft/gpl.html
45 * or write to the Free Software Foundation, Inc., 59
46 * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
50 * Revision 1.7 2001/06/29 13:33:04 oes
51 * - Cleaned up, renamed and reordered functions,
54 * - Replaced globalflag with a general flags int
55 * that holds PCRS_GLOBAL, PCRS_SUCCESS, and PCRS_TRIVIAL
56 * - Introduced trivial option that will prevent pcrs
57 * from honouring backreferences in the substitute,
58 * which is useful for large substitutes that are
59 * red in from somewhere and saves the pain of escaping
61 * - Introduced convenience function pcrs_free_joblist()
62 * - Split pcrs_make_job() into pcrs_compile(), which still
63 * takes a complete s/// comand as argument and parses it,
64 * and a new function pcrs_make_job, which takes the
65 * three separate components. This should make for a
66 * much friendlier frontend.
67 * - Removed create_pcrs_job() which was useless
68 * - Fixed a bug in pcrs_execute
69 * - Success flag is now handled by pcrs instead of user
70 * - Removed logentry from cancelled commit
72 * Revision 1.6 2001/06/03 19:12:45 oes
75 * Revision 1.5 2001/05/29 09:50:24 jongfoster
76 * Unified blocklist/imagelist/permissionslist.
77 * File format is still under discussion, but the internal changes
80 * Also modified interceptor behaviour:
81 * - We now intercept all URLs beginning with one of the following
82 * prefixes (and *only* these prefixes):
84 * * http://ijbswa.sf.net/config/
85 * * http://ijbswa.sourceforge.net/config/
86 * - New interceptors "home page" - go to http://i.j.b/ to see it.
87 * - Internal changes so that intercepted and fast redirect pages
88 * are not replaced with an image.
89 * - Interceptors now have the option to send a binary page direct
90 * to the client. (i.e. ijb-send-banner uses this)
91 * - Implemented show-url-info interceptor. (Which is why I needed
92 * the above interceptors changes - a typical URL is
93 * "http://i.j.b/show-url-info?url=www.somesite.com/banner.gif".
94 * The previous mechanism would not have intercepted that, and
95 * if it had been intercepted then it then it would have replaced
98 * Revision 1.4 2001/05/25 14:12:40 oes
99 * Fixed bug: Empty substitutes now detected
101 * Revision 1.3 2001/05/25 11:03:55 oes
102 * Added sanity check for NULL jobs to pcrs_exec_substitution
104 * Revision 1.2 2001/05/22 18:46:04 oes
106 * - Enabled filtering banners by size rather than URL
107 * by adding patterns that replace all standard banner
108 * sizes with the "Junkbuster" gif to the re_filterfile
110 * - Enabled filtering WebBugs by providing a pattern
111 * which kills all 1x1 images
113 * - Added support for PCRE_UNGREEDY behaviour to pcrs,
114 * which is selected by the (nonstandard and therefore
115 * capital) letter 'U' in the option string.
116 * It causes the quantifiers to be ungreedy by default.
117 * Appending a ? turns back to greedy (!).
119 * - Added a new interceptor ijb-send-banner, which
120 * sends back the "Junkbuster" gif. Without imagelist or
121 * MSIE detection support, or if tinygif = 1, or the
122 * URL isn't recognized as an imageurl, a lame HTML
123 * explanation is sent instead.
125 * - Added new feature, which permits blocking remote
126 * script redirects and firing back a local redirect
128 * The feature is conditionally compiled, i.e. it
129 * can be disabled with --disable-fast-redirects,
130 * plus it must be activated by a "fast-redirects"
131 * line in the config file, has its own log level
132 * and of course wants to be displayed by show-proxy-args
133 * Note: Boy, all the #ifdefs in 1001 locations and
134 * all the fumbling with configure.in and acconfig.h
135 * were *way* more work than the feature itself :-(
137 * - Because a generic redirect template was needed for
138 * this, tinygif = 3 now uses the same.
140 * - Moved GIFs, and other static HTTP response templates
145 * - Removed some >400 CRs again (Jon, you really worked
148 * Revision 1.1.1.1 2001/05/15 13:59:02 oes
149 * Initial import of version 2.9.3 source tree
152 *********************************************************************/
158 const char pcrs_h_rcs[] = PCRS_H_VERSION;
161 /*********************************************************************
163 * Function : pcrs_compile_perl_options
165 * Description : This function parses a string containing the options to
166 * Perl's s/// operator. It returns an integer that is the
167 * pcre equivalent of the symbolic optstring.
168 * Since pcre doesn't know about Perl's 'g' (global) or pcrs',
169 * 'T' (trivial) options but pcrs needs them, the corresponding
170 * flags are set if 'g'or 'T' is encountered.
171 * Note: The 'T' and 'U' options do not conform to Perl.
174 * 1 : optstring = string with options in perl syntax
175 * 2 : flags = see description
177 * Returns : option integer suitable for pcre
179 *********************************************************************/
180 int pcrs_compile_perl_options(char *optstring, int *flags)
185 for (i=0; i < strlen(optstring); i++)
190 case 'g': *flags |= PCRS_GLOBAL; break;
191 case 'i': rc |= PCRE_CASELESS; break;
192 case 'm': rc |= PCRE_MULTILINE; break;
194 case 's': rc |= PCRE_DOTALL; break;
195 case 'x': rc |= PCRE_EXTENDED; break;
196 case 'U': rc |= PCRE_UNGREEDY; break;
197 case 'T': *flags |= PCRS_TRIVIAL; break;
206 /*********************************************************************
208 * Function : pcrs_compile_replacement
210 * Description : This function takes a Perl-style replacement (2nd argument
211 * to the s/// operator and returns a compiled pcrs_substitute,
212 * or NULL if memory allocation for the substitute structure
216 * 1 : replacement = replacement part of s/// operator
218 * 2 : errptr = pointer to an integer in which error
219 * conditions can be returned.
221 * Returns : pcrs_substitute data structure, or NULL if an
222 * error is encountered. In that case, *errptr has
225 *********************************************************************/
226 pcrs_substitute *pcrs_compile_replacement(char *replacement, int trivialflag, int *errptr)
228 int length, i, k = 0, l = 0, quoted = 0, idx;
229 char *text, *num_ptr, *numbers = "0123456789";
232 r = (pcrs_substitute *)malloc(sizeof(pcrs_substitute));
233 if (r == NULL) return NULL;
234 memset(r, '\0', sizeof(pcrs_substitute));
236 text = strdup(replacement); /* must be free()d by caller */
239 *errptr = PCRS_ERR_NOMEM;
244 length = strlen(replacement);
252 for (i=0; i < length; i++)
254 /* Backslash treatment */
255 if (replacement[i] == '\\')
259 text[k++] = replacement[i];
269 /* Dollar treatment */
270 if (replacement[i] == '$' && !quoted && i < length - 1)
272 if (strchr("0123456789&", replacement[i + 1]) == NULL)
274 text[k++] = replacement[i];
278 r->block_length[l] = k - r->block_offset[l];
280 if (replacement[i + 1] != '&')
282 while ((num_ptr = strchr(numbers, replacement[++i])) != NULL && i < length)
284 idx = num_ptr - numbers;
285 r->backref[l] = r->backref[l] * 10 + idx;
291 if (r->backref[l] < PCRS_MAX_SUBMATCHES)
292 r->backref_count[r->backref[l]] += 1;
294 r->block_offset[l] = k;
299 /* Plain char treatment */
300 text[k++] = replacement[i];
303 } /* -END- if (!trivialflag) */
308 r->block_length[l] = k - r->block_offset[l];
314 /*********************************************************************
316 * Function : pcrs_free_job
318 * Description : Frees the memory used by a pcrs_job struct and its
319 * dependant structures. Returns a pointer to the next
320 * job, if there was any, or NULL otherwise.
323 * 1 : job = pointer to the pcrs_job structure to be freed
325 * Returns : a pointer to the next job, if there was any, or
328 *********************************************************************/
329 pcrs_job *pcrs_free_job(pcrs_job *job)
340 if (job->pattern != NULL) free(job->pattern);
341 if (job->hints != NULL) free(job->hints);
342 if (job->substitute != NULL)
344 if (job->substitute->text != NULL) free(job->substitute->text);
345 free(job->substitute);
353 /*********************************************************************
355 * Function : pcrs_free_joblist
357 * Description : Iterates through a chained list of pcrs_job's and
358 * frees them using pcrs_free_job.
361 * 1 : joblist = pointer to the first pcrs_job structure to
366 *********************************************************************/
367 void pcrs_free_joblist(pcrs_job *joblist)
369 while ( NULL != (joblist = pcrs_free_job(joblist)) ) {};
376 /*********************************************************************
378 * Function : pcrs_compile
380 * Description : Main entry point. Takes a string with a Perl-style
381 * s/// command and returns a corresponding pcrs_job,
382 * or NULL if compiling the job fails at any stage.
385 * 1 : command = string with perl-style s/// command
386 * 2 : errptr = pointer to an integer in which error
387 * conditions can be returned.
389 * Returns : a corresponding pcrs_job data structure, or NULL
390 * if an error was encountered. In that case, *errptr
393 *********************************************************************/
394 pcrs_job *pcrs_compile(char *command, int *errptr)
396 int i, k, l, limit, quoted = FALSE;
404 * Tokenize the perl command
406 limit = strlen(command);
409 *errptr = PCRS_ERR_CMDSYNTAX;
414 delimiter = command[1];
417 tokens[l] = (char *) malloc(limit + 1);
419 for (i=0; i <= limit; i++)
422 if (command[i] == delimiter && !quoted)
429 tokens[0][k++] = '\0';
430 tokens[++l] = tokens[0] + k;
434 else if (command[i] == '\\' && !quoted && i+1 < limit && command[i+1] == delimiter)
439 tokens[0][k++] = command[i];
449 *errptr = PCRS_ERR_CMDSYNTAX;
454 newjob = pcrs_make_job(tokens[1], tokens[2], tokens[3], errptr);
461 /*********************************************************************
463 * Function : pcrs_make_job
465 * Description : Takes the three arguments to a perl s/// command
466 * and compiles a pcrs_job structure from them.
469 * 1 : pattern = string with perl-style pattern
470 * 2 : substitute = string with perl-style substitute
471 * 3 : options = string with perl-style options
472 * 4 : errptr = pointer to an integer in which error
473 * conditions can be returned.
475 * Returns : a corresponding pcrs_job data structure, or NULL
476 * if an error was encountered. In that case, *errptr
479 *********************************************************************/
480 pcrs_job *pcrs_make_job(char *pattern, char *substitute, char *options, int *errptr)
487 * Handle NULL arguments
489 if (pattern == NULL) pattern = "";
490 if (substitute == NULL) substitute = "";
491 if (options == NULL) options = "";
494 * Get and init memory
496 if (NULL == (newjob = (pcrs_job *)malloc(sizeof(pcrs_job))))
498 *errptr = PCRS_ERR_NOMEM;
501 memset(newjob, '\0', sizeof(pcrs_job));
505 * Evaluate the options
507 newjob->options = pcrs_compile_perl_options(options, &flags);
508 newjob->flags = flags;
512 * Compile the pattern
514 newjob->pattern = pcre_compile(pattern, newjob->options, &error, errptr, NULL);
515 if (newjob->pattern == NULL)
517 pcrs_free_job(newjob);
523 * Generate hints. This has little overhead, since the
524 * hints will be NULL for a boring pattern anyway.
526 newjob->hints = pcre_study(newjob->pattern, 0, &error);
529 *errptr = PCRS_ERR_STUDY;
530 pcrs_free_job(newjob);
536 * Compile the substitute
538 if (NULL == (newjob->substitute = pcrs_compile_replacement(substitute, newjob->flags & PCRS_TRIVIAL, errptr)))
540 pcrs_free_job(newjob);
549 /*********************************************************************
551 * Function : pcrs_execute
553 * Description : Modify the subject by executing the regular substitution
554 * defined by the job. Since the result may be longer than
555 * the subject, its space requirements are precalculated in
556 * the matching phase and new memory is allocated accordingly.
557 * It is the caller's responsibility to free the result when
558 * it's no longer needed.
561 * 1 : job = the pcrs_job to be executed
562 * 2 : subject = the subject (== original) string
563 * 3 : subject_length = the subject's length
564 * INCLUDING the terminating zero, if string!
565 * 4 : result = char** for returning the result
566 * 5 : result_length = int* for returning the result's length
568 * Returns : the number of substitutions that were made. May be > 1
569 * if job->flags contained PCRS_GLOBAL
571 *********************************************************************/
572 int pcrs_execute(pcrs_job *job, char *subject, int subject_length, char **result, int *result_length)
574 int offsets[3 * PCRS_MAX_SUBMATCHES],
579 pcrs_match matches[PCRS_MAX_MATCHES];
587 if (job == NULL || job->pattern == NULL || job->substitute == NULL)
590 return(PCRS_ERR_BADJOB);
595 * Find the pattern and calculate the space
596 * requirements for the result (newsize)
598 newsize=subject_length;
600 while ((submatches = pcre_exec(job->pattern, job->hints, subject, subject_length, offset, 0, offsets, 3 * PCRS_MAX_SUBMATCHES)) > 0)
602 job->flags |= PCRS_SUCCESS;
603 matches[i].submatches = submatches;
604 for (k=0; k < submatches; k++)
606 matches[i].submatch_offset[k] = offsets[2 * k];
608 /* Note: Non-found optional submatches have length -1-(-1)==0 */
609 matches[i].submatch_length[k] = offsets[2 * k + 1] - offsets[2 * k];
611 /* reserve mem for each submatch as often as it is ref'd */
612 newsize += matches[i].submatch_length[k] * job->substitute->backref_count[k];
614 /* plus replacement text size minus match text size */
615 newsize += strlen(job->substitute->text) - matches[i].submatch_length[0];
617 /* Non-global search or limit reached? */
618 if (++i >= PCRS_MAX_MATCHES || !(job->flags & PCRS_GLOBAL) ) break;
620 /* Don't loop on empty matches */
621 if (offsets[1] == offset)
622 if (offset < subject_length)
626 /* Go find the next one */
630 /* Pass pcre error through if failiure*/
631 if (submatches < -1) return submatches;
636 * Get memory for the result
638 if ((*result = (char *)malloc(newsize)) == NULL) /* must be free()d by caller */
640 return PCRS_ERR_NOMEM;
648 result_offset = *result;
650 for (i=0; i < matches_found; i++)
652 /* copy the chunk preceding the match */
653 memcpy(result_offset, subject + offset, matches[i].submatch_offset[0] - offset);
654 result_offset += matches[i].submatch_offset[0] - offset;
656 /* For every segment of the substitute.. */
657 for (k=0; k <= job->substitute->backrefs; k++)
659 /* ...copy its text.. */
660 memcpy(result_offset, job->substitute->text + job->substitute->block_offset[k], job->substitute->block_length[k]);
661 result_offset += job->substitute->block_length[k];
663 /* ..plus, if it's not the last chunk (i.e.: There IS a backref).. */
664 if (k != job->substitute->backrefs
665 /* ..and a nonempty match.. */
666 && matches[i].submatch_length[job->substitute->backref[k]] > 0
667 /* ..and in legal range, ... */
668 && job->substitute->backref[k] <= PCRS_MAX_SUBMATCHES)
670 /* copy the submatch that is ref'd. */
673 subject + matches[i].submatch_offset[job->substitute->backref[k]],
674 matches[i].submatch_length[job->substitute->backref[k]]
676 result_offset += matches[i].submatch_length[job->substitute->backref[k]];
679 offset = matches[i].submatch_offset[0] + matches[i].submatch_length[0];
683 memcpy(result_offset, subject + offset, subject_length - offset);
685 *result_length = newsize;
686 return matches_found;