1 const char pcrs_rcs[] = "$Id: pcrs.c,v 1.6 2001/06/03 11:03:48 oes Exp $";
3 /*********************************************************************
5 * File : $Source: /cvsroot/ijbswa/current/pcrs.c,v $
7 * Purpose : This is the alpha release of libpcrs. It is only published
8 * at this early stage of development, because it is
9 * needed for a new feature in JunkBuster.
11 * While no inconsistencies, memory leaks or functional bugs
12 * are known at this time, there *could* be plenty ;-). Also,
13 * Many pcre-specific options are not yet supported, and
14 * error handling needs improvement.
16 * pcrs is a supplement to the brilliant pcre library by Philip
17 * Hazel (ph10@cam.ac.uk) and adds Perl-style substitution. That
18 * is, it mimics Perl's 's' operator.
20 * Currently, there's no documentation besides comments and the
23 * Copyright : Written and Copyright (C) 2000 by Andreas Oesterhelt
24 * <andreas@oesterhelt.org>
26 * This program is free software; you can redistribute it
27 * and/or modify it under the terms of the GNU General
28 * Public License as published by the Free Software
29 * Foundation; either version 2 of the License, or (at
30 * your option) any later version.
32 * This program is distributed in the hope that it will
33 * be useful, but WITHOUT ANY WARRANTY; without even the
34 * implied warranty of MERCHANTABILITY or FITNESS FOR A
35 * PARTICULAR PURPOSE. See the GNU General Public
36 * License for more details.
38 * The GNU General Public License should be included with
39 * this file. If not, you can view it at
40 * http://www.gnu.org/copyleft/gpl.html
41 * or write to the Free Software Foundation, Inc., 59
42 * Temple Place - Suite 330, Boston, MA 02111-1307, USA.
46 * Revision 1.6 2001/06/03 11:03:48 oes
53 * adapted to new enlist_unique arg format
57 * introduced confdir option
59 * filters.c filtrers.h
61 * extracted-CGI relevant stuff
69 * support for new cgi mechansim
73 * functions for new list type: "map"
74 * extended enlist_unique
81 * deleted const struct interceptors
89 * added struct http_response
90 * changes struct interceptors to struct cgi_dispatcher
91 * moved HTML stuff to cgi.h
100 * Revision 1.5 2001/05/29 09:50:24 jongfoster
101 * Unified blocklist/imagelist/permissionslist.
102 * File format is still under discussion, but the internal changes
105 * Also modified interceptor behaviour:
106 * - We now intercept all URLs beginning with one of the following
107 * prefixes (and *only* these prefixes):
109 * * http://ijbswa.sf.net/config/
110 * * http://ijbswa.sourceforge.net/config/
111 * - New interceptors "home page" - go to http://i.j.b/ to see it.
112 * - Internal changes so that intercepted and fast redirect pages
113 * are not replaced with an image.
114 * - Interceptors now have the option to send a binary page direct
115 * to the client. (i.e. ijb-send-banner uses this)
116 * - Implemented show-url-info interceptor. (Which is why I needed
117 * the above interceptors changes - a typical URL is
118 * "http://i.j.b/show-url-info?url=www.somesite.com/banner.gif".
119 * The previous mechanism would not have intercepted that, and
120 * if it had been intercepted then it then it would have replaced
123 * Revision 1.4 2001/05/25 14:12:40 oes
124 * Fixed bug: Empty substitutes now detected
126 * Revision 1.3 2001/05/25 11:03:55 oes
127 * Added sanity check for NULL jobs to pcrs_exec_substitution
129 * Revision 1.2 2001/05/22 18:46:04 oes
131 * - Enabled filtering banners by size rather than URL
132 * by adding patterns that replace all standard banner
133 * sizes with the "Junkbuster" gif to the re_filterfile
135 * - Enabled filtering WebBugs by providing a pattern
136 * which kills all 1x1 images
138 * - Added support for PCRE_UNGREEDY behaviour to pcrs,
139 * which is selected by the (nonstandard and therefore
140 * capital) letter 'U' in the option string.
141 * It causes the quantifiers to be ungreedy by default.
142 * Appending a ? turns back to greedy (!).
144 * - Added a new interceptor ijb-send-banner, which
145 * sends back the "Junkbuster" gif. Without imagelist or
146 * MSIE detection support, or if tinygif = 1, or the
147 * URL isn't recognized as an imageurl, a lame HTML
148 * explanation is sent instead.
150 * - Added new feature, which permits blocking remote
151 * script redirects and firing back a local redirect
153 * The feature is conditionally compiled, i.e. it
154 * can be disabled with --disable-fast-redirects,
155 * plus it must be activated by a "fast-redirects"
156 * line in the config file, has its own log level
157 * and of course wants to be displayed by show-proxy-args
158 * Note: Boy, all the #ifdefs in 1001 locations and
159 * all the fumbling with configure.in and acconfig.h
160 * were *way* more work than the feature itself :-(
162 * - Because a generic redirect template was needed for
163 * this, tinygif = 3 now uses the same.
165 * - Moved GIFs, and other static HTTP response templates
170 * - Removed some >400 CRs again (Jon, you really worked
173 * Revision 1.1.1.1 2001/05/15 13:59:02 oes
174 * Initial import of version 2.9.3 source tree
177 *********************************************************************/
183 const char pcrs_h_rcs[] = PCRS_H_VERSION;
186 /*********************************************************************
188 * Function : my_strsep
190 * Description : Convenience function. It acts like strsep, except that
191 * it respects quoting of the delimiter character with the
192 * quote character. (And, of course, quoting the quote char
193 * with itself.) Called from `pcrs_make_job'.
196 * 1 : token = current token
197 * 2 : text = string to tokenize
198 * 3 : delimiter = single character deliminter
199 * 4 : quote_char = character to cause quoting
201 * Returns : -1 => failure, else the length of the token found.
202 * In the latter case, *text is the token's start.
204 *********************************************************************/
205 int my_strsep(char *token, char **text, char delimiter, char quote_char)
207 int i, k=0, limit, quoted = FALSE;
209 limit = strlen(*text);
217 for (i=0; i < limit; i++)
219 if (text[0][i] == delimiter && !quoted)
224 else if (text[0][i] == quote_char && !quoted && i+1 < limit && text[0][i+1] == delimiter)
229 token[k++] = text[0][i];
239 /*********************************************************************
241 * Function : pcrs_compile_perl_options
243 * Description : This function parses a string containing the options to
244 * Perl's s/// operator. It returns an integer that is the
245 * pcre equivalent of the symbolic optstring.
246 * Since pcre doesn't know about Perl's 'g' (global) option,
247 * but pcrs needs it, the globalflag integer is set if 'g'
251 * 1 : optstring = string with options in perl syntax
252 * 2 : globalflag = see description
254 * Returns : option integer suitable for pcre
256 *********************************************************************/
257 int pcrs_compile_perl_options(char *optstring, int *globalflag)
262 for (i=0; i < strlen(optstring); i++)
267 case 'g': *globalflag = 1; break;
268 case 'i': rc |= PCRE_CASELESS; break;
269 case 'm': rc |= PCRE_MULTILINE; break;
271 case 's': rc |= PCRE_DOTALL; break;
272 case 'x': rc |= PCRE_EXTENDED; break;
273 case 'U': rc |= PCRE_UNGREEDY; break;
282 /*********************************************************************
284 * Function : pcrs_compile_replacement
286 * Description : This function takes a Perl-style replacement (2nd argument
287 * to the s/// operator and returns a compiled pcrs_substitute,
288 * or NULL if memory allocation for the substitute structure
292 * 1 : replacement = replacement part of s/// operator
294 * 2 : errptr = pointer to an integer in which error
295 * conditions can be returned.
297 * Returns : pcrs_substitute data structure, or NULL if an
298 * error is encountered. In that case, *errptr has
301 *********************************************************************/
302 pcrs_substitute *pcrs_compile_replacement(char *replacement, int *errptr)
304 int length, i, k = 0, l = 0, quoted = 0, idx;
305 char *text, *num_ptr, *numbers = "0123456789";
308 r = (pcrs_substitute *)malloc(sizeof(pcrs_substitute));
309 if (r == NULL) return NULL;
310 memset(r, '\0', sizeof(pcrs_substitute));
312 text = strdup(replacement); /* must be free()d by caller */
315 *errptr = PCRS_ERR_NOMEM;
320 length = strlen(replacement);
322 for (i=0; i < length; i++)
324 /* Backslash treatment */
325 if (replacement[i] == '\\')
329 text[k++] = replacement[i];
339 /* Dollar treatment */
340 if (replacement[i] == '$' && !quoted && i < length - 1)
342 if (strchr("0123456789&", replacement[i + 1]) == NULL)
344 text[k++] = replacement[i];
348 r->block_length[l] = k - r->block_offset[l];
350 if (replacement[i + 1] != '&')
352 while ((num_ptr = strchr(numbers, replacement[++i])) != NULL && i < length)
354 idx = num_ptr - numbers;
355 r->backref[l] = r->backref[l] * 10 + idx;
361 if (r->backref[l] < PCRS_MAX_SUBMATCHES)
362 r->backref_count[r->backref[l]] += 1;
364 r->block_offset[l] = k;
369 /* Plain char treatment */
370 text[k++] = replacement[i];
377 r->block_length[l] = k - r->block_offset[l];
383 /*********************************************************************
385 * Function : pcrs_free_job
387 * Description : Frees the memory used by a pcrs_job struct and its
388 * dependant structures. Returns a pointer to the next
389 * job, if there was any, or NULL otherwise.
392 * 1 : job = pointer to the pcrs_job structure to be freed
394 * Returns : a pointer to the next job, if there was any, or
397 *********************************************************************/
398 pcrs_job *pcrs_free_job(pcrs_job *job)
409 if (job->pattern != NULL) free(job->pattern);
410 if (job->hints != NULL) free(job->hints);
411 if (job->substitute != NULL)
413 if (job->substitute->text != NULL) free(job->substitute->text);
414 free(job->substitute);
423 /*********************************************************************
425 * Function : pcrs_make_job
427 * Description : Main entry point. Takes a string with a Perl-style
428 * s/// command and returns a corresponding pcrs_job,
429 * or NULL if compiling the job fails at any stage.
430 * Diagnostics could obviously be improved.
433 * 1 : command = string with perl-style s/// command
434 * 2 : errptr = pointer to an integer in which error
435 * conditions can be returned.
437 * Returns : a corresponding pcrs_job data structure, or NULL
438 * if an error was encountered. In that case, *errptr
441 *********************************************************************/
442 pcrs_job *pcrs_make_job(char *command, int *errptr)
444 char *dummy, *token, delimiter;
446 int i = 0, globalflag;
449 /* Get and init memory */
450 if ((newjob = (pcrs_job *)malloc(sizeof(pcrs_job))) == NULL)
452 *errptr = PCRS_ERR_NOMEM;
455 memset(newjob, '\0', sizeof(pcrs_job));
457 /* Command too short? */
458 if (strlen(command) < 4)
460 *errptr = PCRS_ERR_CMDSYNTAX;
461 pcrs_free_job(newjob);
465 /* Split command into tokens and handle them */
466 delimiter = command[1];
467 token = (char *)malloc(strlen(command)); /* current token */
468 dummy = (char *)malloc(strlen(command)); /* must store pattern, since we can't */
469 /* use it until the options are known */
470 while (my_strsep(token, &command, delimiter, '\\') >= 0)
474 /* We don't care about the command and assume 's' */
480 strcpy(dummy, token);
485 if ((newjob->substitute = pcrs_compile_replacement(token, errptr)) == NULL)
487 pcrs_free_job(newjob);
497 newjob->options = pcrs_compile_perl_options(token, &globalflag);
498 newjob->globalflag = globalflag;
501 /* There shouldn't be anything else! */
503 *errptr = PCRS_ERR_CMDSYNTAX;
504 pcrs_free_job(newjob);
511 /* We have a valid substitute? */
512 if (newjob->substitute == NULL)
514 *errptr = PCRS_ERR_CMDSYNTAX;
515 pcrs_free_job(newjob);
519 /* Compile the pattern */
520 newjob->pattern = pcre_compile(dummy, newjob->options, &error, errptr, NULL);
521 if (newjob->pattern == NULL)
523 pcrs_free_job(newjob);
529 * Generate hints. This has little overhead, since the
530 * hints will be NULL for a boring pattern anyway.
532 newjob->hints = pcre_study(newjob->pattern, 0, &error);
535 *errptr = PCRS_ERR_STUDY;
536 pcrs_free_job(newjob);
545 /*********************************************************************
547 * Function : create_pcrs_job
549 * Description : Create a job from all its components, if you don't
550 * have a Perl command to start from. Rather boring.
553 * 1 : pattern = pointer to pcre pattern
554 * 2 : hints = pointer to pcre hints
555 * 3 : options = options in pcre format
556 * 4 : globalflag = flag that indicates if global matching is desired
557 * 5 : substitute = pointer to pcrs_substitute data structure
558 * 2 : errptr = pointer to an integer in which error
559 * conditions can be returned.
561 * Returns : pcrs_job structure, or NULL if an error was encountered.
562 * In that case, *errptr has the reason why.
564 *********************************************************************/
565 pcrs_job *create_pcrs_job(pcre *pattern, pcre_extra *hints, int options, int globalflag, pcrs_substitute *substitute, int *errptr)
569 if ((newjob = (pcrs_job *)malloc(sizeof(pcrs_job))) == NULL)
571 *errptr = PCRS_ERR_NOMEM;
574 memset(newjob, '\0', sizeof(pcrs_job));
576 newjob->pattern = pattern;
577 newjob->hints = hints;
578 newjob->options = options;
579 newjob->globalflag = globalflag;
580 newjob->substitute = substitute;
587 /*********************************************************************
589 * Function : pcrs_exec_substitution
591 * Description : Modify the subject by executing the regular substitution
592 * defined by the job. Since the result may be longer than
593 * the subject, its space requirements are precalculated in
594 * the matching phase and new memory is allocated accordingly.
595 * It is the caller's responsibility to free the result when
596 * it's no longer needed.
598 * FIXME: MUST HANDLE SUBJECTS THAT ARE LONGER THAN subject_length
602 * 1 : job = the pcrs_job to be executed
603 * 2 : subject = the subject (== original) string
604 * 3 : subject_length = the subject's length
605 * 4 : result = char** for returning the result
606 * 5 : result_length = int* for returning the result's length
608 * Returns : the number of substitutions that were made. May be > 1
609 * if job->globalflag was set
611 *********************************************************************/
612 int pcrs_exec_substitution(pcrs_job *job, char *subject, int subject_length, char **result, int *result_length)
614 int offsets[3 * PCRS_MAX_SUBMATCHES],
615 offset = 0, i=0, k, matches_found, newsize, submatches;
616 pcrs_match matches[PCRS_MAX_MATCHES];
621 if (job == NULL || job->pattern == NULL || job->substitute == NULL)
624 return(PCRS_ERR_BADJOB);
627 newsize=subject_length;
631 while ((submatches = pcre_exec(job->pattern, job->hints, subject, subject_length, offset, 0, offsets, 99)) > 0)
633 matches[i].submatches = submatches;
634 for (k=0; k < submatches; k++)
636 matches[i].submatch_offset[k] = offsets[2 * k];
637 matches[i].submatch_length[k] = offsets[2 * k + 1] - offsets[2 * k]; /* Non-found optional submatches have length -1-(-1)==0 */
638 newsize += matches[i].submatch_length[k] * job->substitute->backref_count[k]; /* reserve mem for each submatch as often as it is ref'd */
640 newsize += strlen(job->substitute->text) - matches[i].submatch_length[0]; /* plus replacement text size minus match text size */
642 /* Non-global search or limit reached? */
643 if (++i >= PCRS_MAX_MATCHES || !job->globalflag ) break;
645 /* Don't loop on empty matches */
646 if (offsets[1] == offset)
647 if (offset < subject_length)
651 /* Go find the next one */
655 if (submatches < -1) return submatches; /* Pass pcre error through */
660 if ((*result = (char *)malloc(newsize)) == NULL) /* must be free()d by caller */
662 return PCRS_ERR_NOMEM;
668 result_offset = *result;
670 for (i=0; i < matches_found; i++)
672 memcpy(result_offset, subject + offset, matches[i].submatch_offset[0] - offset); /* copy the chunk preceding the match */
673 result_offset += matches[i].submatch_offset[0] - offset;
675 /* For every segment of the substitute.. */
676 for (k=0; k <= job->substitute->backrefs; k++)
678 /* ...copy its text.. */
679 memcpy(result_offset, job->substitute->text + job->substitute->block_offset[k], job->substitute->block_length[k]);
680 result_offset += job->substitute->block_length[k];
682 /* ..plus, if it's not the last chunk (i.e.: There IS a backref).. */
683 if (k != job->substitute->backrefs
684 /* ..and a nonempty match.. */
685 && matches[i].submatch_length[job->substitute->backref[k]] > 0
686 /* ..and in legal range, ... */
687 && job->substitute->backref[k] <= PCRS_MAX_SUBMATCHES)
689 /* copy the submatch that is ref'd. */
692 subject + matches[i].submatch_offset[job->substitute->backref[k]],
693 matches[i].submatch_length[job->substitute->backref[k]]
695 result_offset += matches[i].submatch_length[job->substitute->backref[k]];
698 offset = matches[i].submatch_offset[0] + matches[i].submatch_length[0];
702 memcpy(result_offset, subject + offset, subject_length - offset);
704 *result_length = newsize;
705 return matches_found;