PostgreSQL Source Code: src/backend/utils/adt/regexp.c Source File

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * regexp.c
00004  *    Postgres' interface to the regular expression package.
00005  *
00006  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00007  * Portions Copyright (c) 1994, Regents of the University of California
00008  *
00009  *
00010  * IDENTIFICATION
00011  *    src/backend/utils/adt/regexp.c
00012  *
00013  *      Alistair Crooks added the code for the regex caching
00014  *      agc - cached the regular expressions used - there's a good chance
00015  *      that we'll get a hit, so this saves a compile step for every
00016  *      attempted match. I haven't actually measured the speed improvement,
00017  *      but it `looks' a lot quicker visually when watching regression
00018  *      test output.
00019  *
00020  *      agc - incorporated Keith Bostic's Berkeley regex code into
00021  *      the tree for all ports. To distinguish this regex code from any that
00022  *      is existent on a platform, I've prepended the string "pg_" to
00023  *      the functions regcomp, regerror, regexec and regfree.
00024  *      Fixed a bug that was originally a typo by me, where `i' was used
00025  *      instead of `oldest' when compiling regular expressions - benign
00026  *      results mostly, although occasionally it bit you...
00027  *
00028  *-------------------------------------------------------------------------
00029  */
00030 #include "postgres.h"
00031 
00032 #include "catalog/pg_type.h"
00033 #include "funcapi.h"
00034 #include "regex/regex.h"
00035 #include "utils/array.h"
00036 #include "utils/builtins.h"
00037 
00038 #define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \
00039     (PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL)
00040 
00041 
00042 /* all the options of interest for regex functions */
00043 typedef struct pg_re_flags
00044 {
00045     int         cflags;         /* compile flags for Spencer's regex code */
00046     bool        glob;           /* do it globally (for each occurrence) */
00047 } pg_re_flags;
00048 
00049 /* cross-call state for regexp_matches(), also regexp_split() */
00050 typedef struct regexp_matches_ctx
00051 {
00052     text       *orig_str;       /* data string in original TEXT form */
00053     int         nmatches;       /* number of places where pattern matched */
00054     int         npatterns;      /* number of capturing subpatterns */
00055     /* We store start char index and end+1 char index for each match */
00056     /* so the number of entries in match_locs is nmatches * npatterns * 2 */
00057     int        *match_locs;     /* 0-based character indexes */
00058     int         next_match;     /* 0-based index of next match to process */
00059     /* workspace for build_regexp_matches_result() */
00060     Datum      *elems;          /* has npatterns elements */
00061     bool       *nulls;          /* has npatterns elements */
00062 } regexp_matches_ctx;
00063 
00064 /*
00065  * We cache precompiled regular expressions using a "self organizing list"
00066  * structure, in which recently-used items tend to be near the front.
00067  * Whenever we use an entry, it's moved up to the front of the list.
00068  * Over time, an item's average position corresponds to its frequency of use.
00069  *
00070  * When we first create an entry, it's inserted at the front of
00071  * the array, dropping the entry at the end of the array if necessary to
00072  * make room.  (This might seem to be weighting the new entry too heavily,
00073  * but if we insert new entries further back, we'll be unable to adjust to
00074  * a sudden shift in the query mix where we are presented with MAX_CACHED_RES
00075  * never-before-seen items used circularly.  We ought to be able to handle
00076  * that case, so we have to insert at the front.)
00077  *
00078  * Knuth mentions a variant strategy in which a used item is moved up just
00079  * one place in the list.  Although he says this uses fewer comparisons on
00080  * average, it seems not to adapt very well to the situation where you have
00081  * both some reusable patterns and a steady stream of non-reusable patterns.
00082  * A reusable pattern that isn't used at least as often as non-reusable
00083  * patterns are seen will "fail to keep up" and will drop off the end of the
00084  * cache.  With move-to-front, a reusable pattern is guaranteed to stay in
00085  * the cache as long as it's used at least once in every MAX_CACHED_RES uses.
00086  */
00087 
00088 /* this is the maximum number of cached regular expressions */
00089 #ifndef MAX_CACHED_RES
00090 #define MAX_CACHED_RES  32
00091 #endif
00092 
00093 /* this structure describes one cached regular expression */
00094 typedef struct cached_re_str
00095 {
00096     char       *cre_pat;        /* original RE (not null terminated!) */
00097     int         cre_pat_len;    /* length of original RE, in bytes */
00098     int         cre_flags;      /* compile flags: extended,icase etc */
00099     Oid         cre_collation;  /* collation to use */
00100     regex_t     cre_re;         /* the compiled regular expression */
00101 } cached_re_str;
00102 
00103 static int  num_res = 0;        /* # of cached re's */
00104 static cached_re_str re_array[MAX_CACHED_RES];  /* cached re's */
00105 
00106 
00107 /* Local functions */
00108 static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
00109                      text *flags,
00110                      Oid collation,
00111                      bool force_glob,
00112                      bool use_subpatterns,
00113                      bool ignore_degenerate);
00114 static void cleanup_regexp_matches(regexp_matches_ctx *matchctx);
00115 static ArrayType *build_regexp_matches_result(regexp_matches_ctx *matchctx);
00116 static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
00117 
00118 
00119 /*
00120  * RE_compile_and_cache - compile a RE, caching if possible
00121  *
00122  * Returns regex_t *
00123  *
00124  *  text_re --- the pattern, expressed as a TEXT object
00125  *  cflags --- compile options for the pattern
00126  *  collation --- collation to use for LC_CTYPE-dependent behavior
00127  *
00128  * Pattern is given in the database encoding.  We internally convert to
00129  * an array of pg_wchar, which is what Spencer's regex package wants.
00130  */
00131 static regex_t *
00132 RE_compile_and_cache(text *text_re, int cflags, Oid collation)
00133 {
00134     int         text_re_len = VARSIZE_ANY_EXHDR(text_re);
00135     char       *text_re_val = VARDATA_ANY(text_re);
00136     pg_wchar   *pattern;
00137     int         pattern_len;
00138     int         i;
00139     int         regcomp_result;
00140     cached_re_str re_temp;
00141     char        errMsg[100];
00142 
00143     /*
00144      * Look for a match among previously compiled REs.  Since the data
00145      * structure is self-organizing with most-used entries at the front, our
00146      * search strategy can just be to scan from the front.
00147      */
00148     for (i = 0; i < num_res; i++)
00149     {
00150         if (re_array[i].cre_pat_len == text_re_len &&
00151             re_array[i].cre_flags == cflags &&
00152             re_array[i].cre_collation == collation &&
00153             memcmp(re_array[i].cre_pat, text_re_val, text_re_len) == 0)
00154         {
00155             /*
00156              * Found a match; move it to front if not there already.
00157              */
00158             if (i > 0)
00159             {
00160                 re_temp = re_array[i];
00161                 memmove(&re_array[1], &re_array[0], i * sizeof(cached_re_str));
00162                 re_array[0] = re_temp;
00163             }
00164 
00165             return &re_array[0].cre_re;
00166         }
00167     }
00168 
00169     /*
00170      * Couldn't find it, so try to compile the new RE.  To avoid leaking
00171      * resources on failure, we build into the re_temp local.
00172      */
00173 
00174     /* Convert pattern string to wide characters */
00175     pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
00176     pattern_len = pg_mb2wchar_with_len(text_re_val,
00177                                        pattern,
00178                                        text_re_len);
00179 
00180     regcomp_result = pg_regcomp(&re_temp.cre_re,
00181                                 pattern,
00182                                 pattern_len,
00183                                 cflags,
00184                                 collation);
00185 
00186     pfree(pattern);
00187 
00188     if (regcomp_result != REG_OKAY)
00189     {
00190         /* re didn't compile (no need for pg_regfree, if so) */
00191         pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg));
00192         ereport(ERROR,
00193                 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
00194                  errmsg("invalid regular expression: %s", errMsg)));
00195     }
00196 
00197     /*
00198      * We use malloc/free for the cre_pat field because the storage has to
00199      * persist across transactions, and because we want to get control back on
00200      * out-of-memory.  The Max() is because some malloc implementations return
00201      * NULL for malloc(0).
00202      */
00203     re_temp.cre_pat = malloc(Max(text_re_len, 1));
00204     if (re_temp.cre_pat == NULL)
00205     {
00206         pg_regfree(&re_temp.cre_re);
00207         ereport(ERROR,
00208                 (errcode(ERRCODE_OUT_OF_MEMORY),
00209                  errmsg("out of memory")));
00210     }
00211     memcpy(re_temp.cre_pat, text_re_val, text_re_len);
00212     re_temp.cre_pat_len = text_re_len;
00213     re_temp.cre_flags = cflags;
00214     re_temp.cre_collation = collation;
00215 
00216     /*
00217      * Okay, we have a valid new item in re_temp; insert it into the storage
00218      * array.  Discard last entry if needed.
00219      */
00220     if (num_res >= MAX_CACHED_RES)
00221     {
00222         --num_res;
00223         Assert(num_res < MAX_CACHED_RES);
00224         pg_regfree(&re_array[num_res].cre_re);
00225         free(re_array[num_res].cre_pat);
00226     }
00227 
00228     if (num_res > 0)
00229         memmove(&re_array[1], &re_array[0], num_res * sizeof(cached_re_str));
00230 
00231     re_array[0] = re_temp;
00232     num_res++;
00233 
00234     return &re_array[0].cre_re;
00235 }
00236 
00237 /*
00238  * RE_wchar_execute - execute a RE on pg_wchar data
00239  *
00240  * Returns TRUE on match, FALSE on no match
00241  *
00242  *  re --- the compiled pattern as returned by RE_compile_and_cache
00243  *  data --- the data to match against (need not be null-terminated)
00244  *  data_len --- the length of the data string
00245  *  start_search -- the offset in the data to start searching
00246  *  nmatch, pmatch  --- optional return area for match details
00247  *
00248  * Data is given as array of pg_wchar which is what Spencer's regex package
00249  * wants.
00250  */
00251 static bool
00252 RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len,
00253                  int start_search, int nmatch, regmatch_t *pmatch)
00254 {
00255     int         regexec_result;
00256     char        errMsg[100];
00257 
00258     /* Perform RE match and return result */
00259     regexec_result = pg_regexec(re,
00260                                 data,
00261                                 data_len,
00262                                 start_search,
00263                                 NULL,   /* no details */
00264                                 nmatch,
00265                                 pmatch,
00266                                 0);
00267 
00268     if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
00269     {
00270         /* re failed??? */
00271         pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
00272         ereport(ERROR,
00273                 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
00274                  errmsg("regular expression failed: %s", errMsg)));
00275     }
00276 
00277     return (regexec_result == REG_OKAY);
00278 }
00279 
00280 /*
00281  * RE_execute - execute a RE
00282  *
00283  * Returns TRUE on match, FALSE on no match
00284  *
00285  *  re --- the compiled pattern as returned by RE_compile_and_cache
00286  *  dat --- the data to match against (need not be null-terminated)
00287  *  dat_len --- the length of the data string
00288  *  nmatch, pmatch  --- optional return area for match details
00289  *
00290  * Data is given in the database encoding.  We internally
00291  * convert to array of pg_wchar which is what Spencer's regex package wants.
00292  */
00293 static bool
00294 RE_execute(regex_t *re, char *dat, int dat_len,
00295            int nmatch, regmatch_t *pmatch)
00296 {
00297     pg_wchar   *data;
00298     int         data_len;
00299     bool        match;
00300 
00301     /* Convert data string to wide characters */
00302     data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
00303     data_len = pg_mb2wchar_with_len(dat, data, dat_len);
00304 
00305     /* Perform RE match and return result */
00306     match = RE_wchar_execute(re, data, data_len, 0, nmatch, pmatch);
00307 
00308     pfree(data);
00309     return match;
00310 }
00311 
00312 /*
00313  * RE_compile_and_execute - compile and execute a RE
00314  *
00315  * Returns TRUE on match, FALSE on no match
00316  *
00317  *  text_re --- the pattern, expressed as a TEXT object
00318  *  dat --- the data to match against (need not be null-terminated)
00319  *  dat_len --- the length of the data string
00320  *  cflags --- compile options for the pattern
00321  *  collation --- collation to use for LC_CTYPE-dependent behavior
00322  *  nmatch, pmatch  --- optional return area for match details
00323  *
00324  * Both pattern and data are given in the database encoding.  We internally
00325  * convert to array of pg_wchar which is what Spencer's regex package wants.
00326  */
00327 static bool
00328 RE_compile_and_execute(text *text_re, char *dat, int dat_len,
00329                        int cflags, Oid collation,
00330                        int nmatch, regmatch_t *pmatch)
00331 {
00332     regex_t    *re;
00333 
00334     /* Compile RE */
00335     re = RE_compile_and_cache(text_re, cflags, collation);
00336 
00337     return RE_execute(re, dat, dat_len, nmatch, pmatch);
00338 }
00339 
00340 
00341 /*
00342  * parse_re_flags - parse the options argument of regexp_matches and friends
00343  *
00344  *  flags --- output argument, filled with desired options
00345  *  opts --- TEXT object, or NULL for defaults
00346  *
00347  * This accepts all the options allowed by any of the callers; callers that
00348  * don't want some have to reject them after the fact.
00349  */
00350 static void
00351 parse_re_flags(pg_re_flags *flags, text *opts)
00352 {
00353     /* regex flavor is always folded into the compile flags */
00354     flags->cflags = REG_ADVANCED;
00355     flags->glob = false;
00356 
00357     if (opts)
00358     {
00359         char       *opt_p = VARDATA_ANY(opts);
00360         int         opt_len = VARSIZE_ANY_EXHDR(opts);
00361         int         i;
00362 
00363         for (i = 0; i < opt_len; i++)
00364         {
00365             switch (opt_p[i])
00366             {
00367                 case 'g':
00368                     flags->glob = true;
00369                     break;
00370                 case 'b':       /* BREs (but why???) */
00371                     flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED | REG_QUOTE);
00372                     break;
00373                 case 'c':       /* case sensitive */
00374                     flags->cflags &= ~REG_ICASE;
00375                     break;
00376                 case 'e':       /* plain EREs */
00377                     flags->cflags |= REG_EXTENDED;
00378                     flags->cflags &= ~(REG_ADVANCED | REG_QUOTE);
00379                     break;
00380                 case 'i':       /* case insensitive */
00381                     flags->cflags |= REG_ICASE;
00382                     break;
00383                 case 'm':       /* Perloid synonym for n */
00384                 case 'n':       /* \n affects ^ $ . [^ */
00385                     flags->cflags |= REG_NEWLINE;
00386                     break;
00387                 case 'p':       /* ~Perl, \n affects . [^ */
00388                     flags->cflags |= REG_NLSTOP;
00389                     flags->cflags &= ~REG_NLANCH;
00390                     break;
00391                 case 'q':       /* literal string */
00392                     flags->cflags |= REG_QUOTE;
00393                     flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED);
00394                     break;
00395                 case 's':       /* single line, \n ordinary */
00396                     flags->cflags &= ~REG_NEWLINE;
00397                     break;
00398                 case 't':       /* tight syntax */
00399                     flags->cflags &= ~REG_EXPANDED;
00400                     break;
00401                 case 'w':       /* weird, \n affects ^ $ only */
00402                     flags->cflags &= ~REG_NLSTOP;
00403                     flags->cflags |= REG_NLANCH;
00404                     break;
00405                 case 'x':       /* expanded syntax */
00406                     flags->cflags |= REG_EXPANDED;
00407                     break;
00408                 default:
00409                     ereport(ERROR,
00410                             (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00411                              errmsg("invalid regexp option: \"%c\"",
00412                                     opt_p[i])));
00413                     break;
00414             }
00415         }
00416     }
00417 }
00418 
00419 
00420 /*
00421  *  interface routines called by the function manager
00422  */
00423 
00424 Datum
00425 nameregexeq(PG_FUNCTION_ARGS)
00426 {
00427     Name        n = PG_GETARG_NAME(0);
00428     text       *p = PG_GETARG_TEXT_PP(1);
00429 
00430     PG_RETURN_BOOL(RE_compile_and_execute(p,
00431                                           NameStr(*n),
00432                                           strlen(NameStr(*n)),
00433                                           REG_ADVANCED,
00434                                           PG_GET_COLLATION(),
00435                                           0, NULL));
00436 }
00437 
00438 Datum
00439 nameregexne(PG_FUNCTION_ARGS)
00440 {
00441     Name        n = PG_GETARG_NAME(0);
00442     text       *p = PG_GETARG_TEXT_PP(1);
00443 
00444     PG_RETURN_BOOL(!RE_compile_and_execute(p,
00445                                            NameStr(*n),
00446                                            strlen(NameStr(*n)),
00447                                            REG_ADVANCED,
00448                                            PG_GET_COLLATION(),
00449                                            0, NULL));
00450 }
00451 
00452 Datum
00453 textregexeq(PG_FUNCTION_ARGS)
00454 {
00455     text       *s = PG_GETARG_TEXT_PP(0);
00456     text       *p = PG_GETARG_TEXT_PP(1);
00457 
00458     PG_RETURN_BOOL(RE_compile_and_execute(p,
00459                                           VARDATA_ANY(s),
00460                                           VARSIZE_ANY_EXHDR(s),
00461                                           REG_ADVANCED,
00462                                           PG_GET_COLLATION(),
00463                                           0, NULL));
00464 }
00465 
00466 Datum
00467 textregexne(PG_FUNCTION_ARGS)
00468 {
00469     text       *s = PG_GETARG_TEXT_PP(0);
00470     text       *p = PG_GETARG_TEXT_PP(1);
00471 
00472     PG_RETURN_BOOL(!RE_compile_and_execute(p,
00473                                            VARDATA_ANY(s),
00474                                            VARSIZE_ANY_EXHDR(s),
00475                                            REG_ADVANCED,
00476                                            PG_GET_COLLATION(),
00477                                            0, NULL));
00478 }
00479 
00480 
00481 /*
00482  *  routines that use the regexp stuff, but ignore the case.
00483  *  for this, we use the REG_ICASE flag to pg_regcomp
00484  */
00485 
00486 
00487 Datum
00488 nameicregexeq(PG_FUNCTION_ARGS)
00489 {
00490     Name        n = PG_GETARG_NAME(0);
00491     text       *p = PG_GETARG_TEXT_PP(1);
00492 
00493     PG_RETURN_BOOL(RE_compile_and_execute(p,
00494                                           NameStr(*n),
00495                                           strlen(NameStr(*n)),
00496                                           REG_ADVANCED | REG_ICASE,
00497                                           PG_GET_COLLATION(),
00498                                           0, NULL));
00499 }
00500 
00501 Datum
00502 nameicregexne(PG_FUNCTION_ARGS)
00503 {
00504     Name        n = PG_GETARG_NAME(0);
00505     text       *p = PG_GETARG_TEXT_PP(1);
00506 
00507     PG_RETURN_BOOL(!RE_compile_and_execute(p,
00508                                            NameStr(*n),
00509                                            strlen(NameStr(*n)),
00510                                            REG_ADVANCED | REG_ICASE,
00511                                            PG_GET_COLLATION(),
00512                                            0, NULL));
00513 }
00514 
00515 Datum
00516 texticregexeq(PG_FUNCTION_ARGS)
00517 {
00518     text       *s = PG_GETARG_TEXT_PP(0);
00519     text       *p = PG_GETARG_TEXT_PP(1);
00520 
00521     PG_RETURN_BOOL(RE_compile_and_execute(p,
00522                                           VARDATA_ANY(s),
00523                                           VARSIZE_ANY_EXHDR(s),
00524                                           REG_ADVANCED | REG_ICASE,
00525                                           PG_GET_COLLATION(),
00526                                           0, NULL));
00527 }
00528 
00529 Datum
00530 texticregexne(PG_FUNCTION_ARGS)
00531 {
00532     text       *s = PG_GETARG_TEXT_PP(0);
00533     text       *p = PG_GETARG_TEXT_PP(1);
00534 
00535     PG_RETURN_BOOL(!RE_compile_and_execute(p,
00536                                            VARDATA_ANY(s),
00537                                            VARSIZE_ANY_EXHDR(s),
00538                                            REG_ADVANCED | REG_ICASE,
00539                                            PG_GET_COLLATION(),
00540                                            0, NULL));
00541 }
00542 
00543 
00544 /*
00545  * textregexsubstr()
00546  *      Return a substring matched by a regular expression.
00547  */
00548 Datum
00549 textregexsubstr(PG_FUNCTION_ARGS)
00550 {
00551     text       *s = PG_GETARG_TEXT_PP(0);
00552     text       *p = PG_GETARG_TEXT_PP(1);
00553     regex_t    *re;
00554     regmatch_t  pmatch[2];
00555     int         so,
00556                 eo;
00557 
00558     /* Compile RE */
00559     re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());
00560 
00561     /*
00562      * We pass two regmatch_t structs to get info about the overall match and
00563      * the match for the first parenthesized subexpression (if any). If there
00564      * is a parenthesized subexpression, we return what it matched; else
00565      * return what the whole regexp matched.
00566      */
00567     if (!RE_execute(re,
00568                     VARDATA_ANY(s), VARSIZE_ANY_EXHDR(s),
00569                     2, pmatch))
00570         PG_RETURN_NULL();       /* definitely no match */
00571 
00572     if (re->re_nsub > 0)
00573     {
00574         /* has parenthesized subexpressions, use the first one */
00575         so = pmatch[1].rm_so;
00576         eo = pmatch[1].rm_eo;
00577     }
00578     else
00579     {
00580         /* no parenthesized subexpression, use whole match */
00581         so = pmatch[0].rm_so;
00582         eo = pmatch[0].rm_eo;
00583     }
00584 
00585     /*
00586      * It is possible to have a match to the whole pattern but no match for a
00587      * subexpression; for example 'foo(bar)?' is considered to match 'foo' but
00588      * there is no subexpression match.  So this extra test for match failure
00589      * is not redundant.
00590      */
00591     if (so < 0 || eo < 0)
00592         PG_RETURN_NULL();
00593 
00594     return DirectFunctionCall3(text_substr,
00595                                PointerGetDatum(s),
00596                                Int32GetDatum(so + 1),
00597                                Int32GetDatum(eo - so));
00598 }
00599 
00600 /*
00601  * textregexreplace_noopt()
00602  *      Return a string matched by a regular expression, with replacement.
00603  *
00604  * This version doesn't have an option argument: we default to case
00605  * sensitive match, replace the first instance only.
00606  */
00607 Datum
00608 textregexreplace_noopt(PG_FUNCTION_ARGS)
00609 {
00610     text       *s = PG_GETARG_TEXT_PP(0);
00611     text       *p = PG_GETARG_TEXT_PP(1);
00612     text       *r = PG_GETARG_TEXT_PP(2);
00613     regex_t    *re;
00614 
00615     re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());
00616 
00617     PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, false));
00618 }
00619 
00620 /*
00621  * textregexreplace()
00622  *      Return a string matched by a regular expression, with replacement.
00623  */
00624 Datum
00625 textregexreplace(PG_FUNCTION_ARGS)
00626 {
00627     text       *s = PG_GETARG_TEXT_PP(0);
00628     text       *p = PG_GETARG_TEXT_PP(1);
00629     text       *r = PG_GETARG_TEXT_PP(2);
00630     text       *opt = PG_GETARG_TEXT_PP(3);
00631     regex_t    *re;
00632     pg_re_flags flags;
00633 
00634     parse_re_flags(&flags, opt);
00635 
00636     re = RE_compile_and_cache(p, flags.cflags, PG_GET_COLLATION());
00637 
00638     PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, flags.glob));
00639 }
00640 
00641 /*
00642  * similar_escape()
00643  * Convert a SQL:2008 regexp pattern to POSIX style, so it can be used by
00644  * our regexp engine.
00645  */
00646 Datum
00647 similar_escape(PG_FUNCTION_ARGS)
00648 {
00649     text       *pat_text;
00650     text       *esc_text;
00651     text       *result;
00652     char       *p,
00653                *e,
00654                *r;
00655     int         plen,
00656                 elen;
00657     bool        afterescape = false;
00658     bool        incharclass = false;
00659     int         nquotes = 0;
00660 
00661     /* This function is not strict, so must test explicitly */
00662     if (PG_ARGISNULL(0))
00663         PG_RETURN_NULL();
00664     pat_text = PG_GETARG_TEXT_PP(0);
00665     p = VARDATA_ANY(pat_text);
00666     plen = VARSIZE_ANY_EXHDR(pat_text);
00667     if (PG_ARGISNULL(1))
00668     {
00669         /* No ESCAPE clause provided; default to backslash as escape */
00670         e = "\\";
00671         elen = 1;
00672     }
00673     else
00674     {
00675         esc_text = PG_GETARG_TEXT_PP(1);
00676         e = VARDATA_ANY(esc_text);
00677         elen = VARSIZE_ANY_EXHDR(esc_text);
00678         if (elen == 0)
00679             e = NULL;           /* no escape character */
00680         else if (elen != 1)
00681             ereport(ERROR,
00682                     (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
00683                      errmsg("invalid escape string"),
00684                   errhint("Escape string must be empty or one character.")));
00685     }
00686 
00687     /*----------
00688      * We surround the transformed input string with
00689      *          ^(?: ... )$
00690      * which requires some explanation.  We need "^" and "$" to force
00691      * the pattern to match the entire input string as per SQL99 spec.
00692      * The "(?:" and ")" are a non-capturing set of parens; we have to have
00693      * parens in case the string contains "|", else the "^" and "$" will
00694      * be bound into the first and last alternatives which is not what we
00695      * want, and the parens must be non capturing because we don't want them
00696      * to count when selecting output for SUBSTRING.
00697      *----------
00698      */
00699 
00700     /*
00701      * We need room for the prefix/postfix plus as many as 3 output bytes per
00702      * input byte; since the input is at most 1GB this can't overflow
00703      */
00704     result = (text *) palloc(VARHDRSZ + 6 + 3 * plen);
00705     r = VARDATA(result);
00706 
00707     *r++ = '^';
00708     *r++ = '(';
00709     *r++ = '?';
00710     *r++ = ':';
00711 
00712     while (plen > 0)
00713     {
00714         char        pchar = *p;
00715 
00716         if (afterescape)
00717         {
00718             if (pchar == '"' && !incharclass)   /* for SUBSTRING patterns */
00719                 *r++ = ((nquotes++ % 2) == 0) ? '(' : ')';
00720             else
00721             {
00722                 *r++ = '\\';
00723                 *r++ = pchar;
00724             }
00725             afterescape = false;
00726         }
00727         else if (e && pchar == *e)
00728         {
00729             /* SQL99 escape character; do not send to output */
00730             afterescape = true;
00731         }
00732         else if (incharclass)
00733         {
00734             if (pchar == '\\')
00735                 *r++ = '\\';
00736             *r++ = pchar;
00737             if (pchar == ']')
00738                 incharclass = false;
00739         }
00740         else if (pchar == '[')
00741         {
00742             *r++ = pchar;
00743             incharclass = true;
00744         }
00745         else if (pchar == '%')
00746         {
00747             *r++ = '.';
00748             *r++ = '*';
00749         }
00750         else if (pchar == '_')
00751             *r++ = '.';
00752         else if (pchar == '(')
00753         {
00754             /* convert to non-capturing parenthesis */
00755             *r++ = '(';
00756             *r++ = '?';
00757             *r++ = ':';
00758         }
00759         else if (pchar == '\\' || pchar == '.' ||
00760                  pchar == '^' || pchar == '$')
00761         {
00762             *r++ = '\\';
00763             *r++ = pchar;
00764         }
00765         else
00766             *r++ = pchar;
00767         p++, plen--;
00768     }
00769 
00770     *r++ = ')';
00771     *r++ = '$';
00772 
00773     SET_VARSIZE(result, r - ((char *) result));
00774 
00775     PG_RETURN_TEXT_P(result);
00776 }
00777 
00778 /*
00779  * regexp_matches()
00780  *      Return a table of matches of a pattern within a string.
00781  */
00782 Datum
00783 regexp_matches(PG_FUNCTION_ARGS)
00784 {
00785     FuncCallContext *funcctx;
00786     regexp_matches_ctx *matchctx;
00787 
00788     if (SRF_IS_FIRSTCALL())
00789     {
00790         text       *pattern = PG_GETARG_TEXT_PP(1);
00791         text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
00792         MemoryContext oldcontext;
00793 
00794         funcctx = SRF_FIRSTCALL_INIT();
00795         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
00796 
00797         /* be sure to copy the input string into the multi-call ctx */
00798         matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
00799                                         flags,
00800                                         PG_GET_COLLATION(),
00801                                         false, true, false);
00802 
00803         /* Pre-create workspace that build_regexp_matches_result needs */
00804         matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
00805         matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);
00806 
00807         MemoryContextSwitchTo(oldcontext);
00808         funcctx->user_fctx = (void *) matchctx;
00809     }
00810 
00811     funcctx = SRF_PERCALL_SETUP();
00812     matchctx = (regexp_matches_ctx *) funcctx->user_fctx;
00813 
00814     if (matchctx->next_match < matchctx->nmatches)
00815     {
00816         ArrayType  *result_ary;
00817 
00818         result_ary = build_regexp_matches_result(matchctx);
00819         matchctx->next_match++;
00820         SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
00821     }
00822 
00823     /* release space in multi-call ctx to avoid intraquery memory leak */
00824     cleanup_regexp_matches(matchctx);
00825 
00826     SRF_RETURN_DONE(funcctx);
00827 }
00828 
00829 /* This is separate to keep the opr_sanity regression test from complaining */
00830 Datum
00831 regexp_matches_no_flags(PG_FUNCTION_ARGS)
00832 {
00833     return regexp_matches(fcinfo);
00834 }
00835 
00836 /*
00837  * setup_regexp_matches --- do the initial matching for regexp_matches()
00838  *      or regexp_split()
00839  *
00840  * To avoid having to re-find the compiled pattern on each call, we do
00841  * all the matching in one swoop.  The returned regexp_matches_ctx contains
00842  * the locations of all the substrings matching the pattern.
00843  *
00844  * The three bool parameters have only two patterns (one for each caller)
00845  * but it seems clearer to distinguish the functionality this way than to
00846  * key it all off one "is_split" flag.
00847  */
00848 static regexp_matches_ctx *
00849 setup_regexp_matches(text *orig_str, text *pattern, text *flags,
00850                      Oid collation,
00851                      bool force_glob, bool use_subpatterns,
00852                      bool ignore_degenerate)
00853 {
00854     regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
00855     int         orig_len;
00856     pg_wchar   *wide_str;
00857     int         wide_len;
00858     pg_re_flags re_flags;
00859     regex_t    *cpattern;
00860     regmatch_t *pmatch;
00861     int         pmatch_len;
00862     int         array_len;
00863     int         array_idx;
00864     int         prev_match_end;
00865     int         start_search;
00866 
00867     /* save original string --- we'll extract result substrings from it */
00868     matchctx->orig_str = orig_str;
00869 
00870     /* convert string to pg_wchar form for matching */
00871     orig_len = VARSIZE_ANY_EXHDR(orig_str);
00872     wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
00873     wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);
00874 
00875     /* determine options */
00876     parse_re_flags(&re_flags, flags);
00877     if (force_glob)
00878     {
00879         /* user mustn't specify 'g' for regexp_split */
00880         if (re_flags.glob)
00881             ereport(ERROR,
00882                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00883                  errmsg("regexp_split does not support the global option")));
00884         /* but we find all the matches anyway */
00885         re_flags.glob = true;
00886     }
00887 
00888     /* set up the compiled pattern */
00889     cpattern = RE_compile_and_cache(pattern, re_flags.cflags, collation);
00890 
00891     /* do we want to remember subpatterns? */
00892     if (use_subpatterns && cpattern->re_nsub > 0)
00893     {
00894         matchctx->npatterns = cpattern->re_nsub;
00895         pmatch_len = cpattern->re_nsub + 1;
00896     }
00897     else
00898     {
00899         use_subpatterns = false;
00900         matchctx->npatterns = 1;
00901         pmatch_len = 1;
00902     }
00903 
00904     /* temporary output space for RE package */
00905     pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
00906 
00907     /* the real output space (grown dynamically if needed) */
00908     array_len = re_flags.glob ? 256 : 32;
00909     matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
00910     array_idx = 0;
00911 
00912     /* search for the pattern, perhaps repeatedly */
00913     prev_match_end = 0;
00914     start_search = 0;
00915     while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search,
00916                             pmatch_len, pmatch))
00917     {
00918         /*
00919          * If requested, ignore degenerate matches, which are zero-length
00920          * matches occurring at the start or end of a string or just after a
00921          * previous match.
00922          */
00923         if (!ignore_degenerate ||
00924             (pmatch[0].rm_so < wide_len &&
00925              pmatch[0].rm_eo > prev_match_end))
00926         {
00927             /* enlarge output space if needed */
00928             while (array_idx + matchctx->npatterns * 2 > array_len)
00929             {
00930                 array_len *= 2;
00931                 matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
00932                                                     sizeof(int) * array_len);
00933             }
00934 
00935             /* save this match's locations */
00936             if (use_subpatterns)
00937             {
00938                 int         i;
00939 
00940                 for (i = 1; i <= matchctx->npatterns; i++)
00941                 {
00942                     matchctx->match_locs[array_idx++] = pmatch[i].rm_so;
00943                     matchctx->match_locs[array_idx++] = pmatch[i].rm_eo;
00944                 }
00945             }
00946             else
00947             {
00948                 matchctx->match_locs[array_idx++] = pmatch[0].rm_so;
00949                 matchctx->match_locs[array_idx++] = pmatch[0].rm_eo;
00950             }
00951             matchctx->nmatches++;
00952         }
00953         prev_match_end = pmatch[0].rm_eo;
00954 
00955         /* if not glob, stop after one match */
00956         if (!re_flags.glob)
00957             break;
00958 
00959         /*
00960          * Advance search position.  Normally we start just after the end of
00961          * the previous match, but always advance at least one character (the
00962          * special case can occur if the pattern matches zero characters just
00963          * after the prior match or at the end of the string).
00964          */
00965         if (start_search < pmatch[0].rm_eo)
00966             start_search = pmatch[0].rm_eo;
00967         else
00968             start_search++;
00969         if (start_search > wide_len)
00970             break;
00971     }
00972 
00973     /* Clean up temp storage */
00974     pfree(wide_str);
00975     pfree(pmatch);
00976 
00977     return matchctx;
00978 }
00979 
00980 /*
00981  * cleanup_regexp_matches - release memory of a regexp_matches_ctx
00982  */
00983 static void
00984 cleanup_regexp_matches(regexp_matches_ctx *matchctx)
00985 {
00986     pfree(matchctx->orig_str);
00987     pfree(matchctx->match_locs);
00988     if (matchctx->elems)
00989         pfree(matchctx->elems);
00990     if (matchctx->nulls)
00991         pfree(matchctx->nulls);
00992     pfree(matchctx);
00993 }
00994 
00995 /*
00996  * build_regexp_matches_result - build output array for current match
00997  */
00998 static ArrayType *
00999 build_regexp_matches_result(regexp_matches_ctx *matchctx)
01000 {
01001     Datum      *elems = matchctx->elems;
01002     bool       *nulls = matchctx->nulls;
01003     int         dims[1];
01004     int         lbs[1];
01005     int         loc;
01006     int         i;
01007 
01008     /* Extract matching substrings from the original string */
01009     loc = matchctx->next_match * matchctx->npatterns * 2;
01010     for (i = 0; i < matchctx->npatterns; i++)
01011     {
01012         int         so = matchctx->match_locs[loc++];
01013         int         eo = matchctx->match_locs[loc++];
01014 
01015         if (so < 0 || eo < 0)
01016         {
01017             elems[i] = (Datum) 0;
01018             nulls[i] = true;
01019         }
01020         else
01021         {
01022             elems[i] = DirectFunctionCall3(text_substr,
01023                                          PointerGetDatum(matchctx->orig_str),
01024                                            Int32GetDatum(so + 1),
01025                                            Int32GetDatum(eo - so));
01026             nulls[i] = false;
01027         }
01028     }
01029 
01030     /* And form an array */
01031     dims[0] = matchctx->npatterns;
01032     lbs[0] = 1;
01033     /* XXX: this hardcodes assumptions about the text type */
01034     return construct_md_array(elems, nulls, 1, dims, lbs,
01035                               TEXTOID, -1, false, 'i');
01036 }
01037 
01038 /*
01039  * regexp_split_to_table()
01040  *      Split the string at matches of the pattern, returning the
01041  *      split-out substrings as a table.
01042  */
01043 Datum
01044 regexp_split_to_table(PG_FUNCTION_ARGS)
01045 {
01046     FuncCallContext *funcctx;
01047     regexp_matches_ctx *splitctx;
01048 
01049     if (SRF_IS_FIRSTCALL())
01050     {
01051         text       *pattern = PG_GETARG_TEXT_PP(1);
01052         text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
01053         MemoryContext oldcontext;
01054 
01055         funcctx = SRF_FIRSTCALL_INIT();
01056         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
01057 
01058         /* be sure to copy the input string into the multi-call ctx */
01059         splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
01060                                         flags,
01061                                         PG_GET_COLLATION(),
01062                                         true, false, true);
01063 
01064         MemoryContextSwitchTo(oldcontext);
01065         funcctx->user_fctx = (void *) splitctx;
01066     }
01067 
01068     funcctx = SRF_PERCALL_SETUP();
01069     splitctx = (regexp_matches_ctx *) funcctx->user_fctx;
01070 
01071     if (splitctx->next_match <= splitctx->nmatches)
01072     {
01073         Datum       result = build_regexp_split_result(splitctx);
01074 
01075         splitctx->next_match++;
01076         SRF_RETURN_NEXT(funcctx, result);
01077     }
01078 
01079     /* release space in multi-call ctx to avoid intraquery memory leak */
01080     cleanup_regexp_matches(splitctx);
01081 
01082     SRF_RETURN_DONE(funcctx);
01083 }
01084 
01085 /* This is separate to keep the opr_sanity regression test from complaining */
01086 Datum
01087 regexp_split_to_table_no_flags(PG_FUNCTION_ARGS)
01088 {
01089     return regexp_split_to_table(fcinfo);
01090 }
01091 
01092 /*
01093  * regexp_split_to_array()
01094  *      Split the string at matches of the pattern, returning the
01095  *      split-out substrings as an array.
01096  */
01097 Datum
01098 regexp_split_to_array(PG_FUNCTION_ARGS)
01099 {
01100     ArrayBuildState *astate = NULL;
01101     regexp_matches_ctx *splitctx;
01102 
01103     splitctx = setup_regexp_matches(PG_GETARG_TEXT_PP(0),
01104                                     PG_GETARG_TEXT_PP(1),
01105                                     PG_GETARG_TEXT_PP_IF_EXISTS(2),
01106                                     PG_GET_COLLATION(),
01107                                     true, false, true);
01108 
01109     while (splitctx->next_match <= splitctx->nmatches)
01110     {
01111         astate = accumArrayResult(astate,
01112                                   build_regexp_split_result(splitctx),
01113                                   false,
01114                                   TEXTOID,
01115                                   CurrentMemoryContext);
01116         splitctx->next_match++;
01117     }
01118 
01119     /*
01120      * We don't call cleanup_regexp_matches here; it would try to pfree the
01121      * input string, which we didn't copy.  The space is not in a long-lived
01122      * memory context anyway.
01123      */
01124 
01125     PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, CurrentMemoryContext));
01126 }
01127 
01128 /* This is separate to keep the opr_sanity regression test from complaining */
01129 Datum
01130 regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
01131 {
01132     return regexp_split_to_array(fcinfo);
01133 }
01134 
01135 /*
01136  * build_regexp_split_result - build output string for current match
01137  *
01138  * We return the string between the current match and the previous one,
01139  * or the string after the last match when next_match == nmatches.
01140  */
01141 static Datum
01142 build_regexp_split_result(regexp_matches_ctx *splitctx)
01143 {
01144     int         startpos;
01145     int         endpos;
01146 
01147     if (splitctx->next_match > 0)
01148         startpos = splitctx->match_locs[splitctx->next_match * 2 - 1];
01149     else
01150         startpos = 0;
01151     if (startpos < 0)
01152         elog(ERROR, "invalid match ending position");
01153 
01154     if (splitctx->next_match < splitctx->nmatches)
01155     {
01156         endpos = splitctx->match_locs[splitctx->next_match * 2];
01157         if (endpos < startpos)
01158             elog(ERROR, "invalid match starting position");
01159         return DirectFunctionCall3(text_substr,
01160                                    PointerGetDatum(splitctx->orig_str),
01161                                    Int32GetDatum(startpos + 1),
01162                                    Int32GetDatum(endpos - startpos));
01163     }
01164     else
01165     {
01166         /* no more matches, return rest of string */
01167         return DirectFunctionCall2(text_substr_no_len,
01168                                    PointerGetDatum(splitctx->orig_str),
01169                                    Int32GetDatum(startpos + 1));
01170     }
01171 }
01172 
01173 /*
01174  * regexp_fixed_prefix - extract fixed prefix, if any, for a regexp
01175  *
01176  * The result is NULL if there is no fixed prefix, else a palloc'd string.
01177  * If it is an exact match, not just a prefix, *exact is returned as TRUE.
01178  */
01179 char *
01180 regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation,
01181                     bool *exact)
01182 {
01183     char       *result;
01184     regex_t    *re;
01185     int         cflags;
01186     int         re_result;
01187     pg_wchar   *str;
01188     size_t      slen;
01189     size_t      maxlen;
01190     char        errMsg[100];
01191 
01192     *exact = false;             /* default result */
01193 
01194     /* Compile RE */
01195     cflags = REG_ADVANCED;
01196     if (case_insensitive)
01197         cflags |= REG_ICASE;
01198 
01199     re = RE_compile_and_cache(text_re, cflags, collation);
01200 
01201     /* Examine it to see if there's a fixed prefix */
01202     re_result = pg_regprefix(re, &str, &slen);
01203 
01204     switch (re_result)
01205     {
01206         case REG_NOMATCH:
01207             return NULL;
01208 
01209         case REG_PREFIX:
01210             /* continue with wchar conversion */
01211             break;
01212 
01213         case REG_EXACT:
01214             *exact = true;
01215             /* continue with wchar conversion */
01216             break;
01217 
01218         default:
01219             /* re failed??? */
01220             pg_regerror(re_result, re, errMsg, sizeof(errMsg));
01221             ereport(ERROR,
01222                     (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
01223                      errmsg("regular expression failed: %s", errMsg)));
01224             break;
01225     }
01226 
01227     /* Convert pg_wchar result back to database encoding */
01228     maxlen = pg_database_encoding_max_length() * slen + 1;
01229     result = (char *) palloc(maxlen);
01230     slen = pg_wchar2mb_with_len(str, result, slen);
01231     Assert(slen < maxlen);
01232 
01233     free(str);
01234 
01235     return result;
01236 }
Header And Logo

regexp.c