Header And Logo

PostgreSQL
| The world's most advanced open source database.

regprefix.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * regprefix.c
00004  *    Extract a common prefix, if any, from a compiled regex.
00005  *
00006  *
00007  * Portions Copyright (c) 2012-2013, PostgreSQL Global Development Group
00008  * Portions Copyright (c) 1998, 1999 Henry Spencer
00009  *
00010  * IDENTIFICATION
00011  *    src/backend/regex/regprefix.c
00012  *
00013  *-------------------------------------------------------------------------
00014  */
00015 
00016 #include "regex/regguts.h"
00017 
00018 
00019 /*
00020  * forward declarations
00021  */
00022 static int findprefix(struct cnfa * cnfa, struct colormap * cm,
00023                       chr *string, size_t *slength);
00024 
00025 
00026 /*
00027  * pg_regprefix - get common prefix for regular expression
00028  *
00029  * Returns one of:
00030  *  REG_NOMATCH: there is no common prefix of strings matching the regex
00031  *  REG_PREFIX: there is a common prefix of strings matching the regex
00032  *  REG_EXACT: all strings satisfying the regex must match the same string
00033  *  or a REG_XXX error code
00034  *
00035  * In the non-failure cases, *string is set to a malloc'd string containing
00036  * the common prefix or exact value, of length *slength (measured in chrs
00037  * not bytes!).
00038  *
00039  * This function does not analyze all complex cases (such as lookahead
00040  * constraints) exactly.  Therefore it is possible that some strings matching
00041  * the reported prefix or exact-match string do not satisfy the regex.  But
00042  * it should never be the case that a string satisfying the regex does not
00043  * match the reported prefix or exact-match string.
00044  */
00045 int
00046 pg_regprefix(regex_t *re,
00047              chr **string,
00048              size_t *slength)
00049 {
00050     struct guts *g;
00051     struct cnfa *cnfa;
00052     int         st;
00053 
00054     /* sanity checks */
00055     if (string == NULL || slength == NULL)
00056         return REG_INVARG;
00057     *string = NULL;             /* initialize for failure cases */
00058     *slength = 0;
00059     if (re == NULL || re->re_magic != REMAGIC)
00060         return REG_INVARG;
00061     if (re->re_csize != sizeof(chr))
00062         return REG_MIXED;
00063 
00064     /* Initialize locale-dependent support */
00065     pg_set_regex_collation(re->re_collation);
00066 
00067     /* setup */
00068     g = (struct guts *) re->re_guts;
00069     if (g->info & REG_UIMPOSSIBLE)
00070         return REG_NOMATCH;
00071 
00072     /*
00073      * This implementation considers only the search NFA for the topmost regex
00074      * tree node.  Therefore, constraints such as backrefs are not fully
00075      * applied, which is allowed per the function's API spec.
00076      */
00077     assert(g->tree != NULL);
00078     cnfa = &g->tree->cnfa;
00079 
00080     /*
00081      * Since a correct NFA should never contain any exit-free loops, it should
00082      * not be possible for our traversal to return to a previously visited
00083      * NFA state.  Hence we need at most nstates chrs in the output string.
00084      */
00085     *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
00086     if (*string == NULL)
00087         return REG_ESPACE;
00088 
00089     /* do it */
00090     st = findprefix(cnfa, &g->cmap, *string, slength);
00091 
00092     assert(*slength <= cnfa->nstates);
00093 
00094     /* clean up */
00095     if (st != REG_PREFIX && st != REG_EXACT)
00096     {
00097         FREE(*string);
00098         *string = NULL;
00099         *slength = 0;
00100     }
00101 
00102     return st;
00103 }
00104 
00105 /*
00106  * findprefix - extract common prefix from cNFA
00107  *
00108  * Results are returned into the preallocated chr array string[], with
00109  * *slength (which must be preset to zero) incremented for each chr.
00110  */
00111 static int                      /* regprefix return code */
00112 findprefix(struct cnfa * cnfa,
00113            struct colormap * cm,
00114            chr *string,
00115            size_t *slength)
00116 {
00117     int         st;
00118     int         nextst;
00119     color       thiscolor;
00120     chr         c;
00121     struct carc *ca;
00122 
00123     /*
00124      * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
00125      * anchored left.  If we have both BOS and BOL, they must go to the
00126      * same next state.
00127      */
00128     st = cnfa->pre;
00129     nextst = -1;
00130     for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
00131     {
00132         if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
00133         {
00134             if (nextst == -1)
00135                 nextst = ca->to;
00136             else if (nextst != ca->to)
00137                 return REG_NOMATCH;
00138         }
00139         else
00140             return REG_NOMATCH;
00141     }
00142     if (nextst == -1)
00143         return REG_NOMATCH;
00144 
00145     /*
00146      * Scan through successive states, stopping as soon as we find one with
00147      * more than one acceptable transition character (either multiple colors
00148      * on out-arcs, or a color with more than one member chr).
00149      *
00150      * We could find a state with multiple out-arcs that are all labeled with
00151      * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
00152      * In that case we add the chr "c" to the output string but then exit the
00153      * loop with nextst == -1.  This leaves a little bit on the table: if the
00154      * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
00155      * to the prefix.  But chasing multiple parallel state chains doesn't seem
00156      * worth the trouble.
00157      */
00158     do
00159     {
00160         st = nextst;
00161         nextst = -1;
00162         thiscolor = COLORLESS;
00163         for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
00164         {
00165             /* We ignore lookahead constraints */
00166             if (ca->co >= cnfa->ncolors)
00167                 continue;
00168             /* We can also ignore BOS/BOL arcs */
00169             if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
00170                 continue;
00171             /* ... but EOS/EOL arcs terminate the search */
00172             if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
00173             {
00174                 thiscolor = COLORLESS;
00175                 break;
00176             }
00177             if (thiscolor == COLORLESS)
00178             {
00179                 /* First plain outarc */
00180                 thiscolor = ca->co;
00181                 nextst = ca->to;
00182             }
00183             else if (thiscolor == ca->co)
00184             {
00185                 /* Another plain outarc for same color */
00186                 nextst = -1;
00187             }
00188             else
00189             {
00190                 /* More than one plain outarc color terminates the search */
00191                 thiscolor = COLORLESS;
00192                 break;
00193             }
00194         }
00195         /* Done if we didn't find exactly one color on plain outarcs */
00196         if (thiscolor == COLORLESS)
00197             break;
00198         /* The color must be a singleton */
00199         if (cm->cd[thiscolor].nchrs != 1)
00200             break;
00201 
00202         /*
00203          * Identify the color's sole member chr and add it to the prefix
00204          * string.  In general the colormap data structure doesn't provide a
00205          * way to find color member chrs, except by trying GETCOLOR() on each
00206          * possible chr value, which won't do at all.  However, for the cases
00207          * we care about it should be sufficient to test the "firstchr" value,
00208          * that is the first chr ever added to the color.  There are cases
00209          * where this might no longer be a member of the color (so we do need
00210          * to test), but none of them are likely to arise for a character that
00211          * is a member of a common prefix.  If we do hit such a corner case,
00212          * we just fall out without adding anything to the prefix string.
00213          */
00214         c = cm->cd[thiscolor].firstchr;
00215         if (GETCOLOR(cm, c) != thiscolor)
00216             break;
00217 
00218         string[(*slength)++] = c;
00219 
00220         /* Advance to next state, but only if we have a unique next state */
00221     } while (nextst != -1);
00222 
00223     /*
00224      * If we ended at a state that only has EOS/EOL outarcs leading to the
00225      * "post" state, then we have an exact-match string.  Note this is true
00226      * even if the string is of zero length.
00227      */
00228     nextst = -1;
00229     for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
00230     {
00231         if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
00232         {
00233             if (nextst == -1)
00234                 nextst = ca->to;
00235             else if (nextst != ca->to)
00236             {
00237                 nextst = -1;
00238                 break;
00239             }
00240         }
00241         else
00242         {
00243             nextst = -1;
00244             break;
00245         }
00246     }
00247     if (nextst == cnfa->post)
00248         return REG_EXACT;
00249 
00250     /*
00251      * Otherwise, if we were unable to identify any prefix characters, say
00252      * NOMATCH --- the pattern is anchored left, but doesn't specify any
00253      * particular first character.
00254      */
00255     if (*slength > 0)
00256         return REG_PREFIX;
00257 
00258     return REG_NOMATCH;
00259 }