00001 /*------------------------------------------------------------------------- 00002 * 00003 * regprefix.c 00004 * Extract a common prefix, if any, from a compiled regex. 00005 * 00006 * 00007 * Portions Copyright (c) 2012-2013, PostgreSQL Global Development Group 00008 * Portions Copyright (c) 1998, 1999 Henry Spencer 00009 * 00010 * IDENTIFICATION 00011 * src/backend/regex/regprefix.c 00012 * 00013 *------------------------------------------------------------------------- 00014 */ 00015 00016 #include "regex/regguts.h" 00017 00018 00019 /* 00020 * forward declarations 00021 */ 00022 static int findprefix(struct cnfa * cnfa, struct colormap * cm, 00023 chr *string, size_t *slength); 00024 00025 00026 /* 00027 * pg_regprefix - get common prefix for regular expression 00028 * 00029 * Returns one of: 00030 * REG_NOMATCH: there is no common prefix of strings matching the regex 00031 * REG_PREFIX: there is a common prefix of strings matching the regex 00032 * REG_EXACT: all strings satisfying the regex must match the same string 00033 * or a REG_XXX error code 00034 * 00035 * In the non-failure cases, *string is set to a malloc'd string containing 00036 * the common prefix or exact value, of length *slength (measured in chrs 00037 * not bytes!). 00038 * 00039 * This function does not analyze all complex cases (such as lookahead 00040 * constraints) exactly. Therefore it is possible that some strings matching 00041 * the reported prefix or exact-match string do not satisfy the regex. But 00042 * it should never be the case that a string satisfying the regex does not 00043 * match the reported prefix or exact-match string. 00044 */ 00045 int 00046 pg_regprefix(regex_t *re, 00047 chr **string, 00048 size_t *slength) 00049 { 00050 struct guts *g; 00051 struct cnfa *cnfa; 00052 int st; 00053 00054 /* sanity checks */ 00055 if (string == NULL || slength == NULL) 00056 return REG_INVARG; 00057 *string = NULL; /* initialize for failure cases */ 00058 *slength = 0; 00059 if (re == NULL || re->re_magic != REMAGIC) 00060 return REG_INVARG; 00061 if (re->re_csize != sizeof(chr)) 00062 return REG_MIXED; 00063 00064 /* Initialize locale-dependent support */ 00065 pg_set_regex_collation(re->re_collation); 00066 00067 /* setup */ 00068 g = (struct guts *) re->re_guts; 00069 if (g->info & REG_UIMPOSSIBLE) 00070 return REG_NOMATCH; 00071 00072 /* 00073 * This implementation considers only the search NFA for the topmost regex 00074 * tree node. Therefore, constraints such as backrefs are not fully 00075 * applied, which is allowed per the function's API spec. 00076 */ 00077 assert(g->tree != NULL); 00078 cnfa = &g->tree->cnfa; 00079 00080 /* 00081 * Since a correct NFA should never contain any exit-free loops, it should 00082 * not be possible for our traversal to return to a previously visited 00083 * NFA state. Hence we need at most nstates chrs in the output string. 00084 */ 00085 *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr)); 00086 if (*string == NULL) 00087 return REG_ESPACE; 00088 00089 /* do it */ 00090 st = findprefix(cnfa, &g->cmap, *string, slength); 00091 00092 assert(*slength <= cnfa->nstates); 00093 00094 /* clean up */ 00095 if (st != REG_PREFIX && st != REG_EXACT) 00096 { 00097 FREE(*string); 00098 *string = NULL; 00099 *slength = 0; 00100 } 00101 00102 return st; 00103 } 00104 00105 /* 00106 * findprefix - extract common prefix from cNFA 00107 * 00108 * Results are returned into the preallocated chr array string[], with 00109 * *slength (which must be preset to zero) incremented for each chr. 00110 */ 00111 static int /* regprefix return code */ 00112 findprefix(struct cnfa * cnfa, 00113 struct colormap * cm, 00114 chr *string, 00115 size_t *slength) 00116 { 00117 int st; 00118 int nextst; 00119 color thiscolor; 00120 chr c; 00121 struct carc *ca; 00122 00123 /* 00124 * The "pre" state must have only BOS/BOL outarcs, else pattern isn't 00125 * anchored left. If we have both BOS and BOL, they must go to the 00126 * same next state. 00127 */ 00128 st = cnfa->pre; 00129 nextst = -1; 00130 for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) 00131 { 00132 if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) 00133 { 00134 if (nextst == -1) 00135 nextst = ca->to; 00136 else if (nextst != ca->to) 00137 return REG_NOMATCH; 00138 } 00139 else 00140 return REG_NOMATCH; 00141 } 00142 if (nextst == -1) 00143 return REG_NOMATCH; 00144 00145 /* 00146 * Scan through successive states, stopping as soon as we find one with 00147 * more than one acceptable transition character (either multiple colors 00148 * on out-arcs, or a color with more than one member chr). 00149 * 00150 * We could find a state with multiple out-arcs that are all labeled with 00151 * the same singleton color; this comes from patterns like "^ab(cde|cxy)". 00152 * In that case we add the chr "c" to the output string but then exit the 00153 * loop with nextst == -1. This leaves a little bit on the table: if the 00154 * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added 00155 * to the prefix. But chasing multiple parallel state chains doesn't seem 00156 * worth the trouble. 00157 */ 00158 do 00159 { 00160 st = nextst; 00161 nextst = -1; 00162 thiscolor = COLORLESS; 00163 for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) 00164 { 00165 /* We ignore lookahead constraints */ 00166 if (ca->co >= cnfa->ncolors) 00167 continue; 00168 /* We can also ignore BOS/BOL arcs */ 00169 if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) 00170 continue; 00171 /* ... but EOS/EOL arcs terminate the search */ 00172 if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1]) 00173 { 00174 thiscolor = COLORLESS; 00175 break; 00176 } 00177 if (thiscolor == COLORLESS) 00178 { 00179 /* First plain outarc */ 00180 thiscolor = ca->co; 00181 nextst = ca->to; 00182 } 00183 else if (thiscolor == ca->co) 00184 { 00185 /* Another plain outarc for same color */ 00186 nextst = -1; 00187 } 00188 else 00189 { 00190 /* More than one plain outarc color terminates the search */ 00191 thiscolor = COLORLESS; 00192 break; 00193 } 00194 } 00195 /* Done if we didn't find exactly one color on plain outarcs */ 00196 if (thiscolor == COLORLESS) 00197 break; 00198 /* The color must be a singleton */ 00199 if (cm->cd[thiscolor].nchrs != 1) 00200 break; 00201 00202 /* 00203 * Identify the color's sole member chr and add it to the prefix 00204 * string. In general the colormap data structure doesn't provide a 00205 * way to find color member chrs, except by trying GETCOLOR() on each 00206 * possible chr value, which won't do at all. However, for the cases 00207 * we care about it should be sufficient to test the "firstchr" value, 00208 * that is the first chr ever added to the color. There are cases 00209 * where this might no longer be a member of the color (so we do need 00210 * to test), but none of them are likely to arise for a character that 00211 * is a member of a common prefix. If we do hit such a corner case, 00212 * we just fall out without adding anything to the prefix string. 00213 */ 00214 c = cm->cd[thiscolor].firstchr; 00215 if (GETCOLOR(cm, c) != thiscolor) 00216 break; 00217 00218 string[(*slength)++] = c; 00219 00220 /* Advance to next state, but only if we have a unique next state */ 00221 } while (nextst != -1); 00222 00223 /* 00224 * If we ended at a state that only has EOS/EOL outarcs leading to the 00225 * "post" state, then we have an exact-match string. Note this is true 00226 * even if the string is of zero length. 00227 */ 00228 nextst = -1; 00229 for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) 00230 { 00231 if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1]) 00232 { 00233 if (nextst == -1) 00234 nextst = ca->to; 00235 else if (nextst != ca->to) 00236 { 00237 nextst = -1; 00238 break; 00239 } 00240 } 00241 else 00242 { 00243 nextst = -1; 00244 break; 00245 } 00246 } 00247 if (nextst == cnfa->post) 00248 return REG_EXACT; 00249 00250 /* 00251 * Otherwise, if we were unable to identify any prefix characters, say 00252 * NOMATCH --- the pattern is anchored left, but doesn't specify any 00253 * particular first character. 00254 */ 00255 if (*slength > 0) 00256 return REG_PREFIX; 00257 00258 return REG_NOMATCH; 00259 }