#include "regex/regguts.h"
Go to the source code of this file.
Functions | |
static int | findprefix (struct cnfa *cnfa, struct colormap *cm, chr *string, size_t *slength) |
int | pg_regprefix (regex_t *re, chr **string, size_t *slength) |
static int findprefix | ( | struct cnfa * | cnfa, | |
struct colormap * | cm, | |||
chr * | string, | |||
size_t * | slength | |||
) | [static] |
Definition at line 112 of file regprefix.c.
References cnfa::bos, colormap::cd, carc::co, COLORLESS, cnfa::eos, colordesc::firstchr, colordesc::nchrs, cnfa::ncolors, cnfa::post, cnfa::pre, cnfa::states, and carc::to.
Referenced by pg_regprefix().
{ int st; int nextst; color thiscolor; chr c; struct carc *ca; /* * The "pre" state must have only BOS/BOL outarcs, else pattern isn't * anchored left. If we have both BOS and BOL, they must go to the * same next state. */ st = cnfa->pre; nextst = -1; for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) { if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) { if (nextst == -1) nextst = ca->to; else if (nextst != ca->to) return REG_NOMATCH; } else return REG_NOMATCH; } if (nextst == -1) return REG_NOMATCH; /* * Scan through successive states, stopping as soon as we find one with * more than one acceptable transition character (either multiple colors * on out-arcs, or a color with more than one member chr). * * We could find a state with multiple out-arcs that are all labeled with * the same singleton color; this comes from patterns like "^ab(cde|cxy)". * In that case we add the chr "c" to the output string but then exit the * loop with nextst == -1. This leaves a little bit on the table: if the * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added * to the prefix. But chasing multiple parallel state chains doesn't seem * worth the trouble. */ do { st = nextst; nextst = -1; thiscolor = COLORLESS; for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) { /* We ignore lookahead constraints */ if (ca->co >= cnfa->ncolors) continue; /* We can also ignore BOS/BOL arcs */ if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) continue; /* ... but EOS/EOL arcs terminate the search */ if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1]) { thiscolor = COLORLESS; break; } if (thiscolor == COLORLESS) { /* First plain outarc */ thiscolor = ca->co; nextst = ca->to; } else if (thiscolor == ca->co) { /* Another plain outarc for same color */ nextst = -1; } else { /* More than one plain outarc color terminates the search */ thiscolor = COLORLESS; break; } } /* Done if we didn't find exactly one color on plain outarcs */ if (thiscolor == COLORLESS) break; /* The color must be a singleton */ if (cm->cd[thiscolor].nchrs != 1) break; /* * Identify the color's sole member chr and add it to the prefix * string. In general the colormap data structure doesn't provide a * way to find color member chrs, except by trying GETCOLOR() on each * possible chr value, which won't do at all. However, for the cases * we care about it should be sufficient to test the "firstchr" value, * that is the first chr ever added to the color. There are cases * where this might no longer be a member of the color (so we do need * to test), but none of them are likely to arise for a character that * is a member of a common prefix. If we do hit such a corner case, * we just fall out without adding anything to the prefix string. */ c = cm->cd[thiscolor].firstchr; if (GETCOLOR(cm, c) != thiscolor) break; string[(*slength)++] = c; /* Advance to next state, but only if we have a unique next state */ } while (nextst != -1); /* * If we ended at a state that only has EOS/EOL outarcs leading to the * "post" state, then we have an exact-match string. Note this is true * even if the string is of zero length. */ nextst = -1; for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) { if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1]) { if (nextst == -1) nextst = ca->to; else if (nextst != ca->to) { nextst = -1; break; } } else { nextst = -1; break; } } if (nextst == cnfa->post) return REG_EXACT; /* * Otherwise, if we were unable to identify any prefix characters, say * NOMATCH --- the pattern is anchored left, but doesn't specify any * particular first character. */ if (*slength > 0) return REG_PREFIX; return REG_NOMATCH; }
Definition at line 46 of file regprefix.c.
References assert, guts::cmap, subre::cnfa, findprefix(), FREE, guts::info, MALLOC, cnfa::nstates, NULL, pg_set_regex_collation(), regex_t::re_collation, regex_t::re_csize, regex_t::re_guts, regex_t::re_magic, REG_EXACT, REG_PREFIX, REG_UIMPOSSIBLE, REMAGIC, and guts::tree.
Referenced by regexp_fixed_prefix().
{ struct guts *g; struct cnfa *cnfa; int st; /* sanity checks */ if (string == NULL || slength == NULL) return REG_INVARG; *string = NULL; /* initialize for failure cases */ *slength = 0; if (re == NULL || re->re_magic != REMAGIC) return REG_INVARG; if (re->re_csize != sizeof(chr)) return REG_MIXED; /* Initialize locale-dependent support */ pg_set_regex_collation(re->re_collation); /* setup */ g = (struct guts *) re->re_guts; if (g->info & REG_UIMPOSSIBLE) return REG_NOMATCH; /* * This implementation considers only the search NFA for the topmost regex * tree node. Therefore, constraints such as backrefs are not fully * applied, which is allowed per the function's API spec. */ assert(g->tree != NULL); cnfa = &g->tree->cnfa; /* * Since a correct NFA should never contain any exit-free loops, it should * not be possible for our traversal to return to a previously visited * NFA state. Hence we need at most nstates chrs in the output string. */ *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr)); if (*string == NULL) return REG_ESPACE; /* do it */ st = findprefix(cnfa, &g->cmap, *string, slength); assert(*slength <= cnfa->nstates); /* clean up */ if (st != REG_PREFIX && st != REG_EXACT) { FREE(*string); *string = NULL; *slength = 0; } return st; }