#include "regex/regguts.h"
Go to the source code of this file.
Functions | |
| static int | findprefix (struct cnfa *cnfa, struct colormap *cm, chr *string, size_t *slength) |
| int | pg_regprefix (regex_t *re, chr **string, size_t *slength) |
| static int findprefix | ( | struct cnfa * | cnfa, | |
| struct colormap * | cm, | |||
| chr * | string, | |||
| size_t * | slength | |||
| ) | [static] |
Definition at line 112 of file regprefix.c.
References cnfa::bos, colormap::cd, carc::co, COLORLESS, cnfa::eos, colordesc::firstchr, colordesc::nchrs, cnfa::ncolors, cnfa::post, cnfa::pre, cnfa::states, and carc::to.
Referenced by pg_regprefix().
{
int st;
int nextst;
color thiscolor;
chr c;
struct carc *ca;
/*
* The "pre" state must have only BOS/BOL outarcs, else pattern isn't
* anchored left. If we have both BOS and BOL, they must go to the
* same next state.
*/
st = cnfa->pre;
nextst = -1;
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
{
if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
{
if (nextst == -1)
nextst = ca->to;
else if (nextst != ca->to)
return REG_NOMATCH;
}
else
return REG_NOMATCH;
}
if (nextst == -1)
return REG_NOMATCH;
/*
* Scan through successive states, stopping as soon as we find one with
* more than one acceptable transition character (either multiple colors
* on out-arcs, or a color with more than one member chr).
*
* We could find a state with multiple out-arcs that are all labeled with
* the same singleton color; this comes from patterns like "^ab(cde|cxy)".
* In that case we add the chr "c" to the output string but then exit the
* loop with nextst == -1. This leaves a little bit on the table: if the
* pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
* to the prefix. But chasing multiple parallel state chains doesn't seem
* worth the trouble.
*/
do
{
st = nextst;
nextst = -1;
thiscolor = COLORLESS;
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
{
/* We ignore lookahead constraints */
if (ca->co >= cnfa->ncolors)
continue;
/* We can also ignore BOS/BOL arcs */
if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
continue;
/* ... but EOS/EOL arcs terminate the search */
if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
{
thiscolor = COLORLESS;
break;
}
if (thiscolor == COLORLESS)
{
/* First plain outarc */
thiscolor = ca->co;
nextst = ca->to;
}
else if (thiscolor == ca->co)
{
/* Another plain outarc for same color */
nextst = -1;
}
else
{
/* More than one plain outarc color terminates the search */
thiscolor = COLORLESS;
break;
}
}
/* Done if we didn't find exactly one color on plain outarcs */
if (thiscolor == COLORLESS)
break;
/* The color must be a singleton */
if (cm->cd[thiscolor].nchrs != 1)
break;
/*
* Identify the color's sole member chr and add it to the prefix
* string. In general the colormap data structure doesn't provide a
* way to find color member chrs, except by trying GETCOLOR() on each
* possible chr value, which won't do at all. However, for the cases
* we care about it should be sufficient to test the "firstchr" value,
* that is the first chr ever added to the color. There are cases
* where this might no longer be a member of the color (so we do need
* to test), but none of them are likely to arise for a character that
* is a member of a common prefix. If we do hit such a corner case,
* we just fall out without adding anything to the prefix string.
*/
c = cm->cd[thiscolor].firstchr;
if (GETCOLOR(cm, c) != thiscolor)
break;
string[(*slength)++] = c;
/* Advance to next state, but only if we have a unique next state */
} while (nextst != -1);
/*
* If we ended at a state that only has EOS/EOL outarcs leading to the
* "post" state, then we have an exact-match string. Note this is true
* even if the string is of zero length.
*/
nextst = -1;
for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
{
if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
{
if (nextst == -1)
nextst = ca->to;
else if (nextst != ca->to)
{
nextst = -1;
break;
}
}
else
{
nextst = -1;
break;
}
}
if (nextst == cnfa->post)
return REG_EXACT;
/*
* Otherwise, if we were unable to identify any prefix characters, say
* NOMATCH --- the pattern is anchored left, but doesn't specify any
* particular first character.
*/
if (*slength > 0)
return REG_PREFIX;
return REG_NOMATCH;
}
Definition at line 46 of file regprefix.c.
References assert, guts::cmap, subre::cnfa, findprefix(), FREE, guts::info, MALLOC, cnfa::nstates, NULL, pg_set_regex_collation(), regex_t::re_collation, regex_t::re_csize, regex_t::re_guts, regex_t::re_magic, REG_EXACT, REG_PREFIX, REG_UIMPOSSIBLE, REMAGIC, and guts::tree.
Referenced by regexp_fixed_prefix().
{
struct guts *g;
struct cnfa *cnfa;
int st;
/* sanity checks */
if (string == NULL || slength == NULL)
return REG_INVARG;
*string = NULL; /* initialize for failure cases */
*slength = 0;
if (re == NULL || re->re_magic != REMAGIC)
return REG_INVARG;
if (re->re_csize != sizeof(chr))
return REG_MIXED;
/* Initialize locale-dependent support */
pg_set_regex_collation(re->re_collation);
/* setup */
g = (struct guts *) re->re_guts;
if (g->info & REG_UIMPOSSIBLE)
return REG_NOMATCH;
/*
* This implementation considers only the search NFA for the topmost regex
* tree node. Therefore, constraints such as backrefs are not fully
* applied, which is allowed per the function's API spec.
*/
assert(g->tree != NULL);
cnfa = &g->tree->cnfa;
/*
* Since a correct NFA should never contain any exit-free loops, it should
* not be possible for our traversal to return to a previously visited
* NFA state. Hence we need at most nstates chrs in the output string.
*/
*string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
if (*string == NULL)
return REG_ESPACE;
/* do it */
st = findprefix(cnfa, &g->cmap, *string, slength);
assert(*slength <= cnfa->nstates);
/* clean up */
if (st != REG_PREFIX && st != REG_EXACT)
{
FREE(*string);
*string = NULL;
*slength = 0;
}
return st;
}
1.7.1