Header And Logo

PostgreSQL
| The world's most advanced open source database.

regexec.c

Go to the documentation of this file.
00001 /*
00002  * re_*exec and friends - match REs
00003  *
00004  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
00005  *
00006  * Development of this software was funded, in part, by Cray Research Inc.,
00007  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
00008  * Corporation, none of whom are responsible for the results.  The author
00009  * thanks all of them.
00010  *
00011  * Redistribution and use in source and binary forms -- with or without
00012  * modification -- are permitted for any purpose, provided that
00013  * redistributions in source form retain this entire copyright notice and
00014  * indicate the origin and nature of any modifications.
00015  *
00016  * I'd appreciate being given credit for this package in the documentation
00017  * of software which uses it, but that is not a requirement.
00018  *
00019  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
00020  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
00021  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
00022  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00023  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00024  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
00025  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
00026  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
00027  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
00028  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00029  *
00030  * src/backend/regex/regexec.c
00031  *
00032  */
00033 
00034 #include "regex/regguts.h"
00035 
00036 
00037 
00038 /* lazy-DFA representation */
00039 struct arcp
00040 {                               /* "pointer" to an outarc */
00041     struct sset *ss;
00042     color       co;
00043 };
00044 
00045 struct sset
00046 {                               /* state set */
00047     unsigned   *states;         /* pointer to bitvector */
00048     unsigned    hash;           /* hash of bitvector */
00049 #define  HASH(bv, nw)    (((nw) == 1) ? *(bv) : hash(bv, nw))
00050 #define  HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \
00051         memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0))
00052     int         flags;
00053 #define  STARTER     01         /* the initial state set */
00054 #define  POSTSTATE   02         /* includes the goal state */
00055 #define  LOCKED      04         /* locked in cache */
00056 #define  NOPROGRESS  010        /* zero-progress state set */
00057     struct arcp ins;            /* chain of inarcs pointing here */
00058     chr        *lastseen;       /* last entered on arrival here */
00059     struct sset **outs;         /* outarc vector indexed by color */
00060     struct arcp *inchain;       /* chain-pointer vector for outarcs */
00061 };
00062 
00063 struct dfa
00064 {
00065     int         nssets;         /* size of cache */
00066     int         nssused;        /* how many entries occupied yet */
00067     int         nstates;        /* number of states */
00068     int         ncolors;        /* length of outarc and inchain vectors */
00069     int         wordsper;       /* length of state-set bitvectors */
00070     struct sset *ssets;         /* state-set cache */
00071     unsigned   *statesarea;     /* bitvector storage */
00072     unsigned   *work;           /* pointer to work area within statesarea */
00073     struct sset **outsarea;     /* outarc-vector storage */
00074     struct arcp *incarea;       /* inchain storage */
00075     struct cnfa *cnfa;
00076     struct colormap *cm;
00077     chr        *lastpost;       /* location of last cache-flushed success */
00078     chr        *lastnopr;       /* location of last cache-flushed NOPROGRESS */
00079     struct sset *search;        /* replacement-search-pointer memory */
00080     int         cptsmalloced;   /* were the areas individually malloced? */
00081     char       *mallocarea;     /* self, or master malloced area, or NULL */
00082 };
00083 
00084 #define WORK    1               /* number of work bitvectors needed */
00085 
00086 /* setup for non-malloc allocation for small cases */
00087 #define FEWSTATES   20          /* must be less than UBITS */
00088 #define FEWCOLORS   15
00089 struct smalldfa
00090 {
00091     struct dfa  dfa;
00092     struct sset ssets[FEWSTATES * 2];
00093     unsigned    statesarea[FEWSTATES * 2 + WORK];
00094     struct sset *outsarea[FEWSTATES * 2 * FEWCOLORS];
00095     struct arcp incarea[FEWSTATES * 2 * FEWCOLORS];
00096 };
00097 
00098 #define DOMALLOC    ((struct smalldfa *)NULL)   /* force malloc */
00099 
00100 
00101 
00102 /* internal variables, bundled for easy passing around */
00103 struct vars
00104 {
00105     regex_t    *re;
00106     struct guts *g;
00107     int         eflags;         /* copies of arguments */
00108     size_t      nmatch;
00109     regmatch_t *pmatch;
00110     rm_detail_t *details;
00111     chr        *start;          /* start of string */
00112     chr        *search_start;   /* search start of string */
00113     chr        *stop;           /* just past end of string */
00114     int         err;            /* error code if any (0 none) */
00115     struct dfa **subdfas;       /* per-subre DFAs */
00116     struct smalldfa dfa1;
00117     struct smalldfa dfa2;
00118 };
00119 
00120 #define VISERR(vv)  ((vv)->err != 0)    /* have we seen an error yet? */
00121 #define ISERR() VISERR(v)
00122 #define VERR(vv,e)  ((vv)->err = ((vv)->err ? (vv)->err : (e)))
00123 #define ERR(e)  VERR(v, e)      /* record an error */
00124 #define NOERR() {if (ISERR()) return v->err;}   /* if error seen, return it */
00125 #define OFF(p)  ((p) - v->start)
00126 #define LOFF(p) ((long)OFF(p))
00127 
00128 
00129 
00130 /*
00131  * forward declarations
00132  */
00133 /* === regexec.c === */
00134 static struct dfa *getsubdfa(struct vars *, struct subre *);
00135 static int  find(struct vars *, struct cnfa *, struct colormap *);
00136 static int  cfind(struct vars *, struct cnfa *, struct colormap *);
00137 static int  cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **);
00138 static void zapallsubs(regmatch_t *, size_t);
00139 static void zaptreesubs(struct vars *, struct subre *);
00140 static void subset(struct vars *, struct subre *, chr *, chr *);
00141 static int  cdissect(struct vars *, struct subre *, chr *, chr *);
00142 static int  ccondissect(struct vars *, struct subre *, chr *, chr *);
00143 static int  crevcondissect(struct vars *, struct subre *, chr *, chr *);
00144 static int  cbrdissect(struct vars *, struct subre *, chr *, chr *);
00145 static int  caltdissect(struct vars *, struct subre *, chr *, chr *);
00146 static int  citerdissect(struct vars *, struct subre *, chr *, chr *);
00147 static int  creviterdissect(struct vars *, struct subre *, chr *, chr *);
00148 
00149 /* === rege_dfa.c === */
00150 static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
00151 static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *);
00152 static chr *lastcold(struct vars *, struct dfa *);
00153 static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *);
00154 static void freedfa(struct dfa *);
00155 static unsigned hash(unsigned *, int);
00156 static struct sset *initialize(struct vars *, struct dfa *, chr *);
00157 static struct sset *miss(struct vars *, struct dfa *, struct sset *, pcolor, chr *, chr *);
00158 static int  lacon(struct vars *, struct cnfa *, chr *, pcolor);
00159 static struct sset *getvacant(struct vars *, struct dfa *, chr *, chr *);
00160 static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *);
00161 
00162 
00163 /*
00164  * pg_regexec - match regular expression
00165  */
00166 int
00167 pg_regexec(regex_t *re,
00168            const chr *string,
00169            size_t len,
00170            size_t search_start,
00171            rm_detail_t *details,
00172            size_t nmatch,
00173            regmatch_t pmatch[],
00174            int flags)
00175 {
00176     struct vars var;
00177     register struct vars *v = &var;
00178     int         st;
00179     size_t      n;
00180     size_t      i;
00181     int         backref;
00182 
00183 #define  LOCALMAT    20
00184     regmatch_t  mat[LOCALMAT];
00185 
00186 #define  LOCALDFAS   40
00187     struct dfa *subdfas[LOCALDFAS];
00188 
00189     /* sanity checks */
00190     if (re == NULL || string == NULL || re->re_magic != REMAGIC)
00191         return REG_INVARG;
00192     if (re->re_csize != sizeof(chr))
00193         return REG_MIXED;
00194 
00195     /* Initialize locale-dependent support */
00196     pg_set_regex_collation(re->re_collation);
00197 
00198     /* setup */
00199     v->re = re;
00200     v->g = (struct guts *) re->re_guts;
00201     if ((v->g->cflags & REG_EXPECT) && details == NULL)
00202         return REG_INVARG;
00203     if (v->g->info & REG_UIMPOSSIBLE)
00204         return REG_NOMATCH;
00205     backref = (v->g->info & REG_UBACKREF) ? 1 : 0;
00206     v->eflags = flags;
00207     if (v->g->cflags & REG_NOSUB)
00208         nmatch = 0;             /* override client */
00209     v->nmatch = nmatch;
00210     if (backref)
00211     {
00212         /* need work area */
00213         if (v->g->nsub + 1 <= LOCALMAT)
00214             v->pmatch = mat;
00215         else
00216             v->pmatch = (regmatch_t *) MALLOC((v->g->nsub + 1) *
00217                                               sizeof(regmatch_t));
00218         if (v->pmatch == NULL)
00219             return REG_ESPACE;
00220         v->nmatch = v->g->nsub + 1;
00221     }
00222     else
00223         v->pmatch = pmatch;
00224     v->details = details;
00225     v->start = (chr *) string;
00226     v->search_start = (chr *) string + search_start;
00227     v->stop = (chr *) string + len;
00228     v->err = 0;
00229     assert(v->g->ntree >= 0);
00230     n = (size_t) v->g->ntree;
00231     if (n <= LOCALDFAS)
00232         v->subdfas = subdfas;
00233     else
00234         v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
00235     if (v->subdfas == NULL)
00236     {
00237         if (v->pmatch != pmatch && v->pmatch != mat)
00238             FREE(v->pmatch);
00239         return REG_ESPACE;
00240     }
00241     for (i = 0; i < n; i++)
00242         v->subdfas[i] = NULL;
00243 
00244     /* do it */
00245     assert(v->g->tree != NULL);
00246     if (backref)
00247         st = cfind(v, &v->g->tree->cnfa, &v->g->cmap);
00248     else
00249         st = find(v, &v->g->tree->cnfa, &v->g->cmap);
00250 
00251     /* copy (portion of) match vector over if necessary */
00252     if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0)
00253     {
00254         zapallsubs(pmatch, nmatch);
00255         n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
00256         memcpy(VS(pmatch), VS(v->pmatch), n * sizeof(regmatch_t));
00257     }
00258 
00259     /* clean up */
00260     if (v->pmatch != pmatch && v->pmatch != mat)
00261         FREE(v->pmatch);
00262     for (i = 0; i < n; i++)
00263     {
00264         if (v->subdfas[i] != NULL)
00265             freedfa(v->subdfas[i]);
00266     }
00267     if (v->subdfas != subdfas)
00268         FREE(v->subdfas);
00269 
00270     return st;
00271 }
00272 
00273 /*
00274  * getsubdfa - create or re-fetch the DFA for a subre node
00275  *
00276  * We only need to create the DFA once per overall regex execution.
00277  * The DFA will be freed by the cleanup step in pg_regexec().
00278  */
00279 static struct dfa *
00280 getsubdfa(struct vars * v,
00281           struct subre * t)
00282 {
00283     if (v->subdfas[t->id] == NULL)
00284     {
00285         v->subdfas[t->id] = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC);
00286         if (ISERR())
00287             return NULL;
00288     }
00289     return v->subdfas[t->id];
00290 }
00291 
00292 /*
00293  * find - find a match for the main NFA (no-complications case)
00294  */
00295 static int
00296 find(struct vars * v,
00297      struct cnfa * cnfa,
00298      struct colormap * cm)
00299 {
00300     struct dfa *s;
00301     struct dfa *d;
00302     chr        *begin;
00303     chr        *end = NULL;
00304     chr        *cold;
00305     chr        *open;           /* open and close of range of possible starts */
00306     chr        *close;
00307     int         hitend;
00308     int         shorter = (v->g->tree->flags & SHORTER) ? 1 : 0;
00309 
00310     /* first, a shot with the search RE */
00311     s = newdfa(v, &v->g->search, cm, &v->dfa1);
00312     assert(!(ISERR() && s != NULL));
00313     NOERR();
00314     MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
00315     cold = NULL;
00316     close = shortest(v, s, v->search_start, v->search_start, v->stop,
00317                      &cold, (int *) NULL);
00318     freedfa(s);
00319     NOERR();
00320     if (v->g->cflags & REG_EXPECT)
00321     {
00322         assert(v->details != NULL);
00323         if (cold != NULL)
00324             v->details->rm_extend.rm_so = OFF(cold);
00325         else
00326             v->details->rm_extend.rm_so = OFF(v->stop);
00327         v->details->rm_extend.rm_eo = OFF(v->stop);     /* unknown */
00328     }
00329     if (close == NULL)          /* not found */
00330         return REG_NOMATCH;
00331     if (v->nmatch == 0)         /* found, don't need exact location */
00332         return REG_OKAY;
00333 
00334     /* find starting point and match */
00335     assert(cold != NULL);
00336     open = cold;
00337     cold = NULL;
00338     MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close)));
00339     d = newdfa(v, cnfa, cm, &v->dfa1);
00340     assert(!(ISERR() && d != NULL));
00341     NOERR();
00342     for (begin = open; begin <= close; begin++)
00343     {
00344         MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
00345         if (shorter)
00346             end = shortest(v, d, begin, begin, v->stop,
00347                            (chr **) NULL, &hitend);
00348         else
00349             end = longest(v, d, begin, v->stop, &hitend);
00350         NOERR();
00351         if (hitend && cold == NULL)
00352             cold = begin;
00353         if (end != NULL)
00354             break;              /* NOTE BREAK OUT */
00355     }
00356     assert(end != NULL);        /* search RE succeeded so loop should */
00357     freedfa(d);
00358 
00359     /* and pin down details */
00360     assert(v->nmatch > 0);
00361     v->pmatch[0].rm_so = OFF(begin);
00362     v->pmatch[0].rm_eo = OFF(end);
00363     if (v->g->cflags & REG_EXPECT)
00364     {
00365         if (cold != NULL)
00366             v->details->rm_extend.rm_so = OFF(cold);
00367         else
00368             v->details->rm_extend.rm_so = OFF(v->stop);
00369         v->details->rm_extend.rm_eo = OFF(v->stop);     /* unknown */
00370     }
00371     if (v->nmatch == 1)         /* no need for submatches */
00372         return REG_OKAY;
00373 
00374     /* find submatches */
00375     zapallsubs(v->pmatch, v->nmatch);
00376     return cdissect(v, v->g->tree, begin, end);
00377 }
00378 
00379 /*
00380  * cfind - find a match for the main NFA (with complications)
00381  */
00382 static int
00383 cfind(struct vars * v,
00384       struct cnfa * cnfa,
00385       struct colormap * cm)
00386 {
00387     struct dfa *s;
00388     struct dfa *d;
00389     chr        *cold;
00390     int         ret;
00391 
00392     s = newdfa(v, &v->g->search, cm, &v->dfa1);
00393     NOERR();
00394     d = newdfa(v, cnfa, cm, &v->dfa2);
00395     if (ISERR())
00396     {
00397         assert(d == NULL);
00398         freedfa(s);
00399         return v->err;
00400     }
00401 
00402     ret = cfindloop(v, cnfa, cm, d, s, &cold);
00403 
00404     freedfa(d);
00405     freedfa(s);
00406     NOERR();
00407     if (v->g->cflags & REG_EXPECT)
00408     {
00409         assert(v->details != NULL);
00410         if (cold != NULL)
00411             v->details->rm_extend.rm_so = OFF(cold);
00412         else
00413             v->details->rm_extend.rm_so = OFF(v->stop);
00414         v->details->rm_extend.rm_eo = OFF(v->stop);     /* unknown */
00415     }
00416     return ret;
00417 }
00418 
00419 /*
00420  * cfindloop - the heart of cfind
00421  */
00422 static int
00423 cfindloop(struct vars * v,
00424           struct cnfa * cnfa,
00425           struct colormap * cm,
00426           struct dfa * d,
00427           struct dfa * s,
00428           chr **coldp)          /* where to put coldstart pointer */
00429 {
00430     chr        *begin;
00431     chr        *end;
00432     chr        *cold;
00433     chr        *open;           /* open and close of range of possible starts */
00434     chr        *close;
00435     chr        *estart;
00436     chr        *estop;
00437     int         er;
00438     int         shorter = v->g->tree->flags & SHORTER;
00439     int         hitend;
00440 
00441     assert(d != NULL && s != NULL);
00442     cold = NULL;
00443     close = v->search_start;
00444     do
00445     {
00446         MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
00447         close = shortest(v, s, close, close, v->stop, &cold, (int *) NULL);
00448         if (close == NULL)
00449             break;              /* NOTE BREAK */
00450         assert(cold != NULL);
00451         open = cold;
00452         cold = NULL;
00453         MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
00454         for (begin = open; begin <= close; begin++)
00455         {
00456             MDEBUG(("\ncfind trying at %ld\n", LOFF(begin)));
00457             estart = begin;
00458             estop = v->stop;
00459             for (;;)
00460             {
00461                 if (shorter)
00462                     end = shortest(v, d, begin, estart,
00463                                    estop, (chr **) NULL, &hitend);
00464                 else
00465                     end = longest(v, d, begin, estop,
00466                                   &hitend);
00467                 if (hitend && cold == NULL)
00468                     cold = begin;
00469                 if (end == NULL)
00470                     break;      /* NOTE BREAK OUT */
00471                 MDEBUG(("tentative end %ld\n", LOFF(end)));
00472                 zapallsubs(v->pmatch, v->nmatch);
00473                 er = cdissect(v, v->g->tree, begin, end);
00474                 if (er == REG_OKAY)
00475                 {
00476                     if (v->nmatch > 0)
00477                     {
00478                         v->pmatch[0].rm_so = OFF(begin);
00479                         v->pmatch[0].rm_eo = OFF(end);
00480                     }
00481                     *coldp = cold;
00482                     return REG_OKAY;
00483                 }
00484                 if (er != REG_NOMATCH)
00485                 {
00486                     ERR(er);
00487                     *coldp = cold;
00488                     return er;
00489                 }
00490                 if ((shorter) ? end == estop : end == begin)
00491                 {
00492                     /* no point in trying again */
00493                     *coldp = cold;
00494                     return REG_NOMATCH;
00495                 }
00496                 /* go around and try again */
00497                 if (shorter)
00498                     estart = end + 1;
00499                 else
00500                     estop = end - 1;
00501             }
00502         }
00503     } while (close < v->stop);
00504 
00505     *coldp = cold;
00506     return REG_NOMATCH;
00507 }
00508 
00509 /*
00510  * zapallsubs - initialize all subexpression matches to "no match"
00511  */
00512 static void
00513 zapallsubs(regmatch_t *p,
00514            size_t n)
00515 {
00516     size_t      i;
00517 
00518     for (i = n - 1; i > 0; i--)
00519     {
00520         p[i].rm_so = -1;
00521         p[i].rm_eo = -1;
00522     }
00523 }
00524 
00525 /*
00526  * zaptreesubs - initialize subexpressions within subtree to "no match"
00527  */
00528 static void
00529 zaptreesubs(struct vars * v,
00530             struct subre * t)
00531 {
00532     if (t->op == '(')
00533     {
00534         int         n = t->subno;
00535 
00536         assert(n > 0);
00537         if ((size_t) n < v->nmatch)
00538         {
00539             v->pmatch[n].rm_so = -1;
00540             v->pmatch[n].rm_eo = -1;
00541         }
00542     }
00543 
00544     if (t->left != NULL)
00545         zaptreesubs(v, t->left);
00546     if (t->right != NULL)
00547         zaptreesubs(v, t->right);
00548 }
00549 
00550 /*
00551  * subset - set subexpression match data for a successful subre
00552  */
00553 static void
00554 subset(struct vars * v,
00555        struct subre * sub,
00556        chr *begin,
00557        chr *end)
00558 {
00559     int         n = sub->subno;
00560 
00561     assert(n > 0);
00562     if ((size_t) n >= v->nmatch)
00563         return;
00564 
00565     MDEBUG(("setting %d\n", n));
00566     v->pmatch[n].rm_so = OFF(begin);
00567     v->pmatch[n].rm_eo = OFF(end);
00568 }
00569 
00570 /*
00571  * cdissect - check backrefs and determine subexpression matches
00572  *
00573  * cdissect recursively processes a subre tree to check matching of backrefs
00574  * and/or identify submatch boundaries for capture nodes.  The proposed match
00575  * runs from "begin" to "end" (not including "end"), and we are basically
00576  * "dissecting" it to see where the submatches are.
00577  *
00578  * Before calling any level of cdissect, the caller must have run the node's
00579  * DFA and found that the proposed substring satisfies the DFA.  (We make
00580  * the caller do that because in concatenation and iteration nodes, it's
00581  * much faster to check all the substrings against the child DFAs before we
00582  * recurse.)  Also, caller must have cleared subexpression match data via
00583  * zaptreesubs (or zapallsubs at the top level).
00584  */
00585 static int                      /* regexec return code */
00586 cdissect(struct vars * v,
00587          struct subre * t,
00588          chr *begin,            /* beginning of relevant substring */
00589          chr *end)              /* end of same */
00590 {
00591     int         er;
00592 
00593     assert(t != NULL);
00594     MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op));
00595 
00596     switch (t->op)
00597     {
00598         case '=':               /* terminal node */
00599             assert(t->left == NULL && t->right == NULL);
00600             er = REG_OKAY;      /* no action, parent did the work */
00601             break;
00602         case 'b':               /* back reference */
00603             assert(t->left == NULL && t->right == NULL);
00604             er = cbrdissect(v, t, begin, end);
00605             break;
00606         case '.':               /* concatenation */
00607             assert(t->left != NULL && t->right != NULL);
00608             if (t->left->flags & SHORTER)       /* reverse scan */
00609                 er = crevcondissect(v, t, begin, end);
00610             else
00611                 er = ccondissect(v, t, begin, end);
00612             break;
00613         case '|':               /* alternation */
00614             assert(t->left != NULL);
00615             er = caltdissect(v, t, begin, end);
00616             break;
00617         case '*':               /* iteration */
00618             assert(t->left != NULL);
00619             if (t->left->flags & SHORTER)       /* reverse scan */
00620                 er = creviterdissect(v, t, begin, end);
00621             else
00622                 er = citerdissect(v, t, begin, end);
00623             break;
00624         case '(':               /* capturing */
00625             assert(t->left != NULL && t->right == NULL);
00626             assert(t->subno > 0);
00627             er = cdissect(v, t->left, begin, end);
00628             if (er == REG_OKAY)
00629                 subset(v, t, begin, end);
00630             break;
00631         default:
00632             er = REG_ASSERT;
00633             break;
00634     }
00635 
00636     /*
00637      * We should never have a match failure unless backrefs lurk below;
00638      * otherwise, either caller failed to check the DFA, or there's some
00639      * inconsistency between the DFA and the node's innards.
00640      */
00641     assert(er != REG_NOMATCH || (t->flags & BACKR));
00642 
00643     return er;
00644 }
00645 
00646 /*
00647  * ccondissect - dissect match for concatenation node
00648  */
00649 static int                      /* regexec return code */
00650 ccondissect(struct vars * v,
00651             struct subre * t,
00652             chr *begin,         /* beginning of relevant substring */
00653             chr *end)           /* end of same */
00654 {
00655     struct dfa *d;
00656     struct dfa *d2;
00657     chr        *mid;
00658     int         er;
00659 
00660     assert(t->op == '.');
00661     assert(t->left != NULL && t->left->cnfa.nstates > 0);
00662     assert(t->right != NULL && t->right->cnfa.nstates > 0);
00663     assert(!(t->left->flags & SHORTER));
00664 
00665     d = getsubdfa(v, t->left);
00666     NOERR();
00667     d2 = getsubdfa(v, t->right);
00668     NOERR();
00669     MDEBUG(("cconcat %d\n", t->id));
00670 
00671     /* pick a tentative midpoint */
00672     mid = longest(v, d, begin, end, (int *) NULL);
00673     if (mid == NULL)
00674         return REG_NOMATCH;
00675     MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
00676 
00677     /* iterate until satisfaction or failure */
00678     for (;;)
00679     {
00680         /* try this midpoint on for size */
00681         if (longest(v, d2, mid, end, (int *) NULL) == end)
00682         {
00683             er = cdissect(v, t->left, begin, mid);
00684             if (er == REG_OKAY)
00685             {
00686                 er = cdissect(v, t->right, mid, end);
00687                 if (er == REG_OKAY)
00688                 {
00689                     /* satisfaction */
00690                     MDEBUG(("successful\n"));
00691                     return REG_OKAY;
00692                 }
00693             }
00694             if (er != REG_NOMATCH)
00695                 return er;
00696         }
00697 
00698         /* that midpoint didn't work, find a new one */
00699         if (mid == begin)
00700         {
00701             /* all possibilities exhausted */
00702             MDEBUG(("%d no midpoint\n", t->id));
00703             return REG_NOMATCH;
00704         }
00705         mid = longest(v, d, begin, mid - 1, (int *) NULL);
00706         if (mid == NULL)
00707         {
00708             /* failed to find a new one */
00709             MDEBUG(("%d failed midpoint\n", t->id));
00710             return REG_NOMATCH;
00711         }
00712         MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
00713         zaptreesubs(v, t->left);
00714         zaptreesubs(v, t->right);
00715     }
00716 
00717     /* can't get here */
00718     return REG_ASSERT;
00719 }
00720 
00721 /*
00722  * crevcondissect - dissect match for concatenation node, shortest-first
00723  */
00724 static int                      /* regexec return code */
00725 crevcondissect(struct vars * v,
00726                struct subre * t,
00727                chr *begin,      /* beginning of relevant substring */
00728                chr *end)        /* end of same */
00729 {
00730     struct dfa *d;
00731     struct dfa *d2;
00732     chr        *mid;
00733     int         er;
00734 
00735     assert(t->op == '.');
00736     assert(t->left != NULL && t->left->cnfa.nstates > 0);
00737     assert(t->right != NULL && t->right->cnfa.nstates > 0);
00738     assert(t->left->flags & SHORTER);
00739 
00740     d = getsubdfa(v, t->left);
00741     NOERR();
00742     d2 = getsubdfa(v, t->right);
00743     NOERR();
00744     MDEBUG(("crevcon %d\n", t->id));
00745 
00746     /* pick a tentative midpoint */
00747     mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL);
00748     if (mid == NULL)
00749         return REG_NOMATCH;
00750     MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
00751 
00752     /* iterate until satisfaction or failure */
00753     for (;;)
00754     {
00755         /* try this midpoint on for size */
00756         if (longest(v, d2, mid, end, (int *) NULL) == end)
00757         {
00758             er = cdissect(v, t->left, begin, mid);
00759             if (er == REG_OKAY)
00760             {
00761                 er = cdissect(v, t->right, mid, end);
00762                 if (er == REG_OKAY)
00763                 {
00764                     /* satisfaction */
00765                     MDEBUG(("successful\n"));
00766                     return REG_OKAY;
00767                 }
00768             }
00769             if (er != REG_NOMATCH)
00770                 return er;
00771         }
00772 
00773         /* that midpoint didn't work, find a new one */
00774         if (mid == end)
00775         {
00776             /* all possibilities exhausted */
00777             MDEBUG(("%d no midpoint\n", t->id));
00778             return REG_NOMATCH;
00779         }
00780         mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, (int *) NULL);
00781         if (mid == NULL)
00782         {
00783             /* failed to find a new one */
00784             MDEBUG(("%d failed midpoint\n", t->id));
00785             return REG_NOMATCH;
00786         }
00787         MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
00788         zaptreesubs(v, t->left);
00789         zaptreesubs(v, t->right);
00790     }
00791 
00792     /* can't get here */
00793     return REG_ASSERT;
00794 }
00795 
00796 /*
00797  * cbrdissect - dissect match for backref node
00798  */
00799 static int                      /* regexec return code */
00800 cbrdissect(struct vars * v,
00801            struct subre * t,
00802            chr *begin,          /* beginning of relevant substring */
00803            chr *end)            /* end of same */
00804 {
00805     int         n = t->subno;
00806     size_t      numreps;
00807     size_t      tlen;
00808     size_t      brlen;
00809     chr        *brstring;
00810     chr        *p;
00811     int         min = t->min;
00812     int         max = t->max;
00813 
00814     assert(t != NULL);
00815     assert(t->op == 'b');
00816     assert(n >= 0);
00817     assert((size_t) n < v->nmatch);
00818 
00819     MDEBUG(("cbackref n%d %d{%d-%d}\n", t->id, n, min, max));
00820 
00821     /* get the backreferenced string */
00822     if (v->pmatch[n].rm_so == -1)
00823         return REG_NOMATCH;
00824     brstring = v->start + v->pmatch[n].rm_so;
00825     brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
00826 
00827     /* special cases for zero-length strings */
00828     if (brlen == 0)
00829     {
00830         /*
00831          * matches only if target is zero length, but any number of
00832          * repetitions can be considered to be present
00833          */
00834         if (begin == end && min <= max)
00835         {
00836             MDEBUG(("cbackref matched trivially\n"));
00837             return REG_OKAY;
00838         }
00839         return REG_NOMATCH;
00840     }
00841     if (begin == end)
00842     {
00843         /* matches only if zero repetitions are okay */
00844         if (min == 0)
00845         {
00846             MDEBUG(("cbackref matched trivially\n"));
00847             return REG_OKAY;
00848         }
00849         return REG_NOMATCH;
00850     }
00851 
00852     /*
00853      * check target length to see if it could possibly be an allowed number of
00854      * repetitions of brstring
00855      */
00856     assert(end > begin);
00857     tlen = end - begin;
00858     if (tlen % brlen != 0)
00859         return REG_NOMATCH;
00860     numreps = tlen / brlen;
00861     if (numreps < min || (numreps > max && max != INFINITY))
00862         return REG_NOMATCH;
00863 
00864     /* okay, compare the actual string contents */
00865     p = begin;
00866     while (numreps-- > 0)
00867     {
00868         if ((*v->g->compare) (brstring, p, brlen) != 0)
00869             return REG_NOMATCH;
00870         p += brlen;
00871     }
00872 
00873     MDEBUG(("cbackref matched\n"));
00874     return REG_OKAY;
00875 }
00876 
00877 /*
00878  * caltdissect - dissect match for alternation node
00879  */
00880 static int                      /* regexec return code */
00881 caltdissect(struct vars * v,
00882             struct subre * t,
00883             chr *begin,         /* beginning of relevant substring */
00884             chr *end)           /* end of same */
00885 {
00886     struct dfa *d;
00887     int         er;
00888 
00889     /* We loop, rather than tail-recurse, to handle a chain of alternatives */
00890     while (t != NULL)
00891     {
00892         assert(t->op == '|');
00893         assert(t->left != NULL && t->left->cnfa.nstates > 0);
00894 
00895         MDEBUG(("calt n%d\n", t->id));
00896 
00897         d = getsubdfa(v, t->left);
00898         NOERR();
00899         if (longest(v, d, begin, end, (int *) NULL) == end)
00900         {
00901             MDEBUG(("calt matched\n"));
00902             er = cdissect(v, t->left, begin, end);
00903             if (er != REG_NOMATCH)
00904                 return er;
00905         }
00906 
00907         t = t->right;
00908     }
00909 
00910     return REG_NOMATCH;
00911 }
00912 
00913 /*
00914  * citerdissect - dissect match for iteration node
00915  */
00916 static int                      /* regexec return code */
00917 citerdissect(struct vars * v,
00918              struct subre * t,
00919              chr *begin,        /* beginning of relevant substring */
00920              chr *end)          /* end of same */
00921 {
00922     struct dfa *d;
00923     chr       **endpts;
00924     chr        *limit;
00925     int         min_matches;
00926     size_t      max_matches;
00927     int         nverified;
00928     int         k;
00929     int         i;
00930     int         er;
00931 
00932     assert(t->op == '*');
00933     assert(t->left != NULL && t->left->cnfa.nstates > 0);
00934     assert(!(t->left->flags & SHORTER));
00935     assert(begin <= end);
00936 
00937     /*
00938      * If zero matches are allowed, and target string is empty, just declare
00939      * victory.  OTOH, if target string isn't empty, zero matches can't work
00940      * so we pretend the min is 1.
00941      */
00942     min_matches = t->min;
00943     if (min_matches <= 0)
00944     {
00945         if (begin == end)
00946             return REG_OKAY;
00947         min_matches = 1;
00948     }
00949 
00950     /*
00951      * We need workspace to track the endpoints of each sub-match.  Normally
00952      * we consider only nonzero-length sub-matches, so there can be at most
00953      * end-begin of them.  However, if min is larger than that, we will also
00954      * consider zero-length sub-matches in order to find enough matches.
00955      *
00956      * For convenience, endpts[0] contains the "begin" pointer and we store
00957      * sub-match endpoints in endpts[1..max_matches].
00958      */
00959     max_matches = end - begin;
00960     if (max_matches > t->max && t->max != INFINITY)
00961         max_matches = t->max;
00962     if (max_matches < min_matches)
00963         max_matches = min_matches;
00964     endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
00965     if (endpts == NULL)
00966         return REG_ESPACE;
00967     endpts[0] = begin;
00968 
00969     d = getsubdfa(v, t->left);
00970     if (ISERR())
00971     {
00972         FREE(endpts);
00973         return v->err;
00974     }
00975     MDEBUG(("citer %d\n", t->id));
00976 
00977     /*
00978      * Our strategy is to first find a set of sub-match endpoints that are
00979      * valid according to the child node's DFA, and then recursively dissect
00980      * each sub-match to confirm validity.  If any validity check fails,
00981      * backtrack the last sub-match and try again.  And, when we next try for
00982      * a validity check, we need not recheck any successfully verified
00983      * sub-matches that we didn't move the endpoints of.  nverified remembers
00984      * how many sub-matches are currently known okay.
00985      */
00986 
00987     /* initialize to consider first sub-match */
00988     nverified = 0;
00989     k = 1;
00990     limit = end;
00991 
00992     /* iterate until satisfaction or failure */
00993     while (k > 0)
00994     {
00995         /* try to find an endpoint for the k'th sub-match */
00996         endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
00997         if (endpts[k] == NULL)
00998         {
00999             /* no match possible, so see if we can shorten previous one */
01000             k--;
01001             goto backtrack;
01002         }
01003         MDEBUG(("%d: working endpoint %d: %ld\n",
01004                 t->id, k, LOFF(endpts[k])));
01005 
01006         /* k'th sub-match can no longer be considered verified */
01007         if (nverified >= k)
01008             nverified = k - 1;
01009 
01010         if (endpts[k] != end)
01011         {
01012             /* haven't reached end yet, try another iteration if allowed */
01013             if (k >= max_matches)
01014             {
01015                 /* must try to shorten some previous match */
01016                 k--;
01017                 goto backtrack;
01018             }
01019 
01020             /* reject zero-length match unless necessary to achieve min */
01021             if (endpts[k] == endpts[k - 1] &&
01022                 (k >= min_matches || min_matches - k < end - endpts[k]))
01023                 goto backtrack;
01024 
01025             k++;
01026             limit = end;
01027             continue;
01028         }
01029 
01030         /*
01031          * We've identified a way to divide the string into k sub-matches that
01032          * works so far as the child DFA can tell.  If k is an allowed number
01033          * of matches, start the slow part: recurse to verify each sub-match.
01034          * We always have k <= max_matches, needn't check that.
01035          */
01036         if (k < min_matches)
01037             goto backtrack;
01038 
01039         MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
01040 
01041         for (i = nverified + 1; i <= k; i++)
01042         {
01043             zaptreesubs(v, t->left);
01044             er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
01045             if (er == REG_OKAY)
01046             {
01047                 nverified = i;
01048                 continue;
01049             }
01050             if (er == REG_NOMATCH)
01051                 break;
01052             /* oops, something failed */
01053             FREE(endpts);
01054             return er;
01055         }
01056 
01057         if (i > k)
01058         {
01059             /* satisfaction */
01060             MDEBUG(("%d successful\n", t->id));
01061             FREE(endpts);
01062             return REG_OKAY;
01063         }
01064 
01065         /* match failed to verify, so backtrack */
01066 
01067 backtrack:
01068 
01069         /*
01070          * Must consider shorter versions of the current sub-match.  However,
01071          * we'll only ask for a zero-length match if necessary.
01072          */
01073         while (k > 0)
01074         {
01075             chr        *prev_end = endpts[k - 1];
01076 
01077             if (endpts[k] > prev_end)
01078             {
01079                 limit = endpts[k] - 1;
01080                 if (limit > prev_end ||
01081                     (k < min_matches && min_matches - k >= end - prev_end))
01082                 {
01083                     /* break out of backtrack loop, continue the outer one */
01084                     break;
01085                 }
01086             }
01087             /* can't shorten k'th sub-match any more, consider previous one */
01088             k--;
01089         }
01090     }
01091 
01092     /* all possibilities exhausted */
01093     MDEBUG(("%d failed\n", t->id));
01094     FREE(endpts);
01095     return REG_NOMATCH;
01096 }
01097 
01098 /*
01099  * creviterdissect - dissect match for iteration node, shortest-first
01100  */
01101 static int                      /* regexec return code */
01102 creviterdissect(struct vars * v,
01103                 struct subre * t,
01104                 chr *begin,     /* beginning of relevant substring */
01105                 chr *end)       /* end of same */
01106 {
01107     struct dfa *d;
01108     chr       **endpts;
01109     chr        *limit;
01110     int         min_matches;
01111     size_t      max_matches;
01112     int         nverified;
01113     int         k;
01114     int         i;
01115     int         er;
01116 
01117     assert(t->op == '*');
01118     assert(t->left != NULL && t->left->cnfa.nstates > 0);
01119     assert(t->left->flags & SHORTER);
01120     assert(begin <= end);
01121 
01122     /*
01123      * If zero matches are allowed, and target string is empty, just declare
01124      * victory.  OTOH, if target string isn't empty, zero matches can't work
01125      * so we pretend the min is 1.
01126      */
01127     min_matches = t->min;
01128     if (min_matches <= 0)
01129     {
01130         if (begin == end)
01131             return REG_OKAY;
01132         min_matches = 1;
01133     }
01134 
01135     /*
01136      * We need workspace to track the endpoints of each sub-match.  Normally
01137      * we consider only nonzero-length sub-matches, so there can be at most
01138      * end-begin of them.  However, if min is larger than that, we will also
01139      * consider zero-length sub-matches in order to find enough matches.
01140      *
01141      * For convenience, endpts[0] contains the "begin" pointer and we store
01142      * sub-match endpoints in endpts[1..max_matches].
01143      */
01144     max_matches = end - begin;
01145     if (max_matches > t->max && t->max != INFINITY)
01146         max_matches = t->max;
01147     if (max_matches < min_matches)
01148         max_matches = min_matches;
01149     endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
01150     if (endpts == NULL)
01151         return REG_ESPACE;
01152     endpts[0] = begin;
01153 
01154     d = getsubdfa(v, t->left);
01155     if (ISERR())
01156     {
01157         FREE(endpts);
01158         return v->err;
01159     }
01160     MDEBUG(("creviter %d\n", t->id));
01161 
01162     /*
01163      * Our strategy is to first find a set of sub-match endpoints that are
01164      * valid according to the child node's DFA, and then recursively dissect
01165      * each sub-match to confirm validity.  If any validity check fails,
01166      * backtrack the last sub-match and try again.  And, when we next try for
01167      * a validity check, we need not recheck any successfully verified
01168      * sub-matches that we didn't move the endpoints of.  nverified remembers
01169      * how many sub-matches are currently known okay.
01170      */
01171 
01172     /* initialize to consider first sub-match */
01173     nverified = 0;
01174     k = 1;
01175     limit = begin;
01176 
01177     /* iterate until satisfaction or failure */
01178     while (k > 0)
01179     {
01180         /* disallow zero-length match unless necessary to achieve min */
01181         if (limit == endpts[k - 1] &&
01182             limit != end &&
01183             (k >= min_matches || min_matches - k < end - limit))
01184             limit++;
01185 
01186         /* try to find an endpoint for the k'th sub-match */
01187         endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
01188                              (chr **) NULL, (int *) NULL);
01189         if (endpts[k] == NULL)
01190         {
01191             /* no match possible, so see if we can lengthen previous one */
01192             k--;
01193             goto backtrack;
01194         }
01195         MDEBUG(("%d: working endpoint %d: %ld\n",
01196                 t->id, k, LOFF(endpts[k])));
01197 
01198         /* k'th sub-match can no longer be considered verified */
01199         if (nverified >= k)
01200             nverified = k - 1;
01201 
01202         if (endpts[k] != end)
01203         {
01204             /* haven't reached end yet, try another iteration if allowed */
01205             if (k >= max_matches)
01206             {
01207                 /* must try to lengthen some previous match */
01208                 k--;
01209                 goto backtrack;
01210             }
01211 
01212             k++;
01213             limit = endpts[k - 1];
01214             continue;
01215         }
01216 
01217         /*
01218          * We've identified a way to divide the string into k sub-matches that
01219          * works so far as the child DFA can tell.  If k is an allowed number
01220          * of matches, start the slow part: recurse to verify each sub-match.
01221          * We always have k <= max_matches, needn't check that.
01222          */
01223         if (k < min_matches)
01224             goto backtrack;
01225 
01226         MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
01227 
01228         for (i = nverified + 1; i <= k; i++)
01229         {
01230             zaptreesubs(v, t->left);
01231             er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
01232             if (er == REG_OKAY)
01233             {
01234                 nverified = i;
01235                 continue;
01236             }
01237             if (er == REG_NOMATCH)
01238                 break;
01239             /* oops, something failed */
01240             FREE(endpts);
01241             return er;
01242         }
01243 
01244         if (i > k)
01245         {
01246             /* satisfaction */
01247             MDEBUG(("%d successful\n", t->id));
01248             FREE(endpts);
01249             return REG_OKAY;
01250         }
01251 
01252         /* match failed to verify, so backtrack */
01253 
01254 backtrack:
01255 
01256         /*
01257          * Must consider longer versions of the current sub-match.
01258          */
01259         while (k > 0)
01260         {
01261             if (endpts[k] < end)
01262             {
01263                 limit = endpts[k] + 1;
01264                 /* break out of backtrack loop, continue the outer one */
01265                 break;
01266             }
01267             /* can't lengthen k'th sub-match any more, consider previous one */
01268             k--;
01269         }
01270     }
01271 
01272     /* all possibilities exhausted */
01273     MDEBUG(("%d failed\n", t->id));
01274     FREE(endpts);
01275     return REG_NOMATCH;
01276 }
01277 
01278 
01279 
01280 #include "rege_dfa.c"