LLVM API Documentation

regengine.inc
Go to the documentation of this file.
00001 /*-
00002  * This code is derived from OpenBSD's libc/regex, original license follows:
00003  *
00004  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
00005  * Copyright (c) 1992, 1993, 1994
00006  *  The Regents of the University of California.  All rights reserved.
00007  *
00008  * This code is derived from software contributed to Berkeley by
00009  * Henry Spencer.
00010  *
00011  * Redistribution and use in source and binary forms, with or without
00012  * modification, are permitted provided that the following conditions
00013  * are met:
00014  * 1. Redistributions of source code must retain the above copyright
00015  *    notice, this list of conditions and the following disclaimer.
00016  * 2. Redistributions in binary form must reproduce the above copyright
00017  *    notice, this list of conditions and the following disclaimer in the
00018  *    documentation and/or other materials provided with the distribution.
00019  * 3. Neither the name of the University nor the names of its contributors
00020  *    may be used to endorse or promote products derived from this software
00021  *    without specific prior written permission.
00022  *
00023  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
00024  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00025  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00026  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
00027  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00028  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00029  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00030  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00031  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00032  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00033  * SUCH DAMAGE.
00034  *
00035  *  @(#)engine.c  8.5 (Berkeley) 3/20/94
00036  */
00037 
00038 /*
00039  * The matching engine and friends.  This file is #included by regexec.c
00040  * after suitable #defines of a variety of macros used herein, so that
00041  * different state representations can be used without duplicating masses
00042  * of code.
00043  */
00044 
00045 #ifdef SNAMES
00046 #define matcher smatcher
00047 #define fast  sfast
00048 #define slow  sslow
00049 #define dissect sdissect
00050 #define backref sbackref
00051 #define step  sstep
00052 #define print sprint
00053 #define at  sat
00054 #define match smat
00055 #define nope  snope
00056 #endif
00057 #ifdef LNAMES
00058 #define matcher lmatcher
00059 #define fast  lfast
00060 #define slow  lslow
00061 #define dissect ldissect
00062 #define backref lbackref
00063 #define step  lstep
00064 #define print lprint
00065 #define at  lat
00066 #define match lmat
00067 #define nope  lnope
00068 #endif
00069 
00070 /* another structure passed up and down to avoid zillions of parameters */
00071 struct match {
00072   struct re_guts *g;
00073   int eflags;
00074   llvm_regmatch_t *pmatch;  /* [nsub+1] (0 element unused) */
00075   const char *offp;   /* offsets work from here */
00076   const char *beginp;   /* start of string -- virtual NUL precedes */
00077   const char *endp;   /* end of string -- virtual NUL here */
00078   const char *coldp;    /* can be no match starting before here */
00079   const char **lastpos;   /* [nplus+1] */
00080   STATEVARS;
00081   states st;    /* current states */
00082   states fresh;   /* states for a fresh start */
00083   states tmp;   /* temporary */
00084   states empty;   /* empty set of states */
00085 };
00086 
00087 static int matcher(struct re_guts *, const char *, size_t,
00088                    llvm_regmatch_t[], int);
00089 static const char *dissect(struct match *, const char *, const char *, sopno,
00090                            sopno);
00091 static const char *backref(struct match *, const char *, const char *, sopno,
00092                            sopno, sopno, int);
00093 static const char *fast(struct match *, const char *, const char *, sopno, sopno);
00094 static const char *slow(struct match *, const char *, const char *, sopno, sopno);
00095 static states step(struct re_guts *, sopno, sopno, states, int, states);
00096 #define MAX_RECURSION 100
00097 #define BOL (OUT+1)
00098 #define EOL (BOL+1)
00099 #define BOLEOL  (BOL+2)
00100 #define NOTHING (BOL+3)
00101 #define BOW (BOL+4)
00102 #define EOW (BOL+5)
00103 #define CODEMAX (BOL+5)   /* highest code used */
00104 #define NONCHAR(c)  ((c) > CHAR_MAX)
00105 #define NNONCHAR  (CODEMAX-CHAR_MAX)
00106 #ifdef REDEBUG
00107 static void print(struct match *, char *, states, int, FILE *);
00108 #endif
00109 #ifdef REDEBUG
00110 static void at(struct match *, char *, char *, char *, sopno, sopno);
00111 #endif
00112 #ifdef REDEBUG
00113 static char *pchar(int);
00114 #endif
00115 
00116 #ifdef REDEBUG
00117 #define SP(t, s, c) print(m, t, s, c, stdout)
00118 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
00119 #define NOTE(str) { if (m->eflags&REG_TRACE) (void)printf("=%s\n", (str)); }
00120 static int nope = 0;
00121 #else
00122 #define SP(t, s, c) /* nothing */
00123 #define AT(t, p1, p2, s1, s2) /* nothing */
00124 #define NOTE(s) /* nothing */
00125 #endif
00126 
00127 /*
00128  - matcher - the actual matching engine
00129  */
00130 static int      /* 0 success, REG_NOMATCH failure */
00131 matcher(struct re_guts *g, const char *string, size_t nmatch,
00132         llvm_regmatch_t pmatch[],
00133     int eflags)
00134 {
00135   const char *endp;
00136   size_t i;
00137   struct match mv;
00138   struct match *m = &mv;
00139   const char *dp;
00140   const sopno gf = g->firststate+1; /* +1 for OEND */
00141   const sopno gl = g->laststate;
00142   const char *start;
00143   const char *stop;
00144 
00145   /* simplify the situation where possible */
00146   if (g->cflags&REG_NOSUB)
00147     nmatch = 0;
00148   if (eflags&REG_STARTEND) {
00149     start = string + pmatch[0].rm_so;
00150     stop = string + pmatch[0].rm_eo;
00151   } else {
00152     start = string;
00153     stop = start + strlen(start);
00154   }
00155   if (stop < start)
00156     return(REG_INVARG);
00157 
00158   /* prescreening; this does wonders for this rather slow code */
00159   if (g->must != NULL) {
00160     for (dp = start; dp < stop; dp++)
00161       if (*dp == g->must[0] && stop - dp >= g->mlen &&
00162         memcmp(dp, g->must, (size_t)g->mlen) == 0)
00163         break;
00164     if (dp == stop)   /* we didn't find g->must */
00165       return(REG_NOMATCH);
00166   }
00167 
00168   /* match struct setup */
00169   m->g = g;
00170   m->eflags = eflags;
00171   m->pmatch = NULL;
00172   m->lastpos = NULL;
00173   m->offp = string;
00174   m->beginp = start;
00175   m->endp = stop;
00176   STATESETUP(m, 4);
00177   SETUP(m->st);
00178   SETUP(m->fresh);
00179   SETUP(m->tmp);
00180   SETUP(m->empty);
00181   CLEAR(m->empty);
00182 
00183   /* this loop does only one repetition except for backrefs */
00184   for (;;) {
00185     endp = fast(m, start, stop, gf, gl);
00186     if (endp == NULL) {   /* a miss */
00187       free(m->pmatch);
00188       free((void*)m->lastpos);
00189       STATETEARDOWN(m);
00190       return(REG_NOMATCH);
00191     }
00192     if (nmatch == 0 && !g->backrefs)
00193       break;    /* no further info needed */
00194 
00195     /* where? */
00196     assert(m->coldp != NULL);
00197     for (;;) {
00198       NOTE("finding start");
00199       endp = slow(m, m->coldp, stop, gf, gl);
00200       if (endp != NULL)
00201         break;
00202       assert(m->coldp < m->endp);
00203       m->coldp++;
00204     }
00205     if (nmatch == 1 && !g->backrefs)
00206       break;    /* no further info needed */
00207 
00208     /* oh my, they want the subexpressions... */
00209     if (m->pmatch == NULL)
00210       m->pmatch = (llvm_regmatch_t *)malloc((m->g->nsub + 1) *
00211               sizeof(llvm_regmatch_t));
00212     if (m->pmatch == NULL) {
00213       STATETEARDOWN(m);
00214       return(REG_ESPACE);
00215     }
00216     for (i = 1; i <= m->g->nsub; i++)
00217       m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
00218     if (!g->backrefs && !(m->eflags&REG_BACKR)) {
00219       NOTE("dissecting");
00220       dp = dissect(m, m->coldp, endp, gf, gl);
00221     } else {
00222       if (g->nplus > 0 && m->lastpos == NULL)
00223         m->lastpos = (const char **)malloc((g->nplus+1) *
00224               sizeof(char *));
00225       if (g->nplus > 0 && m->lastpos == NULL) {
00226         free(m->pmatch);
00227         STATETEARDOWN(m);
00228         return(REG_ESPACE);
00229       }
00230       NOTE("backref dissect");
00231       dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
00232     }
00233     if (dp != NULL)
00234       break;
00235 
00236     /* uh-oh... we couldn't find a subexpression-level match */
00237     assert(g->backrefs);  /* must be back references doing it */
00238     assert(g->nplus == 0 || m->lastpos != NULL);
00239     for (;;) {
00240       if (dp != NULL || endp <= m->coldp)
00241         break;    /* defeat */
00242       NOTE("backoff");
00243       endp = slow(m, m->coldp, endp-1, gf, gl);
00244       if (endp == NULL)
00245         break;    /* defeat */
00246       /* try it on a shorter possibility */
00247 #ifndef NDEBUG
00248       for (i = 1; i <= m->g->nsub; i++) {
00249         assert(m->pmatch[i].rm_so == -1);
00250         assert(m->pmatch[i].rm_eo == -1);
00251       }
00252 #endif
00253       NOTE("backoff dissect");
00254       dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
00255     }
00256     assert(dp == NULL || dp == endp);
00257     if (dp != NULL)   /* found a shorter one */
00258       break;
00259 
00260     /* despite initial appearances, there is no match here */
00261     NOTE("false alarm");
00262     if (m->coldp == stop)
00263       break;
00264     start = m->coldp + 1; /* recycle starting later */
00265   }
00266 
00267   /* fill in the details if requested */
00268   if (nmatch > 0) {
00269     pmatch[0].rm_so = m->coldp - m->offp;
00270     pmatch[0].rm_eo = endp - m->offp;
00271   }
00272   if (nmatch > 1) {
00273     assert(m->pmatch != NULL);
00274     for (i = 1; i < nmatch; i++)
00275       if (i <= m->g->nsub)
00276         pmatch[i] = m->pmatch[i];
00277       else {
00278         pmatch[i].rm_so = -1;
00279         pmatch[i].rm_eo = -1;
00280       }
00281   }
00282 
00283   if (m->pmatch != NULL)
00284     free((char *)m->pmatch);
00285   if (m->lastpos != NULL)
00286     free((char *)m->lastpos);
00287   STATETEARDOWN(m);
00288   return(0);
00289 }
00290 
00291 /*
00292  - dissect - figure out what matched what, no back references
00293  */
00294 static const char *     /* == stop (success) always */
00295 dissect(struct match *m, const char *start, const char *stop, sopno startst,
00296         sopno stopst)
00297 {
00298   int i;
00299   sopno ss; /* start sop of current subRE */
00300   sopno es; /* end sop of current subRE */
00301   const char *sp; /* start of string matched by it */
00302   const char *stp;  /* string matched by it cannot pass here */
00303   const char *rest; /* start of rest of string */
00304   const char *tail; /* string unmatched by rest of RE */
00305   sopno ssub; /* start sop of subsubRE */
00306   sopno esub; /* end sop of subsubRE */
00307   const char *ssp;  /* start of string matched by subsubRE */
00308   const char *sep;  /* end of string matched by subsubRE */
00309   const char *oldssp; /* previous ssp */
00310 
00311   AT("diss", start, stop, startst, stopst);
00312   sp = start;
00313   for (ss = startst; ss < stopst; ss = es) {
00314     /* identify end of subRE */
00315     es = ss;
00316     switch (OP(m->g->strip[es])) {
00317     case OPLUS_:
00318     case OQUEST_:
00319       es += OPND(m->g->strip[es]);
00320       break;
00321     case OCH_:
00322       while (OP(m->g->strip[es]) != O_CH)
00323         es += OPND(m->g->strip[es]);
00324       break;
00325     }
00326     es++;
00327 
00328     /* figure out what it matched */
00329     switch (OP(m->g->strip[ss])) {
00330     case OEND:
00331       assert(nope);
00332       break;
00333     case OCHAR:
00334       sp++;
00335       break;
00336     case OBOL:
00337     case OEOL:
00338     case OBOW:
00339     case OEOW:
00340       break;
00341     case OANY:
00342     case OANYOF:
00343       sp++;
00344       break;
00345     case OBACK_:
00346     case O_BACK:
00347       assert(nope);
00348       break;
00349     /* cases where length of match is hard to find */
00350     case OQUEST_:
00351       stp = stop;
00352       for (;;) {
00353         /* how long could this one be? */
00354         rest = slow(m, sp, stp, ss, es);
00355         assert(rest != NULL); /* it did match */
00356         /* could the rest match the rest? */
00357         tail = slow(m, rest, stop, es, stopst);
00358         if (tail == stop)
00359           break;    /* yes! */
00360         /* no -- try a shorter match for this one */
00361         stp = rest - 1;
00362         assert(stp >= sp);  /* it did work */
00363       }
00364       ssub = ss + 1;
00365       esub = es - 1;
00366       /* did innards match? */
00367       if (slow(m, sp, rest, ssub, esub) != NULL) {
00368         const char *dp = dissect(m, sp, rest, ssub, esub);
00369         (void)dp; /* avoid warning if assertions off */
00370         assert(dp == rest);
00371       } else    /* no */
00372         assert(sp == rest);
00373       sp = rest;
00374       break;
00375     case OPLUS_:
00376       stp = stop;
00377       for (;;) {
00378         /* how long could this one be? */
00379         rest = slow(m, sp, stp, ss, es);
00380         assert(rest != NULL); /* it did match */
00381         /* could the rest match the rest? */
00382         tail = slow(m, rest, stop, es, stopst);
00383         if (tail == stop)
00384           break;    /* yes! */
00385         /* no -- try a shorter match for this one */
00386         stp = rest - 1;
00387         assert(stp >= sp);  /* it did work */
00388       }
00389       ssub = ss + 1;
00390       esub = es - 1;
00391       ssp = sp;
00392       oldssp = ssp;
00393       for (;;) {  /* find last match of innards */
00394         sep = slow(m, ssp, rest, ssub, esub);
00395         if (sep == NULL || sep == ssp)
00396           break;  /* failed or matched null */
00397         oldssp = ssp; /* on to next try */
00398         ssp = sep;
00399       }
00400       if (sep == NULL) {
00401         /* last successful match */
00402         sep = ssp;
00403         ssp = oldssp;
00404       }
00405       assert(sep == rest);  /* must exhaust substring */
00406       assert(slow(m, ssp, sep, ssub, esub) == rest);
00407       {
00408         const char *dp = dissect(m, ssp, sep, ssub, esub);
00409         (void)dp; /* avoid warning if assertions off */
00410         assert(dp == sep);
00411       }
00412       sp = rest;
00413       break;
00414     case OCH_:
00415       stp = stop;
00416       for (;;) {
00417         /* how long could this one be? */
00418         rest = slow(m, sp, stp, ss, es);
00419         assert(rest != NULL); /* it did match */
00420         /* could the rest match the rest? */
00421         tail = slow(m, rest, stop, es, stopst);
00422         if (tail == stop)
00423           break;    /* yes! */
00424         /* no -- try a shorter match for this one */
00425         stp = rest - 1;
00426         assert(stp >= sp);  /* it did work */
00427       }
00428       ssub = ss + 1;
00429       esub = ss + OPND(m->g->strip[ss]) - 1;
00430       assert(OP(m->g->strip[esub]) == OOR1);
00431       for (;;) {  /* find first matching branch */
00432         if (slow(m, sp, rest, ssub, esub) == rest)
00433           break;  /* it matched all of it */
00434         /* that one missed, try next one */
00435         assert(OP(m->g->strip[esub]) == OOR1);
00436         esub++;
00437         assert(OP(m->g->strip[esub]) == OOR2);
00438         ssub = esub + 1;
00439         esub += OPND(m->g->strip[esub]);
00440         if (OP(m->g->strip[esub]) == OOR2)
00441           esub--;
00442         else
00443           assert(OP(m->g->strip[esub]) == O_CH);
00444       }
00445       {
00446         const char *dp = dissect(m, sp, rest, ssub, esub);
00447         (void)dp; /* avoid warning if assertions off */
00448         assert(dp == rest);
00449       }
00450       sp = rest;
00451       break;
00452     case O_PLUS:
00453     case O_QUEST:
00454     case OOR1:
00455     case OOR2:
00456     case O_CH:
00457       assert(nope);
00458       break;
00459     case OLPAREN:
00460       i = OPND(m->g->strip[ss]);
00461       assert(0 < i && i <= m->g->nsub);
00462       m->pmatch[i].rm_so = sp - m->offp;
00463       break;
00464     case ORPAREN:
00465       i = OPND(m->g->strip[ss]);
00466       assert(0 < i && i <= m->g->nsub);
00467       m->pmatch[i].rm_eo = sp - m->offp;
00468       break;
00469     default:    /* uh oh */
00470       assert(nope);
00471       break;
00472     }
00473   }
00474 
00475   assert(sp == stop);
00476   return(sp);
00477 }
00478 
00479 /*
00480  - backref - figure out what matched what, figuring in back references
00481  */
00482 static const char *     /* == stop (success) or NULL (failure) */
00483 backref(struct match *m, const char *start, const char *stop, sopno startst,
00484         sopno stopst, sopno lev, int rec)     /* PLUS nesting level */
00485 {
00486   int i;
00487   sopno ss; /* start sop of current subRE */
00488   const char *sp; /* start of string matched by it */
00489   sopno ssub; /* start sop of subsubRE */
00490   sopno esub; /* end sop of subsubRE */
00491   const char *ssp;  /* start of string matched by subsubRE */
00492   const char *dp;
00493   size_t len;
00494   int hard;
00495   sop s;
00496   llvm_regoff_t offsave;
00497   cset *cs;
00498 
00499   AT("back", start, stop, startst, stopst);
00500   sp = start;
00501 
00502   /* get as far as we can with easy stuff */
00503   hard = 0;
00504   for (ss = startst; !hard && ss < stopst; ss++)
00505     switch (OP(s = m->g->strip[ss])) {
00506     case OCHAR:
00507       if (sp == stop || *sp++ != (char)OPND(s))
00508         return(NULL);
00509       break;
00510     case OANY:
00511       if (sp == stop)
00512         return(NULL);
00513       sp++;
00514       break;
00515     case OANYOF:
00516       cs = &m->g->sets[OPND(s)];
00517       if (sp == stop || !CHIN(cs, *sp++))
00518         return(NULL);
00519       break;
00520     case OBOL:
00521       if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
00522           (sp < m->endp && *(sp-1) == '\n' &&
00523             (m->g->cflags&REG_NEWLINE)) )
00524         { /* yes */ }
00525       else
00526         return(NULL);
00527       break;
00528     case OEOL:
00529       if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
00530           (sp < m->endp && *sp == '\n' &&
00531             (m->g->cflags&REG_NEWLINE)) )
00532         { /* yes */ }
00533       else
00534         return(NULL);
00535       break;
00536     case OBOW:
00537       if (( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
00538           (sp < m->endp && *(sp-1) == '\n' &&
00539             (m->g->cflags&REG_NEWLINE)) ||
00540           (sp > m->beginp &&
00541               !ISWORD(*(sp-1))) ) &&
00542           (sp < m->endp && ISWORD(*sp)) )
00543         { /* yes */ }
00544       else
00545         return(NULL);
00546       break;
00547     case OEOW:
00548       if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
00549           (sp < m->endp && *sp == '\n' &&
00550             (m->g->cflags&REG_NEWLINE)) ||
00551           (sp < m->endp && !ISWORD(*sp)) ) &&
00552           (sp > m->beginp && ISWORD(*(sp-1))) )
00553         { /* yes */ }
00554       else
00555         return(NULL);
00556       break;
00557     case O_QUEST:
00558       break;
00559     case OOR1:  /* matches null but needs to skip */
00560       ss++;
00561       s = m->g->strip[ss];
00562       do {
00563         assert(OP(s) == OOR2);
00564         ss += OPND(s);
00565       } while (OP(s = m->g->strip[ss]) != O_CH);
00566       /* note that the ss++ gets us past the O_CH */
00567       break;
00568     default:  /* have to make a choice */
00569       hard = 1;
00570       break;
00571     }
00572   if (!hard) {    /* that was it! */
00573     if (sp != stop)
00574       return(NULL);
00575     return(sp);
00576   }
00577   ss--;     /* adjust for the for's final increment */
00578 
00579   /* the hard stuff */
00580   AT("hard", sp, stop, ss, stopst);
00581   s = m->g->strip[ss];
00582   switch (OP(s)) {
00583   case OBACK_:    /* the vilest depths */
00584     i = OPND(s);
00585     assert(0 < i && i <= m->g->nsub);
00586     if (m->pmatch[i].rm_eo == -1)
00587       return(NULL);
00588     assert(m->pmatch[i].rm_so != -1);
00589     len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
00590     if (len == 0 && rec++ > MAX_RECURSION)
00591       return(NULL);
00592     assert(stop - m->beginp >= len);
00593     if (sp > stop - len)
00594       return(NULL); /* not enough left to match */
00595     ssp = m->offp + m->pmatch[i].rm_so;
00596     if (memcmp(sp, ssp, len) != 0)
00597       return(NULL);
00598     while (m->g->strip[ss] != SOP(O_BACK, i))
00599       ss++;
00600     return(backref(m, sp+len, stop, ss+1, stopst, lev, rec));
00601     break;
00602   case OQUEST_:   /* to null or not */
00603     dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
00604     if (dp != NULL)
00605       return(dp); /* not */
00606     return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
00607     break;
00608   case OPLUS_:
00609     assert(m->lastpos != NULL);
00610     assert(lev+1 <= m->g->nplus);
00611     m->lastpos[lev+1] = sp;
00612     return(backref(m, sp, stop, ss+1, stopst, lev+1, rec));
00613     break;
00614   case O_PLUS:
00615     if (sp == m->lastpos[lev])  /* last pass matched null */
00616       return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
00617     /* try another pass */
00618     m->lastpos[lev] = sp;
00619     dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
00620     if (dp == NULL)
00621       return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
00622     else
00623       return(dp);
00624     break;
00625   case OCH_:    /* find the right one, if any */
00626     ssub = ss + 1;
00627     esub = ss + OPND(s) - 1;
00628     assert(OP(m->g->strip[esub]) == OOR1);
00629     for (;;) {  /* find first matching branch */
00630       dp = backref(m, sp, stop, ssub, esub, lev, rec);
00631       if (dp != NULL)
00632         return(dp);
00633       /* that one missed, try next one */
00634       if (OP(m->g->strip[esub]) == O_CH)
00635         return(NULL); /* there is none */
00636       esub++;
00637       assert(OP(m->g->strip[esub]) == OOR2);
00638       ssub = esub + 1;
00639       esub += OPND(m->g->strip[esub]);
00640       if (OP(m->g->strip[esub]) == OOR2)
00641         esub--;
00642       else
00643         assert(OP(m->g->strip[esub]) == O_CH);
00644     }
00645     break;
00646   case OLPAREN:   /* must undo assignment if rest fails */
00647     i = OPND(s);
00648     assert(0 < i && i <= m->g->nsub);
00649     offsave = m->pmatch[i].rm_so;
00650     m->pmatch[i].rm_so = sp - m->offp;
00651     dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
00652     if (dp != NULL)
00653       return(dp);
00654     m->pmatch[i].rm_so = offsave;
00655     return(NULL);
00656     break;
00657   case ORPAREN:   /* must undo assignment if rest fails */
00658     i = OPND(s);
00659     assert(0 < i && i <= m->g->nsub);
00660     offsave = m->pmatch[i].rm_eo;
00661     m->pmatch[i].rm_eo = sp - m->offp;
00662     dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
00663     if (dp != NULL)
00664       return(dp);
00665     m->pmatch[i].rm_eo = offsave;
00666     return(NULL);
00667     break;
00668   default:    /* uh oh */
00669     assert(nope);
00670     break;
00671   }
00672 
00673   /* "can't happen" */
00674   assert(nope);
00675   /* NOTREACHED */
00676         return NULL;
00677 }
00678 
00679 /*
00680  - fast - step through the string at top speed
00681  */
00682 static const char *     /* where tentative match ended, or NULL */
00683 fast(struct match *m, const char *start, const char *stop, sopno startst,
00684      sopno stopst)
00685 {
00686   states st = m->st;
00687   states fresh = m->fresh;
00688   states tmp = m->tmp;
00689   const char *p = start;
00690   int c = (start == m->beginp) ? OUT : *(start-1);
00691   int lastc;  /* previous c */
00692   int flagch;
00693   int i;
00694   const char *coldp;  /* last p after which no match was underway */
00695 
00696   CLEAR(st);
00697   SET1(st, startst);
00698   st = step(m->g, startst, stopst, st, NOTHING, st);
00699   ASSIGN(fresh, st);
00700   SP("start", st, *p);
00701   coldp = NULL;
00702   for (;;) {
00703     /* next character */
00704     lastc = c;
00705     c = (p == m->endp) ? OUT : *p;
00706     if (EQ(st, fresh))
00707       coldp = p;
00708 
00709     /* is there an EOL and/or BOL between lastc and c? */
00710     flagch = '\0';
00711     i = 0;
00712     if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
00713         (lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
00714       flagch = BOL;
00715       i = m->g->nbol;
00716     }
00717     if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
00718         (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
00719       flagch = (flagch == BOL) ? BOLEOL : EOL;
00720       i += m->g->neol;
00721     }
00722     if (i != 0) {
00723       for (; i > 0; i--)
00724         st = step(m->g, startst, stopst, st, flagch, st);
00725       SP("boleol", st, c);
00726     }
00727 
00728     /* how about a word boundary? */
00729     if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
00730           (c != OUT && ISWORD(c)) ) {
00731       flagch = BOW;
00732     }
00733     if ( (lastc != OUT && ISWORD(lastc)) &&
00734         (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
00735       flagch = EOW;
00736     }
00737     if (flagch == BOW || flagch == EOW) {
00738       st = step(m->g, startst, stopst, st, flagch, st);
00739       SP("boweow", st, c);
00740     }
00741 
00742     /* are we done? */
00743     if (ISSET(st, stopst) || p == stop)
00744       break;    /* NOTE BREAK OUT */
00745 
00746     /* no, we must deal with this character */
00747     ASSIGN(tmp, st);
00748     ASSIGN(st, fresh);
00749     assert(c != OUT);
00750     st = step(m->g, startst, stopst, tmp, c, st);
00751     SP("aft", st, c);
00752     assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
00753     p++;
00754   }
00755 
00756   assert(coldp != NULL);
00757   m->coldp = coldp;
00758   if (ISSET(st, stopst))
00759     return(p+1);
00760   else
00761     return(NULL);
00762 }
00763 
00764 /*
00765  - slow - step through the string more deliberately
00766  */
00767 static const char *     /* where it ended */
00768 slow(struct match *m, const char *start, const char *stop, sopno startst,
00769      sopno stopst)
00770 {
00771   states st = m->st;
00772   states empty = m->empty;
00773   states tmp = m->tmp;
00774   const char *p = start;
00775   int c = (start == m->beginp) ? OUT : *(start-1);
00776   int lastc;  /* previous c */
00777   int flagch;
00778   int i;
00779   const char *matchp; /* last p at which a match ended */
00780 
00781   AT("slow", start, stop, startst, stopst);
00782   CLEAR(st);
00783   SET1(st, startst);
00784   SP("sstart", st, *p);
00785   st = step(m->g, startst, stopst, st, NOTHING, st);
00786   matchp = NULL;
00787   for (;;) {
00788     /* next character */
00789     lastc = c;
00790     c = (p == m->endp) ? OUT : *p;
00791 
00792     /* is there an EOL and/or BOL between lastc and c? */
00793     flagch = '\0';
00794     i = 0;
00795     if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
00796         (lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
00797       flagch = BOL;
00798       i = m->g->nbol;
00799     }
00800     if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
00801         (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
00802       flagch = (flagch == BOL) ? BOLEOL : EOL;
00803       i += m->g->neol;
00804     }
00805     if (i != 0) {
00806       for (; i > 0; i--)
00807         st = step(m->g, startst, stopst, st, flagch, st);
00808       SP("sboleol", st, c);
00809     }
00810 
00811     /* how about a word boundary? */
00812     if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
00813           (c != OUT && ISWORD(c)) ) {
00814       flagch = BOW;
00815     }
00816     if ( (lastc != OUT && ISWORD(lastc)) &&
00817         (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
00818       flagch = EOW;
00819     }
00820     if (flagch == BOW || flagch == EOW) {
00821       st = step(m->g, startst, stopst, st, flagch, st);
00822       SP("sboweow", st, c);
00823     }
00824 
00825     /* are we done? */
00826     if (ISSET(st, stopst))
00827       matchp = p;
00828     if (EQ(st, empty) || p == stop)
00829       break;    /* NOTE BREAK OUT */
00830 
00831     /* no, we must deal with this character */
00832     ASSIGN(tmp, st);
00833     ASSIGN(st, empty);
00834     assert(c != OUT);
00835     st = step(m->g, startst, stopst, tmp, c, st);
00836     SP("saft", st, c);
00837     assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
00838     p++;
00839   }
00840 
00841   return(matchp);
00842 }
00843 
00844 
00845 /*
00846  - step - map set of states reachable before char to set reachable after
00847  */
00848 static states
00849 step(struct re_guts *g,
00850     sopno start,    /* start state within strip */
00851     sopno stop,     /* state after stop state within strip */
00852     states bef,     /* states reachable before */
00853     int ch,     /* character or NONCHAR code */
00854     states aft)     /* states already known reachable after */
00855 {
00856   cset *cs;
00857   sop s;
00858   sopno pc;
00859   onestate here;    /* note, macros know this name */
00860   sopno look;
00861   int i;
00862 
00863   for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
00864     s = g->strip[pc];
00865     switch (OP(s)) {
00866     case OEND:
00867       assert(pc == stop-1);
00868       break;
00869     case OCHAR:
00870       /* only characters can match */
00871       assert(!NONCHAR(ch) || ch != (char)OPND(s));
00872       if (ch == (char)OPND(s))
00873         FWD(aft, bef, 1);
00874       break;
00875     case OBOL:
00876       if (ch == BOL || ch == BOLEOL)
00877         FWD(aft, bef, 1);
00878       break;
00879     case OEOL:
00880       if (ch == EOL || ch == BOLEOL)
00881         FWD(aft, bef, 1);
00882       break;
00883     case OBOW:
00884       if (ch == BOW)
00885         FWD(aft, bef, 1);
00886       break;
00887     case OEOW:
00888       if (ch == EOW)
00889         FWD(aft, bef, 1);
00890       break;
00891     case OANY:
00892       if (!NONCHAR(ch))
00893         FWD(aft, bef, 1);
00894       break;
00895     case OANYOF:
00896       cs = &g->sets[OPND(s)];
00897       if (!NONCHAR(ch) && CHIN(cs, ch))
00898         FWD(aft, bef, 1);
00899       break;
00900     case OBACK_:    /* ignored here */
00901     case O_BACK:
00902       FWD(aft, aft, 1);
00903       break;
00904     case OPLUS_:    /* forward, this is just an empty */
00905       FWD(aft, aft, 1);
00906       break;
00907     case O_PLUS:    /* both forward and back */
00908       FWD(aft, aft, 1);
00909       i = ISSETBACK(aft, OPND(s));
00910       BACK(aft, aft, OPND(s));
00911       if (!i && ISSETBACK(aft, OPND(s))) {
00912         /* oho, must reconsider loop body */
00913         pc -= OPND(s) + 1;
00914         INIT(here, pc);
00915       }
00916       break;
00917     case OQUEST_:   /* two branches, both forward */
00918       FWD(aft, aft, 1);
00919       FWD(aft, aft, OPND(s));
00920       break;
00921     case O_QUEST:   /* just an empty */
00922       FWD(aft, aft, 1);
00923       break;
00924     case OLPAREN:   /* not significant here */
00925     case ORPAREN:
00926       FWD(aft, aft, 1);
00927       break;
00928     case OCH_:    /* mark the first two branches */
00929       FWD(aft, aft, 1);
00930       assert(OP(g->strip[pc+OPND(s)]) == OOR2);
00931       FWD(aft, aft, OPND(s));
00932       break;
00933     case OOR1:    /* done a branch, find the O_CH */
00934       if (ISSTATEIN(aft, here)) {
00935         for (look = 1;
00936             OP(s = g->strip[pc+look]) != O_CH;
00937             look += OPND(s))
00938           assert(OP(s) == OOR2);
00939         FWD(aft, aft, look);
00940       }
00941       break;
00942     case OOR2:    /* propagate OCH_'s marking */
00943       FWD(aft, aft, 1);
00944       if (OP(g->strip[pc+OPND(s)]) != O_CH) {
00945         assert(OP(g->strip[pc+OPND(s)]) == OOR2);
00946         FWD(aft, aft, OPND(s));
00947       }
00948       break;
00949     case O_CH:    /* just empty */
00950       FWD(aft, aft, 1);
00951       break;
00952     default:    /* ooooops... */
00953       assert(nope);
00954       break;
00955     }
00956   }
00957 
00958   return(aft);
00959 }
00960 
00961 #ifdef REDEBUG
00962 /*
00963  - print - print a set of states
00964  */
00965 static void
00966 print(struct match *m, char *caption, states st, int ch, FILE *d)
00967 {
00968   struct re_guts *g = m->g;
00969   int i;
00970   int first = 1;
00971 
00972   if (!(m->eflags&REG_TRACE))
00973     return;
00974 
00975   (void)fprintf(d, "%s", caption);
00976   if (ch != '\0')
00977     (void)fprintf(d, " %s", pchar(ch));
00978   for (i = 0; i < g->nstates; i++)
00979     if (ISSET(st, i)) {
00980       (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
00981       first = 0;
00982     }
00983   (void)fprintf(d, "\n");
00984 }
00985 
00986 /* 
00987  - at - print current situation
00988  */
00989 static void
00990 at(struct match *m, char *title, char *start, char *stop, sopno startst,
00991     sopno stopst)
00992 {
00993   if (!(m->eflags&REG_TRACE))
00994     return;
00995 
00996   (void)printf("%s %s-", title, pchar(*start));
00997   (void)printf("%s ", pchar(*stop));
00998   (void)printf("%ld-%ld\n", (long)startst, (long)stopst);
00999 }
01000 
01001 #ifndef PCHARDONE
01002 #define PCHARDONE /* never again */
01003 /*
01004  - pchar - make a character printable
01005  *
01006  * Is this identical to regchar() over in debug.c?  Well, yes.  But a
01007  * duplicate here avoids having a debugging-capable regexec.o tied to
01008  * a matching debug.o, and this is convenient.  It all disappears in
01009  * the non-debug compilation anyway, so it doesn't matter much.
01010  */
01011 static char *     /* -> representation */
01012 pchar(int ch)
01013 {
01014   static char pbuf[10];
01015 
01016   if (isprint(ch) || ch == ' ')
01017     (void)snprintf(pbuf, sizeof pbuf, "%c", ch);
01018   else
01019     (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch);
01020   return(pbuf);
01021 }
01022 #endif
01023 #endif
01024 
01025 #undef  matcher
01026 #undef  fast
01027 #undef  slow
01028 #undef  dissect
01029 #undef  backref
01030 #undef  step
01031 #undef  print
01032 #undef  at
01033 #undef  match
01034 #undef  nope