LLVM API Documentation
00001 /*- 00002 * This code is derived from OpenBSD's libc/regex, original license follows: 00003 * 00004 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 00005 * Copyright (c) 1992, 1993, 1994 00006 * The Regents of the University of California. All rights reserved. 00007 * 00008 * This code is derived from software contributed to Berkeley by 00009 * Henry Spencer. 00010 * 00011 * Redistribution and use in source and binary forms, with or without 00012 * modification, are permitted provided that the following conditions 00013 * are met: 00014 * 1. Redistributions of source code must retain the above copyright 00015 * notice, this list of conditions and the following disclaimer. 00016 * 2. Redistributions in binary form must reproduce the above copyright 00017 * notice, this list of conditions and the following disclaimer in the 00018 * documentation and/or other materials provided with the distribution. 00019 * 3. Neither the name of the University nor the names of its contributors 00020 * may be used to endorse or promote products derived from this software 00021 * without specific prior written permission. 00022 * 00023 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 00024 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00025 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00026 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 00027 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 00028 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 00029 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 00030 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00031 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 00032 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 00033 * SUCH DAMAGE. 00034 * 00035 * @(#)engine.c 8.5 (Berkeley) 3/20/94 00036 */ 00037 00038 /* 00039 * The matching engine and friends. This file is #included by regexec.c 00040 * after suitable #defines of a variety of macros used herein, so that 00041 * different state representations can be used without duplicating masses 00042 * of code. 00043 */ 00044 00045 #ifdef SNAMES 00046 #define matcher smatcher 00047 #define fast sfast 00048 #define slow sslow 00049 #define dissect sdissect 00050 #define backref sbackref 00051 #define step sstep 00052 #define print sprint 00053 #define at sat 00054 #define match smat 00055 #define nope snope 00056 #endif 00057 #ifdef LNAMES 00058 #define matcher lmatcher 00059 #define fast lfast 00060 #define slow lslow 00061 #define dissect ldissect 00062 #define backref lbackref 00063 #define step lstep 00064 #define print lprint 00065 #define at lat 00066 #define match lmat 00067 #define nope lnope 00068 #endif 00069 00070 /* another structure passed up and down to avoid zillions of parameters */ 00071 struct match { 00072 struct re_guts *g; 00073 int eflags; 00074 llvm_regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 00075 const char *offp; /* offsets work from here */ 00076 const char *beginp; /* start of string -- virtual NUL precedes */ 00077 const char *endp; /* end of string -- virtual NUL here */ 00078 const char *coldp; /* can be no match starting before here */ 00079 const char **lastpos; /* [nplus+1] */ 00080 STATEVARS; 00081 states st; /* current states */ 00082 states fresh; /* states for a fresh start */ 00083 states tmp; /* temporary */ 00084 states empty; /* empty set of states */ 00085 }; 00086 00087 static int matcher(struct re_guts *, const char *, size_t, 00088 llvm_regmatch_t[], int); 00089 static const char *dissect(struct match *, const char *, const char *, sopno, 00090 sopno); 00091 static const char *backref(struct match *, const char *, const char *, sopno, 00092 sopno, sopno, int); 00093 static const char *fast(struct match *, const char *, const char *, sopno, sopno); 00094 static const char *slow(struct match *, const char *, const char *, sopno, sopno); 00095 static states step(struct re_guts *, sopno, sopno, states, int, states); 00096 #define MAX_RECURSION 100 00097 #define BOL (OUT+1) 00098 #define EOL (BOL+1) 00099 #define BOLEOL (BOL+2) 00100 #define NOTHING (BOL+3) 00101 #define BOW (BOL+4) 00102 #define EOW (BOL+5) 00103 #define CODEMAX (BOL+5) /* highest code used */ 00104 #define NONCHAR(c) ((c) > CHAR_MAX) 00105 #define NNONCHAR (CODEMAX-CHAR_MAX) 00106 #ifdef REDEBUG 00107 static void print(struct match *, char *, states, int, FILE *); 00108 #endif 00109 #ifdef REDEBUG 00110 static void at(struct match *, char *, char *, char *, sopno, sopno); 00111 #endif 00112 #ifdef REDEBUG 00113 static char *pchar(int); 00114 #endif 00115 00116 #ifdef REDEBUG 00117 #define SP(t, s, c) print(m, t, s, c, stdout) 00118 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 00119 #define NOTE(str) { if (m->eflags®_TRACE) (void)printf("=%s\n", (str)); } 00120 static int nope = 0; 00121 #else 00122 #define SP(t, s, c) /* nothing */ 00123 #define AT(t, p1, p2, s1, s2) /* nothing */ 00124 #define NOTE(s) /* nothing */ 00125 #endif 00126 00127 /* 00128 - matcher - the actual matching engine 00129 */ 00130 static int /* 0 success, REG_NOMATCH failure */ 00131 matcher(struct re_guts *g, const char *string, size_t nmatch, 00132 llvm_regmatch_t pmatch[], 00133 int eflags) 00134 { 00135 const char *endp; 00136 size_t i; 00137 struct match mv; 00138 struct match *m = &mv; 00139 const char *dp; 00140 const sopno gf = g->firststate+1; /* +1 for OEND */ 00141 const sopno gl = g->laststate; 00142 const char *start; 00143 const char *stop; 00144 00145 /* simplify the situation where possible */ 00146 if (g->cflags®_NOSUB) 00147 nmatch = 0; 00148 if (eflags®_STARTEND) { 00149 start = string + pmatch[0].rm_so; 00150 stop = string + pmatch[0].rm_eo; 00151 } else { 00152 start = string; 00153 stop = start + strlen(start); 00154 } 00155 if (stop < start) 00156 return(REG_INVARG); 00157 00158 /* prescreening; this does wonders for this rather slow code */ 00159 if (g->must != NULL) { 00160 for (dp = start; dp < stop; dp++) 00161 if (*dp == g->must[0] && stop - dp >= g->mlen && 00162 memcmp(dp, g->must, (size_t)g->mlen) == 0) 00163 break; 00164 if (dp == stop) /* we didn't find g->must */ 00165 return(REG_NOMATCH); 00166 } 00167 00168 /* match struct setup */ 00169 m->g = g; 00170 m->eflags = eflags; 00171 m->pmatch = NULL; 00172 m->lastpos = NULL; 00173 m->offp = string; 00174 m->beginp = start; 00175 m->endp = stop; 00176 STATESETUP(m, 4); 00177 SETUP(m->st); 00178 SETUP(m->fresh); 00179 SETUP(m->tmp); 00180 SETUP(m->empty); 00181 CLEAR(m->empty); 00182 00183 /* this loop does only one repetition except for backrefs */ 00184 for (;;) { 00185 endp = fast(m, start, stop, gf, gl); 00186 if (endp == NULL) { /* a miss */ 00187 free(m->pmatch); 00188 free((void*)m->lastpos); 00189 STATETEARDOWN(m); 00190 return(REG_NOMATCH); 00191 } 00192 if (nmatch == 0 && !g->backrefs) 00193 break; /* no further info needed */ 00194 00195 /* where? */ 00196 assert(m->coldp != NULL); 00197 for (;;) { 00198 NOTE("finding start"); 00199 endp = slow(m, m->coldp, stop, gf, gl); 00200 if (endp != NULL) 00201 break; 00202 assert(m->coldp < m->endp); 00203 m->coldp++; 00204 } 00205 if (nmatch == 1 && !g->backrefs) 00206 break; /* no further info needed */ 00207 00208 /* oh my, they want the subexpressions... */ 00209 if (m->pmatch == NULL) 00210 m->pmatch = (llvm_regmatch_t *)malloc((m->g->nsub + 1) * 00211 sizeof(llvm_regmatch_t)); 00212 if (m->pmatch == NULL) { 00213 STATETEARDOWN(m); 00214 return(REG_ESPACE); 00215 } 00216 for (i = 1; i <= m->g->nsub; i++) 00217 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; 00218 if (!g->backrefs && !(m->eflags®_BACKR)) { 00219 NOTE("dissecting"); 00220 dp = dissect(m, m->coldp, endp, gf, gl); 00221 } else { 00222 if (g->nplus > 0 && m->lastpos == NULL) 00223 m->lastpos = (const char **)malloc((g->nplus+1) * 00224 sizeof(char *)); 00225 if (g->nplus > 0 && m->lastpos == NULL) { 00226 free(m->pmatch); 00227 STATETEARDOWN(m); 00228 return(REG_ESPACE); 00229 } 00230 NOTE("backref dissect"); 00231 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 00232 } 00233 if (dp != NULL) 00234 break; 00235 00236 /* uh-oh... we couldn't find a subexpression-level match */ 00237 assert(g->backrefs); /* must be back references doing it */ 00238 assert(g->nplus == 0 || m->lastpos != NULL); 00239 for (;;) { 00240 if (dp != NULL || endp <= m->coldp) 00241 break; /* defeat */ 00242 NOTE("backoff"); 00243 endp = slow(m, m->coldp, endp-1, gf, gl); 00244 if (endp == NULL) 00245 break; /* defeat */ 00246 /* try it on a shorter possibility */ 00247 #ifndef NDEBUG 00248 for (i = 1; i <= m->g->nsub; i++) { 00249 assert(m->pmatch[i].rm_so == -1); 00250 assert(m->pmatch[i].rm_eo == -1); 00251 } 00252 #endif 00253 NOTE("backoff dissect"); 00254 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); 00255 } 00256 assert(dp == NULL || dp == endp); 00257 if (dp != NULL) /* found a shorter one */ 00258 break; 00259 00260 /* despite initial appearances, there is no match here */ 00261 NOTE("false alarm"); 00262 if (m->coldp == stop) 00263 break; 00264 start = m->coldp + 1; /* recycle starting later */ 00265 } 00266 00267 /* fill in the details if requested */ 00268 if (nmatch > 0) { 00269 pmatch[0].rm_so = m->coldp - m->offp; 00270 pmatch[0].rm_eo = endp - m->offp; 00271 } 00272 if (nmatch > 1) { 00273 assert(m->pmatch != NULL); 00274 for (i = 1; i < nmatch; i++) 00275 if (i <= m->g->nsub) 00276 pmatch[i] = m->pmatch[i]; 00277 else { 00278 pmatch[i].rm_so = -1; 00279 pmatch[i].rm_eo = -1; 00280 } 00281 } 00282 00283 if (m->pmatch != NULL) 00284 free((char *)m->pmatch); 00285 if (m->lastpos != NULL) 00286 free((char *)m->lastpos); 00287 STATETEARDOWN(m); 00288 return(0); 00289 } 00290 00291 /* 00292 - dissect - figure out what matched what, no back references 00293 */ 00294 static const char * /* == stop (success) always */ 00295 dissect(struct match *m, const char *start, const char *stop, sopno startst, 00296 sopno stopst) 00297 { 00298 int i; 00299 sopno ss; /* start sop of current subRE */ 00300 sopno es; /* end sop of current subRE */ 00301 const char *sp; /* start of string matched by it */ 00302 const char *stp; /* string matched by it cannot pass here */ 00303 const char *rest; /* start of rest of string */ 00304 const char *tail; /* string unmatched by rest of RE */ 00305 sopno ssub; /* start sop of subsubRE */ 00306 sopno esub; /* end sop of subsubRE */ 00307 const char *ssp; /* start of string matched by subsubRE */ 00308 const char *sep; /* end of string matched by subsubRE */ 00309 const char *oldssp; /* previous ssp */ 00310 00311 AT("diss", start, stop, startst, stopst); 00312 sp = start; 00313 for (ss = startst; ss < stopst; ss = es) { 00314 /* identify end of subRE */ 00315 es = ss; 00316 switch (OP(m->g->strip[es])) { 00317 case OPLUS_: 00318 case OQUEST_: 00319 es += OPND(m->g->strip[es]); 00320 break; 00321 case OCH_: 00322 while (OP(m->g->strip[es]) != O_CH) 00323 es += OPND(m->g->strip[es]); 00324 break; 00325 } 00326 es++; 00327 00328 /* figure out what it matched */ 00329 switch (OP(m->g->strip[ss])) { 00330 case OEND: 00331 assert(nope); 00332 break; 00333 case OCHAR: 00334 sp++; 00335 break; 00336 case OBOL: 00337 case OEOL: 00338 case OBOW: 00339 case OEOW: 00340 break; 00341 case OANY: 00342 case OANYOF: 00343 sp++; 00344 break; 00345 case OBACK_: 00346 case O_BACK: 00347 assert(nope); 00348 break; 00349 /* cases where length of match is hard to find */ 00350 case OQUEST_: 00351 stp = stop; 00352 for (;;) { 00353 /* how long could this one be? */ 00354 rest = slow(m, sp, stp, ss, es); 00355 assert(rest != NULL); /* it did match */ 00356 /* could the rest match the rest? */ 00357 tail = slow(m, rest, stop, es, stopst); 00358 if (tail == stop) 00359 break; /* yes! */ 00360 /* no -- try a shorter match for this one */ 00361 stp = rest - 1; 00362 assert(stp >= sp); /* it did work */ 00363 } 00364 ssub = ss + 1; 00365 esub = es - 1; 00366 /* did innards match? */ 00367 if (slow(m, sp, rest, ssub, esub) != NULL) { 00368 const char *dp = dissect(m, sp, rest, ssub, esub); 00369 (void)dp; /* avoid warning if assertions off */ 00370 assert(dp == rest); 00371 } else /* no */ 00372 assert(sp == rest); 00373 sp = rest; 00374 break; 00375 case OPLUS_: 00376 stp = stop; 00377 for (;;) { 00378 /* how long could this one be? */ 00379 rest = slow(m, sp, stp, ss, es); 00380 assert(rest != NULL); /* it did match */ 00381 /* could the rest match the rest? */ 00382 tail = slow(m, rest, stop, es, stopst); 00383 if (tail == stop) 00384 break; /* yes! */ 00385 /* no -- try a shorter match for this one */ 00386 stp = rest - 1; 00387 assert(stp >= sp); /* it did work */ 00388 } 00389 ssub = ss + 1; 00390 esub = es - 1; 00391 ssp = sp; 00392 oldssp = ssp; 00393 for (;;) { /* find last match of innards */ 00394 sep = slow(m, ssp, rest, ssub, esub); 00395 if (sep == NULL || sep == ssp) 00396 break; /* failed or matched null */ 00397 oldssp = ssp; /* on to next try */ 00398 ssp = sep; 00399 } 00400 if (sep == NULL) { 00401 /* last successful match */ 00402 sep = ssp; 00403 ssp = oldssp; 00404 } 00405 assert(sep == rest); /* must exhaust substring */ 00406 assert(slow(m, ssp, sep, ssub, esub) == rest); 00407 { 00408 const char *dp = dissect(m, ssp, sep, ssub, esub); 00409 (void)dp; /* avoid warning if assertions off */ 00410 assert(dp == sep); 00411 } 00412 sp = rest; 00413 break; 00414 case OCH_: 00415 stp = stop; 00416 for (;;) { 00417 /* how long could this one be? */ 00418 rest = slow(m, sp, stp, ss, es); 00419 assert(rest != NULL); /* it did match */ 00420 /* could the rest match the rest? */ 00421 tail = slow(m, rest, stop, es, stopst); 00422 if (tail == stop) 00423 break; /* yes! */ 00424 /* no -- try a shorter match for this one */ 00425 stp = rest - 1; 00426 assert(stp >= sp); /* it did work */ 00427 } 00428 ssub = ss + 1; 00429 esub = ss + OPND(m->g->strip[ss]) - 1; 00430 assert(OP(m->g->strip[esub]) == OOR1); 00431 for (;;) { /* find first matching branch */ 00432 if (slow(m, sp, rest, ssub, esub) == rest) 00433 break; /* it matched all of it */ 00434 /* that one missed, try next one */ 00435 assert(OP(m->g->strip[esub]) == OOR1); 00436 esub++; 00437 assert(OP(m->g->strip[esub]) == OOR2); 00438 ssub = esub + 1; 00439 esub += OPND(m->g->strip[esub]); 00440 if (OP(m->g->strip[esub]) == OOR2) 00441 esub--; 00442 else 00443 assert(OP(m->g->strip[esub]) == O_CH); 00444 } 00445 { 00446 const char *dp = dissect(m, sp, rest, ssub, esub); 00447 (void)dp; /* avoid warning if assertions off */ 00448 assert(dp == rest); 00449 } 00450 sp = rest; 00451 break; 00452 case O_PLUS: 00453 case O_QUEST: 00454 case OOR1: 00455 case OOR2: 00456 case O_CH: 00457 assert(nope); 00458 break; 00459 case OLPAREN: 00460 i = OPND(m->g->strip[ss]); 00461 assert(0 < i && i <= m->g->nsub); 00462 m->pmatch[i].rm_so = sp - m->offp; 00463 break; 00464 case ORPAREN: 00465 i = OPND(m->g->strip[ss]); 00466 assert(0 < i && i <= m->g->nsub); 00467 m->pmatch[i].rm_eo = sp - m->offp; 00468 break; 00469 default: /* uh oh */ 00470 assert(nope); 00471 break; 00472 } 00473 } 00474 00475 assert(sp == stop); 00476 return(sp); 00477 } 00478 00479 /* 00480 - backref - figure out what matched what, figuring in back references 00481 */ 00482 static const char * /* == stop (success) or NULL (failure) */ 00483 backref(struct match *m, const char *start, const char *stop, sopno startst, 00484 sopno stopst, sopno lev, int rec) /* PLUS nesting level */ 00485 { 00486 int i; 00487 sopno ss; /* start sop of current subRE */ 00488 const char *sp; /* start of string matched by it */ 00489 sopno ssub; /* start sop of subsubRE */ 00490 sopno esub; /* end sop of subsubRE */ 00491 const char *ssp; /* start of string matched by subsubRE */ 00492 const char *dp; 00493 size_t len; 00494 int hard; 00495 sop s; 00496 llvm_regoff_t offsave; 00497 cset *cs; 00498 00499 AT("back", start, stop, startst, stopst); 00500 sp = start; 00501 00502 /* get as far as we can with easy stuff */ 00503 hard = 0; 00504 for (ss = startst; !hard && ss < stopst; ss++) 00505 switch (OP(s = m->g->strip[ss])) { 00506 case OCHAR: 00507 if (sp == stop || *sp++ != (char)OPND(s)) 00508 return(NULL); 00509 break; 00510 case OANY: 00511 if (sp == stop) 00512 return(NULL); 00513 sp++; 00514 break; 00515 case OANYOF: 00516 cs = &m->g->sets[OPND(s)]; 00517 if (sp == stop || !CHIN(cs, *sp++)) 00518 return(NULL); 00519 break; 00520 case OBOL: 00521 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 00522 (sp < m->endp && *(sp-1) == '\n' && 00523 (m->g->cflags®_NEWLINE)) ) 00524 { /* yes */ } 00525 else 00526 return(NULL); 00527 break; 00528 case OEOL: 00529 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 00530 (sp < m->endp && *sp == '\n' && 00531 (m->g->cflags®_NEWLINE)) ) 00532 { /* yes */ } 00533 else 00534 return(NULL); 00535 break; 00536 case OBOW: 00537 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 00538 (sp < m->endp && *(sp-1) == '\n' && 00539 (m->g->cflags®_NEWLINE)) || 00540 (sp > m->beginp && 00541 !ISWORD(*(sp-1))) ) && 00542 (sp < m->endp && ISWORD(*sp)) ) 00543 { /* yes */ } 00544 else 00545 return(NULL); 00546 break; 00547 case OEOW: 00548 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 00549 (sp < m->endp && *sp == '\n' && 00550 (m->g->cflags®_NEWLINE)) || 00551 (sp < m->endp && !ISWORD(*sp)) ) && 00552 (sp > m->beginp && ISWORD(*(sp-1))) ) 00553 { /* yes */ } 00554 else 00555 return(NULL); 00556 break; 00557 case O_QUEST: 00558 break; 00559 case OOR1: /* matches null but needs to skip */ 00560 ss++; 00561 s = m->g->strip[ss]; 00562 do { 00563 assert(OP(s) == OOR2); 00564 ss += OPND(s); 00565 } while (OP(s = m->g->strip[ss]) != O_CH); 00566 /* note that the ss++ gets us past the O_CH */ 00567 break; 00568 default: /* have to make a choice */ 00569 hard = 1; 00570 break; 00571 } 00572 if (!hard) { /* that was it! */ 00573 if (sp != stop) 00574 return(NULL); 00575 return(sp); 00576 } 00577 ss--; /* adjust for the for's final increment */ 00578 00579 /* the hard stuff */ 00580 AT("hard", sp, stop, ss, stopst); 00581 s = m->g->strip[ss]; 00582 switch (OP(s)) { 00583 case OBACK_: /* the vilest depths */ 00584 i = OPND(s); 00585 assert(0 < i && i <= m->g->nsub); 00586 if (m->pmatch[i].rm_eo == -1) 00587 return(NULL); 00588 assert(m->pmatch[i].rm_so != -1); 00589 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; 00590 if (len == 0 && rec++ > MAX_RECURSION) 00591 return(NULL); 00592 assert(stop - m->beginp >= len); 00593 if (sp > stop - len) 00594 return(NULL); /* not enough left to match */ 00595 ssp = m->offp + m->pmatch[i].rm_so; 00596 if (memcmp(sp, ssp, len) != 0) 00597 return(NULL); 00598 while (m->g->strip[ss] != SOP(O_BACK, i)) 00599 ss++; 00600 return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); 00601 break; 00602 case OQUEST_: /* to null or not */ 00603 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 00604 if (dp != NULL) 00605 return(dp); /* not */ 00606 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); 00607 break; 00608 case OPLUS_: 00609 assert(m->lastpos != NULL); 00610 assert(lev+1 <= m->g->nplus); 00611 m->lastpos[lev+1] = sp; 00612 return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); 00613 break; 00614 case O_PLUS: 00615 if (sp == m->lastpos[lev]) /* last pass matched null */ 00616 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 00617 /* try another pass */ 00618 m->lastpos[lev] = sp; 00619 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); 00620 if (dp == NULL) 00621 return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); 00622 else 00623 return(dp); 00624 break; 00625 case OCH_: /* find the right one, if any */ 00626 ssub = ss + 1; 00627 esub = ss + OPND(s) - 1; 00628 assert(OP(m->g->strip[esub]) == OOR1); 00629 for (;;) { /* find first matching branch */ 00630 dp = backref(m, sp, stop, ssub, esub, lev, rec); 00631 if (dp != NULL) 00632 return(dp); 00633 /* that one missed, try next one */ 00634 if (OP(m->g->strip[esub]) == O_CH) 00635 return(NULL); /* there is none */ 00636 esub++; 00637 assert(OP(m->g->strip[esub]) == OOR2); 00638 ssub = esub + 1; 00639 esub += OPND(m->g->strip[esub]); 00640 if (OP(m->g->strip[esub]) == OOR2) 00641 esub--; 00642 else 00643 assert(OP(m->g->strip[esub]) == O_CH); 00644 } 00645 break; 00646 case OLPAREN: /* must undo assignment if rest fails */ 00647 i = OPND(s); 00648 assert(0 < i && i <= m->g->nsub); 00649 offsave = m->pmatch[i].rm_so; 00650 m->pmatch[i].rm_so = sp - m->offp; 00651 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 00652 if (dp != NULL) 00653 return(dp); 00654 m->pmatch[i].rm_so = offsave; 00655 return(NULL); 00656 break; 00657 case ORPAREN: /* must undo assignment if rest fails */ 00658 i = OPND(s); 00659 assert(0 < i && i <= m->g->nsub); 00660 offsave = m->pmatch[i].rm_eo; 00661 m->pmatch[i].rm_eo = sp - m->offp; 00662 dp = backref(m, sp, stop, ss+1, stopst, lev, rec); 00663 if (dp != NULL) 00664 return(dp); 00665 m->pmatch[i].rm_eo = offsave; 00666 return(NULL); 00667 break; 00668 default: /* uh oh */ 00669 assert(nope); 00670 break; 00671 } 00672 00673 /* "can't happen" */ 00674 assert(nope); 00675 /* NOTREACHED */ 00676 return NULL; 00677 } 00678 00679 /* 00680 - fast - step through the string at top speed 00681 */ 00682 static const char * /* where tentative match ended, or NULL */ 00683 fast(struct match *m, const char *start, const char *stop, sopno startst, 00684 sopno stopst) 00685 { 00686 states st = m->st; 00687 states fresh = m->fresh; 00688 states tmp = m->tmp; 00689 const char *p = start; 00690 int c = (start == m->beginp) ? OUT : *(start-1); 00691 int lastc; /* previous c */ 00692 int flagch; 00693 int i; 00694 const char *coldp; /* last p after which no match was underway */ 00695 00696 CLEAR(st); 00697 SET1(st, startst); 00698 st = step(m->g, startst, stopst, st, NOTHING, st); 00699 ASSIGN(fresh, st); 00700 SP("start", st, *p); 00701 coldp = NULL; 00702 for (;;) { 00703 /* next character */ 00704 lastc = c; 00705 c = (p == m->endp) ? OUT : *p; 00706 if (EQ(st, fresh)) 00707 coldp = p; 00708 00709 /* is there an EOL and/or BOL between lastc and c? */ 00710 flagch = '\0'; 00711 i = 0; 00712 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 00713 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 00714 flagch = BOL; 00715 i = m->g->nbol; 00716 } 00717 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 00718 (c == OUT && !(m->eflags®_NOTEOL)) ) { 00719 flagch = (flagch == BOL) ? BOLEOL : EOL; 00720 i += m->g->neol; 00721 } 00722 if (i != 0) { 00723 for (; i > 0; i--) 00724 st = step(m->g, startst, stopst, st, flagch, st); 00725 SP("boleol", st, c); 00726 } 00727 00728 /* how about a word boundary? */ 00729 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 00730 (c != OUT && ISWORD(c)) ) { 00731 flagch = BOW; 00732 } 00733 if ( (lastc != OUT && ISWORD(lastc)) && 00734 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 00735 flagch = EOW; 00736 } 00737 if (flagch == BOW || flagch == EOW) { 00738 st = step(m->g, startst, stopst, st, flagch, st); 00739 SP("boweow", st, c); 00740 } 00741 00742 /* are we done? */ 00743 if (ISSET(st, stopst) || p == stop) 00744 break; /* NOTE BREAK OUT */ 00745 00746 /* no, we must deal with this character */ 00747 ASSIGN(tmp, st); 00748 ASSIGN(st, fresh); 00749 assert(c != OUT); 00750 st = step(m->g, startst, stopst, tmp, c, st); 00751 SP("aft", st, c); 00752 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 00753 p++; 00754 } 00755 00756 assert(coldp != NULL); 00757 m->coldp = coldp; 00758 if (ISSET(st, stopst)) 00759 return(p+1); 00760 else 00761 return(NULL); 00762 } 00763 00764 /* 00765 - slow - step through the string more deliberately 00766 */ 00767 static const char * /* where it ended */ 00768 slow(struct match *m, const char *start, const char *stop, sopno startst, 00769 sopno stopst) 00770 { 00771 states st = m->st; 00772 states empty = m->empty; 00773 states tmp = m->tmp; 00774 const char *p = start; 00775 int c = (start == m->beginp) ? OUT : *(start-1); 00776 int lastc; /* previous c */ 00777 int flagch; 00778 int i; 00779 const char *matchp; /* last p at which a match ended */ 00780 00781 AT("slow", start, stop, startst, stopst); 00782 CLEAR(st); 00783 SET1(st, startst); 00784 SP("sstart", st, *p); 00785 st = step(m->g, startst, stopst, st, NOTHING, st); 00786 matchp = NULL; 00787 for (;;) { 00788 /* next character */ 00789 lastc = c; 00790 c = (p == m->endp) ? OUT : *p; 00791 00792 /* is there an EOL and/or BOL between lastc and c? */ 00793 flagch = '\0'; 00794 i = 0; 00795 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 00796 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 00797 flagch = BOL; 00798 i = m->g->nbol; 00799 } 00800 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 00801 (c == OUT && !(m->eflags®_NOTEOL)) ) { 00802 flagch = (flagch == BOL) ? BOLEOL : EOL; 00803 i += m->g->neol; 00804 } 00805 if (i != 0) { 00806 for (; i > 0; i--) 00807 st = step(m->g, startst, stopst, st, flagch, st); 00808 SP("sboleol", st, c); 00809 } 00810 00811 /* how about a word boundary? */ 00812 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 00813 (c != OUT && ISWORD(c)) ) { 00814 flagch = BOW; 00815 } 00816 if ( (lastc != OUT && ISWORD(lastc)) && 00817 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 00818 flagch = EOW; 00819 } 00820 if (flagch == BOW || flagch == EOW) { 00821 st = step(m->g, startst, stopst, st, flagch, st); 00822 SP("sboweow", st, c); 00823 } 00824 00825 /* are we done? */ 00826 if (ISSET(st, stopst)) 00827 matchp = p; 00828 if (EQ(st, empty) || p == stop) 00829 break; /* NOTE BREAK OUT */ 00830 00831 /* no, we must deal with this character */ 00832 ASSIGN(tmp, st); 00833 ASSIGN(st, empty); 00834 assert(c != OUT); 00835 st = step(m->g, startst, stopst, tmp, c, st); 00836 SP("saft", st, c); 00837 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 00838 p++; 00839 } 00840 00841 return(matchp); 00842 } 00843 00844 00845 /* 00846 - step - map set of states reachable before char to set reachable after 00847 */ 00848 static states 00849 step(struct re_guts *g, 00850 sopno start, /* start state within strip */ 00851 sopno stop, /* state after stop state within strip */ 00852 states bef, /* states reachable before */ 00853 int ch, /* character or NONCHAR code */ 00854 states aft) /* states already known reachable after */ 00855 { 00856 cset *cs; 00857 sop s; 00858 sopno pc; 00859 onestate here; /* note, macros know this name */ 00860 sopno look; 00861 int i; 00862 00863 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 00864 s = g->strip[pc]; 00865 switch (OP(s)) { 00866 case OEND: 00867 assert(pc == stop-1); 00868 break; 00869 case OCHAR: 00870 /* only characters can match */ 00871 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 00872 if (ch == (char)OPND(s)) 00873 FWD(aft, bef, 1); 00874 break; 00875 case OBOL: 00876 if (ch == BOL || ch == BOLEOL) 00877 FWD(aft, bef, 1); 00878 break; 00879 case OEOL: 00880 if (ch == EOL || ch == BOLEOL) 00881 FWD(aft, bef, 1); 00882 break; 00883 case OBOW: 00884 if (ch == BOW) 00885 FWD(aft, bef, 1); 00886 break; 00887 case OEOW: 00888 if (ch == EOW) 00889 FWD(aft, bef, 1); 00890 break; 00891 case OANY: 00892 if (!NONCHAR(ch)) 00893 FWD(aft, bef, 1); 00894 break; 00895 case OANYOF: 00896 cs = &g->sets[OPND(s)]; 00897 if (!NONCHAR(ch) && CHIN(cs, ch)) 00898 FWD(aft, bef, 1); 00899 break; 00900 case OBACK_: /* ignored here */ 00901 case O_BACK: 00902 FWD(aft, aft, 1); 00903 break; 00904 case OPLUS_: /* forward, this is just an empty */ 00905 FWD(aft, aft, 1); 00906 break; 00907 case O_PLUS: /* both forward and back */ 00908 FWD(aft, aft, 1); 00909 i = ISSETBACK(aft, OPND(s)); 00910 BACK(aft, aft, OPND(s)); 00911 if (!i && ISSETBACK(aft, OPND(s))) { 00912 /* oho, must reconsider loop body */ 00913 pc -= OPND(s) + 1; 00914 INIT(here, pc); 00915 } 00916 break; 00917 case OQUEST_: /* two branches, both forward */ 00918 FWD(aft, aft, 1); 00919 FWD(aft, aft, OPND(s)); 00920 break; 00921 case O_QUEST: /* just an empty */ 00922 FWD(aft, aft, 1); 00923 break; 00924 case OLPAREN: /* not significant here */ 00925 case ORPAREN: 00926 FWD(aft, aft, 1); 00927 break; 00928 case OCH_: /* mark the first two branches */ 00929 FWD(aft, aft, 1); 00930 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 00931 FWD(aft, aft, OPND(s)); 00932 break; 00933 case OOR1: /* done a branch, find the O_CH */ 00934 if (ISSTATEIN(aft, here)) { 00935 for (look = 1; 00936 OP(s = g->strip[pc+look]) != O_CH; 00937 look += OPND(s)) 00938 assert(OP(s) == OOR2); 00939 FWD(aft, aft, look); 00940 } 00941 break; 00942 case OOR2: /* propagate OCH_'s marking */ 00943 FWD(aft, aft, 1); 00944 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 00945 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 00946 FWD(aft, aft, OPND(s)); 00947 } 00948 break; 00949 case O_CH: /* just empty */ 00950 FWD(aft, aft, 1); 00951 break; 00952 default: /* ooooops... */ 00953 assert(nope); 00954 break; 00955 } 00956 } 00957 00958 return(aft); 00959 } 00960 00961 #ifdef REDEBUG 00962 /* 00963 - print - print a set of states 00964 */ 00965 static void 00966 print(struct match *m, char *caption, states st, int ch, FILE *d) 00967 { 00968 struct re_guts *g = m->g; 00969 int i; 00970 int first = 1; 00971 00972 if (!(m->eflags®_TRACE)) 00973 return; 00974 00975 (void)fprintf(d, "%s", caption); 00976 if (ch != '\0') 00977 (void)fprintf(d, " %s", pchar(ch)); 00978 for (i = 0; i < g->nstates; i++) 00979 if (ISSET(st, i)) { 00980 (void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 00981 first = 0; 00982 } 00983 (void)fprintf(d, "\n"); 00984 } 00985 00986 /* 00987 - at - print current situation 00988 */ 00989 static void 00990 at(struct match *m, char *title, char *start, char *stop, sopno startst, 00991 sopno stopst) 00992 { 00993 if (!(m->eflags®_TRACE)) 00994 return; 00995 00996 (void)printf("%s %s-", title, pchar(*start)); 00997 (void)printf("%s ", pchar(*stop)); 00998 (void)printf("%ld-%ld\n", (long)startst, (long)stopst); 00999 } 01000 01001 #ifndef PCHARDONE 01002 #define PCHARDONE /* never again */ 01003 /* 01004 - pchar - make a character printable 01005 * 01006 * Is this identical to regchar() over in debug.c? Well, yes. But a 01007 * duplicate here avoids having a debugging-capable regexec.o tied to 01008 * a matching debug.o, and this is convenient. It all disappears in 01009 * the non-debug compilation anyway, so it doesn't matter much. 01010 */ 01011 static char * /* -> representation */ 01012 pchar(int ch) 01013 { 01014 static char pbuf[10]; 01015 01016 if (isprint(ch) || ch == ' ') 01017 (void)snprintf(pbuf, sizeof pbuf, "%c", ch); 01018 else 01019 (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch); 01020 return(pbuf); 01021 } 01022 #endif 01023 #endif 01024 01025 #undef matcher 01026 #undef fast 01027 #undef slow 01028 #undef dissect 01029 #undef backref 01030 #undef step 01031 #undef print 01032 #undef at 01033 #undef match 01034 #undef nope