LLVM API Documentation
00001 /*- 00002 * This code is derived from OpenBSD's libc/regex, original license follows: 00003 * 00004 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 00005 * Copyright (c) 1992, 1993, 1994 00006 * The Regents of the University of California. All rights reserved. 00007 * 00008 * This code is derived from software contributed to Berkeley by 00009 * Henry Spencer. 00010 * 00011 * Redistribution and use in source and binary forms, with or without 00012 * modification, are permitted provided that the following conditions 00013 * are met: 00014 * 1. Redistributions of source code must retain the above copyright 00015 * notice, this list of conditions and the following disclaimer. 00016 * 2. Redistributions in binary form must reproduce the above copyright 00017 * notice, this list of conditions and the following disclaimer in the 00018 * documentation and/or other materials provided with the distribution. 00019 * 3. Neither the name of the University nor the names of its contributors 00020 * may be used to endorse or promote products derived from this software 00021 * without specific prior written permission. 00022 * 00023 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 00024 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00025 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00026 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 00027 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 00028 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 00029 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 00030 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00031 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 00032 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 00033 * SUCH DAMAGE. 00034 * 00035 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 00036 */ 00037 00038 #include <sys/types.h> 00039 #include <stdio.h> 00040 #include <string.h> 00041 #include <ctype.h> 00042 #include <limits.h> 00043 #include <stdlib.h> 00044 #include "regex_impl.h" 00045 00046 #include "regutils.h" 00047 #include "regex2.h" 00048 00049 #include "regcclass.h" 00050 #include "regcname.h" 00051 00052 /* 00053 * parse structure, passed up and down to avoid global variables and 00054 * other clumsinesses 00055 */ 00056 struct parse { 00057 char *next; /* next character in RE */ 00058 char *end; /* end of string (-> NUL normally) */ 00059 int error; /* has an error been seen? */ 00060 sop *strip; /* malloced strip */ 00061 sopno ssize; /* malloced strip size (allocated) */ 00062 sopno slen; /* malloced strip length (used) */ 00063 int ncsalloc; /* number of csets allocated */ 00064 struct re_guts *g; 00065 # define NPAREN 10 /* we need to remember () 1-9 for back refs */ 00066 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 00067 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 00068 }; 00069 00070 static void p_ere(struct parse *, int); 00071 static void p_ere_exp(struct parse *); 00072 static void p_str(struct parse *); 00073 static void p_bre(struct parse *, int, int); 00074 static int p_simp_re(struct parse *, int); 00075 static int p_count(struct parse *); 00076 static void p_bracket(struct parse *); 00077 static void p_b_term(struct parse *, cset *); 00078 static void p_b_cclass(struct parse *, cset *); 00079 static void p_b_eclass(struct parse *, cset *); 00080 static char p_b_symbol(struct parse *); 00081 static char p_b_coll_elem(struct parse *, int); 00082 static char othercase(int); 00083 static void bothcases(struct parse *, int); 00084 static void ordinary(struct parse *, int); 00085 static void nonnewline(struct parse *); 00086 static void repeat(struct parse *, sopno, int, int); 00087 static int seterr(struct parse *, int); 00088 static cset *allocset(struct parse *); 00089 static void freeset(struct parse *, cset *); 00090 static int freezeset(struct parse *, cset *); 00091 static int firstch(struct parse *, cset *); 00092 static int nch(struct parse *, cset *); 00093 static void mcadd(struct parse *, cset *, const char *); 00094 static void mcinvert(struct parse *, cset *); 00095 static void mccase(struct parse *, cset *); 00096 static int isinsets(struct re_guts *, int); 00097 static int samesets(struct re_guts *, int, int); 00098 static void categorize(struct parse *, struct re_guts *); 00099 static sopno dupl(struct parse *, sopno, sopno); 00100 static void doemit(struct parse *, sop, size_t); 00101 static void doinsert(struct parse *, sop, size_t, sopno); 00102 static void dofwd(struct parse *, sopno, sop); 00103 static void enlarge(struct parse *, sopno); 00104 static void stripsnug(struct parse *, struct re_guts *); 00105 static void findmust(struct parse *, struct re_guts *); 00106 static sopno pluscount(struct parse *, struct re_guts *); 00107 00108 static char nuls[10]; /* place to point scanner in event of error */ 00109 00110 /* 00111 * macros for use with parse structure 00112 * BEWARE: these know that the parse structure is named `p' !!! 00113 */ 00114 #define PEEK() (*p->next) 00115 #define PEEK2() (*(p->next+1)) 00116 #define MORE() (p->next < p->end) 00117 #define MORE2() (p->next+1 < p->end) 00118 #define SEE(c) (MORE() && PEEK() == (c)) 00119 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 00120 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 00121 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 00122 #define NEXT() (p->next++) 00123 #define NEXT2() (p->next += 2) 00124 #define NEXTn(n) (p->next += (n)) 00125 #define GETNEXT() (*p->next++) 00126 #define SETERROR(e) seterr(p, (e)) 00127 #define REQUIRE(co, e) (void)((co) || SETERROR(e)) 00128 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 00129 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) 00130 #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 00131 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) 00132 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 00133 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 00134 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 00135 #define HERE() (p->slen) 00136 #define THERE() (p->slen - 1) 00137 #define THERETHERE() (p->slen - 2) 00138 #define DROP(n) (p->slen -= (n)) 00139 00140 #ifdef _POSIX2_RE_DUP_MAX 00141 #define DUPMAX _POSIX2_RE_DUP_MAX 00142 #else 00143 #define DUPMAX 255 00144 #endif 00145 #define INFINITY (DUPMAX + 1) 00146 00147 #ifndef NDEBUG 00148 static int never = 0; /* for use in asserts; shuts lint up */ 00149 #else 00150 #define never 0 /* some <assert.h>s have bugs too */ 00151 #endif 00152 00153 /* 00154 - llvm_regcomp - interface for parser and compilation 00155 */ 00156 int /* 0 success, otherwise REG_something */ 00157 llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags) 00158 { 00159 struct parse pa; 00160 struct re_guts *g; 00161 struct parse *p = &pa; 00162 int i; 00163 size_t len; 00164 #ifdef REDEBUG 00165 # define GOODFLAGS(f) (f) 00166 #else 00167 # define GOODFLAGS(f) ((f)&~REG_DUMP) 00168 #endif 00169 00170 cflags = GOODFLAGS(cflags); 00171 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 00172 return(REG_INVARG); 00173 00174 if (cflags®_PEND) { 00175 if (preg->re_endp < pattern) 00176 return(REG_INVARG); 00177 len = preg->re_endp - pattern; 00178 } else 00179 len = strlen((const char *)pattern); 00180 00181 /* do the mallocs early so failure handling is easy */ 00182 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 00183 (NC-1)*sizeof(cat_t)); 00184 if (g == NULL) 00185 return(REG_ESPACE); 00186 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 00187 p->strip = (sop *)calloc(p->ssize, sizeof(sop)); 00188 p->slen = 0; 00189 if (p->strip == NULL) { 00190 free((char *)g); 00191 return(REG_ESPACE); 00192 } 00193 00194 /* set things up */ 00195 p->g = g; 00196 p->next = (char *)pattern; /* convenience; we do not modify it */ 00197 p->end = p->next + len; 00198 p->error = 0; 00199 p->ncsalloc = 0; 00200 for (i = 0; i < NPAREN; i++) { 00201 p->pbegin[i] = 0; 00202 p->pend[i] = 0; 00203 } 00204 g->csetsize = NC; 00205 g->sets = NULL; 00206 g->setbits = NULL; 00207 g->ncsets = 0; 00208 g->cflags = cflags; 00209 g->iflags = 0; 00210 g->nbol = 0; 00211 g->neol = 0; 00212 g->must = NULL; 00213 g->mlen = 0; 00214 g->nsub = 0; 00215 g->ncategories = 1; /* category 0 is "everything else" */ 00216 g->categories = &g->catspace[-(CHAR_MIN)]; 00217 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 00218 g->backrefs = 0; 00219 00220 /* do it */ 00221 EMIT(OEND, 0); 00222 g->firststate = THERE(); 00223 if (cflags®_EXTENDED) 00224 p_ere(p, OUT); 00225 else if (cflags®_NOSPEC) 00226 p_str(p); 00227 else 00228 p_bre(p, OUT, OUT); 00229 EMIT(OEND, 0); 00230 g->laststate = THERE(); 00231 00232 /* tidy up loose ends and fill things in */ 00233 categorize(p, g); 00234 stripsnug(p, g); 00235 findmust(p, g); 00236 g->nplus = pluscount(p, g); 00237 g->magic = MAGIC2; 00238 preg->re_nsub = g->nsub; 00239 preg->re_g = g; 00240 preg->re_magic = MAGIC1; 00241 #ifndef REDEBUG 00242 /* not debugging, so can't rely on the assert() in llvm_regexec() */ 00243 if (g->iflags®EX_BAD) 00244 SETERROR(REG_ASSERT); 00245 #endif 00246 00247 /* win or lose, we're done */ 00248 if (p->error != 0) /* lose */ 00249 llvm_regfree(preg); 00250 return(p->error); 00251 } 00252 00253 /* 00254 - p_ere - ERE parser top level, concatenation and alternation 00255 */ 00256 static void 00257 p_ere(struct parse *p, int stop) /* character this ERE should end at */ 00258 { 00259 char c; 00260 sopno prevback = 0; 00261 sopno prevfwd = 0; 00262 sopno conc; 00263 int first = 1; /* is this the first alternative? */ 00264 00265 for (;;) { 00266 /* do a bunch of concatenated expressions */ 00267 conc = HERE(); 00268 while (MORE() && (c = PEEK()) != '|' && c != stop) 00269 p_ere_exp(p); 00270 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 00271 00272 if (!EAT('|')) 00273 break; /* NOTE BREAK OUT */ 00274 00275 if (first) { 00276 INSERT(OCH_, conc); /* offset is wrong */ 00277 prevfwd = conc; 00278 prevback = conc; 00279 first = 0; 00280 } 00281 ASTERN(OOR1, prevback); 00282 prevback = THERE(); 00283 AHEAD(prevfwd); /* fix previous offset */ 00284 prevfwd = HERE(); 00285 EMIT(OOR2, 0); /* offset is very wrong */ 00286 } 00287 00288 if (!first) { /* tail-end fixups */ 00289 AHEAD(prevfwd); 00290 ASTERN(O_CH, prevback); 00291 } 00292 00293 assert(!MORE() || SEE(stop)); 00294 } 00295 00296 /* 00297 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 00298 */ 00299 static void 00300 p_ere_exp(struct parse *p) 00301 { 00302 char c; 00303 sopno pos; 00304 int count; 00305 int count2; 00306 int backrefnum; 00307 sopno subno; 00308 int wascaret = 0; 00309 00310 assert(MORE()); /* caller should have ensured this */ 00311 c = GETNEXT(); 00312 00313 pos = HERE(); 00314 switch (c) { 00315 case '(': 00316 REQUIRE(MORE(), REG_EPAREN); 00317 p->g->nsub++; 00318 subno = p->g->nsub; 00319 if (subno < NPAREN) 00320 p->pbegin[subno] = HERE(); 00321 EMIT(OLPAREN, subno); 00322 if (!SEE(')')) 00323 p_ere(p, ')'); 00324 if (subno < NPAREN) { 00325 p->pend[subno] = HERE(); 00326 assert(p->pend[subno] != 0); 00327 } 00328 EMIT(ORPAREN, subno); 00329 MUSTEAT(')', REG_EPAREN); 00330 break; 00331 #ifndef POSIX_MISTAKE 00332 case ')': /* happens only if no current unmatched ( */ 00333 /* 00334 * You may ask, why the ifndef? Because I didn't notice 00335 * this until slightly too late for 1003.2, and none of the 00336 * other 1003.2 regular-expression reviewers noticed it at 00337 * all. So an unmatched ) is legal POSIX, at least until 00338 * we can get it fixed. 00339 */ 00340 SETERROR(REG_EPAREN); 00341 break; 00342 #endif 00343 case '^': 00344 EMIT(OBOL, 0); 00345 p->g->iflags |= USEBOL; 00346 p->g->nbol++; 00347 wascaret = 1; 00348 break; 00349 case '$': 00350 EMIT(OEOL, 0); 00351 p->g->iflags |= USEEOL; 00352 p->g->neol++; 00353 break; 00354 case '|': 00355 SETERROR(REG_EMPTY); 00356 break; 00357 case '*': 00358 case '+': 00359 case '?': 00360 SETERROR(REG_BADRPT); 00361 break; 00362 case '.': 00363 if (p->g->cflags®_NEWLINE) 00364 nonnewline(p); 00365 else 00366 EMIT(OANY, 0); 00367 break; 00368 case '[': 00369 p_bracket(p); 00370 break; 00371 case '\\': 00372 REQUIRE(MORE(), REG_EESCAPE); 00373 c = GETNEXT(); 00374 if (c >= '1' && c <= '9') { 00375 /* \[0-9] is taken to be a back-reference to a previously specified 00376 * matching group. backrefnum will hold the number. The matching 00377 * group must exist (i.e. if \4 is found there must have been at 00378 * least 4 matching groups specified in the pattern previously). 00379 */ 00380 backrefnum = c - '0'; 00381 if (p->pend[backrefnum] == 0) { 00382 SETERROR(REG_ESUBREG); 00383 break; 00384 } 00385 00386 /* Make sure everything checks out and emit the sequence 00387 * that marks a back-reference to the parse structure. 00388 */ 00389 assert(backrefnum <= p->g->nsub); 00390 EMIT(OBACK_, backrefnum); 00391 assert(p->pbegin[backrefnum] != 0); 00392 assert(OP(p->strip[p->pbegin[backrefnum]]) != OLPAREN); 00393 assert(OP(p->strip[p->pend[backrefnum]]) != ORPAREN); 00394 (void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]); 00395 EMIT(O_BACK, backrefnum); 00396 p->g->backrefs = 1; 00397 } else { 00398 /* Other chars are simply themselves when escaped with a backslash. 00399 */ 00400 ordinary(p, c); 00401 } 00402 break; 00403 case '{': /* okay as ordinary except if digit follows */ 00404 REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); 00405 /* FALLTHROUGH */ 00406 default: 00407 ordinary(p, c); 00408 break; 00409 } 00410 00411 if (!MORE()) 00412 return; 00413 c = PEEK(); 00414 /* we call { a repetition if followed by a digit */ 00415 if (!( c == '*' || c == '+' || c == '?' || 00416 (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) 00417 return; /* no repetition, we're done */ 00418 NEXT(); 00419 00420 REQUIRE(!wascaret, REG_BADRPT); 00421 switch (c) { 00422 case '*': /* implemented as +? */ 00423 /* this case does not require the (y|) trick, noKLUDGE */ 00424 INSERT(OPLUS_, pos); 00425 ASTERN(O_PLUS, pos); 00426 INSERT(OQUEST_, pos); 00427 ASTERN(O_QUEST, pos); 00428 break; 00429 case '+': 00430 INSERT(OPLUS_, pos); 00431 ASTERN(O_PLUS, pos); 00432 break; 00433 case '?': 00434 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 00435 INSERT(OCH_, pos); /* offset slightly wrong */ 00436 ASTERN(OOR1, pos); /* this one's right */ 00437 AHEAD(pos); /* fix the OCH_ */ 00438 EMIT(OOR2, 0); /* offset very wrong... */ 00439 AHEAD(THERE()); /* ...so fix it */ 00440 ASTERN(O_CH, THERETHERE()); 00441 break; 00442 case '{': 00443 count = p_count(p); 00444 if (EAT(',')) { 00445 if (isdigit((uch)PEEK())) { 00446 count2 = p_count(p); 00447 REQUIRE(count <= count2, REG_BADBR); 00448 } else /* single number with comma */ 00449 count2 = INFINITY; 00450 } else /* just a single number */ 00451 count2 = count; 00452 repeat(p, pos, count, count2); 00453 if (!EAT('}')) { /* error heuristics */ 00454 while (MORE() && PEEK() != '}') 00455 NEXT(); 00456 REQUIRE(MORE(), REG_EBRACE); 00457 SETERROR(REG_BADBR); 00458 } 00459 break; 00460 } 00461 00462 if (!MORE()) 00463 return; 00464 c = PEEK(); 00465 if (!( c == '*' || c == '+' || c == '?' || 00466 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) 00467 return; 00468 SETERROR(REG_BADRPT); 00469 } 00470 00471 /* 00472 - p_str - string (no metacharacters) "parser" 00473 */ 00474 static void 00475 p_str(struct parse *p) 00476 { 00477 REQUIRE(MORE(), REG_EMPTY); 00478 while (MORE()) 00479 ordinary(p, GETNEXT()); 00480 } 00481 00482 /* 00483 - p_bre - BRE parser top level, anchoring and concatenation 00484 * Giving end1 as OUT essentially eliminates the end1/end2 check. 00485 * 00486 * This implementation is a bit of a kludge, in that a trailing $ is first 00487 * taken as an ordinary character and then revised to be an anchor. The 00488 * only undesirable side effect is that '$' gets included as a character 00489 * category in such cases. This is fairly harmless; not worth fixing. 00490 * The amount of lookahead needed to avoid this kludge is excessive. 00491 */ 00492 static void 00493 p_bre(struct parse *p, 00494 int end1, /* first terminating character */ 00495 int end2) /* second terminating character */ 00496 { 00497 sopno start = HERE(); 00498 int first = 1; /* first subexpression? */ 00499 int wasdollar = 0; 00500 00501 if (EAT('^')) { 00502 EMIT(OBOL, 0); 00503 p->g->iflags |= USEBOL; 00504 p->g->nbol++; 00505 } 00506 while (MORE() && !SEETWO(end1, end2)) { 00507 wasdollar = p_simp_re(p, first); 00508 first = 0; 00509 } 00510 if (wasdollar) { /* oops, that was a trailing anchor */ 00511 DROP(1); 00512 EMIT(OEOL, 0); 00513 p->g->iflags |= USEEOL; 00514 p->g->neol++; 00515 } 00516 00517 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 00518 } 00519 00520 /* 00521 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 00522 */ 00523 static int /* was the simple RE an unbackslashed $? */ 00524 p_simp_re(struct parse *p, 00525 int starordinary) /* is a leading * an ordinary character? */ 00526 { 00527 int c; 00528 int count; 00529 int count2; 00530 sopno pos; 00531 int i; 00532 sopno subno; 00533 # define BACKSL (1<<CHAR_BIT) 00534 00535 pos = HERE(); /* repetition op, if any, covers from here */ 00536 00537 assert(MORE()); /* caller should have ensured this */ 00538 c = GETNEXT(); 00539 if (c == '\\') { 00540 REQUIRE(MORE(), REG_EESCAPE); 00541 c = BACKSL | GETNEXT(); 00542 } 00543 switch (c) { 00544 case '.': 00545 if (p->g->cflags®_NEWLINE) 00546 nonnewline(p); 00547 else 00548 EMIT(OANY, 0); 00549 break; 00550 case '[': 00551 p_bracket(p); 00552 break; 00553 case BACKSL|'{': 00554 SETERROR(REG_BADRPT); 00555 break; 00556 case BACKSL|'(': 00557 p->g->nsub++; 00558 subno = p->g->nsub; 00559 if (subno < NPAREN) 00560 p->pbegin[subno] = HERE(); 00561 EMIT(OLPAREN, subno); 00562 /* the MORE here is an error heuristic */ 00563 if (MORE() && !SEETWO('\\', ')')) 00564 p_bre(p, '\\', ')'); 00565 if (subno < NPAREN) { 00566 p->pend[subno] = HERE(); 00567 assert(p->pend[subno] != 0); 00568 } 00569 EMIT(ORPAREN, subno); 00570 REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 00571 break; 00572 case BACKSL|')': /* should not get here -- must be user */ 00573 case BACKSL|'}': 00574 SETERROR(REG_EPAREN); 00575 break; 00576 case BACKSL|'1': 00577 case BACKSL|'2': 00578 case BACKSL|'3': 00579 case BACKSL|'4': 00580 case BACKSL|'5': 00581 case BACKSL|'6': 00582 case BACKSL|'7': 00583 case BACKSL|'8': 00584 case BACKSL|'9': 00585 i = (c&~BACKSL) - '0'; 00586 assert(i < NPAREN); 00587 if (p->pend[i] != 0) { 00588 assert(i <= p->g->nsub); 00589 EMIT(OBACK_, i); 00590 assert(p->pbegin[i] != 0); 00591 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 00592 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 00593 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 00594 EMIT(O_BACK, i); 00595 } else 00596 SETERROR(REG_ESUBREG); 00597 p->g->backrefs = 1; 00598 break; 00599 case '*': 00600 REQUIRE(starordinary, REG_BADRPT); 00601 /* FALLTHROUGH */ 00602 default: 00603 ordinary(p, (char)c); 00604 break; 00605 } 00606 00607 if (EAT('*')) { /* implemented as +? */ 00608 /* this case does not require the (y|) trick, noKLUDGE */ 00609 INSERT(OPLUS_, pos); 00610 ASTERN(O_PLUS, pos); 00611 INSERT(OQUEST_, pos); 00612 ASTERN(O_QUEST, pos); 00613 } else if (EATTWO('\\', '{')) { 00614 count = p_count(p); 00615 if (EAT(',')) { 00616 if (MORE() && isdigit((uch)PEEK())) { 00617 count2 = p_count(p); 00618 REQUIRE(count <= count2, REG_BADBR); 00619 } else /* single number with comma */ 00620 count2 = INFINITY; 00621 } else /* just a single number */ 00622 count2 = count; 00623 repeat(p, pos, count, count2); 00624 if (!EATTWO('\\', '}')) { /* error heuristics */ 00625 while (MORE() && !SEETWO('\\', '}')) 00626 NEXT(); 00627 REQUIRE(MORE(), REG_EBRACE); 00628 SETERROR(REG_BADBR); 00629 } 00630 } else if (c == '$') /* $ (but not \$) ends it */ 00631 return(1); 00632 00633 return(0); 00634 } 00635 00636 /* 00637 - p_count - parse a repetition count 00638 */ 00639 static int /* the value */ 00640 p_count(struct parse *p) 00641 { 00642 int count = 0; 00643 int ndigits = 0; 00644 00645 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { 00646 count = count*10 + (GETNEXT() - '0'); 00647 ndigits++; 00648 } 00649 00650 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 00651 return(count); 00652 } 00653 00654 /* 00655 - p_bracket - parse a bracketed character list 00656 * 00657 * Note a significant property of this code: if the allocset() did SETERROR, 00658 * no set operations are done. 00659 */ 00660 static void 00661 p_bracket(struct parse *p) 00662 { 00663 cset *cs; 00664 int invert = 0; 00665 00666 /* Dept of Truly Sickening Special-Case Kludges */ 00667 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) { 00668 EMIT(OBOW, 0); 00669 NEXTn(6); 00670 return; 00671 } 00672 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) { 00673 EMIT(OEOW, 0); 00674 NEXTn(6); 00675 return; 00676 } 00677 00678 if ((cs = allocset(p)) == NULL) { 00679 /* allocset did set error status in p */ 00680 return; 00681 } 00682 00683 if (EAT('^')) 00684 invert++; /* make note to invert set at end */ 00685 if (EAT(']')) 00686 CHadd(cs, ']'); 00687 else if (EAT('-')) 00688 CHadd(cs, '-'); 00689 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 00690 p_b_term(p, cs); 00691 if (EAT('-')) 00692 CHadd(cs, '-'); 00693 MUSTEAT(']', REG_EBRACK); 00694 00695 if (p->error != 0) { /* don't mess things up further */ 00696 freeset(p, cs); 00697 return; 00698 } 00699 00700 if (p->g->cflags®_ICASE) { 00701 int i; 00702 int ci; 00703 00704 for (i = p->g->csetsize - 1; i >= 0; i--) 00705 if (CHIN(cs, i) && isalpha(i)) { 00706 ci = othercase(i); 00707 if (ci != i) 00708 CHadd(cs, ci); 00709 } 00710 if (cs->multis != NULL) 00711 mccase(p, cs); 00712 } 00713 if (invert) { 00714 int i; 00715 00716 for (i = p->g->csetsize - 1; i >= 0; i--) 00717 if (CHIN(cs, i)) 00718 CHsub(cs, i); 00719 else 00720 CHadd(cs, i); 00721 if (p->g->cflags®_NEWLINE) 00722 CHsub(cs, '\n'); 00723 if (cs->multis != NULL) 00724 mcinvert(p, cs); 00725 } 00726 00727 assert(cs->multis == NULL); /* xxx */ 00728 00729 if (nch(p, cs) == 1) { /* optimize singleton sets */ 00730 ordinary(p, firstch(p, cs)); 00731 freeset(p, cs); 00732 } else 00733 EMIT(OANYOF, freezeset(p, cs)); 00734 } 00735 00736 /* 00737 - p_b_term - parse one term of a bracketed character list 00738 */ 00739 static void 00740 p_b_term(struct parse *p, cset *cs) 00741 { 00742 char c; 00743 char start, finish; 00744 int i; 00745 00746 /* classify what we've got */ 00747 switch ((MORE()) ? PEEK() : '\0') { 00748 case '[': 00749 c = (MORE2()) ? PEEK2() : '\0'; 00750 break; 00751 case '-': 00752 SETERROR(REG_ERANGE); 00753 return; /* NOTE RETURN */ 00754 break; 00755 default: 00756 c = '\0'; 00757 break; 00758 } 00759 00760 switch (c) { 00761 case ':': /* character class */ 00762 NEXT2(); 00763 REQUIRE(MORE(), REG_EBRACK); 00764 c = PEEK(); 00765 REQUIRE(c != '-' && c != ']', REG_ECTYPE); 00766 p_b_cclass(p, cs); 00767 REQUIRE(MORE(), REG_EBRACK); 00768 REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 00769 break; 00770 case '=': /* equivalence class */ 00771 NEXT2(); 00772 REQUIRE(MORE(), REG_EBRACK); 00773 c = PEEK(); 00774 REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 00775 p_b_eclass(p, cs); 00776 REQUIRE(MORE(), REG_EBRACK); 00777 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 00778 break; 00779 default: /* symbol, ordinary character, or range */ 00780 /* xxx revision needed for multichar stuff */ 00781 start = p_b_symbol(p); 00782 if (SEE('-') && MORE2() && PEEK2() != ']') { 00783 /* range */ 00784 NEXT(); 00785 if (EAT('-')) 00786 finish = '-'; 00787 else 00788 finish = p_b_symbol(p); 00789 } else 00790 finish = start; 00791 /* xxx what about signed chars here... */ 00792 REQUIRE(start <= finish, REG_ERANGE); 00793 for (i = start; i <= finish; i++) 00794 CHadd(cs, i); 00795 break; 00796 } 00797 } 00798 00799 /* 00800 - p_b_cclass - parse a character-class name and deal with it 00801 */ 00802 static void 00803 p_b_cclass(struct parse *p, cset *cs) 00804 { 00805 char *sp = p->next; 00806 struct cclass *cp; 00807 size_t len; 00808 const char *u; 00809 char c; 00810 00811 while (MORE() && isalpha((uch)PEEK())) 00812 NEXT(); 00813 len = p->next - sp; 00814 for (cp = cclasses; cp->name != NULL; cp++) 00815 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 00816 break; 00817 if (cp->name == NULL) { 00818 /* oops, didn't find it */ 00819 SETERROR(REG_ECTYPE); 00820 return; 00821 } 00822 00823 u = cp->chars; 00824 while ((c = *u++) != '\0') 00825 CHadd(cs, c); 00826 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 00827 MCadd(p, cs, u); 00828 } 00829 00830 /* 00831 - p_b_eclass - parse an equivalence-class name and deal with it 00832 * 00833 * This implementation is incomplete. xxx 00834 */ 00835 static void 00836 p_b_eclass(struct parse *p, cset *cs) 00837 { 00838 char c; 00839 00840 c = p_b_coll_elem(p, '='); 00841 CHadd(cs, c); 00842 } 00843 00844 /* 00845 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 00846 */ 00847 static char /* value of symbol */ 00848 p_b_symbol(struct parse *p) 00849 { 00850 char value; 00851 00852 REQUIRE(MORE(), REG_EBRACK); 00853 if (!EATTWO('[', '.')) 00854 return(GETNEXT()); 00855 00856 /* collating symbol */ 00857 value = p_b_coll_elem(p, '.'); 00858 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 00859 return(value); 00860 } 00861 00862 /* 00863 - p_b_coll_elem - parse a collating-element name and look it up 00864 */ 00865 static char /* value of collating element */ 00866 p_b_coll_elem(struct parse *p, 00867 int endc) /* name ended by endc,']' */ 00868 { 00869 char *sp = p->next; 00870 struct cname *cp; 00871 int len; 00872 00873 while (MORE() && !SEETWO(endc, ']')) 00874 NEXT(); 00875 if (!MORE()) { 00876 SETERROR(REG_EBRACK); 00877 return(0); 00878 } 00879 len = p->next - sp; 00880 for (cp = cnames; cp->name != NULL; cp++) 00881 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 00882 return(cp->code); /* known name */ 00883 if (len == 1) 00884 return(*sp); /* single character */ 00885 SETERROR(REG_ECOLLATE); /* neither */ 00886 return(0); 00887 } 00888 00889 /* 00890 - othercase - return the case counterpart of an alphabetic 00891 */ 00892 static char /* if no counterpart, return ch */ 00893 othercase(int ch) 00894 { 00895 ch = (uch)ch; 00896 assert(isalpha(ch)); 00897 if (isupper(ch)) 00898 return ((uch)tolower(ch)); 00899 else if (islower(ch)) 00900 return ((uch)toupper(ch)); 00901 else /* peculiar, but could happen */ 00902 return(ch); 00903 } 00904 00905 /* 00906 - bothcases - emit a dualcase version of a two-case character 00907 * 00908 * Boy, is this implementation ever a kludge... 00909 */ 00910 static void 00911 bothcases(struct parse *p, int ch) 00912 { 00913 char *oldnext = p->next; 00914 char *oldend = p->end; 00915 char bracket[3]; 00916 00917 ch = (uch)ch; 00918 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 00919 p->next = bracket; 00920 p->end = bracket+2; 00921 bracket[0] = ch; 00922 bracket[1] = ']'; 00923 bracket[2] = '\0'; 00924 p_bracket(p); 00925 assert(p->next == bracket+2); 00926 p->next = oldnext; 00927 p->end = oldend; 00928 } 00929 00930 /* 00931 - ordinary - emit an ordinary character 00932 */ 00933 static void 00934 ordinary(struct parse *p, int ch) 00935 { 00936 cat_t *cap = p->g->categories; 00937 00938 if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch) 00939 bothcases(p, ch); 00940 else { 00941 EMIT(OCHAR, (uch)ch); 00942 if (cap[ch] == 0) 00943 cap[ch] = p->g->ncategories++; 00944 } 00945 } 00946 00947 /* 00948 - nonnewline - emit REG_NEWLINE version of OANY 00949 * 00950 * Boy, is this implementation ever a kludge... 00951 */ 00952 static void 00953 nonnewline(struct parse *p) 00954 { 00955 char *oldnext = p->next; 00956 char *oldend = p->end; 00957 char bracket[4]; 00958 00959 p->next = bracket; 00960 p->end = bracket+3; 00961 bracket[0] = '^'; 00962 bracket[1] = '\n'; 00963 bracket[2] = ']'; 00964 bracket[3] = '\0'; 00965 p_bracket(p); 00966 assert(p->next == bracket+3); 00967 p->next = oldnext; 00968 p->end = oldend; 00969 } 00970 00971 /* 00972 - repeat - generate code for a bounded repetition, recursively if needed 00973 */ 00974 static void 00975 repeat(struct parse *p, 00976 sopno start, /* operand from here to end of strip */ 00977 int from, /* repeated from this number */ 00978 int to) /* to this number of times (maybe INFINITY) */ 00979 { 00980 sopno finish = HERE(); 00981 # define N 2 00982 # define INF 3 00983 # define REP(f, t) ((f)*8 + (t)) 00984 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 00985 sopno copy; 00986 00987 if (p->error != 0) /* head off possible runaway recursion */ 00988 return; 00989 00990 assert(from <= to); 00991 00992 switch (REP(MAP(from), MAP(to))) { 00993 case REP(0, 0): /* must be user doing this */ 00994 DROP(finish-start); /* drop the operand */ 00995 break; 00996 case REP(0, 1): /* as x{1,1}? */ 00997 case REP(0, N): /* as x{1,n}? */ 00998 case REP(0, INF): /* as x{1,}? */ 00999 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 01000 INSERT(OCH_, start); /* offset is wrong... */ 01001 repeat(p, start+1, 1, to); 01002 ASTERN(OOR1, start); 01003 AHEAD(start); /* ... fix it */ 01004 EMIT(OOR2, 0); 01005 AHEAD(THERE()); 01006 ASTERN(O_CH, THERETHERE()); 01007 break; 01008 case REP(1, 1): /* trivial case */ 01009 /* done */ 01010 break; 01011 case REP(1, N): /* as x?x{1,n-1} */ 01012 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 01013 INSERT(OCH_, start); 01014 ASTERN(OOR1, start); 01015 AHEAD(start); 01016 EMIT(OOR2, 0); /* offset very wrong... */ 01017 AHEAD(THERE()); /* ...so fix it */ 01018 ASTERN(O_CH, THERETHERE()); 01019 copy = dupl(p, start+1, finish+1); 01020 assert(copy == finish+4); 01021 repeat(p, copy, 1, to-1); 01022 break; 01023 case REP(1, INF): /* as x+ */ 01024 INSERT(OPLUS_, start); 01025 ASTERN(O_PLUS, start); 01026 break; 01027 case REP(N, N): /* as xx{m-1,n-1} */ 01028 copy = dupl(p, start, finish); 01029 repeat(p, copy, from-1, to-1); 01030 break; 01031 case REP(N, INF): /* as xx{n-1,INF} */ 01032 copy = dupl(p, start, finish); 01033 repeat(p, copy, from-1, to); 01034 break; 01035 default: /* "can't happen" */ 01036 SETERROR(REG_ASSERT); /* just in case */ 01037 break; 01038 } 01039 } 01040 01041 /* 01042 - seterr - set an error condition 01043 */ 01044 static int /* useless but makes type checking happy */ 01045 seterr(struct parse *p, int e) 01046 { 01047 if (p->error == 0) /* keep earliest error condition */ 01048 p->error = e; 01049 p->next = nuls; /* try to bring things to a halt */ 01050 p->end = nuls; 01051 return(0); /* make the return value well-defined */ 01052 } 01053 01054 /* 01055 - allocset - allocate a set of characters for [] 01056 */ 01057 static cset * 01058 allocset(struct parse *p) 01059 { 01060 int no = p->g->ncsets++; 01061 size_t nc; 01062 size_t nbytes; 01063 cset *cs; 01064 size_t css = (size_t)p->g->csetsize; 01065 int i; 01066 01067 if (no >= p->ncsalloc) { /* need another column of space */ 01068 void *ptr; 01069 01070 p->ncsalloc += CHAR_BIT; 01071 nc = p->ncsalloc; 01072 assert(nc % CHAR_BIT == 0); 01073 nbytes = nc / CHAR_BIT * css; 01074 01075 ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset)); 01076 if (ptr == NULL) 01077 goto nomem; 01078 p->g->sets = ptr; 01079 01080 ptr = (uch *)realloc((char *)p->g->setbits, nbytes); 01081 if (ptr == NULL) 01082 goto nomem; 01083 p->g->setbits = ptr; 01084 01085 for (i = 0; i < no; i++) 01086 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 01087 01088 (void) memset((char *)p->g->setbits + (nbytes - css), 0, css); 01089 } 01090 /* XXX should not happen */ 01091 if (p->g->sets == NULL || p->g->setbits == NULL) 01092 goto nomem; 01093 01094 cs = &p->g->sets[no]; 01095 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 01096 cs->mask = 1 << ((no) % CHAR_BIT); 01097 cs->hash = 0; 01098 cs->smultis = 0; 01099 cs->multis = NULL; 01100 01101 return(cs); 01102 nomem: 01103 free(p->g->sets); 01104 p->g->sets = NULL; 01105 free(p->g->setbits); 01106 p->g->setbits = NULL; 01107 01108 SETERROR(REG_ESPACE); 01109 /* caller's responsibility not to do set ops */ 01110 return(NULL); 01111 } 01112 01113 /* 01114 - freeset - free a now-unused set 01115 */ 01116 static void 01117 freeset(struct parse *p, cset *cs) 01118 { 01119 size_t i; 01120 cset *top = &p->g->sets[p->g->ncsets]; 01121 size_t css = (size_t)p->g->csetsize; 01122 01123 for (i = 0; i < css; i++) 01124 CHsub(cs, i); 01125 if (cs == top-1) /* recover only the easy case */ 01126 p->g->ncsets--; 01127 } 01128 01129 /* 01130 - freezeset - final processing on a set of characters 01131 * 01132 * The main task here is merging identical sets. This is usually a waste 01133 * of time (although the hash code minimizes the overhead), but can win 01134 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 01135 * is done using addition rather than xor -- all ASCII [aA] sets xor to 01136 * the same value! 01137 */ 01138 static int /* set number */ 01139 freezeset(struct parse *p, cset *cs) 01140 { 01141 uch h = cs->hash; 01142 size_t i; 01143 cset *top = &p->g->sets[p->g->ncsets]; 01144 cset *cs2; 01145 size_t css = (size_t)p->g->csetsize; 01146 01147 /* look for an earlier one which is the same */ 01148 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 01149 if (cs2->hash == h && cs2 != cs) { 01150 /* maybe */ 01151 for (i = 0; i < css; i++) 01152 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 01153 break; /* no */ 01154 if (i == css) 01155 break; /* yes */ 01156 } 01157 01158 if (cs2 < top) { /* found one */ 01159 freeset(p, cs); 01160 cs = cs2; 01161 } 01162 01163 return((int)(cs - p->g->sets)); 01164 } 01165 01166 /* 01167 - firstch - return first character in a set (which must have at least one) 01168 */ 01169 static int /* character; there is no "none" value */ 01170 firstch(struct parse *p, cset *cs) 01171 { 01172 size_t i; 01173 size_t css = (size_t)p->g->csetsize; 01174 01175 for (i = 0; i < css; i++) 01176 if (CHIN(cs, i)) 01177 return((char)i); 01178 assert(never); 01179 return(0); /* arbitrary */ 01180 } 01181 01182 /* 01183 - nch - number of characters in a set 01184 */ 01185 static int 01186 nch(struct parse *p, cset *cs) 01187 { 01188 size_t i; 01189 size_t css = (size_t)p->g->csetsize; 01190 int n = 0; 01191 01192 for (i = 0; i < css; i++) 01193 if (CHIN(cs, i)) 01194 n++; 01195 return(n); 01196 } 01197 01198 /* 01199 - mcadd - add a collating element to a cset 01200 */ 01201 static void 01202 mcadd( struct parse *p, cset *cs, const char *cp) 01203 { 01204 size_t oldend = cs->smultis; 01205 void *np; 01206 01207 cs->smultis += strlen(cp) + 1; 01208 np = realloc(cs->multis, cs->smultis); 01209 if (np == NULL) { 01210 if (cs->multis) 01211 free(cs->multis); 01212 cs->multis = NULL; 01213 SETERROR(REG_ESPACE); 01214 return; 01215 } 01216 cs->multis = np; 01217 01218 llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1); 01219 } 01220 01221 /* 01222 - mcinvert - invert the list of collating elements in a cset 01223 * 01224 * This would have to know the set of possibilities. Implementation 01225 * is deferred. 01226 */ 01227 /* ARGSUSED */ 01228 static void 01229 mcinvert(struct parse *p, cset *cs) 01230 { 01231 assert(cs->multis == NULL); /* xxx */ 01232 } 01233 01234 /* 01235 - mccase - add case counterparts of the list of collating elements in a cset 01236 * 01237 * This would have to know the set of possibilities. Implementation 01238 * is deferred. 01239 */ 01240 /* ARGSUSED */ 01241 static void 01242 mccase(struct parse *p, cset *cs) 01243 { 01244 assert(cs->multis == NULL); /* xxx */ 01245 } 01246 01247 /* 01248 - isinsets - is this character in any sets? 01249 */ 01250 static int /* predicate */ 01251 isinsets(struct re_guts *g, int c) 01252 { 01253 uch *col; 01254 int i; 01255 int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 01256 unsigned uc = (uch)c; 01257 01258 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 01259 if (col[uc] != 0) 01260 return(1); 01261 return(0); 01262 } 01263 01264 /* 01265 - samesets - are these two characters in exactly the same sets? 01266 */ 01267 static int /* predicate */ 01268 samesets(struct re_guts *g, int c1, int c2) 01269 { 01270 uch *col; 01271 int i; 01272 int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 01273 unsigned uc1 = (uch)c1; 01274 unsigned uc2 = (uch)c2; 01275 01276 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 01277 if (col[uc1] != col[uc2]) 01278 return(0); 01279 return(1); 01280 } 01281 01282 /* 01283 - categorize - sort out character categories 01284 */ 01285 static void 01286 categorize(struct parse *p, struct re_guts *g) 01287 { 01288 cat_t *cats = g->categories; 01289 int c; 01290 int c2; 01291 cat_t cat; 01292 01293 /* avoid making error situations worse */ 01294 if (p->error != 0) 01295 return; 01296 01297 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 01298 if (cats[c] == 0 && isinsets(g, c)) { 01299 cat = g->ncategories++; 01300 cats[c] = cat; 01301 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 01302 if (cats[c2] == 0 && samesets(g, c, c2)) 01303 cats[c2] = cat; 01304 } 01305 } 01306 01307 /* 01308 - dupl - emit a duplicate of a bunch of sops 01309 */ 01310 static sopno /* start of duplicate */ 01311 dupl(struct parse *p, 01312 sopno start, /* from here */ 01313 sopno finish) /* to this less one */ 01314 { 01315 sopno ret = HERE(); 01316 sopno len = finish - start; 01317 01318 assert(finish >= start); 01319 if (len == 0) 01320 return(ret); 01321 enlarge(p, p->ssize + len); /* this many unexpected additions */ 01322 assert(p->ssize >= p->slen + len); 01323 (void) memmove((char *)(p->strip + p->slen), 01324 (char *)(p->strip + start), (size_t)len*sizeof(sop)); 01325 p->slen += len; 01326 return(ret); 01327 } 01328 01329 /* 01330 - doemit - emit a strip operator 01331 * 01332 * It might seem better to implement this as a macro with a function as 01333 * hard-case backup, but it's just too big and messy unless there are 01334 * some changes to the data structures. Maybe later. 01335 */ 01336 static void 01337 doemit(struct parse *p, sop op, size_t opnd) 01338 { 01339 /* avoid making error situations worse */ 01340 if (p->error != 0) 01341 return; 01342 01343 /* deal with oversize operands ("can't happen", more or less) */ 01344 assert(opnd < 1<<OPSHIFT); 01345 01346 /* deal with undersized strip */ 01347 if (p->slen >= p->ssize) 01348 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ 01349 assert(p->slen < p->ssize); 01350 01351 /* finally, it's all reduced to the easy case */ 01352 p->strip[p->slen++] = SOP(op, opnd); 01353 } 01354 01355 /* 01356 - doinsert - insert a sop into the strip 01357 */ 01358 static void 01359 doinsert(struct parse *p, sop op, size_t opnd, sopno pos) 01360 { 01361 sopno sn; 01362 sop s; 01363 int i; 01364 01365 /* avoid making error situations worse */ 01366 if (p->error != 0) 01367 return; 01368 01369 sn = HERE(); 01370 EMIT(op, opnd); /* do checks, ensure space */ 01371 assert(HERE() == sn+1); 01372 s = p->strip[sn]; 01373 01374 /* adjust paren pointers */ 01375 assert(pos > 0); 01376 for (i = 1; i < NPAREN; i++) { 01377 if (p->pbegin[i] >= pos) { 01378 p->pbegin[i]++; 01379 } 01380 if (p->pend[i] >= pos) { 01381 p->pend[i]++; 01382 } 01383 } 01384 01385 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], 01386 (HERE()-pos-1)*sizeof(sop)); 01387 p->strip[pos] = s; 01388 } 01389 01390 /* 01391 - dofwd - complete a forward reference 01392 */ 01393 static void 01394 dofwd(struct parse *p, sopno pos, sop value) 01395 { 01396 /* avoid making error situations worse */ 01397 if (p->error != 0) 01398 return; 01399 01400 assert(value < 1<<OPSHIFT); 01401 p->strip[pos] = OP(p->strip[pos]) | value; 01402 } 01403 01404 /* 01405 - enlarge - enlarge the strip 01406 */ 01407 static void 01408 enlarge(struct parse *p, sopno size) 01409 { 01410 sop *sp; 01411 01412 if (p->ssize >= size) 01413 return; 01414 01415 sp = (sop *)realloc(p->strip, size*sizeof(sop)); 01416 if (sp == NULL) { 01417 SETERROR(REG_ESPACE); 01418 return; 01419 } 01420 p->strip = sp; 01421 p->ssize = size; 01422 } 01423 01424 /* 01425 - stripsnug - compact the strip 01426 */ 01427 static void 01428 stripsnug(struct parse *p, struct re_guts *g) 01429 { 01430 g->nstates = p->slen; 01431 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); 01432 if (g->strip == NULL) { 01433 SETERROR(REG_ESPACE); 01434 g->strip = p->strip; 01435 } 01436 } 01437 01438 /* 01439 - findmust - fill in must and mlen with longest mandatory literal string 01440 * 01441 * This algorithm could do fancy things like analyzing the operands of | 01442 * for common subsequences. Someday. This code is simple and finds most 01443 * of the interesting cases. 01444 * 01445 * Note that must and mlen got initialized during setup. 01446 */ 01447 static void 01448 findmust(struct parse *p, struct re_guts *g) 01449 { 01450 sop *scan; 01451 sop *start = 0; /* start initialized in the default case, after that */ 01452 sop *newstart = 0; /* newstart was initialized in the OCHAR case */ 01453 sopno newlen; 01454 sop s; 01455 char *cp; 01456 sopno i; 01457 01458 /* avoid making error situations worse */ 01459 if (p->error != 0) 01460 return; 01461 01462 /* find the longest OCHAR sequence in strip */ 01463 newlen = 0; 01464 scan = g->strip + 1; 01465 do { 01466 s = *scan++; 01467 switch (OP(s)) { 01468 case OCHAR: /* sequence member */ 01469 if (newlen == 0) /* new sequence */ 01470 newstart = scan - 1; 01471 newlen++; 01472 break; 01473 case OPLUS_: /* things that don't break one */ 01474 case OLPAREN: 01475 case ORPAREN: 01476 break; 01477 case OQUEST_: /* things that must be skipped */ 01478 case OCH_: 01479 scan--; 01480 do { 01481 scan += OPND(s); 01482 s = *scan; 01483 /* assert() interferes w debug printouts */ 01484 if (OP(s) != O_QUEST && OP(s) != O_CH && 01485 OP(s) != OOR2) { 01486 g->iflags |= REGEX_BAD; 01487 return; 01488 } 01489 } while (OP(s) != O_QUEST && OP(s) != O_CH); 01490 /* fallthrough */ 01491 default: /* things that break a sequence */ 01492 if (newlen > g->mlen) { /* ends one */ 01493 start = newstart; 01494 g->mlen = newlen; 01495 } 01496 newlen = 0; 01497 break; 01498 } 01499 } while (OP(s) != OEND); 01500 01501 if (g->mlen == 0) /* there isn't one */ 01502 return; 01503 01504 /* turn it into a character string */ 01505 g->must = malloc((size_t)g->mlen + 1); 01506 if (g->must == NULL) { /* argh; just forget it */ 01507 g->mlen = 0; 01508 return; 01509 } 01510 cp = g->must; 01511 scan = start; 01512 for (i = g->mlen; i > 0; i--) { 01513 while (OP(s = *scan++) != OCHAR) 01514 continue; 01515 assert(cp < g->must + g->mlen); 01516 *cp++ = (char)OPND(s); 01517 } 01518 assert(cp == g->must + g->mlen); 01519 *cp++ = '\0'; /* just on general principles */ 01520 } 01521 01522 /* 01523 - pluscount - count + nesting 01524 */ 01525 static sopno /* nesting depth */ 01526 pluscount(struct parse *p, struct re_guts *g) 01527 { 01528 sop *scan; 01529 sop s; 01530 sopno plusnest = 0; 01531 sopno maxnest = 0; 01532 01533 if (p->error != 0) 01534 return(0); /* there may not be an OEND */ 01535 01536 scan = g->strip + 1; 01537 do { 01538 s = *scan++; 01539 switch (OP(s)) { 01540 case OPLUS_: 01541 plusnest++; 01542 break; 01543 case O_PLUS: 01544 if (plusnest > maxnest) 01545 maxnest = plusnest; 01546 plusnest--; 01547 break; 01548 } 01549 } while (OP(s) != OEND); 01550 if (plusnest != 0) 01551 g->iflags |= REGEX_BAD; 01552 return(maxnest); 01553 }