LLVM: regcomp.c Source File

Go to the documentation of this file.
00001 /*-
00002  * This code is derived from OpenBSD's libc/regex, original license follows:
00003  *
00004  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
00005  * Copyright (c) 1992, 1993, 1994
00006  *  The Regents of the University of California.  All rights reserved.
00007  *
00008  * This code is derived from software contributed to Berkeley by
00009  * Henry Spencer.
00010  *
00011  * Redistribution and use in source and binary forms, with or without
00012  * modification, are permitted provided that the following conditions
00013  * are met:
00014  * 1. Redistributions of source code must retain the above copyright
00015  *    notice, this list of conditions and the following disclaimer.
00016  * 2. Redistributions in binary form must reproduce the above copyright
00017  *    notice, this list of conditions and the following disclaimer in the
00018  *    documentation and/or other materials provided with the distribution.
00019  * 3. Neither the name of the University nor the names of its contributors
00020  *    may be used to endorse or promote products derived from this software
00021  *    without specific prior written permission.
00022  *
00023  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
00024  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00025  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00026  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
00027  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00028  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00029  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00030  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00031  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00032  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00033  * SUCH DAMAGE.
00034  *
00035  *  @(#)regcomp.c 8.5 (Berkeley) 3/20/94
00036  */
00037 
00038 #include <sys/types.h>
00039 #include <stdio.h>
00040 #include <string.h>
00041 #include <ctype.h>
00042 #include <limits.h>
00043 #include <stdlib.h>
00044 #include "regex_impl.h"
00045 
00046 #include "regutils.h"
00047 #include "regex2.h"
00048 
00049 #include "regcclass.h"
00050 #include "regcname.h"
00051 
00052 /*
00053  * parse structure, passed up and down to avoid global variables and
00054  * other clumsinesses
00055  */
00056 struct parse {
00057   char *next;   /* next character in RE */
00058   char *end;    /* end of string (-> NUL normally) */
00059   int error;    /* has an error been seen? */
00060   sop *strip;   /* malloced strip */
00061   sopno ssize;    /* malloced strip size (allocated) */
00062   sopno slen;   /* malloced strip length (used) */
00063   int ncsalloc;   /* number of csets allocated */
00064   struct re_guts *g;
00065 # define  NPAREN  10  /* we need to remember () 1-9 for back refs */
00066   sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
00067   sopno pend[NPAREN]; /* -> ) ([0] unused) */
00068 };
00069 
00070 static void p_ere(struct parse *, int);
00071 static void p_ere_exp(struct parse *);
00072 static void p_str(struct parse *);
00073 static void p_bre(struct parse *, int, int);
00074 static int p_simp_re(struct parse *, int);
00075 static int p_count(struct parse *);
00076 static void p_bracket(struct parse *);
00077 static void p_b_term(struct parse *, cset *);
00078 static void p_b_cclass(struct parse *, cset *);
00079 static void p_b_eclass(struct parse *, cset *);
00080 static char p_b_symbol(struct parse *);
00081 static char p_b_coll_elem(struct parse *, int);
00082 static char othercase(int);
00083 static void bothcases(struct parse *, int);
00084 static void ordinary(struct parse *, int);
00085 static void nonnewline(struct parse *);
00086 static void repeat(struct parse *, sopno, int, int);
00087 static int seterr(struct parse *, int);
00088 static cset *allocset(struct parse *);
00089 static void freeset(struct parse *, cset *);
00090 static int freezeset(struct parse *, cset *);
00091 static int firstch(struct parse *, cset *);
00092 static int nch(struct parse *, cset *);
00093 static void mcadd(struct parse *, cset *, const char *);
00094 static void mcinvert(struct parse *, cset *);
00095 static void mccase(struct parse *, cset *);
00096 static int isinsets(struct re_guts *, int);
00097 static int samesets(struct re_guts *, int, int);
00098 static void categorize(struct parse *, struct re_guts *);
00099 static sopno dupl(struct parse *, sopno, sopno);
00100 static void doemit(struct parse *, sop, size_t);
00101 static void doinsert(struct parse *, sop, size_t, sopno);
00102 static void dofwd(struct parse *, sopno, sop);
00103 static void enlarge(struct parse *, sopno);
00104 static void stripsnug(struct parse *, struct re_guts *);
00105 static void findmust(struct parse *, struct re_guts *);
00106 static sopno pluscount(struct parse *, struct re_guts *);
00107 
00108 static char nuls[10];   /* place to point scanner in event of error */
00109 
00110 /*
00111  * macros for use with parse structure
00112  * BEWARE:  these know that the parse structure is named `p' !!!
00113  */
00114 #define PEEK()  (*p->next)
00115 #define PEEK2() (*(p->next+1))
00116 #define MORE()  (p->next < p->end)
00117 #define MORE2() (p->next+1 < p->end)
00118 #define SEE(c)  (MORE() && PEEK() == (c))
00119 #define SEETWO(a, b)  (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
00120 #define EAT(c)  ((SEE(c)) ? (NEXT(), 1) : 0)
00121 #define EATTWO(a, b)  ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
00122 #define NEXT()  (p->next++)
00123 #define NEXT2() (p->next += 2)
00124 #define NEXTn(n)  (p->next += (n))
00125 #define GETNEXT() (*p->next++)
00126 #define SETERROR(e) seterr(p, (e))
00127 #define REQUIRE(co, e)  (void)((co) || SETERROR(e))
00128 #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
00129 #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
00130 #define MUSTNOTSEE(c, e)  (REQUIRE(!MORE() || PEEK() != (c), e))
00131 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
00132 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
00133 #define AHEAD(pos)    dofwd(p, pos, HERE()-(pos))
00134 #define ASTERN(sop, pos)  EMIT(sop, HERE()-pos)
00135 #define HERE()    (p->slen)
00136 #define THERE()   (p->slen - 1)
00137 #define THERETHERE()  (p->slen - 2)
00138 #define DROP(n) (p->slen -= (n))
00139 
00140 #ifdef  _POSIX2_RE_DUP_MAX
00141 #define DUPMAX  _POSIX2_RE_DUP_MAX
00142 #else
00143 #define DUPMAX  255
00144 #endif
00145 #define INFINITY  (DUPMAX + 1)
00146 
00147 #ifndef NDEBUG
00148 static int never = 0;   /* for use in asserts; shuts lint up */
00149 #else
00150 #define never 0   /* some <assert.h>s have bugs too */
00151 #endif
00152 
00153 /*
00154  - llvm_regcomp - interface for parser and compilation
00155  */
00156 int       /* 0 success, otherwise REG_something */
00157 llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags)
00158 {
00159   struct parse pa;
00160   struct re_guts *g;
00161   struct parse *p = &pa;
00162   int i;
00163   size_t len;
00164 #ifdef REDEBUG
00165 # define  GOODFLAGS(f)  (f)
00166 #else
00167 # define  GOODFLAGS(f)  ((f)&~REG_DUMP)
00168 #endif
00169 
00170   cflags = GOODFLAGS(cflags);
00171   if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
00172     return(REG_INVARG);
00173 
00174   if (cflags&REG_PEND) {
00175     if (preg->re_endp < pattern)
00176       return(REG_INVARG);
00177     len = preg->re_endp - pattern;
00178   } else
00179     len = strlen((const char *)pattern);
00180 
00181   /* do the mallocs early so failure handling is easy */
00182   g = (struct re_guts *)malloc(sizeof(struct re_guts) +
00183               (NC-1)*sizeof(cat_t));
00184   if (g == NULL)
00185     return(REG_ESPACE);
00186   p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
00187   p->strip = (sop *)calloc(p->ssize, sizeof(sop));
00188   p->slen = 0;
00189   if (p->strip == NULL) {
00190     free((char *)g);
00191     return(REG_ESPACE);
00192   }
00193 
00194   /* set things up */
00195   p->g = g;
00196   p->next = (char *)pattern;  /* convenience; we do not modify it */
00197   p->end = p->next + len;
00198   p->error = 0;
00199   p->ncsalloc = 0;
00200   for (i = 0; i < NPAREN; i++) {
00201     p->pbegin[i] = 0;
00202     p->pend[i] = 0;
00203   }
00204   g->csetsize = NC;
00205   g->sets = NULL;
00206   g->setbits = NULL;
00207   g->ncsets = 0;
00208   g->cflags = cflags;
00209   g->iflags = 0;
00210   g->nbol = 0;
00211   g->neol = 0;
00212   g->must = NULL;
00213   g->mlen = 0;
00214   g->nsub = 0;
00215   g->ncategories = 1; /* category 0 is "everything else" */
00216   g->categories = &g->catspace[-(CHAR_MIN)];
00217   (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
00218   g->backrefs = 0;
00219 
00220   /* do it */
00221   EMIT(OEND, 0);
00222   g->firststate = THERE();
00223   if (cflags&REG_EXTENDED)
00224     p_ere(p, OUT);
00225   else if (cflags&REG_NOSPEC)
00226     p_str(p);
00227   else
00228     p_bre(p, OUT, OUT);
00229   EMIT(OEND, 0);
00230   g->laststate = THERE();
00231 
00232   /* tidy up loose ends and fill things in */
00233   categorize(p, g);
00234   stripsnug(p, g);
00235   findmust(p, g);
00236   g->nplus = pluscount(p, g);
00237   g->magic = MAGIC2;
00238   preg->re_nsub = g->nsub;
00239   preg->re_g = g;
00240   preg->re_magic = MAGIC1;
00241 #ifndef REDEBUG
00242   /* not debugging, so can't rely on the assert() in llvm_regexec() */
00243   if (g->iflags&REGEX_BAD)
00244     SETERROR(REG_ASSERT);
00245 #endif
00246 
00247   /* win or lose, we're done */
00248   if (p->error != 0)  /* lose */
00249     llvm_regfree(preg);
00250   return(p->error);
00251 }
00252 
00253 /*
00254  - p_ere - ERE parser top level, concatenation and alternation
00255  */
00256 static void
00257 p_ere(struct parse *p, int stop)  /* character this ERE should end at */
00258 {
00259   char c;
00260   sopno prevback = 0;
00261   sopno prevfwd = 0;
00262   sopno conc;
00263   int first = 1;    /* is this the first alternative? */
00264 
00265   for (;;) {
00266     /* do a bunch of concatenated expressions */
00267     conc = HERE();
00268     while (MORE() && (c = PEEK()) != '|' && c != stop)
00269       p_ere_exp(p);
00270     REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
00271 
00272     if (!EAT('|'))
00273       break;    /* NOTE BREAK OUT */
00274 
00275     if (first) {
00276       INSERT(OCH_, conc); /* offset is wrong */
00277       prevfwd = conc;
00278       prevback = conc;
00279       first = 0;
00280     }
00281     ASTERN(OOR1, prevback);
00282     prevback = THERE();
00283     AHEAD(prevfwd);     /* fix previous offset */
00284     prevfwd = HERE();
00285     EMIT(OOR2, 0);      /* offset is very wrong */
00286   }
00287 
00288   if (!first) {   /* tail-end fixups */
00289     AHEAD(prevfwd);
00290     ASTERN(O_CH, prevback);
00291   }
00292 
00293   assert(!MORE() || SEE(stop));
00294 }
00295 
00296 /*
00297  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
00298  */
00299 static void
00300 p_ere_exp(struct parse *p)
00301 {
00302   char c;
00303   sopno pos;
00304   int count;
00305   int count2;
00306   int backrefnum;
00307   sopno subno;
00308   int wascaret = 0;
00309 
00310   assert(MORE());   /* caller should have ensured this */
00311   c = GETNEXT();
00312 
00313   pos = HERE();
00314   switch (c) {
00315   case '(':
00316     REQUIRE(MORE(), REG_EPAREN);
00317     p->g->nsub++;
00318     subno = p->g->nsub;
00319     if (subno < NPAREN)
00320       p->pbegin[subno] = HERE();
00321     EMIT(OLPAREN, subno);
00322     if (!SEE(')'))
00323       p_ere(p, ')');
00324     if (subno < NPAREN) {
00325       p->pend[subno] = HERE();
00326       assert(p->pend[subno] != 0);
00327     }
00328     EMIT(ORPAREN, subno);
00329     MUSTEAT(')', REG_EPAREN);
00330     break;
00331 #ifndef POSIX_MISTAKE
00332   case ')':   /* happens only if no current unmatched ( */
00333     /*
00334      * You may ask, why the ifndef?  Because I didn't notice
00335      * this until slightly too late for 1003.2, and none of the
00336      * other 1003.2 regular-expression reviewers noticed it at
00337      * all.  So an unmatched ) is legal POSIX, at least until
00338      * we can get it fixed.
00339      */
00340     SETERROR(REG_EPAREN);
00341     break;
00342 #endif
00343   case '^':
00344     EMIT(OBOL, 0);
00345     p->g->iflags |= USEBOL;
00346     p->g->nbol++;
00347     wascaret = 1;
00348     break;
00349   case '$':
00350     EMIT(OEOL, 0);
00351     p->g->iflags |= USEEOL;
00352     p->g->neol++;
00353     break;
00354   case '|':
00355     SETERROR(REG_EMPTY);
00356     break;
00357   case '*':
00358   case '+':
00359   case '?':
00360     SETERROR(REG_BADRPT);
00361     break;
00362   case '.':
00363     if (p->g->cflags&REG_NEWLINE)
00364       nonnewline(p);
00365     else
00366       EMIT(OANY, 0);
00367     break;
00368   case '[':
00369     p_bracket(p);
00370     break;
00371   case '\\':
00372     REQUIRE(MORE(), REG_EESCAPE);
00373     c = GETNEXT();
00374     if (c >= '1' && c <= '9') {
00375       /* \[0-9] is taken to be a back-reference to a previously specified
00376        * matching group. backrefnum will hold the number. The matching
00377        * group must exist (i.e. if \4 is found there must have been at
00378        * least 4 matching groups specified in the pattern previously).
00379        */
00380       backrefnum = c - '0';
00381       if (p->pend[backrefnum] == 0) {
00382         SETERROR(REG_ESUBREG);
00383         break;
00384       }
00385 
00386       /* Make sure everything checks out and emit the sequence
00387        * that marks a back-reference to the parse structure.
00388        */
00389       assert(backrefnum <= p->g->nsub);
00390       EMIT(OBACK_, backrefnum);
00391       assert(p->pbegin[backrefnum] != 0);
00392       assert(OP(p->strip[p->pbegin[backrefnum]]) != OLPAREN);
00393       assert(OP(p->strip[p->pend[backrefnum]]) != ORPAREN);
00394       (void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]);
00395       EMIT(O_BACK, backrefnum);
00396       p->g->backrefs = 1;
00397     } else {
00398       /* Other chars are simply themselves when escaped with a backslash.
00399        */
00400       ordinary(p, c);
00401     }
00402     break;
00403   case '{':   /* okay as ordinary except if digit follows */
00404     REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
00405     /* FALLTHROUGH */
00406   default:
00407     ordinary(p, c);
00408     break;
00409   }
00410 
00411   if (!MORE())
00412     return;
00413   c = PEEK();
00414   /* we call { a repetition if followed by a digit */
00415   if (!( c == '*' || c == '+' || c == '?' ||
00416         (c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
00417     return;   /* no repetition, we're done */
00418   NEXT();
00419 
00420   REQUIRE(!wascaret, REG_BADRPT);
00421   switch (c) {
00422   case '*': /* implemented as +? */
00423     /* this case does not require the (y|) trick, noKLUDGE */
00424     INSERT(OPLUS_, pos);
00425     ASTERN(O_PLUS, pos);
00426     INSERT(OQUEST_, pos);
00427     ASTERN(O_QUEST, pos);
00428     break;
00429   case '+':
00430     INSERT(OPLUS_, pos);
00431     ASTERN(O_PLUS, pos);
00432     break;
00433   case '?':
00434     /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
00435     INSERT(OCH_, pos);    /* offset slightly wrong */
00436     ASTERN(OOR1, pos);    /* this one's right */
00437     AHEAD(pos);     /* fix the OCH_ */
00438     EMIT(OOR2, 0);      /* offset very wrong... */
00439     AHEAD(THERE());     /* ...so fix it */
00440     ASTERN(O_CH, THERETHERE());
00441     break;
00442   case '{':
00443     count = p_count(p);
00444     if (EAT(',')) {
00445       if (isdigit((uch)PEEK())) {
00446         count2 = p_count(p);
00447         REQUIRE(count <= count2, REG_BADBR);
00448       } else    /* single number with comma */
00449         count2 = INFINITY;
00450     } else    /* just a single number */
00451       count2 = count;
00452     repeat(p, pos, count, count2);
00453     if (!EAT('}')) {  /* error heuristics */
00454       while (MORE() && PEEK() != '}')
00455         NEXT();
00456       REQUIRE(MORE(), REG_EBRACE);
00457       SETERROR(REG_BADBR);
00458     }
00459     break;
00460   }
00461 
00462   if (!MORE())
00463     return;
00464   c = PEEK();
00465   if (!( c == '*' || c == '+' || c == '?' ||
00466         (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
00467     return;
00468   SETERROR(REG_BADRPT);
00469 }
00470 
00471 /*
00472  - p_str - string (no metacharacters) "parser"
00473  */
00474 static void
00475 p_str(struct parse *p)
00476 {
00477   REQUIRE(MORE(), REG_EMPTY);
00478   while (MORE())
00479     ordinary(p, GETNEXT());
00480 }
00481 
00482 /*
00483  - p_bre - BRE parser top level, anchoring and concatenation
00484  * Giving end1 as OUT essentially eliminates the end1/end2 check.
00485  *
00486  * This implementation is a bit of a kludge, in that a trailing $ is first
00487  * taken as an ordinary character and then revised to be an anchor.  The
00488  * only undesirable side effect is that '$' gets included as a character
00489  * category in such cases.  This is fairly harmless; not worth fixing.
00490  * The amount of lookahead needed to avoid this kludge is excessive.
00491  */
00492 static void
00493 p_bre(struct parse *p,
00494     int end1,   /* first terminating character */
00495     int end2)   /* second terminating character */
00496 {
00497   sopno start = HERE();
00498   int first = 1;      /* first subexpression? */
00499   int wasdollar = 0;
00500 
00501   if (EAT('^')) {
00502     EMIT(OBOL, 0);
00503     p->g->iflags |= USEBOL;
00504     p->g->nbol++;
00505   }
00506   while (MORE() && !SEETWO(end1, end2)) {
00507     wasdollar = p_simp_re(p, first);
00508     first = 0;
00509   }
00510   if (wasdollar) {  /* oops, that was a trailing anchor */
00511     DROP(1);
00512     EMIT(OEOL, 0);
00513     p->g->iflags |= USEEOL;
00514     p->g->neol++;
00515   }
00516 
00517   REQUIRE(HERE() != start, REG_EMPTY);  /* require nonempty */
00518 }
00519 
00520 /*
00521  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
00522  */
00523 static int      /* was the simple RE an unbackslashed $? */
00524 p_simp_re(struct parse *p,
00525     int starordinary)   /* is a leading * an ordinary character? */
00526 {
00527   int c;
00528   int count;
00529   int count2;
00530   sopno pos;
00531   int i;
00532   sopno subno;
00533 # define  BACKSL  (1<<CHAR_BIT)
00534 
00535         pos = HERE(); /* repetition op, if any, covers from here */
00536 
00537         assert(MORE()); /* caller should have ensured this */
00538         c = GETNEXT();
00539   if (c == '\\') {
00540     REQUIRE(MORE(), REG_EESCAPE);
00541     c = BACKSL | GETNEXT();
00542   }
00543   switch (c) {
00544   case '.':
00545     if (p->g->cflags&REG_NEWLINE)
00546       nonnewline(p);
00547     else
00548       EMIT(OANY, 0);
00549     break;
00550   case '[':
00551     p_bracket(p);
00552     break;
00553   case BACKSL|'{':
00554     SETERROR(REG_BADRPT);
00555     break;
00556   case BACKSL|'(':
00557     p->g->nsub++;
00558     subno = p->g->nsub;
00559     if (subno < NPAREN)
00560       p->pbegin[subno] = HERE();
00561     EMIT(OLPAREN, subno);
00562     /* the MORE here is an error heuristic */
00563     if (MORE() && !SEETWO('\\', ')'))
00564       p_bre(p, '\\', ')');
00565     if (subno < NPAREN) {
00566       p->pend[subno] = HERE();
00567       assert(p->pend[subno] != 0);
00568     }
00569     EMIT(ORPAREN, subno);
00570     REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
00571     break;
00572   case BACKSL|')':  /* should not get here -- must be user */
00573   case BACKSL|'}':
00574     SETERROR(REG_EPAREN);
00575     break;
00576   case BACKSL|'1':
00577   case BACKSL|'2':
00578   case BACKSL|'3':
00579   case BACKSL|'4':
00580   case BACKSL|'5':
00581   case BACKSL|'6':
00582   case BACKSL|'7':
00583   case BACKSL|'8':
00584   case BACKSL|'9':
00585     i = (c&~BACKSL) - '0';
00586     assert(i < NPAREN);
00587     if (p->pend[i] != 0) {
00588       assert(i <= p->g->nsub);
00589       EMIT(OBACK_, i);
00590       assert(p->pbegin[i] != 0);
00591       assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
00592       assert(OP(p->strip[p->pend[i]]) == ORPAREN);
00593       (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
00594       EMIT(O_BACK, i);
00595     } else
00596       SETERROR(REG_ESUBREG);
00597     p->g->backrefs = 1;
00598     break;
00599   case '*':
00600     REQUIRE(starordinary, REG_BADRPT);
00601     /* FALLTHROUGH */
00602   default:
00603     ordinary(p, (char)c);
00604     break;
00605   }
00606 
00607   if (EAT('*')) {   /* implemented as +? */
00608     /* this case does not require the (y|) trick, noKLUDGE */
00609     INSERT(OPLUS_, pos);
00610     ASTERN(O_PLUS, pos);
00611     INSERT(OQUEST_, pos);
00612     ASTERN(O_QUEST, pos);
00613   } else if (EATTWO('\\', '{')) {
00614     count = p_count(p);
00615     if (EAT(',')) {
00616       if (MORE() && isdigit((uch)PEEK())) {
00617         count2 = p_count(p);
00618         REQUIRE(count <= count2, REG_BADBR);
00619       } else    /* single number with comma */
00620         count2 = INFINITY;
00621     } else    /* just a single number */
00622       count2 = count;
00623     repeat(p, pos, count, count2);
00624     if (!EATTWO('\\', '}')) { /* error heuristics */
00625       while (MORE() && !SEETWO('\\', '}'))
00626         NEXT();
00627       REQUIRE(MORE(), REG_EBRACE);
00628       SETERROR(REG_BADBR);
00629     }
00630   } else if (c == '$')  /* $ (but not \$) ends it */
00631     return(1);
00632 
00633   return(0);
00634 }
00635 
00636 /*
00637  - p_count - parse a repetition count
00638  */
00639 static int      /* the value */
00640 p_count(struct parse *p)
00641 {
00642   int count = 0;
00643   int ndigits = 0;
00644 
00645   while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
00646     count = count*10 + (GETNEXT() - '0');
00647     ndigits++;
00648   }
00649 
00650   REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
00651   return(count);
00652 }
00653 
00654 /*
00655  - p_bracket - parse a bracketed character list
00656  *
00657  * Note a significant property of this code:  if the allocset() did SETERROR,
00658  * no set operations are done.
00659  */
00660 static void
00661 p_bracket(struct parse *p)
00662 {
00663   cset *cs;
00664   int invert = 0;
00665 
00666   /* Dept of Truly Sickening Special-Case Kludges */
00667   if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
00668     EMIT(OBOW, 0);
00669     NEXTn(6);
00670     return;
00671   }
00672   if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
00673     EMIT(OEOW, 0);
00674     NEXTn(6);
00675     return;
00676   }
00677 
00678   if ((cs = allocset(p)) == NULL) {
00679     /* allocset did set error status in p */
00680     return;
00681   }
00682 
00683   if (EAT('^'))
00684     invert++; /* make note to invert set at end */
00685   if (EAT(']'))
00686     CHadd(cs, ']');
00687   else if (EAT('-'))
00688     CHadd(cs, '-');
00689   while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
00690     p_b_term(p, cs);
00691   if (EAT('-'))
00692     CHadd(cs, '-');
00693   MUSTEAT(']', REG_EBRACK);
00694 
00695   if (p->error != 0) {  /* don't mess things up further */
00696     freeset(p, cs);
00697     return;
00698   }
00699 
00700   if (p->g->cflags&REG_ICASE) {
00701     int i;
00702     int ci;
00703 
00704     for (i = p->g->csetsize - 1; i >= 0; i--)
00705       if (CHIN(cs, i) && isalpha(i)) {
00706         ci = othercase(i);
00707         if (ci != i)
00708           CHadd(cs, ci);
00709       }
00710     if (cs->multis != NULL)
00711       mccase(p, cs);
00712   }
00713   if (invert) {
00714     int i;
00715 
00716     for (i = p->g->csetsize - 1; i >= 0; i--)
00717       if (CHIN(cs, i))
00718         CHsub(cs, i);
00719       else
00720         CHadd(cs, i);
00721     if (p->g->cflags&REG_NEWLINE)
00722       CHsub(cs, '\n');
00723     if (cs->multis != NULL)
00724       mcinvert(p, cs);
00725   }
00726 
00727   assert(cs->multis == NULL);   /* xxx */
00728 
00729   if (nch(p, cs) == 1) {    /* optimize singleton sets */
00730     ordinary(p, firstch(p, cs));
00731     freeset(p, cs);
00732   } else
00733     EMIT(OANYOF, freezeset(p, cs));
00734 }
00735 
00736 /*
00737  - p_b_term - parse one term of a bracketed character list
00738  */
00739 static void
00740 p_b_term(struct parse *p, cset *cs)
00741 {
00742   char c;
00743   char start, finish;
00744   int i;
00745 
00746   /* classify what we've got */
00747   switch ((MORE()) ? PEEK() : '\0') {
00748   case '[':
00749     c = (MORE2()) ? PEEK2() : '\0';
00750     break;
00751   case '-':
00752     SETERROR(REG_ERANGE);
00753     return;     /* NOTE RETURN */
00754     break;
00755   default:
00756     c = '\0';
00757     break;
00758   }
00759 
00760   switch (c) {
00761   case ':':   /* character class */
00762     NEXT2();
00763     REQUIRE(MORE(), REG_EBRACK);
00764     c = PEEK();
00765     REQUIRE(c != '-' && c != ']', REG_ECTYPE);
00766     p_b_cclass(p, cs);
00767     REQUIRE(MORE(), REG_EBRACK);
00768     REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
00769     break;
00770   case '=':   /* equivalence class */
00771     NEXT2();
00772     REQUIRE(MORE(), REG_EBRACK);
00773     c = PEEK();
00774     REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
00775     p_b_eclass(p, cs);
00776     REQUIRE(MORE(), REG_EBRACK);
00777     REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
00778     break;
00779   default:    /* symbol, ordinary character, or range */
00780 /* xxx revision needed for multichar stuff */
00781     start = p_b_symbol(p);
00782     if (SEE('-') && MORE2() && PEEK2() != ']') {
00783       /* range */
00784       NEXT();
00785       if (EAT('-'))
00786         finish = '-';
00787       else
00788         finish = p_b_symbol(p);
00789     } else
00790       finish = start;
00791 /* xxx what about signed chars here... */
00792     REQUIRE(start <= finish, REG_ERANGE);
00793     for (i = start; i <= finish; i++)
00794       CHadd(cs, i);
00795     break;
00796   }
00797 }
00798 
00799 /*
00800  - p_b_cclass - parse a character-class name and deal with it
00801  */
00802 static void
00803 p_b_cclass(struct parse *p, cset *cs)
00804 {
00805   char *sp = p->next;
00806   struct cclass *cp;
00807   size_t len;
00808   const char *u;
00809   char c;
00810 
00811   while (MORE() && isalpha((uch)PEEK()))
00812     NEXT();
00813   len = p->next - sp;
00814   for (cp = cclasses; cp->name != NULL; cp++)
00815     if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
00816       break;
00817   if (cp->name == NULL) {
00818     /* oops, didn't find it */
00819     SETERROR(REG_ECTYPE);
00820     return;
00821   }
00822 
00823   u = cp->chars;
00824   while ((c = *u++) != '\0')
00825     CHadd(cs, c);
00826   for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
00827     MCadd(p, cs, u);
00828 }
00829 
00830 /*
00831  - p_b_eclass - parse an equivalence-class name and deal with it
00832  *
00833  * This implementation is incomplete. xxx
00834  */
00835 static void
00836 p_b_eclass(struct parse *p, cset *cs)
00837 {
00838   char c;
00839 
00840   c = p_b_coll_elem(p, '=');
00841   CHadd(cs, c);
00842 }
00843 
00844 /*
00845  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
00846  */
00847 static char     /* value of symbol */
00848 p_b_symbol(struct parse *p)
00849 {
00850   char value;
00851 
00852   REQUIRE(MORE(), REG_EBRACK);
00853   if (!EATTWO('[', '.'))
00854     return(GETNEXT());
00855 
00856   /* collating symbol */
00857   value = p_b_coll_elem(p, '.');
00858   REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
00859   return(value);
00860 }
00861 
00862 /*
00863  - p_b_coll_elem - parse a collating-element name and look it up
00864  */
00865 static char     /* value of collating element */
00866 p_b_coll_elem(struct parse *p,
00867     int endc)     /* name ended by endc,']' */
00868 {
00869   char *sp = p->next;
00870   struct cname *cp;
00871   int len;
00872 
00873   while (MORE() && !SEETWO(endc, ']'))
00874     NEXT();
00875   if (!MORE()) {
00876     SETERROR(REG_EBRACK);
00877     return(0);
00878   }
00879   len = p->next - sp;
00880   for (cp = cnames; cp->name != NULL; cp++)
00881     if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
00882       return(cp->code); /* known name */
00883   if (len == 1)
00884     return(*sp);  /* single character */
00885   SETERROR(REG_ECOLLATE);     /* neither */
00886   return(0);
00887 }
00888 
00889 /*
00890  - othercase - return the case counterpart of an alphabetic
00891  */
00892 static char     /* if no counterpart, return ch */
00893 othercase(int ch)
00894 {
00895   ch = (uch)ch;
00896   assert(isalpha(ch));
00897   if (isupper(ch))
00898     return ((uch)tolower(ch));
00899   else if (islower(ch))
00900     return ((uch)toupper(ch));
00901   else      /* peculiar, but could happen */
00902     return(ch);
00903 }
00904 
00905 /*
00906  - bothcases - emit a dualcase version of a two-case character
00907  *
00908  * Boy, is this implementation ever a kludge...
00909  */
00910 static void
00911 bothcases(struct parse *p, int ch)
00912 {
00913   char *oldnext = p->next;
00914   char *oldend = p->end;
00915   char bracket[3];
00916 
00917   ch = (uch)ch;
00918   assert(othercase(ch) != ch);  /* p_bracket() would recurse */
00919   p->next = bracket;
00920   p->end = bracket+2;
00921   bracket[0] = ch;
00922   bracket[1] = ']';
00923   bracket[2] = '\0';
00924   p_bracket(p);
00925   assert(p->next == bracket+2);
00926   p->next = oldnext;
00927   p->end = oldend;
00928 }
00929 
00930 /*
00931  - ordinary - emit an ordinary character
00932  */
00933 static void
00934 ordinary(struct parse *p, int ch)
00935 {
00936   cat_t *cap = p->g->categories;
00937 
00938   if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
00939     bothcases(p, ch);
00940   else {
00941     EMIT(OCHAR, (uch)ch);
00942     if (cap[ch] == 0)
00943       cap[ch] = p->g->ncategories++;
00944   }
00945 }
00946 
00947 /*
00948  - nonnewline - emit REG_NEWLINE version of OANY
00949  *
00950  * Boy, is this implementation ever a kludge...
00951  */
00952 static void
00953 nonnewline(struct parse *p)
00954 {
00955   char *oldnext = p->next;
00956   char *oldend = p->end;
00957   char bracket[4];
00958 
00959   p->next = bracket;
00960   p->end = bracket+3;
00961   bracket[0] = '^';
00962   bracket[1] = '\n';
00963   bracket[2] = ']';
00964   bracket[3] = '\0';
00965   p_bracket(p);
00966   assert(p->next == bracket+3);
00967   p->next = oldnext;
00968   p->end = oldend;
00969 }
00970 
00971 /*
00972  - repeat - generate code for a bounded repetition, recursively if needed
00973  */
00974 static void
00975 repeat(struct parse *p,
00976     sopno start,    /* operand from here to end of strip */
00977     int from,     /* repeated from this number */
00978     int to)     /* to this number of times (maybe INFINITY) */
00979 {
00980   sopno finish = HERE();
00981 # define  N 2
00982 # define  INF 3
00983 # define  REP(f, t) ((f)*8 + (t))
00984 # define  MAP(n)  (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
00985   sopno copy;
00986 
00987   if (p->error != 0)  /* head off possible runaway recursion */
00988     return;
00989 
00990   assert(from <= to);
00991 
00992   switch (REP(MAP(from), MAP(to))) {
00993   case REP(0, 0):     /* must be user doing this */
00994     DROP(finish-start); /* drop the operand */
00995     break;
00996   case REP(0, 1):     /* as x{1,1}? */
00997   case REP(0, N):     /* as x{1,n}? */
00998   case REP(0, INF):   /* as x{1,}? */
00999     /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
01000     INSERT(OCH_, start);    /* offset is wrong... */
01001     repeat(p, start+1, 1, to);
01002     ASTERN(OOR1, start);
01003     AHEAD(start);     /* ... fix it */
01004     EMIT(OOR2, 0);
01005     AHEAD(THERE());
01006     ASTERN(O_CH, THERETHERE());
01007     break;
01008   case REP(1, 1):     /* trivial case */
01009     /* done */
01010     break;
01011   case REP(1, N):     /* as x?x{1,n-1} */
01012     /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
01013     INSERT(OCH_, start);
01014     ASTERN(OOR1, start);
01015     AHEAD(start);
01016     EMIT(OOR2, 0);      /* offset very wrong... */
01017     AHEAD(THERE());     /* ...so fix it */
01018     ASTERN(O_CH, THERETHERE());
01019     copy = dupl(p, start+1, finish+1);
01020     assert(copy == finish+4);
01021     repeat(p, copy, 1, to-1);
01022     break;
01023   case REP(1, INF):   /* as x+ */
01024     INSERT(OPLUS_, start);
01025     ASTERN(O_PLUS, start);
01026     break;
01027   case REP(N, N):     /* as xx{m-1,n-1} */
01028     copy = dupl(p, start, finish);
01029     repeat(p, copy, from-1, to-1);
01030     break;
01031   case REP(N, INF):   /* as xx{n-1,INF} */
01032     copy = dupl(p, start, finish);
01033     repeat(p, copy, from-1, to);
01034     break;
01035   default:      /* "can't happen" */
01036     SETERROR(REG_ASSERT); /* just in case */
01037     break;
01038   }
01039 }
01040 
01041 /*
01042  - seterr - set an error condition
01043  */
01044 static int      /* useless but makes type checking happy */
01045 seterr(struct parse *p, int e)
01046 {
01047   if (p->error == 0)  /* keep earliest error condition */
01048     p->error = e;
01049   p->next = nuls;   /* try to bring things to a halt */
01050   p->end = nuls;
01051   return(0);    /* make the return value well-defined */
01052 }
01053 
01054 /*
01055  - allocset - allocate a set of characters for []
01056  */
01057 static cset *
01058 allocset(struct parse *p)
01059 {
01060   int no = p->g->ncsets++;
01061   size_t nc;
01062   size_t nbytes;
01063   cset *cs;
01064   size_t css = (size_t)p->g->csetsize;
01065   int i;
01066 
01067   if (no >= p->ncsalloc) {  /* need another column of space */
01068     void *ptr;
01069 
01070     p->ncsalloc += CHAR_BIT;
01071     nc = p->ncsalloc;
01072     assert(nc % CHAR_BIT == 0);
01073     nbytes = nc / CHAR_BIT * css;
01074 
01075     ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset));
01076     if (ptr == NULL)
01077       goto nomem;
01078     p->g->sets = ptr;
01079 
01080     ptr = (uch *)realloc((char *)p->g->setbits, nbytes);
01081     if (ptr == NULL)
01082       goto nomem;
01083     p->g->setbits = ptr;
01084 
01085     for (i = 0; i < no; i++)
01086       p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
01087 
01088     (void) memset((char *)p->g->setbits + (nbytes - css), 0, css);
01089   }
01090   /* XXX should not happen */
01091   if (p->g->sets == NULL || p->g->setbits == NULL)
01092     goto nomem;
01093 
01094   cs = &p->g->sets[no];
01095   cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
01096   cs->mask = 1 << ((no) % CHAR_BIT);
01097   cs->hash = 0;
01098   cs->smultis = 0;
01099   cs->multis = NULL;
01100 
01101   return(cs);
01102 nomem:
01103   free(p->g->sets);
01104   p->g->sets = NULL;
01105   free(p->g->setbits);
01106   p->g->setbits = NULL;
01107 
01108   SETERROR(REG_ESPACE);
01109   /* caller's responsibility not to do set ops */
01110   return(NULL);
01111 }
01112 
01113 /*
01114  - freeset - free a now-unused set
01115  */
01116 static void
01117 freeset(struct parse *p, cset *cs)
01118 {
01119   size_t i;
01120   cset *top = &p->g->sets[p->g->ncsets];
01121   size_t css = (size_t)p->g->csetsize;
01122 
01123   for (i = 0; i < css; i++)
01124     CHsub(cs, i);
01125   if (cs == top-1)  /* recover only the easy case */
01126     p->g->ncsets--;
01127 }
01128 
01129 /*
01130  - freezeset - final processing on a set of characters
01131  *
01132  * The main task here is merging identical sets.  This is usually a waste
01133  * of time (although the hash code minimizes the overhead), but can win
01134  * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
01135  * is done using addition rather than xor -- all ASCII [aA] sets xor to
01136  * the same value!
01137  */
01138 static int      /* set number */
01139 freezeset(struct parse *p, cset *cs)
01140 {
01141   uch h = cs->hash;
01142   size_t i;
01143   cset *top = &p->g->sets[p->g->ncsets];
01144   cset *cs2;
01145   size_t css = (size_t)p->g->csetsize;
01146 
01147   /* look for an earlier one which is the same */
01148   for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
01149     if (cs2->hash == h && cs2 != cs) {
01150       /* maybe */
01151       for (i = 0; i < css; i++)
01152         if (!!CHIN(cs2, i) != !!CHIN(cs, i))
01153           break;    /* no */
01154       if (i == css)
01155         break;      /* yes */
01156     }
01157 
01158   if (cs2 < top) {  /* found one */
01159     freeset(p, cs);
01160     cs = cs2;
01161   }
01162 
01163   return((int)(cs - p->g->sets));
01164 }
01165 
01166 /*
01167  - firstch - return first character in a set (which must have at least one)
01168  */
01169 static int      /* character; there is no "none" value */
01170 firstch(struct parse *p, cset *cs)
01171 {
01172   size_t i;
01173   size_t css = (size_t)p->g->csetsize;
01174 
01175   for (i = 0; i < css; i++)
01176     if (CHIN(cs, i))
01177       return((char)i);
01178   assert(never);
01179   return(0);    /* arbitrary */
01180 }
01181 
01182 /*
01183  - nch - number of characters in a set
01184  */
01185 static int
01186 nch(struct parse *p, cset *cs)
01187 {
01188   size_t i;
01189   size_t css = (size_t)p->g->csetsize;
01190   int n = 0;
01191 
01192   for (i = 0; i < css; i++)
01193     if (CHIN(cs, i))
01194       n++;
01195   return(n);
01196 }
01197 
01198 /*
01199  - mcadd - add a collating element to a cset
01200  */
01201 static void
01202 mcadd( struct parse *p, cset *cs, const char *cp)
01203 {
01204   size_t oldend = cs->smultis;
01205   void *np;
01206 
01207   cs->smultis += strlen(cp) + 1;
01208   np = realloc(cs->multis, cs->smultis);
01209   if (np == NULL) {
01210     if (cs->multis)
01211       free(cs->multis);
01212     cs->multis = NULL;
01213     SETERROR(REG_ESPACE);
01214     return;
01215   }
01216   cs->multis = np;
01217 
01218   llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
01219 }
01220 
01221 /*
01222  - mcinvert - invert the list of collating elements in a cset
01223  *
01224  * This would have to know the set of possibilities.  Implementation
01225  * is deferred.
01226  */
01227 /* ARGSUSED */
01228 static void
01229 mcinvert(struct parse *p, cset *cs)
01230 {
01231   assert(cs->multis == NULL); /* xxx */
01232 }
01233 
01234 /*
01235  - mccase - add case counterparts of the list of collating elements in a cset
01236  *
01237  * This would have to know the set of possibilities.  Implementation
01238  * is deferred.
01239  */
01240 /* ARGSUSED */
01241 static void
01242 mccase(struct parse *p, cset *cs)
01243 {
01244   assert(cs->multis == NULL); /* xxx */
01245 }
01246 
01247 /*
01248  - isinsets - is this character in any sets?
01249  */
01250 static int      /* predicate */
01251 isinsets(struct re_guts *g, int c)
01252 {
01253   uch *col;
01254   int i;
01255   int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
01256   unsigned uc = (uch)c;
01257 
01258   for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
01259     if (col[uc] != 0)
01260       return(1);
01261   return(0);
01262 }
01263 
01264 /*
01265  - samesets - are these two characters in exactly the same sets?
01266  */
01267 static int      /* predicate */
01268 samesets(struct re_guts *g, int c1, int c2)
01269 {
01270   uch *col;
01271   int i;
01272   int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
01273   unsigned uc1 = (uch)c1;
01274   unsigned uc2 = (uch)c2;
01275 
01276   for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
01277     if (col[uc1] != col[uc2])
01278       return(0);
01279   return(1);
01280 }
01281 
01282 /*
01283  - categorize - sort out character categories
01284  */
01285 static void
01286 categorize(struct parse *p, struct re_guts *g)
01287 {
01288   cat_t *cats = g->categories;
01289   int c;
01290   int c2;
01291   cat_t cat;
01292 
01293   /* avoid making error situations worse */
01294   if (p->error != 0)
01295     return;
01296 
01297   for (c = CHAR_MIN; c <= CHAR_MAX; c++)
01298     if (cats[c] == 0 && isinsets(g, c)) {
01299       cat = g->ncategories++;
01300       cats[c] = cat;
01301       for (c2 = c+1; c2 <= CHAR_MAX; c2++)
01302         if (cats[c2] == 0 && samesets(g, c, c2))
01303           cats[c2] = cat;
01304     }
01305 }
01306 
01307 /*
01308  - dupl - emit a duplicate of a bunch of sops
01309  */
01310 static sopno      /* start of duplicate */
01311 dupl(struct parse *p,
01312     sopno start,    /* from here */
01313     sopno finish)   /* to this less one */
01314 {
01315   sopno ret = HERE();
01316   sopno len = finish - start;
01317 
01318   assert(finish >= start);
01319   if (len == 0)
01320     return(ret);
01321   enlarge(p, p->ssize + len); /* this many unexpected additions */
01322   assert(p->ssize >= p->slen + len);
01323   (void) memmove((char *)(p->strip + p->slen),
01324     (char *)(p->strip + start), (size_t)len*sizeof(sop));
01325   p->slen += len;
01326   return(ret);
01327 }
01328 
01329 /*
01330  - doemit - emit a strip operator
01331  *
01332  * It might seem better to implement this as a macro with a function as
01333  * hard-case backup, but it's just too big and messy unless there are
01334  * some changes to the data structures.  Maybe later.
01335  */
01336 static void
01337 doemit(struct parse *p, sop op, size_t opnd)
01338 {
01339   /* avoid making error situations worse */
01340   if (p->error != 0)
01341     return;
01342 
01343   /* deal with oversize operands ("can't happen", more or less) */
01344   assert(opnd < 1<<OPSHIFT);
01345 
01346   /* deal with undersized strip */
01347   if (p->slen >= p->ssize)
01348     enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
01349   assert(p->slen < p->ssize);
01350 
01351   /* finally, it's all reduced to the easy case */
01352   p->strip[p->slen++] = SOP(op, opnd);
01353 }
01354 
01355 /*
01356  - doinsert - insert a sop into the strip
01357  */
01358 static void
01359 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
01360 {
01361   sopno sn;
01362   sop s;
01363   int i;
01364 
01365   /* avoid making error situations worse */
01366   if (p->error != 0)
01367     return;
01368 
01369   sn = HERE();
01370   EMIT(op, opnd);   /* do checks, ensure space */
01371   assert(HERE() == sn+1);
01372   s = p->strip[sn];
01373 
01374   /* adjust paren pointers */
01375   assert(pos > 0);
01376   for (i = 1; i < NPAREN; i++) {
01377     if (p->pbegin[i] >= pos) {
01378       p->pbegin[i]++;
01379     }
01380     if (p->pend[i] >= pos) {
01381       p->pend[i]++;
01382     }
01383   }
01384 
01385   memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
01386             (HERE()-pos-1)*sizeof(sop));
01387   p->strip[pos] = s;
01388 }
01389 
01390 /*
01391  - dofwd - complete a forward reference
01392  */
01393 static void
01394 dofwd(struct parse *p, sopno pos, sop value)
01395 {
01396   /* avoid making error situations worse */
01397   if (p->error != 0)
01398     return;
01399 
01400   assert(value < 1<<OPSHIFT);
01401   p->strip[pos] = OP(p->strip[pos]) | value;
01402 }
01403 
01404 /*
01405  - enlarge - enlarge the strip
01406  */
01407 static void
01408 enlarge(struct parse *p, sopno size)
01409 {
01410   sop *sp;
01411 
01412   if (p->ssize >= size)
01413     return;
01414 
01415   sp = (sop *)realloc(p->strip, size*sizeof(sop));
01416   if (sp == NULL) {
01417     SETERROR(REG_ESPACE);
01418     return;
01419   }
01420   p->strip = sp;
01421   p->ssize = size;
01422 }
01423 
01424 /*
01425  - stripsnug - compact the strip
01426  */
01427 static void
01428 stripsnug(struct parse *p, struct re_guts *g)
01429 {
01430   g->nstates = p->slen;
01431   g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
01432   if (g->strip == NULL) {
01433     SETERROR(REG_ESPACE);
01434     g->strip = p->strip;
01435   }
01436 }
01437 
01438 /*
01439  - findmust - fill in must and mlen with longest mandatory literal string
01440  *
01441  * This algorithm could do fancy things like analyzing the operands of |
01442  * for common subsequences.  Someday.  This code is simple and finds most
01443  * of the interesting cases.
01444  *
01445  * Note that must and mlen got initialized during setup.
01446  */
01447 static void
01448 findmust(struct parse *p, struct re_guts *g)
01449 {
01450   sop *scan;
01451   sop *start = 0; /* start initialized in the default case, after that */
01452   sop *newstart = 0; /* newstart was initialized in the OCHAR case */
01453   sopno newlen;
01454   sop s;
01455   char *cp;
01456   sopno i;
01457 
01458   /* avoid making error situations worse */
01459   if (p->error != 0)
01460     return;
01461 
01462   /* find the longest OCHAR sequence in strip */
01463   newlen = 0;
01464   scan = g->strip + 1;
01465   do {
01466     s = *scan++;
01467     switch (OP(s)) {
01468     case OCHAR:   /* sequence member */
01469       if (newlen == 0)    /* new sequence */
01470         newstart = scan - 1;
01471       newlen++;
01472       break;
01473     case OPLUS_:    /* things that don't break one */
01474     case OLPAREN:
01475     case ORPAREN:
01476       break;
01477     case OQUEST_:   /* things that must be skipped */
01478     case OCH_:
01479       scan--;
01480       do {
01481         scan += OPND(s);
01482         s = *scan;
01483         /* assert() interferes w debug printouts */
01484         if (OP(s) != O_QUEST && OP(s) != O_CH &&
01485               OP(s) != OOR2) {
01486           g->iflags |= REGEX_BAD;
01487           return;
01488         }
01489       } while (OP(s) != O_QUEST && OP(s) != O_CH);
01490       /* fallthrough */
01491     default:    /* things that break a sequence */
01492       if (newlen > g->mlen) {   /* ends one */
01493         start = newstart;
01494         g->mlen = newlen;
01495       }
01496       newlen = 0;
01497       break;
01498     }
01499   } while (OP(s) != OEND);
01500 
01501   if (g->mlen == 0)   /* there isn't one */
01502     return;
01503 
01504   /* turn it into a character string */
01505   g->must = malloc((size_t)g->mlen + 1);
01506   if (g->must == NULL) {    /* argh; just forget it */
01507     g->mlen = 0;
01508     return;
01509   }
01510   cp = g->must;
01511   scan = start;
01512   for (i = g->mlen; i > 0; i--) {
01513     while (OP(s = *scan++) != OCHAR)
01514       continue;
01515     assert(cp < g->must + g->mlen);
01516     *cp++ = (char)OPND(s);
01517   }
01518   assert(cp == g->must + g->mlen);
01519   *cp++ = '\0';   /* just on general principles */
01520 }
01521 
01522 /*
01523  - pluscount - count + nesting
01524  */
01525 static sopno      /* nesting depth */
01526 pluscount(struct parse *p, struct re_guts *g)
01527 {
01528   sop *scan;
01529   sop s;
01530   sopno plusnest = 0;
01531   sopno maxnest = 0;
01532 
01533   if (p->error != 0)
01534     return(0);  /* there may not be an OEND */
01535 
01536   scan = g->strip + 1;
01537   do {
01538     s = *scan++;
01539     switch (OP(s)) {
01540     case OPLUS_:
01541       plusnest++;
01542       break;
01543     case O_PLUS:
01544       if (plusnest > maxnest)
01545         maxnest = plusnest;
01546       plusnest--;
01547       break;
01548     }
01549   } while (OP(s) != OEND);
01550   if (plusnest != 0)
01551     g->iflags |= REGEX_BAD;
01552   return(maxnest);
01553 }
Generated on Thu Sep 18 2014 03:21:24 for LLVM by Doxygen