PostgreSQL Source Code: src/backend/regex/regcomp.c Source File

Go to the documentation of this file.
00001 /*
00002  * re_*comp and friends - compile REs
00003  * This file #includes several others (see the bottom).
00004  *
00005  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
00006  *
00007  * Development of this software was funded, in part, by Cray Research Inc.,
00008  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
00009  * Corporation, none of whom are responsible for the results.  The author
00010  * thanks all of them.
00011  *
00012  * Redistribution and use in source and binary forms -- with or without
00013  * modification -- are permitted for any purpose, provided that
00014  * redistributions in source form retain this entire copyright notice and
00015  * indicate the origin and nature of any modifications.
00016  *
00017  * I'd appreciate being given credit for this package in the documentation
00018  * of software which uses it, but that is not a requirement.
00019  *
00020  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
00021  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
00022  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
00023  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00024  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00025  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
00026  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
00027  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
00028  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
00029  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00030  *
00031  * src/backend/regex/regcomp.c
00032  *
00033  */
00034 
00035 #include "regex/regguts.h"
00036 
00037 /*
00038  * forward declarations, up here so forward datatypes etc. are defined early
00039  */
00040 /* === regcomp.c === */
00041 static void moresubs(struct vars *, int);
00042 static int  freev(struct vars *, int);
00043 static void makesearch(struct vars *, struct nfa *);
00044 static struct subre *parse(struct vars *, int, int, struct state *, struct state *);
00045 static struct subre *parsebranch(struct vars *, int, int, struct state *, struct state *, int);
00046 static void parseqatom(struct vars *, int, int, struct state *, struct state *, struct subre *);
00047 static void nonword(struct vars *, int, struct state *, struct state *);
00048 static void word(struct vars *, int, struct state *, struct state *);
00049 static int  scannum(struct vars *);
00050 static void repeat(struct vars *, struct state *, struct state *, int, int);
00051 static void bracket(struct vars *, struct state *, struct state *);
00052 static void cbracket(struct vars *, struct state *, struct state *);
00053 static void brackpart(struct vars *, struct state *, struct state *);
00054 static const chr *scanplain(struct vars *);
00055 static void onechr(struct vars *, chr, struct state *, struct state *);
00056 static void dovec(struct vars *, struct cvec *, struct state *, struct state *);
00057 static void wordchrs(struct vars *);
00058 static struct subre *subre(struct vars *, int, int, struct state *, struct state *);
00059 static void freesubre(struct vars *, struct subre *);
00060 static void freesrnode(struct vars *, struct subre *);
00061 static void optst(struct vars *, struct subre *);
00062 static int  numst(struct subre *, int);
00063 static void markst(struct subre *);
00064 static void cleanst(struct vars *);
00065 static long nfatree(struct vars *, struct subre *, FILE *);
00066 static long nfanode(struct vars *, struct subre *, FILE *);
00067 static int  newlacon(struct vars *, struct state *, struct state *, int);
00068 static void freelacons(struct subre *, int);
00069 static void rfree(regex_t *);
00070 
00071 #ifdef REG_DEBUG
00072 static void dump(regex_t *, FILE *);
00073 static void dumpst(struct subre *, FILE *, int);
00074 static void stdump(struct subre *, FILE *, int);
00075 static const char *stid(struct subre *, char *, size_t);
00076 #endif
00077 /* === regc_lex.c === */
00078 static void lexstart(struct vars *);
00079 static void prefixes(struct vars *);
00080 static void lexnest(struct vars *, const chr *, const chr *);
00081 static void lexword(struct vars *);
00082 static int  next(struct vars *);
00083 static int  lexescape(struct vars *);
00084 static chr  lexdigits(struct vars *, int, int, int);
00085 static int  brenext(struct vars *, chr);
00086 static void skip(struct vars *);
00087 static chr  newline(void);
00088 static chr  chrnamed(struct vars *, const chr *, const chr *, chr);
00089 
00090 /* === regc_color.c === */
00091 static void initcm(struct vars *, struct colormap *);
00092 static void freecm(struct colormap *);
00093 static void cmtreefree(struct colormap *, union tree *, int);
00094 static color setcolor(struct colormap *, chr, pcolor);
00095 static color maxcolor(struct colormap *);
00096 static color newcolor(struct colormap *);
00097 static void freecolor(struct colormap *, pcolor);
00098 static color pseudocolor(struct colormap *);
00099 static color subcolor(struct colormap *, chr c);
00100 static color newsub(struct colormap *, pcolor);
00101 static void subrange(struct vars *, chr, chr, struct state *, struct state *);
00102 static void subblock(struct vars *, chr, struct state *, struct state *);
00103 static void okcolors(struct nfa *, struct colormap *);
00104 static void colorchain(struct colormap *, struct arc *);
00105 static void uncolorchain(struct colormap *, struct arc *);
00106 static void rainbow(struct nfa *, struct colormap *, int, pcolor, struct state *, struct state *);
00107 static void colorcomplement(struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *);
00108 
00109 #ifdef REG_DEBUG
00110 static void dumpcolors(struct colormap *, FILE *);
00111 static void fillcheck(struct colormap *, union tree *, int, FILE *);
00112 static void dumpchr(chr, FILE *);
00113 #endif
00114 /* === regc_nfa.c === */
00115 static struct nfa *newnfa(struct vars *, struct colormap *, struct nfa *);
00116 static void freenfa(struct nfa *);
00117 static struct state *newstate(struct nfa *);
00118 static struct state *newfstate(struct nfa *, int flag);
00119 static void dropstate(struct nfa *, struct state *);
00120 static void freestate(struct nfa *, struct state *);
00121 static void destroystate(struct nfa *, struct state *);
00122 static void newarc(struct nfa *, int, pcolor, struct state *, struct state *);
00123 static struct arc *allocarc(struct nfa *, struct state *);
00124 static void freearc(struct nfa *, struct arc *);
00125 static int  hasnonemptyout(struct state *);
00126 static int  nonemptyouts(struct state *);
00127 static int  nonemptyins(struct state *);
00128 static struct arc *findarc(struct state *, int, pcolor);
00129 static void cparc(struct nfa *, struct arc *, struct state *, struct state *);
00130 static void moveins(struct nfa *, struct state *, struct state *);
00131 static void copyins(struct nfa *, struct state *, struct state *, int);
00132 static void moveouts(struct nfa *, struct state *, struct state *);
00133 static void copyouts(struct nfa *, struct state *, struct state *, int);
00134 static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int);
00135 static void delsub(struct nfa *, struct state *, struct state *);
00136 static void deltraverse(struct nfa *, struct state *, struct state *);
00137 static void dupnfa(struct nfa *, struct state *, struct state *, struct state *, struct state *);
00138 static void duptraverse(struct nfa *, struct state *, struct state *);
00139 static void cleartraverse(struct nfa *, struct state *);
00140 static void specialcolors(struct nfa *);
00141 static long optimize(struct nfa *, FILE *);
00142 static void pullback(struct nfa *, FILE *);
00143 static int  pull(struct nfa *, struct arc *);
00144 static void pushfwd(struct nfa *, FILE *);
00145 static int  push(struct nfa *, struct arc *);
00146 
00147 #define INCOMPATIBLE    1       /* destroys arc */
00148 #define SATISFIED   2           /* constraint satisfied */
00149 #define COMPATIBLE  3           /* compatible but not satisfied yet */
00150 static int  combine(struct arc *, struct arc *);
00151 static void fixempties(struct nfa *, FILE *);
00152 static struct state *emptyreachable(struct state *, struct state *);
00153 static void replaceempty(struct nfa *, struct state *, struct state *);
00154 static void cleanup(struct nfa *);
00155 static void markreachable(struct nfa *, struct state *, struct state *, struct state *);
00156 static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);
00157 static long analyze(struct nfa *);
00158 static void compact(struct nfa *, struct cnfa *);
00159 static void carcsort(struct carc *, struct carc *);
00160 static void freecnfa(struct cnfa *);
00161 static void dumpnfa(struct nfa *, FILE *);
00162 
00163 #ifdef REG_DEBUG
00164 static void dumpstate(struct state *, FILE *);
00165 static void dumparcs(struct state *, FILE *);
00166 static int  dumprarcs(struct arc *, struct state *, FILE *, int);
00167 static void dumparc(struct arc *, struct state *, FILE *);
00168 static void dumpcnfa(struct cnfa *, FILE *);
00169 static void dumpcstate(int, struct cnfa *, FILE *);
00170 #endif
00171 /* === regc_cvec.c === */
00172 static struct cvec *newcvec(int, int);
00173 static struct cvec *clearcvec(struct cvec *);
00174 static void addchr(struct cvec *, chr);
00175 static void addrange(struct cvec *, chr, chr);
00176 static struct cvec *getcvec(struct vars *, int, int);
00177 static void freecvec(struct cvec *);
00178 
00179 /* === regc_pg_locale.c === */
00180 static int  pg_wc_isdigit(pg_wchar c);
00181 static int  pg_wc_isalpha(pg_wchar c);
00182 static int  pg_wc_isalnum(pg_wchar c);
00183 static int  pg_wc_isupper(pg_wchar c);
00184 static int  pg_wc_islower(pg_wchar c);
00185 static int  pg_wc_isgraph(pg_wchar c);
00186 static int  pg_wc_isprint(pg_wchar c);
00187 static int  pg_wc_ispunct(pg_wchar c);
00188 static int  pg_wc_isspace(pg_wchar c);
00189 static pg_wchar pg_wc_toupper(pg_wchar c);
00190 static pg_wchar pg_wc_tolower(pg_wchar c);
00191 
00192 /* === regc_locale.c === */
00193 static celt element(struct vars *, const chr *, const chr *);
00194 static struct cvec *range(struct vars *, celt, celt, int);
00195 static int  before(celt, celt);
00196 static struct cvec *eclass(struct vars *, celt, int);
00197 static struct cvec *cclass(struct vars *, const chr *, const chr *, int);
00198 static struct cvec *allcases(struct vars *, chr);
00199 static int  cmp(const chr *, const chr *, size_t);
00200 static int  casecmp(const chr *, const chr *, size_t);
00201 
00202 
00203 /* internal variables, bundled for easy passing around */
00204 struct vars
00205 {
00206     regex_t    *re;
00207     const chr  *now;            /* scan pointer into string */
00208     const chr  *stop;           /* end of string */
00209     const chr  *savenow;        /* saved now and stop for "subroutine call" */
00210     const chr  *savestop;
00211     int         err;            /* error code (0 if none) */
00212     int         cflags;         /* copy of compile flags */
00213     int         lasttype;       /* type of previous token */
00214     int         nexttype;       /* type of next token */
00215     chr         nextvalue;      /* value (if any) of next token */
00216     int         lexcon;         /* lexical context type (see lex.c) */
00217     int         nsubexp;        /* subexpression count */
00218     struct subre **subs;        /* subRE pointer vector */
00219     size_t      nsubs;          /* length of vector */
00220     struct subre *sub10[10];    /* initial vector, enough for most */
00221     struct nfa *nfa;            /* the NFA */
00222     struct colormap *cm;        /* character color map */
00223     color       nlcolor;        /* color of newline */
00224     struct state *wordchrs;     /* state in nfa holding word-char outarcs */
00225     struct subre *tree;         /* subexpression tree */
00226     struct subre *treechain;    /* all tree nodes allocated */
00227     struct subre *treefree;     /* any free tree nodes */
00228     int         ntree;          /* number of tree nodes */
00229     struct cvec *cv;            /* interface cvec */
00230     struct cvec *cv2;           /* utility cvec */
00231     struct subre *lacons;       /* lookahead-constraint vector */
00232     int         nlacons;        /* size of lacons */
00233 };
00234 
00235 /* parsing macros; most know that `v' is the struct vars pointer */
00236 #define NEXT()  (next(v))       /* advance by one token */
00237 #define SEE(t)  (v->nexttype == (t))    /* is next token this? */
00238 #define EAT(t)  (SEE(t) && next(v))     /* if next is this, swallow it */
00239 #define VISERR(vv)  ((vv)->err != 0)    /* have we seen an error yet? */
00240 #define ISERR() VISERR(v)
00241 #define VERR(vv,e)  ((vv)->nexttype = EOS, \
00242                      (vv)->err = ((vv)->err ? (vv)->err : (e)))
00243 #define ERR(e)  VERR(v, e)      /* record an error */
00244 #define NOERR() {if (ISERR()) return;}  /* if error seen, return */
00245 #define NOERRN()    {if (ISERR()) return NULL;} /* NOERR with retval */
00246 #define NOERRZ()    {if (ISERR()) return 0;}    /* NOERR with retval */
00247 #define INSIST(c, e) do { if (!(c)) ERR(e); } while (0) /* error if c false */
00248 #define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */
00249 #define EMPTYARC(x, y)  newarc(v->nfa, EMPTY, 0, x, y)
00250 
00251 /* token type codes, some also used as NFA arc types */
00252 #define EMPTY   'n'             /* no token present */
00253 #define EOS 'e'                 /* end of string */
00254 #define PLAIN   'p'             /* ordinary character */
00255 #define DIGIT   'd'             /* digit (in bound) */
00256 #define BACKREF 'b'             /* back reference */
00257 #define COLLEL  'I'             /* start of [. */
00258 #define ECLASS  'E'             /* start of [= */
00259 #define CCLASS  'C'             /* start of [: */
00260 #define END 'X'                 /* end of [. [= [: */
00261 #define RANGE   'R'             /* - within [] which might be range delim. */
00262 #define LACON   'L'             /* lookahead constraint subRE */
00263 #define AHEAD   'a'             /* color-lookahead arc */
00264 #define BEHIND  'r'             /* color-lookbehind arc */
00265 #define WBDRY   'w'             /* word boundary constraint */
00266 #define NWBDRY  'W'             /* non-word-boundary constraint */
00267 #define SBEGIN  'A'             /* beginning of string (even if not BOL) */
00268 #define SEND    'Z'             /* end of string (even if not EOL) */
00269 #define PREFER  'P'             /* length preference */
00270 
00271 /* is an arc colored, and hence on a color chain? */
00272 #define COLORED(a) \
00273     ((a)->type == PLAIN || (a)->type == AHEAD || (a)->type == BEHIND)
00274 
00275 
00276 /* static function list */
00277 static struct fns functions = {
00278     rfree,                      /* regfree insides */
00279 };
00280 
00281 
00282 
00283 /*
00284  * pg_regcomp - compile regular expression
00285  *
00286  * Note: on failure, no resources remain allocated, so pg_regfree()
00287  * need not be applied to re.
00288  */
00289 int
00290 pg_regcomp(regex_t *re,
00291            const chr *string,
00292            size_t len,
00293            int flags,
00294            Oid collation)
00295 {
00296     struct vars var;
00297     struct vars *v = &var;
00298     struct guts *g;
00299     int         i;
00300     size_t      j;
00301 
00302 #ifdef REG_DEBUG
00303     FILE       *debug = (flags & REG_PROGRESS) ? stdout : (FILE *) NULL;
00304 #else
00305     FILE       *debug = (FILE *) NULL;
00306 #endif
00307 
00308 #define  CNOERR()    { if (ISERR()) return freev(v, v->err); }
00309 
00310     /* sanity checks */
00311 
00312     if (re == NULL || string == NULL)
00313         return REG_INVARG;
00314     if ((flags & REG_QUOTE) &&
00315         (flags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE)))
00316         return REG_INVARG;
00317     if (!(flags & REG_EXTENDED) && (flags & REG_ADVF))
00318         return REG_INVARG;
00319 
00320     /* Initialize locale-dependent support */
00321     pg_set_regex_collation(collation);
00322 
00323     /* initial setup (after which freev() is callable) */
00324     v->re = re;
00325     v->now = string;
00326     v->stop = v->now + len;
00327     v->savenow = v->savestop = NULL;
00328     v->err = 0;
00329     v->cflags = flags;
00330     v->nsubexp = 0;
00331     v->subs = v->sub10;
00332     v->nsubs = 10;
00333     for (j = 0; j < v->nsubs; j++)
00334         v->subs[j] = NULL;
00335     v->nfa = NULL;
00336     v->cm = NULL;
00337     v->nlcolor = COLORLESS;
00338     v->wordchrs = NULL;
00339     v->tree = NULL;
00340     v->treechain = NULL;
00341     v->treefree = NULL;
00342     v->cv = NULL;
00343     v->cv2 = NULL;
00344     v->lacons = NULL;
00345     v->nlacons = 0;
00346     re->re_magic = REMAGIC;
00347     re->re_info = 0;            /* bits get set during parse */
00348     re->re_csize = sizeof(chr);
00349     re->re_collation = collation;
00350     re->re_guts = NULL;
00351     re->re_fns = VS(&functions);
00352 
00353     /* more complex setup, malloced things */
00354     re->re_guts = VS(MALLOC(sizeof(struct guts)));
00355     if (re->re_guts == NULL)
00356         return freev(v, REG_ESPACE);
00357     g = (struct guts *) re->re_guts;
00358     g->tree = NULL;
00359     initcm(v, &g->cmap);
00360     v->cm = &g->cmap;
00361     g->lacons = NULL;
00362     g->nlacons = 0;
00363     ZAPCNFA(g->search);
00364     v->nfa = newnfa(v, v->cm, (struct nfa *) NULL);
00365     CNOERR();
00366     /* set up a reasonably-sized transient cvec for getcvec usage */
00367     v->cv = newcvec(100, 20);
00368     if (v->cv == NULL)
00369         return freev(v, REG_ESPACE);
00370 
00371     /* parsing */
00372     lexstart(v);                /* also handles prefixes */
00373     if ((v->cflags & REG_NLSTOP) || (v->cflags & REG_NLANCH))
00374     {
00375         /* assign newline a unique color */
00376         v->nlcolor = subcolor(v->cm, newline());
00377         okcolors(v->nfa, v->cm);
00378     }
00379     CNOERR();
00380     v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final);
00381     assert(SEE(EOS));           /* even if error; ISERR() => SEE(EOS) */
00382     CNOERR();
00383     assert(v->tree != NULL);
00384 
00385     /* finish setup of nfa and its subre tree */
00386     specialcolors(v->nfa);
00387     CNOERR();
00388 #ifdef REG_DEBUG
00389     if (debug != NULL)
00390     {
00391         fprintf(debug, "\n\n\n========= RAW ==========\n");
00392         dumpnfa(v->nfa, debug);
00393         dumpst(v->tree, debug, 1);
00394     }
00395 #endif
00396     optst(v, v->tree);
00397     v->ntree = numst(v->tree, 1);
00398     markst(v->tree);
00399     cleanst(v);
00400 #ifdef REG_DEBUG
00401     if (debug != NULL)
00402     {
00403         fprintf(debug, "\n\n\n========= TREE FIXED ==========\n");
00404         dumpst(v->tree, debug, 1);
00405     }
00406 #endif
00407 
00408     /* build compacted NFAs for tree and lacons */
00409     re->re_info |= nfatree(v, v->tree, debug);
00410     CNOERR();
00411     assert(v->nlacons == 0 || v->lacons != NULL);
00412     for (i = 1; i < v->nlacons; i++)
00413     {
00414 #ifdef REG_DEBUG
00415         if (debug != NULL)
00416             fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
00417 #endif
00418         nfanode(v, &v->lacons[i], debug);
00419     }
00420     CNOERR();
00421     if (v->tree->flags & SHORTER)
00422         NOTE(REG_USHORTEST);
00423 
00424     /* build compacted NFAs for tree, lacons, fast search */
00425 #ifdef REG_DEBUG
00426     if (debug != NULL)
00427         fprintf(debug, "\n\n\n========= SEARCH ==========\n");
00428 #endif
00429     /* can sacrifice main NFA now, so use it as work area */
00430     (DISCARD) optimize(v->nfa, debug);
00431     CNOERR();
00432     makesearch(v, v->nfa);
00433     CNOERR();
00434     compact(v->nfa, &g->search);
00435     CNOERR();
00436 
00437     /* looks okay, package it up */
00438     re->re_nsub = v->nsubexp;
00439     v->re = NULL;               /* freev no longer frees re */
00440     g->magic = GUTSMAGIC;
00441     g->cflags = v->cflags;
00442     g->info = re->re_info;
00443     g->nsub = re->re_nsub;
00444     g->tree = v->tree;
00445     v->tree = NULL;
00446     g->ntree = v->ntree;
00447     g->compare = (v->cflags & REG_ICASE) ? casecmp : cmp;
00448     g->lacons = v->lacons;
00449     v->lacons = NULL;
00450     g->nlacons = v->nlacons;
00451 
00452 #ifdef REG_DEBUG
00453     if (flags & REG_DUMP)
00454         dump(re, stdout);
00455 #endif
00456 
00457     assert(v->err == 0);
00458     return freev(v, 0);
00459 }
00460 
00461 /*
00462  * moresubs - enlarge subRE vector
00463  */
00464 static void
00465 moresubs(struct vars * v,
00466          int wanted)            /* want enough room for this one */
00467 {
00468     struct subre **p;
00469     size_t      n;
00470 
00471     assert(wanted > 0 && (size_t) wanted >= v->nsubs);
00472     n = (size_t) wanted *3 / 2 + 1;
00473 
00474     if (v->subs == v->sub10)
00475     {
00476         p = (struct subre **) MALLOC(n * sizeof(struct subre *));
00477         if (p != NULL)
00478             memcpy(VS(p), VS(v->subs),
00479                    v->nsubs * sizeof(struct subre *));
00480     }
00481     else
00482         p = (struct subre **) REALLOC(v->subs, n * sizeof(struct subre *));
00483     if (p == NULL)
00484     {
00485         ERR(REG_ESPACE);
00486         return;
00487     }
00488     v->subs = p;
00489     for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
00490         *p = NULL;
00491     assert(v->nsubs == n);
00492     assert((size_t) wanted < v->nsubs);
00493 }
00494 
00495 /*
00496  * freev - free vars struct's substructures where necessary
00497  *
00498  * Optionally does error-number setting, and always returns error code
00499  * (if any), to make error-handling code terser.
00500  */
00501 static int
00502 freev(struct vars * v,
00503       int err)
00504 {
00505     if (v->re != NULL)
00506         rfree(v->re);
00507     if (v->subs != v->sub10)
00508         FREE(v->subs);
00509     if (v->nfa != NULL)
00510         freenfa(v->nfa);
00511     if (v->tree != NULL)
00512         freesubre(v, v->tree);
00513     if (v->treechain != NULL)
00514         cleanst(v);
00515     if (v->cv != NULL)
00516         freecvec(v->cv);
00517     if (v->cv2 != NULL)
00518         freecvec(v->cv2);
00519     if (v->lacons != NULL)
00520         freelacons(v->lacons, v->nlacons);
00521     ERR(err);                   /* nop if err==0 */
00522 
00523     return v->err;
00524 }
00525 
00526 /*
00527  * makesearch - turn an NFA into a search NFA (implicit prepend of .*?)
00528  * NFA must have been optimize()d already.
00529  */
00530 static void
00531 makesearch(struct vars * v,
00532            struct nfa * nfa)
00533 {
00534     struct arc *a;
00535     struct arc *b;
00536     struct state *pre = nfa->pre;
00537     struct state *s;
00538     struct state *s2;
00539     struct state *slist;
00540 
00541     /* no loops are needed if it's anchored */
00542     for (a = pre->outs; a != NULL; a = a->outchain)
00543     {
00544         assert(a->type == PLAIN);
00545         if (a->co != nfa->bos[0] && a->co != nfa->bos[1])
00546             break;
00547     }
00548     if (a != NULL)
00549     {
00550         /* add implicit .* in front */
00551         rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre);
00552 
00553         /* and ^* and \A* too -- not always necessary, but harmless */
00554         newarc(nfa, PLAIN, nfa->bos[0], pre, pre);
00555         newarc(nfa, PLAIN, nfa->bos[1], pre, pre);
00556     }
00557 
00558     /*
00559      * Now here's the subtle part.  Because many REs have no lookback
00560      * constraints, often knowing when you were in the pre state tells you
00561      * little; it's the next state(s) that are informative.  But some of them
00562      * may have other inarcs, i.e. it may be possible to make actual progress
00563      * and then return to one of them.  We must de-optimize such cases,
00564      * splitting each such state into progress and no-progress states.
00565      */
00566 
00567     /* first, make a list of the states */
00568     slist = NULL;
00569     for (a = pre->outs; a != NULL; a = a->outchain)
00570     {
00571         s = a->to;
00572         for (b = s->ins; b != NULL; b = b->inchain)
00573             if (b->from != pre)
00574                 break;
00575         if (b != NULL && s->tmp == NULL)
00576         {
00577             /*
00578              * Must be split if not already in the list (fixes bugs 505048,
00579              * 230589, 840258, 504785).
00580              */
00581             s->tmp = slist;
00582             slist = s;
00583         }
00584     }
00585 
00586     /* do the splits */
00587     for (s = slist; s != NULL; s = s2)
00588     {
00589         s2 = newstate(nfa);
00590         copyouts(nfa, s, s2, 1);
00591         for (a = s->ins; a != NULL; a = b)
00592         {
00593             b = a->inchain;
00594             if (a->from != pre)
00595             {
00596                 cparc(nfa, a, a->from, s2);
00597                 freearc(nfa, a);
00598             }
00599         }
00600         s2 = s->tmp;
00601         s->tmp = NULL;          /* clean up while we're at it */
00602     }
00603 }
00604 
00605 /*
00606  * parse - parse an RE
00607  *
00608  * This is actually just the top level, which parses a bunch of branches
00609  * tied together with '|'.  They appear in the tree as the left children
00610  * of a chain of '|' subres.
00611  */
00612 static struct subre *
00613 parse(struct vars * v,
00614       int stopper,              /* EOS or ')' */
00615       int type,                 /* LACON (lookahead subRE) or PLAIN */
00616       struct state * init,      /* initial state */
00617       struct state * final)     /* final state */
00618 {
00619     struct state *left;         /* scaffolding for branch */
00620     struct state *right;
00621     struct subre *branches;     /* top level */
00622     struct subre *branch;       /* current branch */
00623     struct subre *t;            /* temporary */
00624     int         firstbranch;    /* is this the first branch? */
00625 
00626     assert(stopper == ')' || stopper == EOS);
00627 
00628     branches = subre(v, '|', LONGER, init, final);
00629     NOERRN();
00630     branch = branches;
00631     firstbranch = 1;
00632     do
00633     {                           /* a branch */
00634         if (!firstbranch)
00635         {
00636             /* need a place to hang it */
00637             branch->right = subre(v, '|', LONGER, init, final);
00638             NOERRN();
00639             branch = branch->right;
00640         }
00641         firstbranch = 0;
00642         left = newstate(v->nfa);
00643         right = newstate(v->nfa);
00644         NOERRN();
00645         EMPTYARC(init, left);
00646         EMPTYARC(right, final);
00647         NOERRN();
00648         branch->left = parsebranch(v, stopper, type, left, right, 0);
00649         NOERRN();
00650         branch->flags |= UP(branch->flags | branch->left->flags);
00651         if ((branch->flags & ~branches->flags) != 0)    /* new flags */
00652             for (t = branches; t != branch; t = t->right)
00653                 t->flags |= branch->flags;
00654     } while (EAT('|'));
00655     assert(SEE(stopper) || SEE(EOS));
00656 
00657     if (!SEE(stopper))
00658     {
00659         assert(stopper == ')' && SEE(EOS));
00660         ERR(REG_EPAREN);
00661     }
00662 
00663     /* optimize out simple cases */
00664     if (branch == branches)
00665     {                           /* only one branch */
00666         assert(branch->right == NULL);
00667         t = branch->left;
00668         branch->left = NULL;
00669         freesubre(v, branches);
00670         branches = t;
00671     }
00672     else if (!MESSY(branches->flags))
00673     {                           /* no interesting innards */
00674         freesubre(v, branches->left);
00675         branches->left = NULL;
00676         freesubre(v, branches->right);
00677         branches->right = NULL;
00678         branches->op = '=';
00679     }
00680 
00681     return branches;
00682 }
00683 
00684 /*
00685  * parsebranch - parse one branch of an RE
00686  *
00687  * This mostly manages concatenation, working closely with parseqatom().
00688  * Concatenated things are bundled up as much as possible, with separate
00689  * ',' nodes introduced only when necessary due to substructure.
00690  */
00691 static struct subre *
00692 parsebranch(struct vars * v,
00693             int stopper,        /* EOS or ')' */
00694             int type,           /* LACON (lookahead subRE) or PLAIN */
00695             struct state * left,    /* leftmost state */
00696             struct state * right,       /* rightmost state */
00697             int partial)        /* is this only part of a branch? */
00698 {
00699     struct state *lp;           /* left end of current construct */
00700     int         seencontent;    /* is there anything in this branch yet? */
00701     struct subre *t;
00702 
00703     lp = left;
00704     seencontent = 0;
00705     t = subre(v, '=', 0, left, right);  /* op '=' is tentative */
00706     NOERRN();
00707     while (!SEE('|') && !SEE(stopper) && !SEE(EOS))
00708     {
00709         if (seencontent)
00710         {                       /* implicit concat operator */
00711             lp = newstate(v->nfa);
00712             NOERRN();
00713             moveins(v->nfa, right, lp);
00714         }
00715         seencontent = 1;
00716 
00717         /* NB, recursion in parseqatom() may swallow rest of branch */
00718         parseqatom(v, stopper, type, lp, right, t);
00719         NOERRN();
00720     }
00721 
00722     if (!seencontent)
00723     {                           /* empty branch */
00724         if (!partial)
00725             NOTE(REG_UUNSPEC);
00726         assert(lp == left);
00727         EMPTYARC(left, right);
00728     }
00729 
00730     return t;
00731 }
00732 
00733 /*
00734  * parseqatom - parse one quantified atom or constraint of an RE
00735  *
00736  * The bookkeeping near the end cooperates very closely with parsebranch();
00737  * in particular, it contains a recursion that can involve parsing the rest
00738  * of the branch, making this function's name somewhat inaccurate.
00739  */
00740 static void
00741 parseqatom(struct vars * v,
00742            int stopper,         /* EOS or ')' */
00743            int type,            /* LACON (lookahead subRE) or PLAIN */
00744            struct state * lp,   /* left state to hang it on */
00745            struct state * rp,   /* right state to hang it on */
00746            struct subre * top)  /* subtree top */
00747 {
00748     struct state *s;            /* temporaries for new states */
00749     struct state *s2;
00750 
00751 #define  ARCV(t, val)    newarc(v->nfa, t, val, lp, rp)
00752     int         m,
00753                 n;
00754     struct subre *atom;         /* atom's subtree */
00755     struct subre *t;
00756     int         cap;            /* capturing parens? */
00757     int         pos;            /* positive lookahead? */
00758     int         subno;          /* capturing-parens or backref number */
00759     int         atomtype;
00760     int         qprefer;        /* quantifier short/long preference */
00761     int         f;
00762     struct subre **atomp;       /* where the pointer to atom is */
00763 
00764     /* initial bookkeeping */
00765     atom = NULL;
00766     assert(lp->nouts == 0);     /* must string new code */
00767     assert(rp->nins == 0);      /* between lp and rp */
00768     subno = 0;                  /* just to shut lint up */
00769 
00770     /* an atom or constraint... */
00771     atomtype = v->nexttype;
00772     switch (atomtype)
00773     {
00774             /* first, constraints, which end by returning */
00775         case '^':
00776             ARCV('^', 1);
00777             if (v->cflags & REG_NLANCH)
00778                 ARCV(BEHIND, v->nlcolor);
00779             NEXT();
00780             return;
00781             break;
00782         case '$':
00783             ARCV('$', 1);
00784             if (v->cflags & REG_NLANCH)
00785                 ARCV(AHEAD, v->nlcolor);
00786             NEXT();
00787             return;
00788             break;
00789         case SBEGIN:
00790             ARCV('^', 1);       /* BOL */
00791             ARCV('^', 0);       /* or BOS */
00792             NEXT();
00793             return;
00794             break;
00795         case SEND:
00796             ARCV('$', 1);       /* EOL */
00797             ARCV('$', 0);       /* or EOS */
00798             NEXT();
00799             return;
00800             break;
00801         case '<':
00802             wordchrs(v);        /* does NEXT() */
00803             s = newstate(v->nfa);
00804             NOERR();
00805             nonword(v, BEHIND, lp, s);
00806             word(v, AHEAD, s, rp);
00807             return;
00808             break;
00809         case '>':
00810             wordchrs(v);        /* does NEXT() */
00811             s = newstate(v->nfa);
00812             NOERR();
00813             word(v, BEHIND, lp, s);
00814             nonword(v, AHEAD, s, rp);
00815             return;
00816             break;
00817         case WBDRY:
00818             wordchrs(v);        /* does NEXT() */
00819             s = newstate(v->nfa);
00820             NOERR();
00821             nonword(v, BEHIND, lp, s);
00822             word(v, AHEAD, s, rp);
00823             s = newstate(v->nfa);
00824             NOERR();
00825             word(v, BEHIND, lp, s);
00826             nonword(v, AHEAD, s, rp);
00827             return;
00828             break;
00829         case NWBDRY:
00830             wordchrs(v);        /* does NEXT() */
00831             s = newstate(v->nfa);
00832             NOERR();
00833             word(v, BEHIND, lp, s);
00834             word(v, AHEAD, s, rp);
00835             s = newstate(v->nfa);
00836             NOERR();
00837             nonword(v, BEHIND, lp, s);
00838             nonword(v, AHEAD, s, rp);
00839             return;
00840             break;
00841         case LACON:             /* lookahead constraint */
00842             pos = v->nextvalue;
00843             NEXT();
00844             s = newstate(v->nfa);
00845             s2 = newstate(v->nfa);
00846             NOERR();
00847             t = parse(v, ')', LACON, s, s2);
00848             freesubre(v, t);    /* internal structure irrelevant */
00849             assert(SEE(')') || ISERR());
00850             NEXT();
00851             n = newlacon(v, s, s2, pos);
00852             NOERR();
00853             ARCV(LACON, n);
00854             return;
00855             break;
00856             /* then errors, to get them out of the way */
00857         case '*':
00858         case '+':
00859         case '?':
00860         case '{':
00861             ERR(REG_BADRPT);
00862             return;
00863             break;
00864         default:
00865             ERR(REG_ASSERT);
00866             return;
00867             break;
00868             /* then plain characters, and minor variants on that theme */
00869         case ')':               /* unbalanced paren */
00870             if ((v->cflags & REG_ADVANCED) != REG_EXTENDED)
00871             {
00872                 ERR(REG_EPAREN);
00873                 return;
00874             }
00875             /* legal in EREs due to specification botch */
00876             NOTE(REG_UPBOTCH);
00877             /* fallthrough into case PLAIN */
00878         case PLAIN:
00879             onechr(v, v->nextvalue, lp, rp);
00880             okcolors(v->nfa, v->cm);
00881             NOERR();
00882             NEXT();
00883             break;
00884         case '[':
00885             if (v->nextvalue == 1)
00886                 bracket(v, lp, rp);
00887             else
00888                 cbracket(v, lp, rp);
00889             assert(SEE(']') || ISERR());
00890             NEXT();
00891             break;
00892         case '.':
00893             rainbow(v->nfa, v->cm, PLAIN,
00894                     (v->cflags & REG_NLSTOP) ? v->nlcolor : COLORLESS,
00895                     lp, rp);
00896             NEXT();
00897             break;
00898             /* and finally the ugly stuff */
00899         case '(':               /* value flags as capturing or non */
00900             cap = (type == LACON) ? 0 : v->nextvalue;
00901             if (cap)
00902             {
00903                 v->nsubexp++;
00904                 subno = v->nsubexp;
00905                 if ((size_t) subno >= v->nsubs)
00906                     moresubs(v, subno);
00907                 assert((size_t) subno < v->nsubs);
00908             }
00909             else
00910                 atomtype = PLAIN;       /* something that's not '(' */
00911             NEXT();
00912             /* need new endpoints because tree will contain pointers */
00913             s = newstate(v->nfa);
00914             s2 = newstate(v->nfa);
00915             NOERR();
00916             EMPTYARC(lp, s);
00917             EMPTYARC(s2, rp);
00918             NOERR();
00919             atom = parse(v, ')', PLAIN, s, s2);
00920             assert(SEE(')') || ISERR());
00921             NEXT();
00922             NOERR();
00923             if (cap)
00924             {
00925                 v->subs[subno] = atom;
00926                 t = subre(v, '(', atom->flags | CAP, lp, rp);
00927                 NOERR();
00928                 t->subno = subno;
00929                 t->left = atom;
00930                 atom = t;
00931             }
00932             /* postpone everything else pending possible {0} */
00933             break;
00934         case BACKREF:           /* the Feature From The Black Lagoon */
00935             INSIST(type != LACON, REG_ESUBREG);
00936             INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
00937             INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
00938             NOERR();
00939             assert(v->nextvalue > 0);
00940             atom = subre(v, 'b', BACKR, lp, rp);
00941             subno = v->nextvalue;
00942             atom->subno = subno;
00943             EMPTYARC(lp, rp);   /* temporarily, so there's something */
00944             NEXT();
00945             break;
00946     }
00947 
00948     /* ...and an atom may be followed by a quantifier */
00949     switch (v->nexttype)
00950     {
00951         case '*':
00952             m = 0;
00953             n = INFINITY;
00954             qprefer = (v->nextvalue) ? LONGER : SHORTER;
00955             NEXT();
00956             break;
00957         case '+':
00958             m = 1;
00959             n = INFINITY;
00960             qprefer = (v->nextvalue) ? LONGER : SHORTER;
00961             NEXT();
00962             break;
00963         case '?':
00964             m = 0;
00965             n = 1;
00966             qprefer = (v->nextvalue) ? LONGER : SHORTER;
00967             NEXT();
00968             break;
00969         case '{':
00970             NEXT();
00971             m = scannum(v);
00972             if (EAT(','))
00973             {
00974                 if (SEE(DIGIT))
00975                     n = scannum(v);
00976                 else
00977                     n = INFINITY;
00978                 if (m > n)
00979                 {
00980                     ERR(REG_BADBR);
00981                     return;
00982                 }
00983                 /* {m,n} exercises preference, even if it's {m,m} */
00984                 qprefer = (v->nextvalue) ? LONGER : SHORTER;
00985             }
00986             else
00987             {
00988                 n = m;
00989                 /* {m} passes operand's preference through */
00990                 qprefer = 0;
00991             }
00992             if (!SEE('}'))
00993             {                   /* catches errors too */
00994                 ERR(REG_BADBR);
00995                 return;
00996             }
00997             NEXT();
00998             break;
00999         default:                /* no quantifier */
01000             m = n = 1;
01001             qprefer = 0;
01002             break;
01003     }
01004 
01005     /* annoying special case:  {0} or {0,0} cancels everything */
01006     if (m == 0 && n == 0)
01007     {
01008         if (atom != NULL)
01009             freesubre(v, atom);
01010         if (atomtype == '(')
01011             v->subs[subno] = NULL;
01012         delsub(v->nfa, lp, rp);
01013         EMPTYARC(lp, rp);
01014         return;
01015     }
01016 
01017     /* if not a messy case, avoid hard part */
01018     assert(!MESSY(top->flags));
01019     f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0);
01020     if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f)))
01021     {
01022         if (!(m == 1 && n == 1))
01023             repeat(v, lp, rp, m, n);
01024         if (atom != NULL)
01025             freesubre(v, atom);
01026         top->flags = f;
01027         return;
01028     }
01029 
01030     /*
01031      * hard part:  something messy
01032      *
01033      * That is, capturing parens, back reference, short/long clash, or an atom
01034      * with substructure containing one of those.
01035      */
01036 
01037     /* now we'll need a subre for the contents even if they're boring */
01038     if (atom == NULL)
01039     {
01040         atom = subre(v, '=', 0, lp, rp);
01041         NOERR();
01042     }
01043 
01044     /*----------
01045      * Prepare a general-purpose state skeleton.
01046      *
01047      * In the no-backrefs case, we want this:
01048      *
01049      * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
01050      *
01051      * where prefix is some repetitions of atom.  In the general case we need
01052      *
01053      * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
01054      *
01055      * where the iterator wraps around [begin] ---atom---> [end]
01056      *
01057      * We make the s state here for both cases; s2 is made below if needed
01058      *----------
01059      */
01060     s = newstate(v->nfa);       /* first, new endpoints for the atom */
01061     s2 = newstate(v->nfa);
01062     NOERR();
01063     moveouts(v->nfa, lp, s);
01064     moveins(v->nfa, rp, s2);
01065     NOERR();
01066     atom->begin = s;
01067     atom->end = s2;
01068     s = newstate(v->nfa);       /* set up starting state */
01069     NOERR();
01070     EMPTYARC(lp, s);
01071     NOERR();
01072 
01073     /* break remaining subRE into x{...} and what follows */
01074     t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);
01075     t->left = atom;
01076     atomp = &t->left;
01077 
01078     /* here we should recurse... but we must postpone that to the end */
01079 
01080     /* split top into prefix and remaining */
01081     assert(top->op == '=' && top->left == NULL && top->right == NULL);
01082     top->left = subre(v, '=', top->flags, top->begin, lp);
01083     top->op = '.';
01084     top->right = t;
01085 
01086     /* if it's a backref, now is the time to replicate the subNFA */
01087     if (atomtype == BACKREF)
01088     {
01089         assert(atom->begin->nouts == 1);        /* just the EMPTY */
01090         delsub(v->nfa, atom->begin, atom->end);
01091         assert(v->subs[subno] != NULL);
01092 
01093         /*
01094          * And here's why the recursion got postponed: it must wait until the
01095          * skeleton is filled in, because it may hit a backref that wants to
01096          * copy the filled-in skeleton.
01097          */
01098         dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
01099                atom->begin, atom->end);
01100         NOERR();
01101     }
01102 
01103     /*
01104      * It's quantifier time.  If the atom is just a backref, we'll let it deal
01105      * with quantifiers internally.
01106      */
01107     if (atomtype == BACKREF)
01108     {
01109         /* special case:  backrefs have internal quantifiers */
01110         EMPTYARC(s, atom->begin);       /* empty prefix */
01111         /* just stuff everything into atom */
01112         repeat(v, atom->begin, atom->end, m, n);
01113         atom->min = (short) m;
01114         atom->max = (short) n;
01115         atom->flags |= COMBINE(qprefer, atom->flags);
01116         /* rest of branch can be strung starting from atom->end */
01117         s2 = atom->end;
01118     }
01119     else if (m == 1 && n == 1)
01120     {
01121         /* no/vacuous quantifier:  done */
01122         EMPTYARC(s, atom->begin);       /* empty prefix */
01123         /* rest of branch can be strung starting from atom->end */
01124         s2 = atom->end;
01125     }
01126     else if (m > 0 && !(atom->flags & BACKR))
01127     {
01128         /*
01129          * If there's no backrefs involved, we can turn x{m,n} into
01130          * x{m-1,n-1}x, with capturing parens in only the second x.  This is
01131          * valid because we only care about capturing matches from the final
01132          * iteration of the quantifier.  It's a win because we can implement
01133          * the backref-free left side as a plain DFA node, since we don't
01134          * really care where its submatches are.
01135          */
01136         dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
01137         assert(m >= 1 && m != INFINITY && n >= 1);
01138         repeat(v, s, atom->begin, m - 1, (n == INFINITY) ? n : n - 1);
01139         f = COMBINE(qprefer, atom->flags);
01140         t = subre(v, '.', f, s, atom->end);     /* prefix and atom */
01141         NOERR();
01142         t->left = subre(v, '=', PREF(f), s, atom->begin);
01143         NOERR();
01144         t->right = atom;
01145         *atomp = t;
01146         /* rest of branch can be strung starting from atom->end */
01147         s2 = atom->end;
01148     }
01149     else
01150     {
01151         /* general case: need an iteration node */
01152         s2 = newstate(v->nfa);
01153         NOERR();
01154         moveouts(v->nfa, atom->end, s2);
01155         NOERR();
01156         dupnfa(v->nfa, atom->begin, atom->end, s, s2);
01157         repeat(v, s, s2, m, n);
01158         f = COMBINE(qprefer, atom->flags);
01159         t = subre(v, '*', f, s, s2);
01160         NOERR();
01161         t->min = (short) m;
01162         t->max = (short) n;
01163         t->left = atom;
01164         *atomp = t;
01165         /* rest of branch is to be strung from iteration's end state */
01166     }
01167 
01168     /* and finally, look after that postponed recursion */
01169     t = top->right;
01170     if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
01171         t->right = parsebranch(v, stopper, type, s2, rp, 1);
01172     else
01173     {
01174         EMPTYARC(s2, rp);
01175         t->right = subre(v, '=', 0, s2, rp);
01176     }
01177     NOERR();
01178     assert(SEE('|') || SEE(stopper) || SEE(EOS));
01179     t->flags |= COMBINE(t->flags, t->right->flags);
01180     top->flags |= COMBINE(top->flags, t->flags);
01181 }
01182 
01183 /*
01184  * nonword - generate arcs for non-word-character ahead or behind
01185  */
01186 static void
01187 nonword(struct vars * v,
01188         int dir,                /* AHEAD or BEHIND */
01189         struct state * lp,
01190         struct state * rp)
01191 {
01192     int         anchor = (dir == AHEAD) ? '$' : '^';
01193 
01194     assert(dir == AHEAD || dir == BEHIND);
01195     newarc(v->nfa, anchor, 1, lp, rp);
01196     newarc(v->nfa, anchor, 0, lp, rp);
01197     colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp);
01198     /* (no need for special attention to \n) */
01199 }
01200 
01201 /*
01202  * word - generate arcs for word character ahead or behind
01203  */
01204 static void
01205 word(struct vars * v,
01206      int dir,                   /* AHEAD or BEHIND */
01207      struct state * lp,
01208      struct state * rp)
01209 {
01210     assert(dir == AHEAD || dir == BEHIND);
01211     cloneouts(v->nfa, v->wordchrs, lp, rp, dir);
01212     /* (no need for special attention to \n) */
01213 }
01214 
01215 /*
01216  * scannum - scan a number
01217  */
01218 static int                      /* value, <= DUPMAX */
01219 scannum(struct vars * v)
01220 {
01221     int         n = 0;
01222 
01223     while (SEE(DIGIT) && n < DUPMAX)
01224     {
01225         n = n * 10 + v->nextvalue;
01226         NEXT();
01227     }
01228     if (SEE(DIGIT) || n > DUPMAX)
01229     {
01230         ERR(REG_BADBR);
01231         return 0;
01232     }
01233     return n;
01234 }
01235 
01236 /*
01237  * repeat - replicate subNFA for quantifiers
01238  *
01239  * The sub-NFA strung from lp to rp is modified to represent m to n
01240  * repetitions of its initial contents.
01241  *
01242  * The duplication sequences used here are chosen carefully so that any
01243  * pointers starting out pointing into the subexpression end up pointing into
01244  * the last occurrence.  (Note that it may not be strung between the same
01245  * left and right end states, however!)  This used to be important for the
01246  * subRE tree, although the important bits are now handled by the in-line
01247  * code in parse(), and when this is called, it doesn't matter any more.
01248  */
01249 static void
01250 repeat(struct vars * v,
01251        struct state * lp,
01252        struct state * rp,
01253        int m,
01254        int n)
01255 {
01256 #define  SOME    2
01257 #define  INF     3
01258 #define  PAIR(x, y)  ((x)*4 + (y))
01259 #define  REDUCE(x)   ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
01260     const int   rm = REDUCE(m);
01261     const int   rn = REDUCE(n);
01262     struct state *s;
01263     struct state *s2;
01264 
01265     switch (PAIR(rm, rn))
01266     {
01267         case PAIR(0, 0):        /* empty string */
01268             delsub(v->nfa, lp, rp);
01269             EMPTYARC(lp, rp);
01270             break;
01271         case PAIR(0, 1):        /* do as x| */
01272             EMPTYARC(lp, rp);
01273             break;
01274         case PAIR(0, SOME):     /* do as x{1,n}| */
01275             repeat(v, lp, rp, 1, n);
01276             NOERR();
01277             EMPTYARC(lp, rp);
01278             break;
01279         case PAIR(0, INF):      /* loop x around */
01280             s = newstate(v->nfa);
01281             NOERR();
01282             moveouts(v->nfa, lp, s);
01283             moveins(v->nfa, rp, s);
01284             EMPTYARC(lp, s);
01285             EMPTYARC(s, rp);
01286             break;
01287         case PAIR(1, 1):        /* no action required */
01288             break;
01289         case PAIR(1, SOME):     /* do as x{0,n-1}x = (x{1,n-1}|)x */
01290             s = newstate(v->nfa);
01291             NOERR();
01292             moveouts(v->nfa, lp, s);
01293             dupnfa(v->nfa, s, rp, lp, s);
01294             NOERR();
01295             repeat(v, lp, s, 1, n - 1);
01296             NOERR();
01297             EMPTYARC(lp, s);
01298             break;
01299         case PAIR(1, INF):      /* add loopback arc */
01300             s = newstate(v->nfa);
01301             s2 = newstate(v->nfa);
01302             NOERR();
01303             moveouts(v->nfa, lp, s);
01304             moveins(v->nfa, rp, s2);
01305             EMPTYARC(lp, s);
01306             EMPTYARC(s2, rp);
01307             EMPTYARC(s2, s);
01308             break;
01309         case PAIR(SOME, SOME):  /* do as x{m-1,n-1}x */
01310             s = newstate(v->nfa);
01311             NOERR();
01312             moveouts(v->nfa, lp, s);
01313             dupnfa(v->nfa, s, rp, lp, s);
01314             NOERR();
01315             repeat(v, lp, s, m - 1, n - 1);
01316             break;
01317         case PAIR(SOME, INF):   /* do as x{m-1,}x */
01318             s = newstate(v->nfa);
01319             NOERR();
01320             moveouts(v->nfa, lp, s);
01321             dupnfa(v->nfa, s, rp, lp, s);
01322             NOERR();
01323             repeat(v, lp, s, m - 1, n);
01324             break;
01325         default:
01326             ERR(REG_ASSERT);
01327             break;
01328     }
01329 }
01330 
01331 /*
01332  * bracket - handle non-complemented bracket expression
01333  * Also called from cbracket for complemented bracket expressions.
01334  */
01335 static void
01336 bracket(struct vars * v,
01337         struct state * lp,
01338         struct state * rp)
01339 {
01340     assert(SEE('['));
01341     NEXT();
01342     while (!SEE(']') && !SEE(EOS))
01343         brackpart(v, lp, rp);
01344     assert(SEE(']') || ISERR());
01345     okcolors(v->nfa, v->cm);
01346 }
01347 
01348 /*
01349  * cbracket - handle complemented bracket expression
01350  * We do it by calling bracket() with dummy endpoints, and then complementing
01351  * the result.  The alternative would be to invoke rainbow(), and then delete
01352  * arcs as the b.e. is seen... but that gets messy.
01353  */
01354 static void
01355 cbracket(struct vars * v,
01356          struct state * lp,
01357          struct state * rp)
01358 {
01359     struct state *left = newstate(v->nfa);
01360     struct state *right = newstate(v->nfa);
01361 
01362     NOERR();
01363     bracket(v, left, right);
01364     if (v->cflags & REG_NLSTOP)
01365         newarc(v->nfa, PLAIN, v->nlcolor, left, right);
01366     NOERR();
01367 
01368     assert(lp->nouts == 0);     /* all outarcs will be ours */
01369 
01370     /*
01371      * Easy part of complementing, and all there is to do since the MCCE code
01372      * was removed.
01373      */
01374     colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp);
01375     NOERR();
01376     dropstate(v->nfa, left);
01377     assert(right->nins == 0);
01378     freestate(v->nfa, right);
01379 }
01380 
01381 /*
01382  * brackpart - handle one item (or range) within a bracket expression
01383  */
01384 static void
01385 brackpart(struct vars * v,
01386           struct state * lp,
01387           struct state * rp)
01388 {
01389     celt        startc;
01390     celt        endc;
01391     struct cvec *cv;
01392     const chr  *startp;
01393     const chr  *endp;
01394     chr         c[1];
01395 
01396     /* parse something, get rid of special cases, take shortcuts */
01397     switch (v->nexttype)
01398     {
01399         case RANGE:             /* a-b-c or other botch */
01400             ERR(REG_ERANGE);
01401             return;
01402             break;
01403         case PLAIN:
01404             c[0] = v->nextvalue;
01405             NEXT();
01406             /* shortcut for ordinary chr (not range) */
01407             if (!SEE(RANGE))
01408             {
01409                 onechr(v, c[0], lp, rp);
01410                 return;
01411             }
01412             startc = element(v, c, c + 1);
01413             NOERR();
01414             break;
01415         case COLLEL:
01416             startp = v->now;
01417             endp = scanplain(v);
01418             INSIST(startp < endp, REG_ECOLLATE);
01419             NOERR();
01420             startc = element(v, startp, endp);
01421             NOERR();
01422             break;
01423         case ECLASS:
01424             startp = v->now;
01425             endp = scanplain(v);
01426             INSIST(startp < endp, REG_ECOLLATE);
01427             NOERR();
01428             startc = element(v, startp, endp);
01429             NOERR();
01430             cv = eclass(v, startc, (v->cflags & REG_ICASE));
01431             NOERR();
01432             dovec(v, cv, lp, rp);
01433             return;
01434             break;
01435         case CCLASS:
01436             startp = v->now;
01437             endp = scanplain(v);
01438             INSIST(startp < endp, REG_ECTYPE);
01439             NOERR();
01440             cv = cclass(v, startp, endp, (v->cflags & REG_ICASE));
01441             NOERR();
01442             dovec(v, cv, lp, rp);
01443             return;
01444             break;
01445         default:
01446             ERR(REG_ASSERT);
01447             return;
01448             break;
01449     }
01450 
01451     if (SEE(RANGE))
01452     {
01453         NEXT();
01454         switch (v->nexttype)
01455         {
01456             case PLAIN:
01457             case RANGE:
01458                 c[0] = v->nextvalue;
01459                 NEXT();
01460                 endc = element(v, c, c + 1);
01461                 NOERR();
01462                 break;
01463             case COLLEL:
01464                 startp = v->now;
01465                 endp = scanplain(v);
01466                 INSIST(startp < endp, REG_ECOLLATE);
01467                 NOERR();
01468                 endc = element(v, startp, endp);
01469                 NOERR();
01470                 break;
01471             default:
01472                 ERR(REG_ERANGE);
01473                 return;
01474                 break;
01475         }
01476     }
01477     else
01478         endc = startc;
01479 
01480     /*
01481      * Ranges are unportable.  Actually, standard C does guarantee that digits
01482      * are contiguous, but making that an exception is just too complicated.
01483      */
01484     if (startc != endc)
01485         NOTE(REG_UUNPORT);
01486     cv = range(v, startc, endc, (v->cflags & REG_ICASE));
01487     NOERR();
01488     dovec(v, cv, lp, rp);
01489 }
01490 
01491 /*
01492  * scanplain - scan PLAIN contents of [. etc.
01493  *
01494  * Certain bits of trickery in lex.c know that this code does not try
01495  * to look past the final bracket of the [. etc.
01496  */
01497 static const chr *              /* just after end of sequence */
01498 scanplain(struct vars * v)
01499 {
01500     const chr  *endp;
01501 
01502     assert(SEE(COLLEL) || SEE(ECLASS) || SEE(CCLASS));
01503     NEXT();
01504 
01505     endp = v->now;
01506     while (SEE(PLAIN))
01507     {
01508         endp = v->now;
01509         NEXT();
01510     }
01511 
01512     assert(SEE(END) || ISERR());
01513     NEXT();
01514 
01515     return endp;
01516 }
01517 
01518 /*
01519  * onechr - fill in arcs for a plain character, and possible case complements
01520  * This is mostly a shortcut for efficient handling of the common case.
01521  */
01522 static void
01523 onechr(struct vars * v,
01524        chr c,
01525        struct state * lp,
01526        struct state * rp)
01527 {
01528     if (!(v->cflags & REG_ICASE))
01529     {
01530         newarc(v->nfa, PLAIN, subcolor(v->cm, c), lp, rp);
01531         return;
01532     }
01533 
01534     /* rats, need general case anyway... */
01535     dovec(v, allcases(v, c), lp, rp);
01536 }
01537 
01538 /*
01539  * dovec - fill in arcs for each element of a cvec
01540  */
01541 static void
01542 dovec(struct vars * v,
01543       struct cvec * cv,
01544       struct state * lp,
01545       struct state * rp)
01546 {
01547     chr         ch,
01548                 from,
01549                 to;
01550     const chr  *p;
01551     int         i;
01552 
01553     /* ordinary characters */
01554     for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--)
01555     {
01556         ch = *p;
01557         newarc(v->nfa, PLAIN, subcolor(v->cm, ch), lp, rp);
01558     }
01559 
01560     /* and the ranges */
01561     for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--)
01562     {
01563         from = *p;
01564         to = *(p + 1);
01565         if (from <= to)
01566             subrange(v, from, to, lp, rp);
01567     }
01568 }
01569 
01570 /*
01571  * wordchrs - set up word-chr list for word-boundary stuff, if needed
01572  *
01573  * The list is kept as a bunch of arcs between two dummy states; it's
01574  * disposed of by the unreachable-states sweep in NFA optimization.
01575  * Does NEXT().  Must not be called from any unusual lexical context.
01576  * This should be reconciled with the \w etc. handling in lex.c, and
01577  * should be cleaned up to reduce dependencies on input scanning.
01578  */
01579 static void
01580 wordchrs(struct vars * v)
01581 {
01582     struct state *left;
01583     struct state *right;
01584 
01585     if (v->wordchrs != NULL)
01586     {
01587         NEXT();                 /* for consistency */
01588         return;
01589     }
01590 
01591     left = newstate(v->nfa);
01592     right = newstate(v->nfa);
01593     NOERR();
01594     /* fine point:  implemented with [::], and lexer will set REG_ULOCALE */
01595     lexword(v);
01596     NEXT();
01597     assert(v->savenow != NULL && SEE('['));
01598     bracket(v, left, right);
01599     assert((v->savenow != NULL && SEE(']')) || ISERR());
01600     NEXT();
01601     NOERR();
01602     v->wordchrs = left;
01603 }
01604 
01605 /*
01606  * subre - allocate a subre
01607  */
01608 static struct subre *
01609 subre(struct vars * v,
01610       int op,
01611       int flags,
01612       struct state * begin,
01613       struct state * end)
01614 {
01615     struct subre *ret = v->treefree;
01616 
01617     if (ret != NULL)
01618         v->treefree = ret->left;
01619     else
01620     {
01621         ret = (struct subre *) MALLOC(sizeof(struct subre));
01622         if (ret == NULL)
01623         {
01624             ERR(REG_ESPACE);
01625             return NULL;
01626         }
01627         ret->chain = v->treechain;
01628         v->treechain = ret;
01629     }
01630 
01631     assert(strchr("=b|.*(", op) != NULL);
01632 
01633     ret->op = op;
01634     ret->flags = flags;
01635     ret->id = 0;                /* will be assigned later */
01636     ret->subno = 0;
01637     ret->min = ret->max = 1;
01638     ret->left = NULL;
01639     ret->right = NULL;
01640     ret->begin = begin;
01641     ret->end = end;
01642     ZAPCNFA(ret->cnfa);
01643 
01644     return ret;
01645 }
01646 
01647 /*
01648  * freesubre - free a subRE subtree
01649  */
01650 static void
01651 freesubre(struct vars * v,      /* might be NULL */
01652           struct subre * sr)
01653 {
01654     if (sr == NULL)
01655         return;
01656 
01657     if (sr->left != NULL)
01658         freesubre(v, sr->left);
01659     if (sr->right != NULL)
01660         freesubre(v, sr->right);
01661 
01662     freesrnode(v, sr);
01663 }
01664 
01665 /*
01666  * freesrnode - free one node in a subRE subtree
01667  */
01668 static void
01669 freesrnode(struct vars * v,     /* might be NULL */
01670            struct subre * sr)
01671 {
01672     if (sr == NULL)
01673         return;
01674 
01675     if (!NULLCNFA(sr->cnfa))
01676         freecnfa(&sr->cnfa);
01677     sr->flags = 0;
01678 
01679     if (v != NULL)
01680     {
01681         sr->left = v->treefree;
01682         v->treefree = sr;
01683     }
01684     else
01685         FREE(sr);
01686 }
01687 
01688 /*
01689  * optst - optimize a subRE subtree
01690  */
01691 static void
01692 optst(struct vars * v,
01693       struct subre * t)
01694 {
01695     /*
01696      * DGP (2007-11-13): I assume it was the programmer's intent to eventually
01697      * come back and add code to optimize subRE trees, but the routine coded
01698      * just spends effort traversing the tree and doing nothing. We can do
01699      * nothing with less effort.
01700      */
01701     return;
01702 }
01703 
01704 /*
01705  * numst - number tree nodes (assigning "id" indexes)
01706  */
01707 static int                      /* next number */
01708 numst(struct subre * t,
01709       int start)                /* starting point for subtree numbers */
01710 {
01711     int         i;
01712 
01713     assert(t != NULL);
01714 
01715     i = start;
01716     t->id = (short) i++;
01717     if (t->left != NULL)
01718         i = numst(t->left, i);
01719     if (t->right != NULL)
01720         i = numst(t->right, i);
01721     return i;
01722 }
01723 
01724 /*
01725  * markst - mark tree nodes as INUSE
01726  */
01727 static void
01728 markst(struct subre * t)
01729 {
01730     assert(t != NULL);
01731 
01732     t->flags |= INUSE;
01733     if (t->left != NULL)
01734         markst(t->left);
01735     if (t->right != NULL)
01736         markst(t->right);
01737 }
01738 
01739 /*
01740  * cleanst - free any tree nodes not marked INUSE
01741  */
01742 static void
01743 cleanst(struct vars * v)
01744 {
01745     struct subre *t;
01746     struct subre *next;
01747 
01748     for (t = v->treechain; t != NULL; t = next)
01749     {
01750         next = t->chain;
01751         if (!(t->flags & INUSE))
01752             FREE(t);
01753     }
01754     v->treechain = NULL;
01755     v->treefree = NULL;         /* just on general principles */
01756 }
01757 
01758 /*
01759  * nfatree - turn a subRE subtree into a tree of compacted NFAs
01760  */
01761 static long                     /* optimize results from top node */
01762 nfatree(struct vars * v,
01763         struct subre * t,
01764         FILE *f)                /* for debug output */
01765 {
01766     assert(t != NULL && t->begin != NULL);
01767 
01768     if (t->left != NULL)
01769         (DISCARD) nfatree(v, t->left, f);
01770     if (t->right != NULL)
01771         (DISCARD) nfatree(v, t->right, f);
01772 
01773     return nfanode(v, t, f);
01774 }
01775 
01776 /*
01777  * nfanode - do one NFA for nfatree
01778  */
01779 static long                     /* optimize results */
01780 nfanode(struct vars * v,
01781         struct subre * t,
01782         FILE *f)                /* for debug output */
01783 {
01784     struct nfa *nfa;
01785     long        ret = 0;
01786 
01787     assert(t->begin != NULL);
01788 
01789 #ifdef REG_DEBUG
01790     if (f != NULL)
01791     {
01792         char        idbuf[50];
01793 
01794         fprintf(f, "\n\n\n========= TREE NODE %s ==========\n",
01795                 stid(t, idbuf, sizeof(idbuf)));
01796     }
01797 #endif
01798     nfa = newnfa(v, v->cm, v->nfa);
01799     NOERRZ();
01800     dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
01801     if (!ISERR())
01802     {
01803         specialcolors(nfa);
01804         ret = optimize(nfa, f);
01805     }
01806     if (!ISERR())
01807         compact(nfa, &t->cnfa);
01808 
01809     freenfa(nfa);
01810     return ret;
01811 }
01812 
01813 /*
01814  * newlacon - allocate a lookahead-constraint subRE
01815  */
01816 static int                      /* lacon number */
01817 newlacon(struct vars * v,
01818          struct state * begin,
01819          struct state * end,
01820          int pos)
01821 {
01822     int         n;
01823     struct subre *sub;
01824 
01825     if (v->nlacons == 0)
01826     {
01827         v->lacons = (struct subre *) MALLOC(2 * sizeof(struct subre));
01828         n = 1;                  /* skip 0th */
01829         v->nlacons = 2;
01830     }
01831     else
01832     {
01833         v->lacons = (struct subre *) REALLOC(v->lacons,
01834                                     (v->nlacons + 1) * sizeof(struct subre));
01835         n = v->nlacons++;
01836     }
01837     if (v->lacons == NULL)
01838     {
01839         ERR(REG_ESPACE);
01840         return 0;
01841     }
01842     sub = &v->lacons[n];
01843     sub->begin = begin;
01844     sub->end = end;
01845     sub->subno = pos;
01846     ZAPCNFA(sub->cnfa);
01847     return n;
01848 }
01849 
01850 /*
01851  * freelacons - free lookahead-constraint subRE vector
01852  */
01853 static void
01854 freelacons(struct subre * subs,
01855            int n)
01856 {
01857     struct subre *sub;
01858     int         i;
01859 
01860     assert(n > 0);
01861     for (sub = subs + 1, i = n - 1; i > 0; sub++, i--)  /* no 0th */
01862         if (!NULLCNFA(sub->cnfa))
01863             freecnfa(&sub->cnfa);
01864     FREE(subs);
01865 }
01866 
01867 /*
01868  * rfree - free a whole RE (insides of regfree)
01869  */
01870 static void
01871 rfree(regex_t *re)
01872 {
01873     struct guts *g;
01874 
01875     if (re == NULL || re->re_magic != REMAGIC)
01876         return;
01877 
01878     re->re_magic = 0;           /* invalidate RE */
01879     g = (struct guts *) re->re_guts;
01880     re->re_guts = NULL;
01881     re->re_fns = NULL;
01882     if (g != NULL)
01883     {
01884         g->magic = 0;
01885         freecm(&g->cmap);
01886         if (g->tree != NULL)
01887             freesubre((struct vars *) NULL, g->tree);
01888         if (g->lacons != NULL)
01889             freelacons(g->lacons, g->nlacons);
01890         if (!NULLCNFA(g->search))
01891             freecnfa(&g->search);
01892         FREE(g);
01893     }
01894 }
01895 
01896 #ifdef REG_DEBUG
01897 
01898 /*
01899  * dump - dump an RE in human-readable form
01900  */
01901 static void
01902 dump(regex_t *re,
01903      FILE *f)
01904 {
01905     struct guts *g;
01906     int         i;
01907 
01908     if (re->re_magic != REMAGIC)
01909         fprintf(f, "bad magic number (0x%x not 0x%x)\n", re->re_magic,
01910                 REMAGIC);
01911     if (re->re_guts == NULL)
01912     {
01913         fprintf(f, "NULL guts!!!\n");
01914         return;
01915     }
01916     g = (struct guts *) re->re_guts;
01917     if (g->magic != GUTSMAGIC)
01918         fprintf(f, "bad guts magic number (0x%x not 0x%x)\n", g->magic,
01919                 GUTSMAGIC);
01920 
01921     fprintf(f, "\n\n\n========= DUMP ==========\n");
01922     fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n",
01923             (int) re->re_nsub, re->re_info, re->re_csize, g->ntree);
01924 
01925     dumpcolors(&g->cmap, f);
01926     if (!NULLCNFA(g->search))
01927     {
01928         printf("\nsearch:\n");
01929         dumpcnfa(&g->search, f);
01930     }
01931     for (i = 1; i < g->nlacons; i++)
01932     {
01933         fprintf(f, "\nla%d (%s):\n", i,
01934                 (g->lacons[i].subno) ? "positive" : "negative");
01935         dumpcnfa(&g->lacons[i].cnfa, f);
01936     }
01937     fprintf(f, "\n");
01938     dumpst(g->tree, f, 0);
01939 }
01940 
01941 /*
01942  * dumpst - dump a subRE tree
01943  */
01944 static void
01945 dumpst(struct subre * t,
01946        FILE *f,
01947        int nfapresent)          /* is the original NFA still around? */
01948 {
01949     if (t == NULL)
01950         fprintf(f, "null tree\n");
01951     else
01952         stdump(t, f, nfapresent);
01953     fflush(f);
01954 }
01955 
01956 /*
01957  * stdump - recursive guts of dumpst
01958  */
01959 static void
01960 stdump(struct subre * t,
01961        FILE *f,
01962        int nfapresent)          /* is the original NFA still around? */
01963 {
01964     char        idbuf[50];
01965 
01966     fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op);
01967     if (t->flags & LONGER)
01968         fprintf(f, " longest");
01969     if (t->flags & SHORTER)
01970         fprintf(f, " shortest");
01971     if (t->flags & MIXED)
01972         fprintf(f, " hasmixed");
01973     if (t->flags & CAP)
01974         fprintf(f, " hascapture");
01975     if (t->flags & BACKR)
01976         fprintf(f, " hasbackref");
01977     if (!(t->flags & INUSE))
01978         fprintf(f, " UNUSED");
01979     if (t->subno != 0)
01980         fprintf(f, " (#%d)", t->subno);
01981     if (t->min != 1 || t->max != 1)
01982     {
01983         fprintf(f, " {%d,", t->min);
01984         if (t->max != INFINITY)
01985             fprintf(f, "%d", t->max);
01986         fprintf(f, "}");
01987     }
01988     if (nfapresent)
01989         fprintf(f, " %ld-%ld", (long) t->begin->no, (long) t->end->no);
01990     if (t->left != NULL)
01991         fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf)));
01992     if (t->right != NULL)
01993         fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf)));
01994     if (!NULLCNFA(t->cnfa))
01995     {
01996         fprintf(f, "\n");
01997         dumpcnfa(&t->cnfa, f);
01998     }
01999     fprintf(f, "\n");
02000     if (t->left != NULL)
02001         stdump(t->left, f, nfapresent);
02002     if (t->right != NULL)
02003         stdump(t->right, f, nfapresent);
02004 }
02005 
02006 /*
02007  * stid - identify a subtree node for dumping
02008  */
02009 static const char *             /* points to buf or constant string */
02010 stid(struct subre * t,
02011      char *buf,
02012      size_t bufsize)
02013 {
02014     /* big enough for hex int or decimal t->id? */
02015     if (bufsize < sizeof(void *) * 2 + 3 || bufsize < sizeof(t->id) * 3 + 1)
02016         return "unable";
02017     if (t->id != 0)
02018         sprintf(buf, "%d", t->id);
02019     else
02020         sprintf(buf, "%p", t);
02021     return buf;
02022 }
02023 #endif   /* REG_DEBUG */
02024 
02025 
02026 #include "regc_lex.c"
02027 #include "regc_color.c"
02028 #include "regc_nfa.c"
02029 #include "regc_cvec.c"
02030 #include "regc_pg_locale.c"
02031 #include "regc_locale.c"
Header And Logo

regcomp.c