Header And Logo

PostgreSQL
| The world's most advanced open source database.

regguts.h

Go to the documentation of this file.
00001 /*
00002  * Internal interface definitions, etc., for the reg package
00003  *
00004  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
00005  *
00006  * Development of this software was funded, in part, by Cray Research Inc.,
00007  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
00008  * Corporation, none of whom are responsible for the results.  The author
00009  * thanks all of them.
00010  *
00011  * Redistribution and use in source and binary forms -- with or without
00012  * modification -- are permitted for any purpose, provided that
00013  * redistributions in source form retain this entire copyright notice and
00014  * indicate the origin and nature of any modifications.
00015  *
00016  * I'd appreciate being given credit for this package in the documentation
00017  * of software which uses it, but that is not a requirement.
00018  *
00019  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
00020  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
00021  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
00022  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00023  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00024  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
00025  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
00026  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
00027  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
00028  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00029  *
00030  * src/include/regex/regguts.h
00031  */
00032 
00033 
00034 
00035 /*
00036  * Environmental customization.  It should not (I hope) be necessary to
00037  * alter the file you are now reading -- regcustom.h should handle it all,
00038  * given care here and elsewhere.
00039  */
00040 #include "regcustom.h"
00041 
00042 
00043 
00044 /*
00045  * Things that regcustom.h might override.
00046  */
00047 
00048 /* assertions */
00049 #ifndef assert
00050 #ifndef REG_DEBUG
00051 #define  NDEBUG                 /* no assertions */
00052 #endif
00053 #include <assert.h>
00054 #endif
00055 
00056 /* voids */
00057 #ifndef DISCARD
00058 #define DISCARD void            /* for throwing values away */
00059 #endif
00060 #ifndef VS
00061 #define VS(x)   ((void *)(x))   /* cast something to generic ptr */
00062 #endif
00063 
00064 /* function-pointer declarator */
00065 #ifndef FUNCPTR
00066 #define FUNCPTR(name, args) (*(name)) args
00067 #endif
00068 
00069 /* memory allocation */
00070 #ifndef MALLOC
00071 #define MALLOC(n)   malloc(n)
00072 #endif
00073 #ifndef REALLOC
00074 #define REALLOC(p, n)   realloc(VS(p), n)
00075 #endif
00076 #ifndef FREE
00077 #define FREE(p)     free(VS(p))
00078 #endif
00079 
00080 /* want size of a char in bits, and max value in bounded quantifiers */
00081 #ifndef CHAR_BIT
00082 #include <limits.h>
00083 #endif
00084 #ifndef _POSIX2_RE_DUP_MAX
00085 #define _POSIX2_RE_DUP_MAX  255 /* normally from <limits.h> */
00086 #endif
00087 
00088 
00089 
00090 /*
00091  * misc
00092  */
00093 
00094 #define NOTREACHED  0
00095 #define xxx     1
00096 
00097 #define DUPMAX  _POSIX2_RE_DUP_MAX
00098 #define INFINITY    (DUPMAX+1)
00099 
00100 #define REMAGIC 0xfed7          /* magic number for main struct */
00101 
00102 
00103 
00104 /*
00105  * debugging facilities
00106  */
00107 #ifdef REG_DEBUG
00108 /* FDEBUG does finite-state tracing */
00109 #define FDEBUG(arglist) { if (v->eflags&REG_FTRACE) printf arglist; }
00110 /* MDEBUG does higher-level tracing */
00111 #define MDEBUG(arglist) { if (v->eflags&REG_MTRACE) printf arglist; }
00112 #else
00113 #define FDEBUG(arglist) {}
00114 #define MDEBUG(arglist) {}
00115 #endif
00116 
00117 
00118 
00119 /*
00120  * bitmap manipulation
00121  */
00122 #define UBITS   (CHAR_BIT * sizeof(unsigned))
00123 #define BSET(uv, sn)    ((uv)[(sn)/UBITS] |= (unsigned)1 << ((sn)%UBITS))
00124 #define ISBSET(uv, sn)  ((uv)[(sn)/UBITS] & ((unsigned)1 << ((sn)%UBITS)))
00125 
00126 
00127 
00128 /*
00129  * We dissect a chr into byts for colormap table indexing.  Here we define
00130  * a byt, which will be the same as a byte on most machines...  The exact
00131  * size of a byt is not critical, but about 8 bits is good, and extraction
00132  * of 8-bit chunks is sometimes especially fast.
00133  */
00134 #ifndef BYTBITS
00135 #define BYTBITS 8               /* bits in a byt */
00136 #endif
00137 #define BYTTAB  (1<<BYTBITS)    /* size of table with one entry per byt value */
00138 #define BYTMASK (BYTTAB-1)      /* bit mask for byt */
00139 #define NBYTS   ((CHRBITS+BYTBITS-1)/BYTBITS)
00140 /* the definition of GETCOLOR(), below, assumes NBYTS <= 4 */
00141 
00142 
00143 
00144 /*
00145  * As soon as possible, we map chrs into equivalence classes -- "colors" --
00146  * which are of much more manageable number.
00147  */
00148 typedef short color;            /* colors of characters */
00149 typedef int pcolor;             /* what color promotes to */
00150 
00151 #define MAX_COLOR   32767       /* max color (must fit in 'color' datatype) */
00152 #define COLORLESS   (-1)        /* impossible color */
00153 #define WHITE       0           /* default color, parent of all others */
00154 
00155 
00156 
00157 /*
00158  * A colormap is a tree -- more precisely, a DAG -- indexed at each level
00159  * by a byt of the chr, to map the chr to a color efficiently.  Because
00160  * lower sections of the tree can be shared, it can exploit the usual
00161  * sparseness of such a mapping table.  The tree is always NBYTS levels
00162  * deep (in the past it was shallower during construction but was "filled"
00163  * to full depth at the end of that); areas that are unaltered as yet point
00164  * to "fill blocks" which are entirely WHITE in color.
00165  */
00166 
00167 /* the tree itself */
00168 struct colors
00169 {
00170     color       ccolor[BYTTAB];
00171 };
00172 struct ptrs
00173 {
00174     union tree *pptr[BYTTAB];
00175 };
00176 union tree
00177 {
00178     struct colors colors;
00179     struct ptrs ptrs;
00180 };
00181 
00182 #define tcolor  colors.ccolor
00183 #define tptr    ptrs.pptr
00184 
00185 /*
00186  * Per-color data structure for the compile-time color machinery
00187  *
00188  * If "sub" is not NOSUB then it is the number of the color's current
00189  * subcolor, i.e. we are in process of dividing this color (character
00190  * equivalence class) into two colors.  See src/backend/regex/README for
00191  * discussion of subcolors.
00192  *
00193  * Currently-unused colors have the FREECOL bit set and are linked into a
00194  * freelist using their "sub" fields, but only if their color numbers are
00195  * less than colormap.max.  Any array entries beyond "max" are just garbage.
00196  */
00197 struct colordesc
00198 {
00199     uchr        nchrs;          /* number of chars of this color */
00200     color       sub;            /* open subcolor, if any; or free-chain ptr */
00201 #define  NOSUB   COLORLESS      /* value of "sub" when no open subcolor */
00202     struct arc *arcs;           /* chain of all arcs of this color */
00203     chr         firstchr;       /* char first assigned to this color */
00204     int         flags;          /* bit values defined next */
00205 #define  FREECOL 01             /* currently free */
00206 #define  PSEUDO  02             /* pseudocolor, no real chars */
00207 #define  UNUSEDCOLOR(cd) ((cd)->flags & FREECOL)
00208     union tree *block;          /* block of solid color, if any */
00209 };
00210 
00211 /*
00212  * The color map itself
00213  *
00214  * Much of the data in the colormap struct is only used at compile time.
00215  * However, the bulk of the space usage is in the "tree" structure, so it's
00216  * not clear that there's much point in converting the rest to a more compact
00217  * form when compilation is finished.
00218  */
00219 struct colormap
00220 {
00221     int         magic;
00222 #define  CMMAGIC 0x876
00223     struct vars *v;             /* for compile error reporting */
00224     size_t      ncds;           /* allocated length of colordescs array */
00225     size_t      max;            /* highest color number currently in use */
00226     color       free;           /* beginning of free chain (if non-0) */
00227     struct colordesc *cd;       /* pointer to array of colordescs */
00228 #define  CDEND(cm)   (&(cm)->cd[(cm)->max + 1])
00229     /* If we need up to NINLINECDS, we store them here to save a malloc */
00230 #define  NINLINECDS  ((size_t)10)
00231     struct colordesc cdspace[NINLINECDS];
00232     union tree  tree[NBYTS];    /* tree top, plus lower-level fill blocks */
00233 };
00234 
00235 /* optimization magic to do fast chr->color mapping */
00236 #define B0(c)   ((c) & BYTMASK)
00237 #define B1(c)   (((c)>>BYTBITS) & BYTMASK)
00238 #define B2(c)   (((c)>>(2*BYTBITS)) & BYTMASK)
00239 #define B3(c)   (((c)>>(3*BYTBITS)) & BYTMASK)
00240 #if NBYTS == 1
00241 #define GETCOLOR(cm, c) ((cm)->tree->tcolor[B0(c)])
00242 #endif
00243 /* beware, for NBYTS>1, GETCOLOR() is unsafe -- 2nd arg used repeatedly */
00244 #if NBYTS == 2
00245 #define GETCOLOR(cm, c) ((cm)->tree->tptr[B1(c)]->tcolor[B0(c)])
00246 #endif
00247 #if NBYTS == 4
00248 #define GETCOLOR(cm, c) ((cm)->tree->tptr[B3(c)]->tptr[B2(c)]->tptr[B1(c)]->tcolor[B0(c)])
00249 #endif
00250 
00251 
00252 /*
00253  * Interface definitions for locale-interface functions in regc_locale.c.
00254  */
00255 
00256 /*
00257  * Representation of a set of characters.  chrs[] represents individual
00258  * code points, ranges[] represents ranges in the form min..max inclusive.
00259  *
00260  * Note that in cvecs gotten from newcvec() and intended to be freed by
00261  * freecvec(), both arrays of chrs are after the end of the struct, not
00262  * separately malloc'd; so chrspace and rangespace are effectively immutable.
00263  */
00264 struct cvec
00265 {
00266     int         nchrs;          /* number of chrs */
00267     int         chrspace;       /* number of chrs allocated in chrs[] */
00268     chr        *chrs;           /* pointer to vector of chrs */
00269     int         nranges;        /* number of ranges (chr pairs) */
00270     int         rangespace;     /* number of ranges allocated in ranges[] */
00271     chr        *ranges;         /* pointer to vector of chr pairs */
00272 };
00273 
00274 
00275 /*
00276  * definitions for NFA internal representation
00277  *
00278  * Having a "from" pointer within each arc may seem redundant, but it
00279  * saves a lot of hassle.
00280  */
00281 struct state;
00282 
00283 struct arc
00284 {
00285     int         type;           /* 0 if free, else an NFA arc type code */
00286     color       co;
00287     struct state *from;         /* where it's from (and contained within) */
00288     struct state *to;           /* where it's to */
00289     struct arc *outchain;       /* link in *from's outs chain or free chain */
00290 #define  freechain   outchain
00291     struct arc *inchain;        /* link in *to's ins chain */
00292     struct arc *colorchain;     /* link in color's arc chain */
00293     struct arc *colorchainRev;  /* back-link in color's arc chain */
00294 };
00295 
00296 struct arcbatch
00297 {                               /* for bulk allocation of arcs */
00298     struct arcbatch *next;
00299 #define  ABSIZE  10
00300     struct arc  a[ABSIZE];
00301 };
00302 
00303 struct state
00304 {
00305     int         no;
00306 #define  FREESTATE   (-1)
00307     char        flag;           /* marks special states */
00308     int         nins;           /* number of inarcs */
00309     struct arc *ins;            /* chain of inarcs */
00310     int         nouts;          /* number of outarcs */
00311     struct arc *outs;           /* chain of outarcs */
00312     struct arc *free;           /* chain of free arcs */
00313     struct state *tmp;          /* temporary for traversal algorithms */
00314     struct state *next;         /* chain for traversing all */
00315     struct state *prev;         /* back chain */
00316     struct arcbatch oas;        /* first arcbatch, avoid malloc in easy case */
00317     int         noas;           /* number of arcs used in first arcbatch */
00318 };
00319 
00320 struct nfa
00321 {
00322     struct state *pre;          /* pre-initial state */
00323     struct state *init;         /* initial state */
00324     struct state *final;        /* final state */
00325     struct state *post;         /* post-final state */
00326     int         nstates;        /* for numbering states */
00327     struct state *states;       /* state-chain header */
00328     struct state *slast;        /* tail of the chain */
00329     struct state *free;         /* free list */
00330     struct colormap *cm;        /* the color map */
00331     color       bos[2];         /* colors, if any, assigned to BOS and BOL */
00332     color       eos[2];         /* colors, if any, assigned to EOS and EOL */
00333     size_t      size;           /* Current NFA size; differs from nstates as
00334                                  * it also counts the number of states created
00335                                  * by children of this state. */
00336     struct vars *v;             /* simplifies compile error reporting */
00337     struct nfa *parent;         /* parent NFA, if any */
00338 };
00339 
00340 
00341 
00342 /*
00343  * definitions for compacted NFA
00344  *
00345  * The main space savings in a compacted NFA is from making the arcs as small
00346  * as possible.  We store only the transition color and next-state number for
00347  * each arc.  The list of out arcs for each state is an array beginning at
00348  * cnfa.states[statenumber], and terminated by a dummy carc struct with
00349  * co == COLORLESS.
00350  *
00351  * The non-dummy carc structs are of two types: plain arcs and LACON arcs.
00352  * Plain arcs just store the transition color number as "co".  LACON arcs
00353  * store the lookahead constraint number plus cnfa.ncolors as "co".  LACON
00354  * arcs can be distinguished from plain by testing for co >= cnfa.ncolors.
00355  */
00356 struct carc
00357 {
00358     color       co;             /* COLORLESS is list terminator */
00359     int         to;             /* next-state number */
00360 };
00361 
00362 struct cnfa
00363 {
00364     int         nstates;        /* number of states */
00365     int         ncolors;        /* number of colors (max color in use + 1) */
00366     int         flags;
00367 #define  HASLACONS  01          /* uses lookahead constraints */
00368     int         pre;            /* setup state number */
00369     int         post;           /* teardown state number */
00370     color       bos[2];         /* colors, if any, assigned to BOS and BOL */
00371     color       eos[2];         /* colors, if any, assigned to EOS and EOL */
00372     char       *stflags;        /* vector of per-state flags bytes */
00373 #define  CNFA_NOPROGRESS    01  /* flag bit for a no-progress state */
00374     struct carc **states;       /* vector of pointers to outarc lists */
00375     /* states[n] are pointers into a single malloc'd array of arcs */
00376     struct carc *arcs;          /* the area for the lists */
00377 };
00378 
00379 #define ZAPCNFA(cnfa)   ((cnfa).nstates = 0)
00380 #define NULLCNFA(cnfa)  ((cnfa).nstates == 0)
00381 
00382 /*
00383  * Used to limit the maximum NFA size to something sane. [Tcl Bug 1810264]
00384  */
00385 #ifndef REG_MAX_STATES
00386 #define REG_MAX_STATES  100000
00387 #endif
00388 
00389 /*
00390  * subexpression tree
00391  *
00392  * "op" is one of:
00393  *      '='  plain regex without interesting substructure (implemented as DFA)
00394  *      'b'  back-reference (has no substructure either)
00395  *      '('  capture node: captures the match of its single child
00396  *      '.'  concatenation: matches a match for left, then a match for right
00397  *      '|'  alternation: matches a match for left or a match for right
00398  *      '*'  iteration: matches some number of matches of its single child
00399  *
00400  * Note: the right child of an alternation must be another alternation or
00401  * NULL; hence, an N-way branch requires N alternation nodes, not N-1 as you
00402  * might expect.  This could stand to be changed.  Actually I'd rather see
00403  * a single alternation node with N children, but that will take revising
00404  * the representation of struct subre.
00405  *
00406  * Note: when a backref is directly quantified, we stick the min/max counts
00407  * into the backref rather than plastering an iteration node on top.  This is
00408  * for efficiency: there is no need to search for possible division points.
00409  */
00410 struct subre
00411 {
00412     char        op;             /* see type codes above */
00413     char        flags;
00414 #define  LONGER  01             /* prefers longer match */
00415 #define  SHORTER 02             /* prefers shorter match */
00416 #define  MIXED   04             /* mixed preference below */
00417 #define  CAP 010                /* capturing parens below */
00418 #define  BACKR   020            /* back reference below */
00419 #define  INUSE   0100           /* in use in final tree */
00420 #define  LOCAL   03             /* bits which may not propagate up */
00421 #define  LMIX(f) ((f)<<2)       /* LONGER -> MIXED */
00422 #define  SMIX(f) ((f)<<1)       /* SHORTER -> MIXED */
00423 #define  UP(f)   (((f)&~LOCAL) | (LMIX(f) & SMIX(f) & MIXED))
00424 #define  MESSY(f)    ((f)&(MIXED|CAP|BACKR))
00425 #define  PREF(f) ((f)&LOCAL)
00426 #define  PREF2(f1, f2)   ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
00427 #define  COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
00428     short       id;             /* ID of subre (1..ntree-1) */
00429     int         subno;          /* subexpression number (for 'b' and '(') */
00430     short       min;            /* min repetitions for iteration or backref */
00431     short       max;            /* max repetitions for iteration or backref */
00432     struct subre *left;         /* left child, if any (also freelist chain) */
00433     struct subre *right;        /* right child, if any */
00434     struct state *begin;        /* outarcs from here... */
00435     struct state *end;          /* ...ending in inarcs here */
00436     struct cnfa cnfa;           /* compacted NFA, if any */
00437     struct subre *chain;        /* for bookkeeping and error cleanup */
00438 };
00439 
00440 
00441 
00442 /*
00443  * table of function pointers for generic manipulation functions
00444  * A regex_t's re_fns points to one of these.
00445  */
00446 struct fns
00447 {
00448     void        FUNCPTR(free, (regex_t *));
00449 };
00450 
00451 
00452 
00453 /*
00454  * the insides of a regex_t, hidden behind a void *
00455  */
00456 struct guts
00457 {
00458     int         magic;
00459 #define  GUTSMAGIC   0xfed9
00460     int         cflags;         /* copy of compile flags */
00461     long        info;           /* copy of re_info */
00462     size_t      nsub;           /* copy of re_nsub */
00463     struct subre *tree;
00464     struct cnfa search;         /* for fast preliminary search */
00465     int         ntree;          /* number of subre's, less one */
00466     struct colormap cmap;
00467     int         FUNCPTR(compare, (const chr *, const chr *, size_t));
00468     struct subre *lacons;       /* lookahead-constraint vector */
00469     int         nlacons;        /* size of lacons */
00470 };