00001 /* 00002 * Internal interface definitions, etc., for the reg package 00003 * 00004 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. 00005 * 00006 * Development of this software was funded, in part, by Cray Research Inc., 00007 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics 00008 * Corporation, none of whom are responsible for the results. The author 00009 * thanks all of them. 00010 * 00011 * Redistribution and use in source and binary forms -- with or without 00012 * modification -- are permitted for any purpose, provided that 00013 * redistributions in source form retain this entire copyright notice and 00014 * indicate the origin and nature of any modifications. 00015 * 00016 * I'd appreciate being given credit for this package in the documentation 00017 * of software which uses it, but that is not a requirement. 00018 * 00019 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, 00020 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 00021 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 00022 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 00023 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 00024 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 00025 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 00026 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 00027 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 00028 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00029 * 00030 * src/include/regex/regguts.h 00031 */ 00032 00033 00034 00035 /* 00036 * Environmental customization. It should not (I hope) be necessary to 00037 * alter the file you are now reading -- regcustom.h should handle it all, 00038 * given care here and elsewhere. 00039 */ 00040 #include "regcustom.h" 00041 00042 00043 00044 /* 00045 * Things that regcustom.h might override. 00046 */ 00047 00048 /* assertions */ 00049 #ifndef assert 00050 #ifndef REG_DEBUG 00051 #define NDEBUG /* no assertions */ 00052 #endif 00053 #include <assert.h> 00054 #endif 00055 00056 /* voids */ 00057 #ifndef DISCARD 00058 #define DISCARD void /* for throwing values away */ 00059 #endif 00060 #ifndef VS 00061 #define VS(x) ((void *)(x)) /* cast something to generic ptr */ 00062 #endif 00063 00064 /* function-pointer declarator */ 00065 #ifndef FUNCPTR 00066 #define FUNCPTR(name, args) (*(name)) args 00067 #endif 00068 00069 /* memory allocation */ 00070 #ifndef MALLOC 00071 #define MALLOC(n) malloc(n) 00072 #endif 00073 #ifndef REALLOC 00074 #define REALLOC(p, n) realloc(VS(p), n) 00075 #endif 00076 #ifndef FREE 00077 #define FREE(p) free(VS(p)) 00078 #endif 00079 00080 /* want size of a char in bits, and max value in bounded quantifiers */ 00081 #ifndef CHAR_BIT 00082 #include <limits.h> 00083 #endif 00084 #ifndef _POSIX2_RE_DUP_MAX 00085 #define _POSIX2_RE_DUP_MAX 255 /* normally from <limits.h> */ 00086 #endif 00087 00088 00089 00090 /* 00091 * misc 00092 */ 00093 00094 #define NOTREACHED 0 00095 #define xxx 1 00096 00097 #define DUPMAX _POSIX2_RE_DUP_MAX 00098 #define INFINITY (DUPMAX+1) 00099 00100 #define REMAGIC 0xfed7 /* magic number for main struct */ 00101 00102 00103 00104 /* 00105 * debugging facilities 00106 */ 00107 #ifdef REG_DEBUG 00108 /* FDEBUG does finite-state tracing */ 00109 #define FDEBUG(arglist) { if (v->eflags®_FTRACE) printf arglist; } 00110 /* MDEBUG does higher-level tracing */ 00111 #define MDEBUG(arglist) { if (v->eflags®_MTRACE) printf arglist; } 00112 #else 00113 #define FDEBUG(arglist) {} 00114 #define MDEBUG(arglist) {} 00115 #endif 00116 00117 00118 00119 /* 00120 * bitmap manipulation 00121 */ 00122 #define UBITS (CHAR_BIT * sizeof(unsigned)) 00123 #define BSET(uv, sn) ((uv)[(sn)/UBITS] |= (unsigned)1 << ((sn)%UBITS)) 00124 #define ISBSET(uv, sn) ((uv)[(sn)/UBITS] & ((unsigned)1 << ((sn)%UBITS))) 00125 00126 00127 00128 /* 00129 * We dissect a chr into byts for colormap table indexing. Here we define 00130 * a byt, which will be the same as a byte on most machines... The exact 00131 * size of a byt is not critical, but about 8 bits is good, and extraction 00132 * of 8-bit chunks is sometimes especially fast. 00133 */ 00134 #ifndef BYTBITS 00135 #define BYTBITS 8 /* bits in a byt */ 00136 #endif 00137 #define BYTTAB (1<<BYTBITS) /* size of table with one entry per byt value */ 00138 #define BYTMASK (BYTTAB-1) /* bit mask for byt */ 00139 #define NBYTS ((CHRBITS+BYTBITS-1)/BYTBITS) 00140 /* the definition of GETCOLOR(), below, assumes NBYTS <= 4 */ 00141 00142 00143 00144 /* 00145 * As soon as possible, we map chrs into equivalence classes -- "colors" -- 00146 * which are of much more manageable number. 00147 */ 00148 typedef short color; /* colors of characters */ 00149 typedef int pcolor; /* what color promotes to */ 00150 00151 #define MAX_COLOR 32767 /* max color (must fit in 'color' datatype) */ 00152 #define COLORLESS (-1) /* impossible color */ 00153 #define WHITE 0 /* default color, parent of all others */ 00154 00155 00156 00157 /* 00158 * A colormap is a tree -- more precisely, a DAG -- indexed at each level 00159 * by a byt of the chr, to map the chr to a color efficiently. Because 00160 * lower sections of the tree can be shared, it can exploit the usual 00161 * sparseness of such a mapping table. The tree is always NBYTS levels 00162 * deep (in the past it was shallower during construction but was "filled" 00163 * to full depth at the end of that); areas that are unaltered as yet point 00164 * to "fill blocks" which are entirely WHITE in color. 00165 */ 00166 00167 /* the tree itself */ 00168 struct colors 00169 { 00170 color ccolor[BYTTAB]; 00171 }; 00172 struct ptrs 00173 { 00174 union tree *pptr[BYTTAB]; 00175 }; 00176 union tree 00177 { 00178 struct colors colors; 00179 struct ptrs ptrs; 00180 }; 00181 00182 #define tcolor colors.ccolor 00183 #define tptr ptrs.pptr 00184 00185 /* 00186 * Per-color data structure for the compile-time color machinery 00187 * 00188 * If "sub" is not NOSUB then it is the number of the color's current 00189 * subcolor, i.e. we are in process of dividing this color (character 00190 * equivalence class) into two colors. See src/backend/regex/README for 00191 * discussion of subcolors. 00192 * 00193 * Currently-unused colors have the FREECOL bit set and are linked into a 00194 * freelist using their "sub" fields, but only if their color numbers are 00195 * less than colormap.max. Any array entries beyond "max" are just garbage. 00196 */ 00197 struct colordesc 00198 { 00199 uchr nchrs; /* number of chars of this color */ 00200 color sub; /* open subcolor, if any; or free-chain ptr */ 00201 #define NOSUB COLORLESS /* value of "sub" when no open subcolor */ 00202 struct arc *arcs; /* chain of all arcs of this color */ 00203 chr firstchr; /* char first assigned to this color */ 00204 int flags; /* bit values defined next */ 00205 #define FREECOL 01 /* currently free */ 00206 #define PSEUDO 02 /* pseudocolor, no real chars */ 00207 #define UNUSEDCOLOR(cd) ((cd)->flags & FREECOL) 00208 union tree *block; /* block of solid color, if any */ 00209 }; 00210 00211 /* 00212 * The color map itself 00213 * 00214 * Much of the data in the colormap struct is only used at compile time. 00215 * However, the bulk of the space usage is in the "tree" structure, so it's 00216 * not clear that there's much point in converting the rest to a more compact 00217 * form when compilation is finished. 00218 */ 00219 struct colormap 00220 { 00221 int magic; 00222 #define CMMAGIC 0x876 00223 struct vars *v; /* for compile error reporting */ 00224 size_t ncds; /* allocated length of colordescs array */ 00225 size_t max; /* highest color number currently in use */ 00226 color free; /* beginning of free chain (if non-0) */ 00227 struct colordesc *cd; /* pointer to array of colordescs */ 00228 #define CDEND(cm) (&(cm)->cd[(cm)->max + 1]) 00229 /* If we need up to NINLINECDS, we store them here to save a malloc */ 00230 #define NINLINECDS ((size_t)10) 00231 struct colordesc cdspace[NINLINECDS]; 00232 union tree tree[NBYTS]; /* tree top, plus lower-level fill blocks */ 00233 }; 00234 00235 /* optimization magic to do fast chr->color mapping */ 00236 #define B0(c) ((c) & BYTMASK) 00237 #define B1(c) (((c)>>BYTBITS) & BYTMASK) 00238 #define B2(c) (((c)>>(2*BYTBITS)) & BYTMASK) 00239 #define B3(c) (((c)>>(3*BYTBITS)) & BYTMASK) 00240 #if NBYTS == 1 00241 #define GETCOLOR(cm, c) ((cm)->tree->tcolor[B0(c)]) 00242 #endif 00243 /* beware, for NBYTS>1, GETCOLOR() is unsafe -- 2nd arg used repeatedly */ 00244 #if NBYTS == 2 00245 #define GETCOLOR(cm, c) ((cm)->tree->tptr[B1(c)]->tcolor[B0(c)]) 00246 #endif 00247 #if NBYTS == 4 00248 #define GETCOLOR(cm, c) ((cm)->tree->tptr[B3(c)]->tptr[B2(c)]->tptr[B1(c)]->tcolor[B0(c)]) 00249 #endif 00250 00251 00252 /* 00253 * Interface definitions for locale-interface functions in regc_locale.c. 00254 */ 00255 00256 /* 00257 * Representation of a set of characters. chrs[] represents individual 00258 * code points, ranges[] represents ranges in the form min..max inclusive. 00259 * 00260 * Note that in cvecs gotten from newcvec() and intended to be freed by 00261 * freecvec(), both arrays of chrs are after the end of the struct, not 00262 * separately malloc'd; so chrspace and rangespace are effectively immutable. 00263 */ 00264 struct cvec 00265 { 00266 int nchrs; /* number of chrs */ 00267 int chrspace; /* number of chrs allocated in chrs[] */ 00268 chr *chrs; /* pointer to vector of chrs */ 00269 int nranges; /* number of ranges (chr pairs) */ 00270 int rangespace; /* number of ranges allocated in ranges[] */ 00271 chr *ranges; /* pointer to vector of chr pairs */ 00272 }; 00273 00274 00275 /* 00276 * definitions for NFA internal representation 00277 * 00278 * Having a "from" pointer within each arc may seem redundant, but it 00279 * saves a lot of hassle. 00280 */ 00281 struct state; 00282 00283 struct arc 00284 { 00285 int type; /* 0 if free, else an NFA arc type code */ 00286 color co; 00287 struct state *from; /* where it's from (and contained within) */ 00288 struct state *to; /* where it's to */ 00289 struct arc *outchain; /* link in *from's outs chain or free chain */ 00290 #define freechain outchain 00291 struct arc *inchain; /* link in *to's ins chain */ 00292 struct arc *colorchain; /* link in color's arc chain */ 00293 struct arc *colorchainRev; /* back-link in color's arc chain */ 00294 }; 00295 00296 struct arcbatch 00297 { /* for bulk allocation of arcs */ 00298 struct arcbatch *next; 00299 #define ABSIZE 10 00300 struct arc a[ABSIZE]; 00301 }; 00302 00303 struct state 00304 { 00305 int no; 00306 #define FREESTATE (-1) 00307 char flag; /* marks special states */ 00308 int nins; /* number of inarcs */ 00309 struct arc *ins; /* chain of inarcs */ 00310 int nouts; /* number of outarcs */ 00311 struct arc *outs; /* chain of outarcs */ 00312 struct arc *free; /* chain of free arcs */ 00313 struct state *tmp; /* temporary for traversal algorithms */ 00314 struct state *next; /* chain for traversing all */ 00315 struct state *prev; /* back chain */ 00316 struct arcbatch oas; /* first arcbatch, avoid malloc in easy case */ 00317 int noas; /* number of arcs used in first arcbatch */ 00318 }; 00319 00320 struct nfa 00321 { 00322 struct state *pre; /* pre-initial state */ 00323 struct state *init; /* initial state */ 00324 struct state *final; /* final state */ 00325 struct state *post; /* post-final state */ 00326 int nstates; /* for numbering states */ 00327 struct state *states; /* state-chain header */ 00328 struct state *slast; /* tail of the chain */ 00329 struct state *free; /* free list */ 00330 struct colormap *cm; /* the color map */ 00331 color bos[2]; /* colors, if any, assigned to BOS and BOL */ 00332 color eos[2]; /* colors, if any, assigned to EOS and EOL */ 00333 size_t size; /* Current NFA size; differs from nstates as 00334 * it also counts the number of states created 00335 * by children of this state. */ 00336 struct vars *v; /* simplifies compile error reporting */ 00337 struct nfa *parent; /* parent NFA, if any */ 00338 }; 00339 00340 00341 00342 /* 00343 * definitions for compacted NFA 00344 * 00345 * The main space savings in a compacted NFA is from making the arcs as small 00346 * as possible. We store only the transition color and next-state number for 00347 * each arc. The list of out arcs for each state is an array beginning at 00348 * cnfa.states[statenumber], and terminated by a dummy carc struct with 00349 * co == COLORLESS. 00350 * 00351 * The non-dummy carc structs are of two types: plain arcs and LACON arcs. 00352 * Plain arcs just store the transition color number as "co". LACON arcs 00353 * store the lookahead constraint number plus cnfa.ncolors as "co". LACON 00354 * arcs can be distinguished from plain by testing for co >= cnfa.ncolors. 00355 */ 00356 struct carc 00357 { 00358 color co; /* COLORLESS is list terminator */ 00359 int to; /* next-state number */ 00360 }; 00361 00362 struct cnfa 00363 { 00364 int nstates; /* number of states */ 00365 int ncolors; /* number of colors (max color in use + 1) */ 00366 int flags; 00367 #define HASLACONS 01 /* uses lookahead constraints */ 00368 int pre; /* setup state number */ 00369 int post; /* teardown state number */ 00370 color bos[2]; /* colors, if any, assigned to BOS and BOL */ 00371 color eos[2]; /* colors, if any, assigned to EOS and EOL */ 00372 char *stflags; /* vector of per-state flags bytes */ 00373 #define CNFA_NOPROGRESS 01 /* flag bit for a no-progress state */ 00374 struct carc **states; /* vector of pointers to outarc lists */ 00375 /* states[n] are pointers into a single malloc'd array of arcs */ 00376 struct carc *arcs; /* the area for the lists */ 00377 }; 00378 00379 #define ZAPCNFA(cnfa) ((cnfa).nstates = 0) 00380 #define NULLCNFA(cnfa) ((cnfa).nstates == 0) 00381 00382 /* 00383 * Used to limit the maximum NFA size to something sane. [Tcl Bug 1810264] 00384 */ 00385 #ifndef REG_MAX_STATES 00386 #define REG_MAX_STATES 100000 00387 #endif 00388 00389 /* 00390 * subexpression tree 00391 * 00392 * "op" is one of: 00393 * '=' plain regex without interesting substructure (implemented as DFA) 00394 * 'b' back-reference (has no substructure either) 00395 * '(' capture node: captures the match of its single child 00396 * '.' concatenation: matches a match for left, then a match for right 00397 * '|' alternation: matches a match for left or a match for right 00398 * '*' iteration: matches some number of matches of its single child 00399 * 00400 * Note: the right child of an alternation must be another alternation or 00401 * NULL; hence, an N-way branch requires N alternation nodes, not N-1 as you 00402 * might expect. This could stand to be changed. Actually I'd rather see 00403 * a single alternation node with N children, but that will take revising 00404 * the representation of struct subre. 00405 * 00406 * Note: when a backref is directly quantified, we stick the min/max counts 00407 * into the backref rather than plastering an iteration node on top. This is 00408 * for efficiency: there is no need to search for possible division points. 00409 */ 00410 struct subre 00411 { 00412 char op; /* see type codes above */ 00413 char flags; 00414 #define LONGER 01 /* prefers longer match */ 00415 #define SHORTER 02 /* prefers shorter match */ 00416 #define MIXED 04 /* mixed preference below */ 00417 #define CAP 010 /* capturing parens below */ 00418 #define BACKR 020 /* back reference below */ 00419 #define INUSE 0100 /* in use in final tree */ 00420 #define LOCAL 03 /* bits which may not propagate up */ 00421 #define LMIX(f) ((f)<<2) /* LONGER -> MIXED */ 00422 #define SMIX(f) ((f)<<1) /* SHORTER -> MIXED */ 00423 #define UP(f) (((f)&~LOCAL) | (LMIX(f) & SMIX(f) & MIXED)) 00424 #define MESSY(f) ((f)&(MIXED|CAP|BACKR)) 00425 #define PREF(f) ((f)&LOCAL) 00426 #define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2)) 00427 #define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2)) 00428 short id; /* ID of subre (1..ntree-1) */ 00429 int subno; /* subexpression number (for 'b' and '(') */ 00430 short min; /* min repetitions for iteration or backref */ 00431 short max; /* max repetitions for iteration or backref */ 00432 struct subre *left; /* left child, if any (also freelist chain) */ 00433 struct subre *right; /* right child, if any */ 00434 struct state *begin; /* outarcs from here... */ 00435 struct state *end; /* ...ending in inarcs here */ 00436 struct cnfa cnfa; /* compacted NFA, if any */ 00437 struct subre *chain; /* for bookkeeping and error cleanup */ 00438 }; 00439 00440 00441 00442 /* 00443 * table of function pointers for generic manipulation functions 00444 * A regex_t's re_fns points to one of these. 00445 */ 00446 struct fns 00447 { 00448 void FUNCPTR(free, (regex_t *)); 00449 }; 00450 00451 00452 00453 /* 00454 * the insides of a regex_t, hidden behind a void * 00455 */ 00456 struct guts 00457 { 00458 int magic; 00459 #define GUTSMAGIC 0xfed9 00460 int cflags; /* copy of compile flags */ 00461 long info; /* copy of re_info */ 00462 size_t nsub; /* copy of re_nsub */ 00463 struct subre *tree; 00464 struct cnfa search; /* for fast preliminary search */ 00465 int ntree; /* number of subre's, less one */ 00466 struct colormap cmap; 00467 int FUNCPTR(compare, (const chr *, const chr *, size_t)); 00468 struct subre *lacons; /* lookahead-constraint vector */ 00469 int nlacons; /* size of lacons */ 00470 };