LLVM API Documentation

regex2.h
Go to the documentation of this file.
00001 /*-
00002  * This code is derived from OpenBSD's libc/regex, original license follows:
00003  *
00004  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
00005  * Copyright (c) 1992, 1993, 1994
00006  *  The Regents of the University of California.  All rights reserved.
00007  *
00008  * This code is derived from software contributed to Berkeley by
00009  * Henry Spencer.
00010  *
00011  * Redistribution and use in source and binary forms, with or without
00012  * modification, are permitted provided that the following conditions
00013  * are met:
00014  * 1. Redistributions of source code must retain the above copyright
00015  *    notice, this list of conditions and the following disclaimer.
00016  * 2. Redistributions in binary form must reproduce the above copyright
00017  *    notice, this list of conditions and the following disclaimer in the
00018  *    documentation and/or other materials provided with the distribution.
00019  * 3. Neither the name of the University nor the names of its contributors
00020  *    may be used to endorse or promote products derived from this software
00021  *    without specific prior written permission.
00022  *
00023  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
00024  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00025  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00026  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
00027  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00028  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00029  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00030  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00031  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00032  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00033  * SUCH DAMAGE.
00034  *
00035  *  @(#)regex2.h  8.4 (Berkeley) 3/20/94
00036  */
00037 
00038 #ifndef LLVM_SUPPORT_REGEX2_H
00039 #define LLVM_SUPPORT_REGEX2_H
00040 
00041 /*
00042  * internals of regex_t
00043  */
00044 #define MAGIC1  ((('r'^0200)<<8) | 'e')
00045 
00046 /*
00047  * The internal representation is a *strip*, a sequence of
00048  * operators ending with an endmarker.  (Some terminology etc. is a
00049  * historical relic of earlier versions which used multiple strips.)
00050  * Certain oddities in the representation are there to permit running
00051  * the machinery backwards; in particular, any deviation from sequential
00052  * flow must be marked at both its source and its destination.  Some
00053  * fine points:
00054  *
00055  * - OPLUS_ and O_PLUS are *inside* the loop they create.
00056  * - OQUEST_ and O_QUEST are *outside* the bypass they create.
00057  * - OCH_ and O_CH are *outside* the multi-way branch they create, while
00058  *   OOR1 and OOR2 are respectively the end and the beginning of one of
00059  *   the branches.  Note that there is an implicit OOR2 following OCH_
00060  *   and an implicit OOR1 preceding O_CH.
00061  *
00062  * In state representations, an operator's bit is on to signify a state
00063  * immediately *preceding* "execution" of that operator.
00064  */
00065 typedef unsigned long sop;  /* strip operator */
00066 typedef long sopno;
00067 #define OPRMASK 0xf8000000LU
00068 #define OPDMASK 0x07ffffffLU
00069 #define OPSHIFT ((unsigned)27)
00070 #define OP(n) ((n)&OPRMASK)
00071 #define OPND(n) ((n)&OPDMASK)
00072 #define SOP(op, opnd) ((op)|(opnd))
00073 /* operators         meaning  operand     */
00074 /*            (back, fwd are offsets) */
00075 #define OEND  (1LU<<OPSHIFT)  /* endmarker  -     */
00076 #define OCHAR (2LU<<OPSHIFT)  /* character  unsigned char   */
00077 #define OBOL  (3LU<<OPSHIFT)  /* left anchor  -     */
00078 #define OEOL  (4LU<<OPSHIFT)  /* right anchor -     */
00079 #define OANY  (5LU<<OPSHIFT)  /* .    -     */
00080 #define OANYOF  (6LU<<OPSHIFT)  /* [...]  set number    */
00081 #define OBACK_  (7LU<<OPSHIFT)  /* begin \d paren number    */
00082 #define O_BACK  (8LU<<OPSHIFT)  /* end \d paren number    */
00083 #define OPLUS_  (9LU<<OPSHIFT)  /* + prefix fwd to suffix   */
00084 #define O_PLUS  (10LU<<OPSHIFT) /* + suffix back to prefix    */
00085 #define OQUEST_ (11LU<<OPSHIFT) /* ? prefix fwd to suffix   */
00086 #define O_QUEST (12LU<<OPSHIFT) /* ? suffix back to prefix    */
00087 #define OLPAREN (13LU<<OPSHIFT) /* (    fwd to )    */
00088 #define ORPAREN (14LU<<OPSHIFT) /* )    back to (   */
00089 #define OCH_  (15LU<<OPSHIFT) /* begin choice fwd to OOR2   */
00090 #define OOR1  (16LU<<OPSHIFT) /* | pt. 1  back to OOR1 or OCH_  */
00091 #define OOR2  (17LU<<OPSHIFT) /* | pt. 2  fwd to OOR2 or O_CH */
00092 #define O_CH  (18LU<<OPSHIFT) /* end choice back to OOR1    */
00093 #define OBOW  (19LU<<OPSHIFT) /* begin word -     */
00094 #define OEOW  (20LU<<OPSHIFT) /* end word -     */
00095 
00096 /*
00097  * Structure for [] character-set representation.  Character sets are
00098  * done as bit vectors, grouped 8 to a byte vector for compactness.
00099  * The individual set therefore has both a pointer to the byte vector
00100  * and a mask to pick out the relevant bit of each byte.  A hash code
00101  * simplifies testing whether two sets could be identical.
00102  *
00103  * This will get trickier for multicharacter collating elements.  As
00104  * preliminary hooks for dealing with such things, we also carry along
00105  * a string of multi-character elements, and decide the size of the
00106  * vectors at run time.
00107  */
00108 typedef struct {
00109   uch *ptr;   /* -> uch [csetsize] */
00110   uch mask;   /* bit within array */
00111   uch hash;   /* hash code */
00112   size_t smultis;
00113   char *multis;   /* -> char[smulti]  ab\0cd\0ef\0\0 */
00114 } cset;
00115 /* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */
00116 #define CHadd(cs, c)  ((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (c))
00117 #define CHsub(cs, c)  ((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (c))
00118 #define CHIN(cs, c) ((cs)->ptr[(uch)(c)] & (cs)->mask)
00119 #define MCadd(p, cs, cp)  mcadd(p, cs, cp)  /* llvm_regcomp() internal fns */
00120 #define MCsub(p, cs, cp)  mcsub(p, cs, cp)
00121 #define MCin(p, cs, cp) mcin(p, cs, cp)
00122 
00123 /* stuff for character categories */
00124 typedef unsigned char cat_t;
00125 
00126 /*
00127  * main compiled-expression structure
00128  */
00129 struct re_guts {
00130   int magic;
00131 #   define  MAGIC2  ((('R'^0200)<<8)|'E')
00132   sop *strip;   /* malloced area for strip */
00133   int csetsize;   /* number of bits in a cset vector */
00134   int ncsets;   /* number of csets in use */
00135   cset *sets;   /* -> cset [ncsets] */
00136   uch *setbits;   /* -> uch[csetsize][ncsets/CHAR_BIT] */
00137   int cflags;   /* copy of llvm_regcomp() cflags argument */
00138   sopno nstates;    /* = number of sops */
00139   sopno firststate; /* the initial OEND (normally 0) */
00140   sopno laststate;  /* the final OEND */
00141   int iflags;   /* internal flags */
00142 #   define  USEBOL  01  /* used ^ */
00143 #   define  USEEOL  02  /* used $ */
00144 #   define  REGEX_BAD 04  /* something wrong */
00145   int nbol;   /* number of ^ used */
00146   int neol;   /* number of $ used */
00147   int ncategories;  /* how many character categories */
00148   cat_t *categories;  /* ->catspace[-CHAR_MIN] */
00149   char *must;   /* match must contain this string */
00150   int mlen;   /* length of must */
00151   size_t nsub;    /* copy of re_nsub */
00152   int backrefs;   /* does it use back references? */
00153   sopno nplus;    /* how deep does it nest +s? */
00154   /* catspace must be last */
00155   cat_t catspace[1];  /* actually [NC] */
00156 };
00157 
00158 /* misc utilities */
00159 #define OUT (CHAR_MAX+1)  /* a non-character value */
00160 #define ISWORD(c) (isalnum(c&0xff) || (c) == '_')
00161 
00162 #endif