PostgreSQL Source Code: src/backend/regex/regc

Go to the documentation of this file.
00001 /*
00002  * lexical analyzer
00003  * This file is #included by regcomp.c.
00004  *
00005  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
00006  *
00007  * Development of this software was funded, in part, by Cray Research Inc.,
00008  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
00009  * Corporation, none of whom are responsible for the results.  The author
00010  * thanks all of them.
00011  *
00012  * Redistribution and use in source and binary forms -- with or without
00013  * modification -- are permitted for any purpose, provided that
00014  * redistributions in source form retain this entire copyright notice and
00015  * indicate the origin and nature of any modifications.
00016  *
00017  * I'd appreciate being given credit for this package in the documentation
00018  * of software which uses it, but that is not a requirement.
00019  *
00020  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
00021  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
00022  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
00023  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00024  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00025  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
00026  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
00027  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
00028  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
00029  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00030  *
00031  * src/backend/regex/regc_lex.c
00032  *
00033  */
00034 
00035 /* scanning macros (know about v) */
00036 #define ATEOS()     (v->now >= v->stop)
00037 #define HAVE(n)     (v->stop - v->now >= (n))
00038 #define NEXT1(c)    (!ATEOS() && *v->now == CHR(c))
00039 #define NEXT2(a,b)  (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
00040 #define NEXT3(a,b,c)    (HAVE(3) && *v->now == CHR(a) && \
00041                         *(v->now+1) == CHR(b) && \
00042                         *(v->now+2) == CHR(c))
00043 #define SET(c)      (v->nexttype = (c))
00044 #define SETV(c, n)  (v->nexttype = (c), v->nextvalue = (n))
00045 #define RET(c)      return (SET(c), 1)
00046 #define RETV(c, n)  return (SETV(c, n), 1)
00047 #define FAILW(e)    return (ERR(e), 0)  /* ERR does SET(EOS) */
00048 #define LASTTYPE(t) (v->lasttype == (t))
00049 
00050 /* lexical contexts */
00051 #define L_ERE   1               /* mainline ERE/ARE */
00052 #define L_BRE   2               /* mainline BRE */
00053 #define L_Q 3                   /* REG_QUOTE */
00054 #define L_EBND  4               /* ERE/ARE bound */
00055 #define L_BBND  5               /* BRE bound */
00056 #define L_BRACK 6               /* brackets */
00057 #define L_CEL   7               /* collating element */
00058 #define L_ECL   8               /* equivalence class */
00059 #define L_CCL   9               /* character class */
00060 #define INTOCON(c)  (v->lexcon = (c))
00061 #define INCON(con)  (v->lexcon == (con))
00062 
00063 /* construct pointer past end of chr array */
00064 #define ENDOF(array)    ((array) + sizeof(array)/sizeof(chr))
00065 
00066 /*
00067  * lexstart - set up lexical stuff, scan leading options
00068  */
00069 static void
00070 lexstart(struct vars * v)
00071 {
00072     prefixes(v);                /* may turn on new type bits etc. */
00073     NOERR();
00074 
00075     if (v->cflags & REG_QUOTE)
00076     {
00077         assert(!(v->cflags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE)));
00078         INTOCON(L_Q);
00079     }
00080     else if (v->cflags & REG_EXTENDED)
00081     {
00082         assert(!(v->cflags & REG_QUOTE));
00083         INTOCON(L_ERE);
00084     }
00085     else
00086     {
00087         assert(!(v->cflags & (REG_QUOTE | REG_ADVF)));
00088         INTOCON(L_BRE);
00089     }
00090 
00091     v->nexttype = EMPTY;        /* remember we were at the start */
00092     next(v);                    /* set up the first token */
00093 }
00094 
00095 /*
00096  * prefixes - implement various special prefixes
00097  */
00098 static void
00099 prefixes(struct vars * v)
00100 {
00101     /* literal string doesn't get any of this stuff */
00102     if (v->cflags & REG_QUOTE)
00103         return;
00104 
00105     /* initial "***" gets special things */
00106     if (HAVE(4) && NEXT3('*', '*', '*'))
00107         switch (*(v->now + 3))
00108         {
00109             case CHR('?'):      /* "***?" error, msg shows version */
00110                 ERR(REG_BADPAT);
00111                 return;         /* proceed no further */
00112                 break;
00113             case CHR('='):      /* "***=" shifts to literal string */
00114                 NOTE(REG_UNONPOSIX);
00115                 v->cflags |= REG_QUOTE;
00116                 v->cflags &= ~(REG_ADVANCED | REG_EXPANDED | REG_NEWLINE);
00117                 v->now += 4;
00118                 return;         /* and there can be no more prefixes */
00119                 break;
00120             case CHR(':'):      /* "***:" shifts to AREs */
00121                 NOTE(REG_UNONPOSIX);
00122                 v->cflags |= REG_ADVANCED;
00123                 v->now += 4;
00124                 break;
00125             default:            /* otherwise *** is just an error */
00126                 ERR(REG_BADRPT);
00127                 return;
00128                 break;
00129         }
00130 
00131     /* BREs and EREs don't get embedded options */
00132     if ((v->cflags & REG_ADVANCED) != REG_ADVANCED)
00133         return;
00134 
00135     /* embedded options (AREs only) */
00136     if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2)))
00137     {
00138         NOTE(REG_UNONPOSIX);
00139         v->now += 2;
00140         for (; !ATEOS() && iscalpha(*v->now); v->now++)
00141             switch (*v->now)
00142             {
00143                 case CHR('b'):  /* BREs (but why???) */
00144                     v->cflags &= ~(REG_ADVANCED | REG_QUOTE);
00145                     break;
00146                 case CHR('c'):  /* case sensitive */
00147                     v->cflags &= ~REG_ICASE;
00148                     break;
00149                 case CHR('e'):  /* plain EREs */
00150                     v->cflags |= REG_EXTENDED;
00151                     v->cflags &= ~(REG_ADVF | REG_QUOTE);
00152                     break;
00153                 case CHR('i'):  /* case insensitive */
00154                     v->cflags |= REG_ICASE;
00155                     break;
00156                 case CHR('m'):  /* Perloid synonym for n */
00157                 case CHR('n'):  /* \n affects ^ $ . [^ */
00158                     v->cflags |= REG_NEWLINE;
00159                     break;
00160                 case CHR('p'):  /* ~Perl, \n affects . [^ */
00161                     v->cflags |= REG_NLSTOP;
00162                     v->cflags &= ~REG_NLANCH;
00163                     break;
00164                 case CHR('q'):  /* literal string */
00165                     v->cflags |= REG_QUOTE;
00166                     v->cflags &= ~REG_ADVANCED;
00167                     break;
00168                 case CHR('s'):  /* single line, \n ordinary */
00169                     v->cflags &= ~REG_NEWLINE;
00170                     break;
00171                 case CHR('t'):  /* tight syntax */
00172                     v->cflags &= ~REG_EXPANDED;
00173                     break;
00174                 case CHR('w'):  /* weird, \n affects ^ $ only */
00175                     v->cflags &= ~REG_NLSTOP;
00176                     v->cflags |= REG_NLANCH;
00177                     break;
00178                 case CHR('x'):  /* expanded syntax */
00179                     v->cflags |= REG_EXPANDED;
00180                     break;
00181                 default:
00182                     ERR(REG_BADOPT);
00183                     return;
00184             }
00185         if (!NEXT1(')'))
00186         {
00187             ERR(REG_BADOPT);
00188             return;
00189         }
00190         v->now++;
00191         if (v->cflags & REG_QUOTE)
00192             v->cflags &= ~(REG_EXPANDED | REG_NEWLINE);
00193     }
00194 }
00195 
00196 /*
00197  * lexnest - "call a subroutine", interpolating string at the lexical level
00198  *
00199  * Note, this is not a very general facility.  There are a number of
00200  * implicit assumptions about what sorts of strings can be subroutines.
00201  */
00202 static void
00203 lexnest(struct vars * v,
00204         const chr *beginp,      /* start of interpolation */
00205         const chr *endp)        /* one past end of interpolation */
00206 {
00207     assert(v->savenow == NULL); /* only one level of nesting */
00208     v->savenow = v->now;
00209     v->savestop = v->stop;
00210     v->now = beginp;
00211     v->stop = endp;
00212 }
00213 
00214 /*
00215  * string constants to interpolate as expansions of things like \d
00216  */
00217 static const chr backd[] = {    /* \d */
00218     CHR('['), CHR('['), CHR(':'),
00219     CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
00220     CHR(':'), CHR(']'), CHR(']')
00221 };
00222 static const chr backD[] = {    /* \D */
00223     CHR('['), CHR('^'), CHR('['), CHR(':'),
00224     CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
00225     CHR(':'), CHR(']'), CHR(']')
00226 };
00227 static const chr brbackd[] = {  /* \d within brackets */
00228     CHR('['), CHR(':'),
00229     CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
00230     CHR(':'), CHR(']')
00231 };
00232 static const chr backs[] = {    /* \s */
00233     CHR('['), CHR('['), CHR(':'),
00234     CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
00235     CHR(':'), CHR(']'), CHR(']')
00236 };
00237 static const chr backS[] = {    /* \S */
00238     CHR('['), CHR('^'), CHR('['), CHR(':'),
00239     CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
00240     CHR(':'), CHR(']'), CHR(']')
00241 };
00242 static const chr brbacks[] = {  /* \s within brackets */
00243     CHR('['), CHR(':'),
00244     CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
00245     CHR(':'), CHR(']')
00246 };
00247 static const chr backw[] = {    /* \w */
00248     CHR('['), CHR('['), CHR(':'),
00249     CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
00250     CHR(':'), CHR(']'), CHR('_'), CHR(']')
00251 };
00252 static const chr backW[] = {    /* \W */
00253     CHR('['), CHR('^'), CHR('['), CHR(':'),
00254     CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
00255     CHR(':'), CHR(']'), CHR('_'), CHR(']')
00256 };
00257 static const chr brbackw[] = {  /* \w within brackets */
00258     CHR('['), CHR(':'),
00259     CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
00260     CHR(':'), CHR(']'), CHR('_')
00261 };
00262 
00263 /*
00264  * lexword - interpolate a bracket expression for word characters
00265  * Possibly ought to inquire whether there is a "word" character class.
00266  */
00267 static void
00268 lexword(struct vars * v)
00269 {
00270     lexnest(v, backw, ENDOF(backw));
00271 }
00272 
00273 /*
00274  * next - get next token
00275  */
00276 static int                      /* 1 normal, 0 failure */
00277 next(struct vars * v)
00278 {
00279     chr         c;
00280 
00281     /* errors yield an infinite sequence of failures */
00282     if (ISERR())
00283         return 0;               /* the error has set nexttype to EOS */
00284 
00285     /* remember flavor of last token */
00286     v->lasttype = v->nexttype;
00287 
00288     /* REG_BOSONLY */
00289     if (v->nexttype == EMPTY && (v->cflags & REG_BOSONLY))
00290     {
00291         /* at start of a REG_BOSONLY RE */
00292         RETV(SBEGIN, 0);        /* same as \A */
00293     }
00294 
00295     /* if we're nested and we've hit end, return to outer level */
00296     if (v->savenow != NULL && ATEOS())
00297     {
00298         v->now = v->savenow;
00299         v->stop = v->savestop;
00300         v->savenow = v->savestop = NULL;
00301     }
00302 
00303     /* skip white space etc. if appropriate (not in literal or []) */
00304     if (v->cflags & REG_EXPANDED)
00305         switch (v->lexcon)
00306         {
00307             case L_ERE:
00308             case L_BRE:
00309             case L_EBND:
00310             case L_BBND:
00311                 skip(v);
00312                 break;
00313         }
00314 
00315     /* handle EOS, depending on context */
00316     if (ATEOS())
00317     {
00318         switch (v->lexcon)
00319         {
00320             case L_ERE:
00321             case L_BRE:
00322             case L_Q:
00323                 RET(EOS);
00324                 break;
00325             case L_EBND:
00326             case L_BBND:
00327                 FAILW(REG_EBRACE);
00328                 break;
00329             case L_BRACK:
00330             case L_CEL:
00331             case L_ECL:
00332             case L_CCL:
00333                 FAILW(REG_EBRACK);
00334                 break;
00335         }
00336         assert(NOTREACHED);
00337     }
00338 
00339     /* okay, time to actually get a character */
00340     c = *v->now++;
00341 
00342     /* deal with the easy contexts, punt EREs to code below */
00343     switch (v->lexcon)
00344     {
00345         case L_BRE:             /* punt BREs to separate function */
00346             return brenext(v, c);
00347             break;
00348         case L_ERE:             /* see below */
00349             break;
00350         case L_Q:               /* literal strings are easy */
00351             RETV(PLAIN, c);
00352             break;
00353         case L_BBND:            /* bounds are fairly simple */
00354         case L_EBND:
00355             switch (c)
00356             {
00357                 case CHR('0'):
00358                 case CHR('1'):
00359                 case CHR('2'):
00360                 case CHR('3'):
00361                 case CHR('4'):
00362                 case CHR('5'):
00363                 case CHR('6'):
00364                 case CHR('7'):
00365                 case CHR('8'):
00366                 case CHR('9'):
00367                     RETV(DIGIT, (chr) DIGITVAL(c));
00368                     break;
00369                 case CHR(','):
00370                     RET(',');
00371                     break;
00372                 case CHR('}'):  /* ERE bound ends with } */
00373                     if (INCON(L_EBND))
00374                     {
00375                         INTOCON(L_ERE);
00376                         if ((v->cflags & REG_ADVF) && NEXT1('?'))
00377                         {
00378                             v->now++;
00379                             NOTE(REG_UNONPOSIX);
00380                             RETV('}', 0);
00381                         }
00382                         RETV('}', 1);
00383                     }
00384                     else
00385                         FAILW(REG_BADBR);
00386                     break;
00387                 case CHR('\\'): /* BRE bound ends with \} */
00388                     if (INCON(L_BBND) && NEXT1('}'))
00389                     {
00390                         v->now++;
00391                         INTOCON(L_BRE);
00392                         RET('}');
00393                     }
00394                     else
00395                         FAILW(REG_BADBR);
00396                     break;
00397                 default:
00398                     FAILW(REG_BADBR);
00399                     break;
00400             }
00401             assert(NOTREACHED);
00402             break;
00403         case L_BRACK:           /* brackets are not too hard */
00404             switch (c)
00405             {
00406                 case CHR(']'):
00407                     if (LASTTYPE('['))
00408                         RETV(PLAIN, c);
00409                     else
00410                     {
00411                         INTOCON((v->cflags & REG_EXTENDED) ?
00412                                 L_ERE : L_BRE);
00413                         RET(']');
00414                     }
00415                     break;
00416                 case CHR('\\'):
00417                     NOTE(REG_UBBS);
00418                     if (!(v->cflags & REG_ADVF))
00419                         RETV(PLAIN, c);
00420                     NOTE(REG_UNONPOSIX);
00421                     if (ATEOS())
00422                         FAILW(REG_EESCAPE);
00423                     (DISCARD) lexescape(v);
00424                     switch (v->nexttype)
00425                     {           /* not all escapes okay here */
00426                         case PLAIN:
00427                             return 1;
00428                             break;
00429                         case CCLASS:
00430                             switch (v->nextvalue)
00431                             {
00432                                 case 'd':
00433                                     lexnest(v, brbackd, ENDOF(brbackd));
00434                                     break;
00435                                 case 's':
00436                                     lexnest(v, brbacks, ENDOF(brbacks));
00437                                     break;
00438                                 case 'w':
00439                                     lexnest(v, brbackw, ENDOF(brbackw));
00440                                     break;
00441                                 default:
00442                                     FAILW(REG_EESCAPE);
00443                                     break;
00444                             }
00445                             /* lexnest done, back up and try again */
00446                             v->nexttype = v->lasttype;
00447                             return next(v);
00448                             break;
00449                     }
00450                     /* not one of the acceptable escapes */
00451                     FAILW(REG_EESCAPE);
00452                     break;
00453                 case CHR('-'):
00454                     if (LASTTYPE('[') || NEXT1(']'))
00455                         RETV(PLAIN, c);
00456                     else
00457                         RETV(RANGE, c);
00458                     break;
00459                 case CHR('['):
00460                     if (ATEOS())
00461                         FAILW(REG_EBRACK);
00462                     switch (*v->now++)
00463                     {
00464                         case CHR('.'):
00465                             INTOCON(L_CEL);
00466                             /* might or might not be locale-specific */
00467                             RET(COLLEL);
00468                             break;
00469                         case CHR('='):
00470                             INTOCON(L_ECL);
00471                             NOTE(REG_ULOCALE);
00472                             RET(ECLASS);
00473                             break;
00474                         case CHR(':'):
00475                             INTOCON(L_CCL);
00476                             NOTE(REG_ULOCALE);
00477                             RET(CCLASS);
00478                             break;
00479                         default:        /* oops */
00480                             v->now--;
00481                             RETV(PLAIN, c);
00482                             break;
00483                     }
00484                     assert(NOTREACHED);
00485                     break;
00486                 default:
00487                     RETV(PLAIN, c);
00488                     break;
00489             }
00490             assert(NOTREACHED);
00491             break;
00492         case L_CEL:             /* collating elements are easy */
00493             if (c == CHR('.') && NEXT1(']'))
00494             {
00495                 v->now++;
00496                 INTOCON(L_BRACK);
00497                 RETV(END, '.');
00498             }
00499             else
00500                 RETV(PLAIN, c);
00501             break;
00502         case L_ECL:             /* ditto equivalence classes */
00503             if (c == CHR('=') && NEXT1(']'))
00504             {
00505                 v->now++;
00506                 INTOCON(L_BRACK);
00507                 RETV(END, '=');
00508             }
00509             else
00510                 RETV(PLAIN, c);
00511             break;
00512         case L_CCL:             /* ditto character classes */
00513             if (c == CHR(':') && NEXT1(']'))
00514             {
00515                 v->now++;
00516                 INTOCON(L_BRACK);
00517                 RETV(END, ':');
00518             }
00519             else
00520                 RETV(PLAIN, c);
00521             break;
00522         default:
00523             assert(NOTREACHED);
00524             break;
00525     }
00526 
00527     /* that got rid of everything except EREs and AREs */
00528     assert(INCON(L_ERE));
00529 
00530     /* deal with EREs and AREs, except for backslashes */
00531     switch (c)
00532     {
00533         case CHR('|'):
00534             RET('|');
00535             break;
00536         case CHR('*'):
00537             if ((v->cflags & REG_ADVF) && NEXT1('?'))
00538             {
00539                 v->now++;
00540                 NOTE(REG_UNONPOSIX);
00541                 RETV('*', 0);
00542             }
00543             RETV('*', 1);
00544             break;
00545         case CHR('+'):
00546             if ((v->cflags & REG_ADVF) && NEXT1('?'))
00547             {
00548                 v->now++;
00549                 NOTE(REG_UNONPOSIX);
00550                 RETV('+', 0);
00551             }
00552             RETV('+', 1);
00553             break;
00554         case CHR('?'):
00555             if ((v->cflags & REG_ADVF) && NEXT1('?'))
00556             {
00557                 v->now++;
00558                 NOTE(REG_UNONPOSIX);
00559                 RETV('?', 0);
00560             }
00561             RETV('?', 1);
00562             break;
00563         case CHR('{'):          /* bounds start or plain character */
00564             if (v->cflags & REG_EXPANDED)
00565                 skip(v);
00566             if (ATEOS() || !iscdigit(*v->now))
00567             {
00568                 NOTE(REG_UBRACES);
00569                 NOTE(REG_UUNSPEC);
00570                 RETV(PLAIN, c);
00571             }
00572             else
00573             {
00574                 NOTE(REG_UBOUNDS);
00575                 INTOCON(L_EBND);
00576                 RET('{');
00577             }
00578             assert(NOTREACHED);
00579             break;
00580         case CHR('('):          /* parenthesis, or advanced extension */
00581             if ((v->cflags & REG_ADVF) && NEXT1('?'))
00582             {
00583                 NOTE(REG_UNONPOSIX);
00584                 v->now++;
00585                 switch (*v->now++)
00586                 {
00587                     case CHR(':'):      /* non-capturing paren */
00588                         RETV('(', 0);
00589                         break;
00590                     case CHR('#'):      /* comment */
00591                         while (!ATEOS() && *v->now != CHR(')'))
00592                             v->now++;
00593                         if (!ATEOS())
00594                             v->now++;
00595                         assert(v->nexttype == v->lasttype);
00596                         return next(v);
00597                         break;
00598                     case CHR('='):      /* positive lookahead */
00599                         NOTE(REG_ULOOKAHEAD);
00600                         RETV(LACON, 1);
00601                         break;
00602                     case CHR('!'):      /* negative lookahead */
00603                         NOTE(REG_ULOOKAHEAD);
00604                         RETV(LACON, 0);
00605                         break;
00606                     default:
00607                         FAILW(REG_BADRPT);
00608                         break;
00609                 }
00610                 assert(NOTREACHED);
00611             }
00612             if (v->cflags & REG_NOSUB)
00613                 RETV('(', 0);   /* all parens non-capturing */
00614             else
00615                 RETV('(', 1);
00616             break;
00617         case CHR(')'):
00618             if (LASTTYPE('('))
00619                 NOTE(REG_UUNSPEC);
00620             RETV(')', c);
00621             break;
00622         case CHR('['):          /* easy except for [[:<:]] and [[:>:]] */
00623             if (HAVE(6) && *(v->now + 0) == CHR('[') &&
00624                 *(v->now + 1) == CHR(':') &&
00625                 (*(v->now + 2) == CHR('<') ||
00626                  *(v->now + 2) == CHR('>')) &&
00627                 *(v->now + 3) == CHR(':') &&
00628                 *(v->now + 4) == CHR(']') &&
00629                 *(v->now + 5) == CHR(']'))
00630             {
00631                 c = *(v->now + 2);
00632                 v->now += 6;
00633                 NOTE(REG_UNONPOSIX);
00634                 RET((c == CHR('<')) ? '<' : '>');
00635             }
00636             INTOCON(L_BRACK);
00637             if (NEXT1('^'))
00638             {
00639                 v->now++;
00640                 RETV('[', 0);
00641             }
00642             RETV('[', 1);
00643             break;
00644         case CHR('.'):
00645             RET('.');
00646             break;
00647         case CHR('^'):
00648             RET('^');
00649             break;
00650         case CHR('$'):
00651             RET('$');
00652             break;
00653         case CHR('\\'): /* mostly punt backslashes to code below */
00654             if (ATEOS())
00655                 FAILW(REG_EESCAPE);
00656             break;
00657         default:                /* ordinary character */
00658             RETV(PLAIN, c);
00659             break;
00660     }
00661 
00662     /* ERE/ARE backslash handling; backslash already eaten */
00663     assert(!ATEOS());
00664     if (!(v->cflags & REG_ADVF))
00665     {                           /* only AREs have non-trivial escapes */
00666         if (iscalnum(*v->now))
00667         {
00668             NOTE(REG_UBSALNUM);
00669             NOTE(REG_UUNSPEC);
00670         }
00671         RETV(PLAIN, *v->now++);
00672     }
00673     (DISCARD) lexescape(v);
00674     if (ISERR())
00675         FAILW(REG_EESCAPE);
00676     if (v->nexttype == CCLASS)
00677     {                           /* fudge at lexical level */
00678         switch (v->nextvalue)
00679         {
00680             case 'd':
00681                 lexnest(v, backd, ENDOF(backd));
00682                 break;
00683             case 'D':
00684                 lexnest(v, backD, ENDOF(backD));
00685                 break;
00686             case 's':
00687                 lexnest(v, backs, ENDOF(backs));
00688                 break;
00689             case 'S':
00690                 lexnest(v, backS, ENDOF(backS));
00691                 break;
00692             case 'w':
00693                 lexnest(v, backw, ENDOF(backw));
00694                 break;
00695             case 'W':
00696                 lexnest(v, backW, ENDOF(backW));
00697                 break;
00698             default:
00699                 assert(NOTREACHED);
00700                 FAILW(REG_ASSERT);
00701                 break;
00702         }
00703         /* lexnest done, back up and try again */
00704         v->nexttype = v->lasttype;
00705         return next(v);
00706     }
00707     /* otherwise, lexescape has already done the work */
00708     return !ISERR();
00709 }
00710 
00711 /*
00712  * lexescape - parse an ARE backslash escape (backslash already eaten)
00713  * Note slightly nonstandard use of the CCLASS type code.
00714  */
00715 static int                      /* not actually used, but convenient for RETV */
00716 lexescape(struct vars * v)
00717 {
00718     chr         c;
00719     static chr  alert[] = {
00720         CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
00721     };
00722     static chr  esc[] = {
00723         CHR('E'), CHR('S'), CHR('C')
00724     };
00725     const chr  *save;
00726 
00727     assert(v->cflags & REG_ADVF);
00728 
00729     assert(!ATEOS());
00730     c = *v->now++;
00731     if (!iscalnum(c))
00732         RETV(PLAIN, c);
00733 
00734     NOTE(REG_UNONPOSIX);
00735     switch (c)
00736     {
00737         case CHR('a'):
00738             RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007')));
00739             break;
00740         case CHR('A'):
00741             RETV(SBEGIN, 0);
00742             break;
00743         case CHR('b'):
00744             RETV(PLAIN, CHR('\b'));
00745             break;
00746         case CHR('B'):
00747             RETV(PLAIN, CHR('\\'));
00748             break;
00749         case CHR('c'):
00750             NOTE(REG_UUNPORT);
00751             if (ATEOS())
00752                 FAILW(REG_EESCAPE);
00753             RETV(PLAIN, (chr) (*v->now++ & 037));
00754             break;
00755         case CHR('d'):
00756             NOTE(REG_ULOCALE);
00757             RETV(CCLASS, 'd');
00758             break;
00759         case CHR('D'):
00760             NOTE(REG_ULOCALE);
00761             RETV(CCLASS, 'D');
00762             break;
00763         case CHR('e'):
00764             NOTE(REG_UUNPORT);
00765             RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033')));
00766             break;
00767         case CHR('f'):
00768             RETV(PLAIN, CHR('\f'));
00769             break;
00770         case CHR('m'):
00771             RET('<');
00772             break;
00773         case CHR('M'):
00774             RET('>');
00775             break;
00776         case CHR('n'):
00777             RETV(PLAIN, CHR('\n'));
00778             break;
00779         case CHR('r'):
00780             RETV(PLAIN, CHR('\r'));
00781             break;
00782         case CHR('s'):
00783             NOTE(REG_ULOCALE);
00784             RETV(CCLASS, 's');
00785             break;
00786         case CHR('S'):
00787             NOTE(REG_ULOCALE);
00788             RETV(CCLASS, 'S');
00789             break;
00790         case CHR('t'):
00791             RETV(PLAIN, CHR('\t'));
00792             break;
00793         case CHR('u'):
00794             c = lexdigits(v, 16, 4, 4);
00795             if (ISERR())
00796                 FAILW(REG_EESCAPE);
00797             RETV(PLAIN, c);
00798             break;
00799         case CHR('U'):
00800             c = lexdigits(v, 16, 8, 8);
00801             if (ISERR())
00802                 FAILW(REG_EESCAPE);
00803             RETV(PLAIN, c);
00804             break;
00805         case CHR('v'):
00806             RETV(PLAIN, CHR('\v'));
00807             break;
00808         case CHR('w'):
00809             NOTE(REG_ULOCALE);
00810             RETV(CCLASS, 'w');
00811             break;
00812         case CHR('W'):
00813             NOTE(REG_ULOCALE);
00814             RETV(CCLASS, 'W');
00815             break;
00816         case CHR('x'):
00817             NOTE(REG_UUNPORT);
00818             c = lexdigits(v, 16, 1, 255);       /* REs >255 long outside spec */
00819             if (ISERR())
00820                 FAILW(REG_EESCAPE);
00821             RETV(PLAIN, c);
00822             break;
00823         case CHR('y'):
00824             NOTE(REG_ULOCALE);
00825             RETV(WBDRY, 0);
00826             break;
00827         case CHR('Y'):
00828             NOTE(REG_ULOCALE);
00829             RETV(NWBDRY, 0);
00830             break;
00831         case CHR('Z'):
00832             RETV(SEND, 0);
00833             break;
00834         case CHR('1'):
00835         case CHR('2'):
00836         case CHR('3'):
00837         case CHR('4'):
00838         case CHR('5'):
00839         case CHR('6'):
00840         case CHR('7'):
00841         case CHR('8'):
00842         case CHR('9'):
00843             save = v->now;
00844             v->now--;           /* put first digit back */
00845             c = lexdigits(v, 10, 1, 255);       /* REs >255 long outside spec */
00846             if (ISERR())
00847                 FAILW(REG_EESCAPE);
00848             /* ugly heuristic (first test is "exactly 1 digit?") */
00849             if (v->now == save || ((int) c > 0 && (int) c <= v->nsubexp))
00850             {
00851                 NOTE(REG_UBACKREF);
00852                 RETV(BACKREF, (chr) c);
00853             }
00854             /* oops, doesn't look like it's a backref after all... */
00855             v->now = save;
00856             /* and fall through into octal number */
00857         case CHR('0'):
00858             NOTE(REG_UUNPORT);
00859             v->now--;           /* put first digit back */
00860             c = lexdigits(v, 8, 1, 3);
00861             if (ISERR())
00862                 FAILW(REG_EESCAPE);
00863             RETV(PLAIN, c);
00864             break;
00865         default:
00866             assert(iscalpha(c));
00867             FAILW(REG_EESCAPE); /* unknown alphabetic escape */
00868             break;
00869     }
00870     assert(NOTREACHED);
00871 }
00872 
00873 /*
00874  * lexdigits - slurp up digits and return chr value
00875  */
00876 static chr                      /* chr value; errors signalled via ERR */
00877 lexdigits(struct vars * v,
00878           int base,
00879           int minlen,
00880           int maxlen)
00881 {
00882     uchr        n;              /* unsigned to avoid overflow misbehavior */
00883     int         len;
00884     chr         c;
00885     int         d;
00886     const uchr  ub = (uchr) base;
00887 
00888     n = 0;
00889     for (len = 0; len < maxlen && !ATEOS(); len++)
00890     {
00891         c = *v->now++;
00892         switch (c)
00893         {
00894             case CHR('0'):
00895             case CHR('1'):
00896             case CHR('2'):
00897             case CHR('3'):
00898             case CHR('4'):
00899             case CHR('5'):
00900             case CHR('6'):
00901             case CHR('7'):
00902             case CHR('8'):
00903             case CHR('9'):
00904                 d = DIGITVAL(c);
00905                 break;
00906             case CHR('a'):
00907             case CHR('A'):
00908                 d = 10;
00909                 break;
00910             case CHR('b'):
00911             case CHR('B'):
00912                 d = 11;
00913                 break;
00914             case CHR('c'):
00915             case CHR('C'):
00916                 d = 12;
00917                 break;
00918             case CHR('d'):
00919             case CHR('D'):
00920                 d = 13;
00921                 break;
00922             case CHR('e'):
00923             case CHR('E'):
00924                 d = 14;
00925                 break;
00926             case CHR('f'):
00927             case CHR('F'):
00928                 d = 15;
00929                 break;
00930             default:
00931                 v->now--;       /* oops, not a digit at all */
00932                 d = -1;
00933                 break;
00934         }
00935 
00936         if (d >= base)
00937         {                       /* not a plausible digit */
00938             v->now--;
00939             d = -1;
00940         }
00941         if (d < 0)
00942             break;              /* NOTE BREAK OUT */
00943         n = n * ub + (uchr) d;
00944     }
00945     if (len < minlen)
00946         ERR(REG_EESCAPE);
00947 
00948     return (chr) n;
00949 }
00950 
00951 /*
00952  * brenext - get next BRE token
00953  *
00954  * This is much like EREs except for all the stupid backslashes and the
00955  * context-dependency of some things.
00956  */
00957 static int                      /* 1 normal, 0 failure */
00958 brenext(struct vars * v,
00959         chr pc)
00960 {
00961     chr         c = (chr) pc;
00962 
00963     switch (c)
00964     {
00965         case CHR('*'):
00966             if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^'))
00967                 RETV(PLAIN, c);
00968             RET('*');
00969             break;
00970         case CHR('['):
00971             if (HAVE(6) && *(v->now + 0) == CHR('[') &&
00972                 *(v->now + 1) == CHR(':') &&
00973                 (*(v->now + 2) == CHR('<') ||
00974                  *(v->now + 2) == CHR('>')) &&
00975                 *(v->now + 3) == CHR(':') &&
00976                 *(v->now + 4) == CHR(']') &&
00977                 *(v->now + 5) == CHR(']'))
00978             {
00979                 c = *(v->now + 2);
00980                 v->now += 6;
00981                 NOTE(REG_UNONPOSIX);
00982                 RET((c == CHR('<')) ? '<' : '>');
00983             }
00984             INTOCON(L_BRACK);
00985             if (NEXT1('^'))
00986             {
00987                 v->now++;
00988                 RETV('[', 0);
00989             }
00990             RETV('[', 1);
00991             break;
00992         case CHR('.'):
00993             RET('.');
00994             break;
00995         case CHR('^'):
00996             if (LASTTYPE(EMPTY))
00997                 RET('^');
00998             if (LASTTYPE('('))
00999             {
01000                 NOTE(REG_UUNSPEC);
01001                 RET('^');
01002             }
01003             RETV(PLAIN, c);
01004             break;
01005         case CHR('$'):
01006             if (v->cflags & REG_EXPANDED)
01007                 skip(v);
01008             if (ATEOS())
01009                 RET('$');
01010             if (NEXT2('\\', ')'))
01011             {
01012                 NOTE(REG_UUNSPEC);
01013                 RET('$');
01014             }
01015             RETV(PLAIN, c);
01016             break;
01017         case CHR('\\'):
01018             break;              /* see below */
01019         default:
01020             RETV(PLAIN, c);
01021             break;
01022     }
01023 
01024     assert(c == CHR('\\'));
01025 
01026     if (ATEOS())
01027         FAILW(REG_EESCAPE);
01028 
01029     c = *v->now++;
01030     switch (c)
01031     {
01032         case CHR('{'):
01033             INTOCON(L_BBND);
01034             NOTE(REG_UBOUNDS);
01035             RET('{');
01036             break;
01037         case CHR('('):
01038             RETV('(', 1);
01039             break;
01040         case CHR(')'):
01041             RETV(')', c);
01042             break;
01043         case CHR('<'):
01044             NOTE(REG_UNONPOSIX);
01045             RET('<');
01046             break;
01047         case CHR('>'):
01048             NOTE(REG_UNONPOSIX);
01049             RET('>');
01050             break;
01051         case CHR('1'):
01052         case CHR('2'):
01053         case CHR('3'):
01054         case CHR('4'):
01055         case CHR('5'):
01056         case CHR('6'):
01057         case CHR('7'):
01058         case CHR('8'):
01059         case CHR('9'):
01060             NOTE(REG_UBACKREF);
01061             RETV(BACKREF, (chr) DIGITVAL(c));
01062             break;
01063         default:
01064             if (iscalnum(c))
01065             {
01066                 NOTE(REG_UBSALNUM);
01067                 NOTE(REG_UUNSPEC);
01068             }
01069             RETV(PLAIN, c);
01070             break;
01071     }
01072 
01073     assert(NOTREACHED);
01074     return 0;
01075 }
01076 
01077 /*
01078  * skip - skip white space and comments in expanded form
01079  */
01080 static void
01081 skip(struct vars * v)
01082 {
01083     const chr  *start = v->now;
01084 
01085     assert(v->cflags & REG_EXPANDED);
01086 
01087     for (;;)
01088     {
01089         while (!ATEOS() && iscspace(*v->now))
01090             v->now++;
01091         if (ATEOS() || *v->now != CHR('#'))
01092             break;              /* NOTE BREAK OUT */
01093         assert(NEXT1('#'));
01094         while (!ATEOS() && *v->now != CHR('\n'))
01095             v->now++;
01096         /* leave the newline to be picked up by the iscspace loop */
01097     }
01098 
01099     if (v->now != start)
01100         NOTE(REG_UNONPOSIX);
01101 }
01102 
01103 /*
01104  * newline - return the chr for a newline
01105  *
01106  * This helps confine use of CHR to this source file.
01107  */
01108 static chr
01109 newline(void)
01110 {
01111     return CHR('\n');
01112 }
01113 
01114 /*
01115  * chrnamed - return the chr known by a given (chr string) name
01116  *
01117  * The code is a bit clumsy, but this routine gets only such specialized
01118  * use that it hardly matters.
01119  */
01120 static chr
01121 chrnamed(struct vars * v,
01122          const chr *startp,     /* start of name */
01123          const chr *endp,       /* just past end of name */
01124          chr lastresort)        /* what to return if name lookup fails */
01125 {
01126     celt        c;
01127     int         errsave;
01128     int         e;
01129     struct cvec *cv;
01130 
01131     errsave = v->err;
01132     v->err = 0;
01133     c = element(v, startp, endp);
01134     e = v->err;
01135     v->err = errsave;
01136 
01137     if (e != 0)
01138         return (chr) lastresort;
01139 
01140     cv = range(v, c, c, 0);
01141     if (cv->nchrs == 0)
01142         return (chr) lastresort;
01143     return cv->chrs[0];
01144 }
Header And Logo

regc_lex.c