Header And Logo

PostgreSQL
| The world's most advanced open source database.

regc_locale.c

Go to the documentation of this file.
00001 /*
00002  * regc_locale.c --
00003  *
00004  *  This file contains locale-specific regexp routines.
00005  *  This file is #included by regcomp.c.
00006  *
00007  * Copyright (c) 1998 by Scriptics Corporation.
00008  *
00009  * This software is copyrighted by the Regents of the University of
00010  * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
00011  * Corporation and other parties.  The following terms apply to all files
00012  * associated with the software unless explicitly disclaimed in
00013  * individual files.
00014  *
00015  * The authors hereby grant permission to use, copy, modify, distribute,
00016  * and license this software and its documentation for any purpose, provided
00017  * that existing copyright notices are retained in all copies and that this
00018  * notice is included verbatim in any distributions. No written agreement,
00019  * license, or royalty fee is required for any of the authorized uses.
00020  * Modifications to this software may be copyrighted by their authors
00021  * and need not follow the licensing terms described here, provided that
00022  * the new terms are clearly indicated on the first page of each file where
00023  * they apply.
00024  *
00025  * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
00026  * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
00027  * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
00028  * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
00029  * POSSIBILITY OF SUCH DAMAGE.
00030  *
00031  * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
00032  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
00033  * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.  THIS SOFTWARE
00034  * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
00035  * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
00036  * MODIFICATIONS.
00037  *
00038  * GOVERNMENT USE: If you are acquiring this software on behalf of the
00039  * U.S. government, the Government shall have only "Restricted Rights"
00040  * in the software and related documentation as defined in the Federal
00041  * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2).  If you
00042  * are acquiring the software on behalf of the Department of Defense, the
00043  * software shall be classified as "Commercial Computer Software" and the
00044  * Government shall have only "Restricted Rights" as defined in Clause
00045  * 252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the
00046  * authors grant the U.S. Government and others acting in its behalf
00047  * permission to use and distribute the software in accordance with the
00048  * terms specified in this license.
00049  *
00050  * src/backend/regex/regc_locale.c
00051  */
00052 
00053 /* ASCII character-name table */
00054 
00055 static const struct cname
00056 {
00057     const char *name;
00058     const char  code;
00059 }   cnames[] =
00060 
00061 {
00062     {
00063         "NUL", '\0'
00064     },
00065     {
00066         "SOH", '\001'
00067     },
00068     {
00069         "STX", '\002'
00070     },
00071     {
00072         "ETX", '\003'
00073     },
00074     {
00075         "EOT", '\004'
00076     },
00077     {
00078         "ENQ", '\005'
00079     },
00080     {
00081         "ACK", '\006'
00082     },
00083     {
00084         "BEL", '\007'
00085     },
00086     {
00087         "alert", '\007'
00088     },
00089     {
00090         "BS", '\010'
00091     },
00092     {
00093         "backspace", '\b'
00094     },
00095     {
00096         "HT", '\011'
00097     },
00098     {
00099         "tab", '\t'
00100     },
00101     {
00102         "LF", '\012'
00103     },
00104     {
00105         "newline", '\n'
00106     },
00107     {
00108         "VT", '\013'
00109     },
00110     {
00111         "vertical-tab", '\v'
00112     },
00113     {
00114         "FF", '\014'
00115     },
00116     {
00117         "form-feed", '\f'
00118     },
00119     {
00120         "CR", '\015'
00121     },
00122     {
00123         "carriage-return", '\r'
00124     },
00125     {
00126         "SO", '\016'
00127     },
00128     {
00129         "SI", '\017'
00130     },
00131     {
00132         "DLE", '\020'
00133     },
00134     {
00135         "DC1", '\021'
00136     },
00137     {
00138         "DC2", '\022'
00139     },
00140     {
00141         "DC3", '\023'
00142     },
00143     {
00144         "DC4", '\024'
00145     },
00146     {
00147         "NAK", '\025'
00148     },
00149     {
00150         "SYN", '\026'
00151     },
00152     {
00153         "ETB", '\027'
00154     },
00155     {
00156         "CAN", '\030'
00157     },
00158     {
00159         "EM", '\031'
00160     },
00161     {
00162         "SUB", '\032'
00163     },
00164     {
00165         "ESC", '\033'
00166     },
00167     {
00168         "IS4", '\034'
00169     },
00170     {
00171         "FS", '\034'
00172     },
00173     {
00174         "IS3", '\035'
00175     },
00176     {
00177         "GS", '\035'
00178     },
00179     {
00180         "IS2", '\036'
00181     },
00182     {
00183         "RS", '\036'
00184     },
00185     {
00186         "IS1", '\037'
00187     },
00188     {
00189         "US", '\037'
00190     },
00191     {
00192         "space", ' '
00193     },
00194     {
00195         "exclamation-mark", '!'
00196     },
00197     {
00198         "quotation-mark", '"'
00199     },
00200     {
00201         "number-sign", '#'
00202     },
00203     {
00204         "dollar-sign", '$'
00205     },
00206     {
00207         "percent-sign", '%'
00208     },
00209     {
00210         "ampersand", '&'
00211     },
00212     {
00213         "apostrophe", '\''
00214     },
00215     {
00216         "left-parenthesis", '('
00217     },
00218     {
00219         "right-parenthesis", ')'
00220     },
00221     {
00222         "asterisk", '*'
00223     },
00224     {
00225         "plus-sign", '+'
00226     },
00227     {
00228         "comma", ','
00229     },
00230     {
00231         "hyphen", '-'
00232     },
00233     {
00234         "hyphen-minus", '-'
00235     },
00236     {
00237         "period", '.'
00238     },
00239     {
00240         "full-stop", '.'
00241     },
00242     {
00243         "slash", '/'
00244     },
00245     {
00246         "solidus", '/'
00247     },
00248     {
00249         "zero", '0'
00250     },
00251     {
00252         "one", '1'
00253     },
00254     {
00255         "two", '2'
00256     },
00257     {
00258         "three", '3'
00259     },
00260     {
00261         "four", '4'
00262     },
00263     {
00264         "five", '5'
00265     },
00266     {
00267         "six", '6'
00268     },
00269     {
00270         "seven", '7'
00271     },
00272     {
00273         "eight", '8'
00274     },
00275     {
00276         "nine", '9'
00277     },
00278     {
00279         "colon", ':'
00280     },
00281     {
00282         "semicolon", ';'
00283     },
00284     {
00285         "less-than-sign", '<'
00286     },
00287     {
00288         "equals-sign", '='
00289     },
00290     {
00291         "greater-than-sign", '>'
00292     },
00293     {
00294         "question-mark", '?'
00295     },
00296     {
00297         "commercial-at", '@'
00298     },
00299     {
00300         "left-square-bracket", '['
00301     },
00302     {
00303         "backslash", '\\'
00304     },
00305     {
00306         "reverse-solidus", '\\'
00307     },
00308     {
00309         "right-square-bracket", ']'
00310     },
00311     {
00312         "circumflex", '^'
00313     },
00314     {
00315         "circumflex-accent", '^'
00316     },
00317     {
00318         "underscore", '_'
00319     },
00320     {
00321         "low-line", '_'
00322     },
00323     {
00324         "grave-accent", '`'
00325     },
00326     {
00327         "left-brace", '{'
00328     },
00329     {
00330         "left-curly-bracket", '{'
00331     },
00332     {
00333         "vertical-line", '|'
00334     },
00335     {
00336         "right-brace", '}'
00337     },
00338     {
00339         "right-curly-bracket", '}'
00340     },
00341     {
00342         "tilde", '~'
00343     },
00344     {
00345         "DEL", '\177'
00346     },
00347     {
00348         NULL, 0
00349     }
00350 };
00351 
00352 
00353 /*
00354  * We do not use the hard-wired Unicode classification tables that Tcl does.
00355  * This is because (a) we need to deal with other encodings besides Unicode,
00356  * and (b) we want to track the behavior of the libc locale routines as
00357  * closely as possible.  For example, it wouldn't be unreasonable for a
00358  * locale to not consider every Unicode letter as a letter.  So we build
00359  * character classification cvecs by asking libc, even for Unicode.
00360  */
00361 
00362 
00363 /*
00364  * element - map collating-element name to celt
00365  */
00366 static celt
00367 element(struct vars * v,        /* context */
00368         const chr *startp,      /* points to start of name */
00369         const chr *endp)        /* points just past end of name */
00370 {
00371     const struct cname *cn;
00372     size_t      len;
00373 
00374     /* generic:  one-chr names stand for themselves */
00375     assert(startp < endp);
00376     len = endp - startp;
00377     if (len == 1)
00378         return *startp;
00379 
00380     NOTE(REG_ULOCALE);
00381 
00382     /* search table */
00383     for (cn = cnames; cn->name != NULL; cn++)
00384     {
00385         if (strlen(cn->name) == len &&
00386             pg_char_and_wchar_strncmp(cn->name, startp, len) == 0)
00387         {
00388             break;              /* NOTE BREAK OUT */
00389         }
00390     }
00391     if (cn->name != NULL)
00392         return CHR(cn->code);
00393 
00394     /* couldn't find it */
00395     ERR(REG_ECOLLATE);
00396     return 0;
00397 }
00398 
00399 /*
00400  * range - supply cvec for a range, including legality check
00401  */
00402 static struct cvec *
00403 range(struct vars * v,          /* context */
00404       celt a,                   /* range start */
00405       celt b,                   /* range end, might equal a */
00406       int cases)                /* case-independent? */
00407 {
00408     int         nchrs;
00409     struct cvec *cv;
00410     celt        c,
00411                 lc,
00412                 uc;
00413 
00414     if (a != b && !before(a, b))
00415     {
00416         ERR(REG_ERANGE);
00417         return NULL;
00418     }
00419 
00420     if (!cases)
00421     {                           /* easy version */
00422         cv = getcvec(v, 0, 1);
00423         NOERRN();
00424         addrange(cv, a, b);
00425         return cv;
00426     }
00427 
00428     /*
00429      * When case-independent, it's hard to decide when cvec ranges are usable,
00430      * so for now at least, we won't try.  We allocate enough space for two
00431      * case variants plus a little extra for the two title case variants.
00432      */
00433 
00434     nchrs = (b - a + 1) * 2 + 4;
00435 
00436     cv = getcvec(v, nchrs, 0);
00437     NOERRN();
00438 
00439     for (c = a; c <= b; c++)
00440     {
00441         addchr(cv, c);
00442         lc = pg_wc_tolower((chr) c);
00443         if (c != lc)
00444             addchr(cv, lc);
00445         uc = pg_wc_toupper((chr) c);
00446         if (c != uc)
00447             addchr(cv, uc);
00448     }
00449 
00450     return cv;
00451 }
00452 
00453 /*
00454  * before - is celt x before celt y, for purposes of range legality?
00455  */
00456 static int                      /* predicate */
00457 before(celt x, celt y)
00458 {
00459     if (x < y)
00460         return 1;
00461     return 0;
00462 }
00463 
00464 /*
00465  * eclass - supply cvec for an equivalence class
00466  * Must include case counterparts on request.
00467  */
00468 static struct cvec *
00469 eclass(struct vars * v,         /* context */
00470        celt c,                  /* Collating element representing the
00471                                  * equivalence class. */
00472        int cases)               /* all cases? */
00473 {
00474     struct cvec *cv;
00475 
00476     /* crude fake equivalence class for testing */
00477     if ((v->cflags & REG_FAKE) && c == 'x')
00478     {
00479         cv = getcvec(v, 4, 0);
00480         addchr(cv, (chr) 'x');
00481         addchr(cv, (chr) 'y');
00482         if (cases)
00483         {
00484             addchr(cv, (chr) 'X');
00485             addchr(cv, (chr) 'Y');
00486         }
00487         return cv;
00488     }
00489 
00490     /* otherwise, none */
00491     if (cases)
00492         return allcases(v, c);
00493     cv = getcvec(v, 1, 0);
00494     assert(cv != NULL);
00495     addchr(cv, (chr) c);
00496     return cv;
00497 }
00498 
00499 /*
00500  * cclass - supply cvec for a character class
00501  *
00502  * Must include case counterparts if "cases" is true.
00503  *
00504  * The returned cvec might be either a transient cvec gotten from getcvec(),
00505  * or a permanently cached one from pg_ctype_get_cache().  This is okay
00506  * because callers are not supposed to explicitly free the result either way.
00507  */
00508 static struct cvec *
00509 cclass(struct vars * v,         /* context */
00510        const chr *startp,       /* where the name starts */
00511        const chr *endp,         /* just past the end of the name */
00512        int cases)               /* case-independent? */
00513 {
00514     size_t      len;
00515     struct cvec *cv = NULL;
00516     const char *const * namePtr;
00517     int         i,
00518                 index;
00519 
00520     /*
00521      * The following arrays define the valid character class names.
00522      */
00523 
00524     static const char *const classNames[] = {
00525         "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
00526         "lower", "print", "punct", "space", "upper", "xdigit", NULL
00527     };
00528 
00529     enum classes
00530     {
00531         CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
00532         CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
00533     };
00534 
00535     /*
00536      * Map the name to the corresponding enumerated value.
00537      */
00538     len = endp - startp;
00539     index = -1;
00540     for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
00541     {
00542         if (strlen(*namePtr) == len &&
00543             pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
00544         {
00545             index = i;
00546             break;
00547         }
00548     }
00549     if (index == -1)
00550     {
00551         ERR(REG_ECTYPE);
00552         return NULL;
00553     }
00554 
00555     /*
00556      * Remap lower and upper to alpha if the match is case insensitive.
00557      */
00558 
00559     if (cases &&
00560         ((enum classes) index == CC_LOWER ||
00561          (enum classes) index == CC_UPPER))
00562         index = (int) CC_ALPHA;
00563 
00564     /*
00565      * Now compute the character class contents.  For classes that are based
00566      * on the behavior of a <wctype.h> or <ctype.h> function, we use
00567      * pg_ctype_get_cache so that we can cache the results.  Other classes
00568      * have definitions that are hard-wired here, and for those we just
00569      * construct a transient cvec on the fly.
00570      */
00571 
00572     switch ((enum classes) index)
00573     {
00574         case CC_PRINT:
00575             cv = pg_ctype_get_cache(pg_wc_isprint);
00576             break;
00577         case CC_ALNUM:
00578             cv = pg_ctype_get_cache(pg_wc_isalnum);
00579             break;
00580         case CC_ALPHA:
00581             cv = pg_ctype_get_cache(pg_wc_isalpha);
00582             break;
00583         case CC_ASCII:
00584             /* hard-wired meaning */
00585             cv = getcvec(v, 0, 1);
00586             if (cv)
00587                 addrange(cv, 0, 0x7f);
00588             break;
00589         case CC_BLANK:
00590             /* hard-wired meaning */
00591             cv = getcvec(v, 2, 0);
00592             addchr(cv, '\t');
00593             addchr(cv, ' ');
00594             break;
00595         case CC_CNTRL:
00596             /* hard-wired meaning */
00597             cv = getcvec(v, 0, 2);
00598             addrange(cv, 0x0, 0x1f);
00599             addrange(cv, 0x7f, 0x9f);
00600             break;
00601         case CC_DIGIT:
00602             cv = pg_ctype_get_cache(pg_wc_isdigit);
00603             break;
00604         case CC_PUNCT:
00605             cv = pg_ctype_get_cache(pg_wc_ispunct);
00606             break;
00607         case CC_XDIGIT:
00608 
00609             /*
00610              * It's not clear how to define this in non-western locales, and
00611              * even less clear that there's any particular use in trying. So
00612              * just hard-wire the meaning.
00613              */
00614             cv = getcvec(v, 0, 3);
00615             if (cv)
00616             {
00617                 addrange(cv, '0', '9');
00618                 addrange(cv, 'a', 'f');
00619                 addrange(cv, 'A', 'F');
00620             }
00621             break;
00622         case CC_SPACE:
00623             cv = pg_ctype_get_cache(pg_wc_isspace);
00624             break;
00625         case CC_LOWER:
00626             cv = pg_ctype_get_cache(pg_wc_islower);
00627             break;
00628         case CC_UPPER:
00629             cv = pg_ctype_get_cache(pg_wc_isupper);
00630             break;
00631         case CC_GRAPH:
00632             cv = pg_ctype_get_cache(pg_wc_isgraph);
00633             break;
00634     }
00635 
00636     /* If cv is NULL now, the reason must be "out of memory" */
00637     if (cv == NULL)
00638         ERR(REG_ESPACE);
00639     return cv;
00640 }
00641 
00642 /*
00643  * allcases - supply cvec for all case counterparts of a chr (including itself)
00644  *
00645  * This is a shortcut, preferably an efficient one, for simple characters;
00646  * messy cases are done via range().
00647  */
00648 static struct cvec *
00649 allcases(struct vars * v,       /* context */
00650          chr pc)                /* character to get case equivs of */
00651 {
00652     struct cvec *cv;
00653     chr         c = (chr) pc;
00654     chr         lc,
00655                 uc;
00656 
00657     lc = pg_wc_tolower((chr) c);
00658     uc = pg_wc_toupper((chr) c);
00659 
00660     cv = getcvec(v, 2, 0);
00661     addchr(cv, lc);
00662     if (lc != uc)
00663         addchr(cv, uc);
00664     return cv;
00665 }
00666 
00667 /*
00668  * cmp - chr-substring compare
00669  *
00670  * Backrefs need this.  It should preferably be efficient.
00671  * Note that it does not need to report anything except equal/unequal.
00672  * Note also that the length is exact, and the comparison should not
00673  * stop at embedded NULs!
00674  */
00675 static int                      /* 0 for equal, nonzero for unequal */
00676 cmp(const chr *x, const chr *y, /* strings to compare */
00677     size_t len)                 /* exact length of comparison */
00678 {
00679     return memcmp(VS(x), VS(y), len * sizeof(chr));
00680 }
00681 
00682 /*
00683  * casecmp - case-independent chr-substring compare
00684  *
00685  * REG_ICASE backrefs need this.  It should preferably be efficient.
00686  * Note that it does not need to report anything except equal/unequal.
00687  * Note also that the length is exact, and the comparison should not
00688  * stop at embedded NULs!
00689  */
00690 static int                      /* 0 for equal, nonzero for unequal */
00691 casecmp(const chr *x, const chr *y,     /* strings to compare */
00692         size_t len)             /* exact length of comparison */
00693 {
00694     for (; len > 0; len--, x++, y++)
00695     {
00696         if ((*x != *y) && (pg_wc_tolower(*x) != pg_wc_tolower(*y)))
00697             return 1;
00698     }
00699     return 0;
00700 }