Header And Logo

PostgreSQL
| The world's most advanced open source database.

regc_pg_locale.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * regc_pg_locale.c
00004  *    ctype functions adapted to work on pg_wchar (a/k/a chr),
00005  *    and functions to cache the results of wholesale ctype probing.
00006  *
00007  * This file is #included by regcomp.c; it's not meant to compile standalone.
00008  *
00009  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00010  * Portions Copyright (c) 1994, Regents of the University of California
00011  *
00012  * IDENTIFICATION
00013  *    src/backend/regex/regc_pg_locale.c
00014  *
00015  *-------------------------------------------------------------------------
00016  */
00017 
00018 #include "catalog/pg_collation.h"
00019 #include "utils/pg_locale.h"
00020 
00021 /*
00022  * To provide as much functionality as possible on a variety of platforms,
00023  * without going so far as to implement everything from scratch, we use
00024  * several implementation strategies depending on the situation:
00025  *
00026  * 1. In C/POSIX collations, we use hard-wired code.  We can't depend on
00027  * the <ctype.h> functions since those will obey LC_CTYPE.  Note that these
00028  * collations don't give a fig about multibyte characters.
00029  *
00030  * 2. In the "default" collation (which is supposed to obey LC_CTYPE):
00031  *
00032  * 2a. When working in UTF8 encoding, we use the <wctype.h> functions if
00033  * available.  This assumes that every platform uses Unicode codepoints
00034  * directly as the wchar_t representation of Unicode.  On some platforms
00035  * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
00036  *
00037  * 2b. In all other encodings, or on machines that lack <wctype.h>, we use
00038  * the <ctype.h> functions for pg_wchar values up to 255, and punt for values
00039  * above that.  This is only 100% correct in single-byte encodings such as
00040  * LATINn.  However, non-Unicode multibyte encodings are mostly Far Eastern
00041  * character sets for which the properties being tested here aren't very
00042  * relevant for higher code values anyway.  The difficulty with using the
00043  * <wctype.h> functions with non-Unicode multibyte encodings is that we can
00044  * have no certainty that the platform's wchar_t representation matches
00045  * what we do in pg_wchar conversions.
00046  *
00047  * 3. Other collations are only supported on platforms that HAVE_LOCALE_T.
00048  * Here, we use the locale_t-extended forms of the <wctype.h> and <ctype.h>
00049  * functions, under exactly the same cases as #2.
00050  *
00051  * There is one notable difference between cases 2 and 3: in the "default"
00052  * collation we force ASCII letters to follow ASCII upcase/downcase rules,
00053  * while in a non-default collation we just let the library functions do what
00054  * they will.  The case where this matters is treatment of I/i in Turkish,
00055  * and the behavior is meant to match the upper()/lower() SQL functions.
00056  *
00057  * We store the active collation setting in static variables.  In principle
00058  * it could be passed down to here via the regex library's "struct vars" data
00059  * structure; but that would require somewhat invasive changes in the regex
00060  * library, and right now there's no real benefit to be gained from that.
00061  *
00062  * NB: the coding here assumes pg_wchar is an unsigned type.
00063  */
00064 
00065 typedef enum
00066 {
00067     PG_REGEX_LOCALE_C,          /* C locale (encoding independent) */
00068     PG_REGEX_LOCALE_WIDE,       /* Use <wctype.h> functions */
00069     PG_REGEX_LOCALE_1BYTE,      /* Use <ctype.h> functions */
00070     PG_REGEX_LOCALE_WIDE_L,     /* Use locale_t <wctype.h> functions */
00071     PG_REGEX_LOCALE_1BYTE_L     /* Use locale_t <ctype.h> functions */
00072 } PG_Locale_Strategy;
00073 
00074 static PG_Locale_Strategy pg_regex_strategy;
00075 static pg_locale_t pg_regex_locale;
00076 static Oid  pg_regex_collation;
00077 
00078 /*
00079  * Hard-wired character properties for C locale
00080  */
00081 #define PG_ISDIGIT  0x01
00082 #define PG_ISALPHA  0x02
00083 #define PG_ISALNUM  (PG_ISDIGIT | PG_ISALPHA)
00084 #define PG_ISUPPER  0x04
00085 #define PG_ISLOWER  0x08
00086 #define PG_ISGRAPH  0x10
00087 #define PG_ISPRINT  0x20
00088 #define PG_ISPUNCT  0x40
00089 #define PG_ISSPACE  0x80
00090 
00091 static const unsigned char pg_char_properties[128] = {
00092      /* NUL */ 0,
00093      /* ^A */ 0,
00094      /* ^B */ 0,
00095      /* ^C */ 0,
00096      /* ^D */ 0,
00097      /* ^E */ 0,
00098      /* ^F */ 0,
00099      /* ^G */ 0,
00100      /* ^H */ 0,
00101      /* ^I */ PG_ISSPACE,
00102      /* ^J */ PG_ISSPACE,
00103      /* ^K */ PG_ISSPACE,
00104      /* ^L */ PG_ISSPACE,
00105      /* ^M */ PG_ISSPACE,
00106      /* ^N */ 0,
00107      /* ^O */ 0,
00108      /* ^P */ 0,
00109      /* ^Q */ 0,
00110      /* ^R */ 0,
00111      /* ^S */ 0,
00112      /* ^T */ 0,
00113      /* ^U */ 0,
00114      /* ^V */ 0,
00115      /* ^W */ 0,
00116      /* ^X */ 0,
00117      /* ^Y */ 0,
00118      /* ^Z */ 0,
00119      /* ^[ */ 0,
00120      /* ^\ */ 0,
00121      /* ^] */ 0,
00122      /* ^^ */ 0,
00123      /* ^_ */ 0,
00124      /* */ PG_ISPRINT | PG_ISSPACE,
00125      /* !  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00126      /* "  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00127      /* #  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00128      /* $  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00129      /* %  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00130      /* &  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00131      /* '  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00132      /* (  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00133      /* )  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00134      /* *  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00135      /* +  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00136      /* ,  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00137      /* -  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00138      /* .  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00139      /* /  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00140      /* 0  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
00141      /* 1  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
00142      /* 2  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
00143      /* 3  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
00144      /* 4  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
00145      /* 5  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
00146      /* 6  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
00147      /* 7  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
00148      /* 8  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
00149      /* 9  */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
00150      /* :  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00151      /* ;  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00152      /* <  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00153      /* =  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00154      /* >  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00155      /* ?  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00156      /* @  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00157      /* A  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00158      /* B  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00159      /* C  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00160      /* D  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00161      /* E  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00162      /* F  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00163      /* G  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00164      /* H  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00165      /* I  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00166      /* J  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00167      /* K  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00168      /* L  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00169      /* M  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00170      /* N  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00171      /* O  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00172      /* P  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00173      /* Q  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00174      /* R  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00175      /* S  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00176      /* T  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00177      /* U  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00178      /* V  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00179      /* W  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00180      /* X  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00181      /* Y  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00182      /* Z  */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
00183      /* [  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00184      /* \  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00185      /* ]  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00186      /* ^  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00187      /* _  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00188      /* `  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00189      /* a  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00190      /* b  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00191      /* c  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00192      /* d  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00193      /* e  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00194      /* f  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00195      /* g  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00196      /* h  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00197      /* i  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00198      /* j  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00199      /* k  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00200      /* l  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00201      /* m  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00202      /* n  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00203      /* o  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00204      /* p  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00205      /* q  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00206      /* r  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00207      /* s  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00208      /* t  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00209      /* u  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00210      /* v  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00211      /* w  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00212      /* x  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00213      /* y  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00214      /* z  */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
00215      /* {  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00216      /* |  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00217      /* }  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00218      /* ~  */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
00219      /* DEL */ 0
00220 };
00221 
00222 
00223 /*
00224  * pg_set_regex_collation: set collation for these functions to obey
00225  *
00226  * This is called when beginning compilation or execution of a regexp.
00227  * Since there's no need for re-entrancy of regexp operations, it's okay
00228  * to store the results in static variables.
00229  */
00230 void
00231 pg_set_regex_collation(Oid collation)
00232 {
00233     if (lc_ctype_is_c(collation))
00234     {
00235         /* C/POSIX collations use this path regardless of database encoding */
00236         pg_regex_strategy = PG_REGEX_LOCALE_C;
00237         pg_regex_locale = 0;
00238         pg_regex_collation = C_COLLATION_OID;
00239     }
00240     else
00241     {
00242         if (collation == DEFAULT_COLLATION_OID)
00243             pg_regex_locale = 0;
00244         else if (OidIsValid(collation))
00245         {
00246             /*
00247              * NB: pg_newlocale_from_collation will fail if not HAVE_LOCALE_T;
00248              * the case of pg_regex_locale != 0 but not HAVE_LOCALE_T does not
00249              * have to be considered below.
00250              */
00251             pg_regex_locale = pg_newlocale_from_collation(collation);
00252         }
00253         else
00254         {
00255             /*
00256              * This typically means that the parser could not resolve a
00257              * conflict of implicit collations, so report it that way.
00258              */
00259             ereport(ERROR,
00260                     (errcode(ERRCODE_INDETERMINATE_COLLATION),
00261                      errmsg("could not determine which collation to use for regular expression"),
00262                      errhint("Use the COLLATE clause to set the collation explicitly.")));
00263         }
00264 
00265 #ifdef USE_WIDE_UPPER_LOWER
00266         if (GetDatabaseEncoding() == PG_UTF8)
00267         {
00268             if (pg_regex_locale)
00269                 pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L;
00270             else
00271                 pg_regex_strategy = PG_REGEX_LOCALE_WIDE;
00272         }
00273         else
00274 #endif   /* USE_WIDE_UPPER_LOWER */
00275         {
00276             if (pg_regex_locale)
00277                 pg_regex_strategy = PG_REGEX_LOCALE_1BYTE_L;
00278             else
00279                 pg_regex_strategy = PG_REGEX_LOCALE_1BYTE;
00280         }
00281 
00282         pg_regex_collation = collation;
00283     }
00284 }
00285 
00286 static int
00287 pg_wc_isdigit(pg_wchar c)
00288 {
00289     switch (pg_regex_strategy)
00290     {
00291         case PG_REGEX_LOCALE_C:
00292             return (c <= (pg_wchar) 127 &&
00293                     (pg_char_properties[c] & PG_ISDIGIT));
00294         case PG_REGEX_LOCALE_WIDE:
00295 #ifdef USE_WIDE_UPPER_LOWER
00296             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
00297                 return iswdigit((wint_t) c);
00298 #endif
00299             /* FALL THRU */
00300         case PG_REGEX_LOCALE_1BYTE:
00301             return (c <= (pg_wchar) UCHAR_MAX &&
00302                     isdigit((unsigned char) c));
00303         case PG_REGEX_LOCALE_WIDE_L:
00304 #if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
00305             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
00306                 return iswdigit_l((wint_t) c, pg_regex_locale);
00307 #endif
00308             /* FALL THRU */
00309         case PG_REGEX_LOCALE_1BYTE_L:
00310 #ifdef HAVE_LOCALE_T
00311             return (c <= (pg_wchar) UCHAR_MAX &&
00312                     isdigit_l((unsigned char) c, pg_regex_locale));
00313 #endif
00314             break;
00315     }
00316     return 0;                   /* can't get here, but keep compiler quiet */
00317 }
00318 
00319 static int
00320 pg_wc_isalpha(pg_wchar c)
00321 {
00322     switch (pg_regex_strategy)
00323     {
00324         case PG_REGEX_LOCALE_C:
00325             return (c <= (pg_wchar) 127 &&
00326                     (pg_char_properties[c] & PG_ISALPHA));
00327         case PG_REGEX_LOCALE_WIDE:
00328 #ifdef USE_WIDE_UPPER_LOWER
00329             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
00330                 return iswalpha((wint_t) c);
00331 #endif
00332             /* FALL THRU */
00333         case PG_REGEX_LOCALE_1BYTE:
00334             return (c <= (pg_wchar) UCHAR_MAX &&
00335                     isalpha((unsigned char) c));
00336         case PG_REGEX_LOCALE_WIDE_L:
00337 #if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
00338             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
00339                 return iswalpha_l((wint_t) c, pg_regex_locale);
00340 #endif
00341             /* FALL THRU */
00342         case PG_REGEX_LOCALE_1BYTE_L:
00343 #ifdef HAVE_LOCALE_T
00344             return (c <= (pg_wchar) UCHAR_MAX &&
00345                     isalpha_l((unsigned char) c, pg_regex_locale));
00346 #endif
00347             break;
00348     }
00349     return 0;                   /* can't get here, but keep compiler quiet */
00350 }
00351 
00352 static int
00353 pg_wc_isalnum(pg_wchar c)
00354 {
00355     switch (pg_regex_strategy)
00356     {
00357         case PG_REGEX_LOCALE_C:
00358             return (c <= (pg_wchar) 127 &&
00359                     (pg_char_properties[c] & PG_ISALNUM));
00360         case PG_REGEX_LOCALE_WIDE:
00361 #ifdef USE_WIDE_UPPER_LOWER
00362             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
00363                 return iswalnum((wint_t) c);
00364 #endif
00365             /* FALL THRU */
00366         case PG_REGEX_LOCALE_1BYTE:
00367             return (c <= (pg_wchar) UCHAR_MAX &&
00368                     isalnum((unsigned char) c));
00369         case PG_REGEX_LOCALE_WIDE_L:
00370 #if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
00371             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
00372                 return iswalnum_l((wint_t) c, pg_regex_locale);
00373 #endif
00374             /* FALL THRU */
00375         case PG_REGEX_LOCALE_1BYTE_L:
00376 #ifdef HAVE_LOCALE_T
00377             return (c <= (pg_wchar) UCHAR_MAX &&
00378                     isalnum_l((unsigned char) c, pg_regex_locale));
00379 #endif
00380             break;
00381     }
00382     return 0;                   /* can't get here, but keep compiler quiet */
00383 }
00384 
00385 static int
00386 pg_wc_isupper(pg_wchar c)
00387 {
00388     switch (pg_regex_strategy)
00389     {
00390         case PG_REGEX_LOCALE_C:
00391             return (c <= (pg_wchar) 127 &&
00392                     (pg_char_properties[c] & PG_ISUPPER));
00393         case PG_REGEX_LOCALE_WIDE:
00394 #ifdef USE_WIDE_UPPER_LOWER
00395             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
00396                 return iswupper((wint_t) c);
00397 #endif
00398             /* FALL THRU */
00399         case PG_REGEX_LOCALE_1BYTE:
00400             return (c <= (pg_wchar) UCHAR_MAX &&
00401                     isupper((unsigned char) c));
00402         case PG_REGEX_LOCALE_WIDE_L:
00403 #if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
00404             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
00405                 return iswupper_l((wint_t) c, pg_regex_locale);
00406 #endif
00407             /* FALL THRU */
00408         case PG_REGEX_LOCALE_1BYTE_L:
00409 #ifdef HAVE_LOCALE_T
00410             return (c <= (pg_wchar) UCHAR_MAX &&
00411                     isupper_l((unsigned char) c, pg_regex_locale));
00412 #endif
00413             break;
00414     }
00415     return 0;                   /* can't get here, but keep compiler quiet */
00416 }
00417 
00418 static int
00419 pg_wc_islower(pg_wchar c)
00420 {
00421     switch (pg_regex_strategy)
00422     {
00423         case PG_REGEX_LOCALE_C:
00424             return (c <= (pg_wchar) 127 &&
00425                     (pg_char_properties[c] & PG_ISLOWER));
00426         case PG_REGEX_LOCALE_WIDE:
00427 #ifdef USE_WIDE_UPPER_LOWER
00428             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
00429                 return iswlower((wint_t) c);
00430 #endif
00431             /* FALL THRU */
00432         case PG_REGEX_LOCALE_1BYTE:
00433             return (c <= (pg_wchar) UCHAR_MAX &&
00434                     islower((unsigned char) c));
00435         case PG_REGEX_LOCALE_WIDE_L:
00436 #if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
00437             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
00438                 return iswlower_l((wint_t) c, pg_regex_locale);
00439 #endif
00440             /* FALL THRU */
00441         case PG_REGEX_LOCALE_1BYTE_L:
00442 #ifdef HAVE_LOCALE_T
00443             return (c <= (pg_wchar) UCHAR_MAX &&
00444                     islower_l((unsigned char) c, pg_regex_locale));
00445 #endif
00446             break;
00447     }
00448     return 0;                   /* can't get here, but keep compiler quiet */
00449 }
00450 
00451 static int
00452 pg_wc_isgraph(pg_wchar c)
00453 {
00454     switch (pg_regex_strategy)
00455     {
00456         case PG_REGEX_LOCALE_C:
00457             return (c <= (pg_wchar) 127 &&
00458                     (pg_char_properties[c] & PG_ISGRAPH));
00459         case PG_REGEX_LOCALE_WIDE:
00460 #ifdef USE_WIDE_UPPER_LOWER
00461             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
00462                 return iswgraph((wint_t) c);
00463 #endif
00464             /* FALL THRU */
00465         case PG_REGEX_LOCALE_1BYTE:
00466             return (c <= (pg_wchar) UCHAR_MAX &&
00467                     isgraph((unsigned char) c));
00468         case PG_REGEX_LOCALE_WIDE_L:
00469 #if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
00470             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
00471                 return iswgraph_l((wint_t) c, pg_regex_locale);
00472 #endif
00473             /* FALL THRU */
00474         case PG_REGEX_LOCALE_1BYTE_L:
00475 #ifdef HAVE_LOCALE_T
00476             return (c <= (pg_wchar) UCHAR_MAX &&
00477                     isgraph_l((unsigned char) c, pg_regex_locale));
00478 #endif
00479             break;
00480     }
00481     return 0;                   /* can't get here, but keep compiler quiet */
00482 }
00483 
00484 static int
00485 pg_wc_isprint(pg_wchar c)
00486 {
00487     switch (pg_regex_strategy)
00488     {
00489         case PG_REGEX_LOCALE_C:
00490             return (c <= (pg_wchar) 127 &&
00491                     (pg_char_properties[c] & PG_ISPRINT));
00492         case PG_REGEX_LOCALE_WIDE:
00493 #ifdef USE_WIDE_UPPER_LOWER
00494             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
00495                 return iswprint((wint_t) c);
00496 #endif
00497             /* FALL THRU */
00498         case PG_REGEX_LOCALE_1BYTE:
00499             return (c <= (pg_wchar) UCHAR_MAX &&
00500                     isprint((unsigned char) c));
00501         case PG_REGEX_LOCALE_WIDE_L:
00502 #if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
00503             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
00504                 return iswprint_l((wint_t) c, pg_regex_locale);
00505 #endif
00506             /* FALL THRU */
00507         case PG_REGEX_LOCALE_1BYTE_L:
00508 #ifdef HAVE_LOCALE_T
00509             return (c <= (pg_wchar) UCHAR_MAX &&
00510                     isprint_l((unsigned char) c, pg_regex_locale));
00511 #endif
00512             break;
00513     }
00514     return 0;                   /* can't get here, but keep compiler quiet */
00515 }
00516 
00517 static int
00518 pg_wc_ispunct(pg_wchar c)
00519 {
00520     switch (pg_regex_strategy)
00521     {
00522         case PG_REGEX_LOCALE_C:
00523             return (c <= (pg_wchar) 127 &&
00524                     (pg_char_properties[c] & PG_ISPUNCT));
00525         case PG_REGEX_LOCALE_WIDE:
00526 #ifdef USE_WIDE_UPPER_LOWER
00527             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
00528                 return iswpunct((wint_t) c);
00529 #endif
00530             /* FALL THRU */
00531         case PG_REGEX_LOCALE_1BYTE:
00532             return (c <= (pg_wchar) UCHAR_MAX &&
00533                     ispunct((unsigned char) c));
00534         case PG_REGEX_LOCALE_WIDE_L:
00535 #if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
00536             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
00537                 return iswpunct_l((wint_t) c, pg_regex_locale);
00538 #endif
00539             /* FALL THRU */
00540         case PG_REGEX_LOCALE_1BYTE_L:
00541 #ifdef HAVE_LOCALE_T
00542             return (c <= (pg_wchar) UCHAR_MAX &&
00543                     ispunct_l((unsigned char) c, pg_regex_locale));
00544 #endif
00545             break;
00546     }
00547     return 0;                   /* can't get here, but keep compiler quiet */
00548 }
00549 
00550 static int
00551 pg_wc_isspace(pg_wchar c)
00552 {
00553     switch (pg_regex_strategy)
00554     {
00555         case PG_REGEX_LOCALE_C:
00556             return (c <= (pg_wchar) 127 &&
00557                     (pg_char_properties[c] & PG_ISSPACE));
00558         case PG_REGEX_LOCALE_WIDE:
00559 #ifdef USE_WIDE_UPPER_LOWER
00560             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
00561                 return iswspace((wint_t) c);
00562 #endif
00563             /* FALL THRU */
00564         case PG_REGEX_LOCALE_1BYTE:
00565             return (c <= (pg_wchar) UCHAR_MAX &&
00566                     isspace((unsigned char) c));
00567         case PG_REGEX_LOCALE_WIDE_L:
00568 #if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
00569             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
00570                 return iswspace_l((wint_t) c, pg_regex_locale);
00571 #endif
00572             /* FALL THRU */
00573         case PG_REGEX_LOCALE_1BYTE_L:
00574 #ifdef HAVE_LOCALE_T
00575             return (c <= (pg_wchar) UCHAR_MAX &&
00576                     isspace_l((unsigned char) c, pg_regex_locale));
00577 #endif
00578             break;
00579     }
00580     return 0;                   /* can't get here, but keep compiler quiet */
00581 }
00582 
00583 static pg_wchar
00584 pg_wc_toupper(pg_wchar c)
00585 {
00586     switch (pg_regex_strategy)
00587     {
00588         case PG_REGEX_LOCALE_C:
00589             if (c <= (pg_wchar) 127)
00590                 return pg_ascii_toupper((unsigned char) c);
00591             return c;
00592         case PG_REGEX_LOCALE_WIDE:
00593             /* force C behavior for ASCII characters, per comments above */
00594             if (c <= (pg_wchar) 127)
00595                 return pg_ascii_toupper((unsigned char) c);
00596 #ifdef USE_WIDE_UPPER_LOWER
00597             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
00598                 return towupper((wint_t) c);
00599 #endif
00600             /* FALL THRU */
00601         case PG_REGEX_LOCALE_1BYTE:
00602             /* force C behavior for ASCII characters, per comments above */
00603             if (c <= (pg_wchar) 127)
00604                 return pg_ascii_toupper((unsigned char) c);
00605             if (c <= (pg_wchar) UCHAR_MAX)
00606                 return toupper((unsigned char) c);
00607             return c;
00608         case PG_REGEX_LOCALE_WIDE_L:
00609 #if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
00610             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
00611                 return towupper_l((wint_t) c, pg_regex_locale);
00612 #endif
00613             /* FALL THRU */
00614         case PG_REGEX_LOCALE_1BYTE_L:
00615 #ifdef HAVE_LOCALE_T
00616             if (c <= (pg_wchar) UCHAR_MAX)
00617                 return toupper_l((unsigned char) c, pg_regex_locale);
00618 #endif
00619             return c;
00620     }
00621     return 0;                   /* can't get here, but keep compiler quiet */
00622 }
00623 
00624 static pg_wchar
00625 pg_wc_tolower(pg_wchar c)
00626 {
00627     switch (pg_regex_strategy)
00628     {
00629         case PG_REGEX_LOCALE_C:
00630             if (c <= (pg_wchar) 127)
00631                 return pg_ascii_tolower((unsigned char) c);
00632             return c;
00633         case PG_REGEX_LOCALE_WIDE:
00634             /* force C behavior for ASCII characters, per comments above */
00635             if (c <= (pg_wchar) 127)
00636                 return pg_ascii_tolower((unsigned char) c);
00637 #ifdef USE_WIDE_UPPER_LOWER
00638             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
00639                 return towlower((wint_t) c);
00640 #endif
00641             /* FALL THRU */
00642         case PG_REGEX_LOCALE_1BYTE:
00643             /* force C behavior for ASCII characters, per comments above */
00644             if (c <= (pg_wchar) 127)
00645                 return pg_ascii_tolower((unsigned char) c);
00646             if (c <= (pg_wchar) UCHAR_MAX)
00647                 return tolower((unsigned char) c);
00648             return c;
00649         case PG_REGEX_LOCALE_WIDE_L:
00650 #if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
00651             if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
00652                 return towlower_l((wint_t) c, pg_regex_locale);
00653 #endif
00654             /* FALL THRU */
00655         case PG_REGEX_LOCALE_1BYTE_L:
00656 #ifdef HAVE_LOCALE_T
00657             if (c <= (pg_wchar) UCHAR_MAX)
00658                 return tolower_l((unsigned char) c, pg_regex_locale);
00659 #endif
00660             return c;
00661     }
00662     return 0;                   /* can't get here, but keep compiler quiet */
00663 }
00664 
00665 
00666 /*
00667  * These functions cache the results of probing libc's ctype behavior for
00668  * all character codes of interest in a given encoding/collation.  The
00669  * result is provided as a "struct cvec", but notice that the representation
00670  * is a touch different from a cvec created by regc_cvec.c: we allocate the
00671  * chrs[] and ranges[] arrays separately from the struct so that we can
00672  * realloc them larger at need.  This is okay since the cvecs made here
00673  * should never be freed by freecvec().
00674  *
00675  * We use malloc not palloc since we mustn't lose control on out-of-memory;
00676  * the main regex code expects us to return a failure indication instead.
00677  */
00678 
00679 typedef int (*pg_wc_probefunc) (pg_wchar c);
00680 
00681 typedef struct pg_ctype_cache
00682 {
00683     pg_wc_probefunc probefunc;  /* pg_wc_isalpha or a sibling */
00684     Oid         collation;      /* collation this entry is for */
00685     struct cvec cv;             /* cache entry contents */
00686     struct pg_ctype_cache *next;    /* chain link */
00687 } pg_ctype_cache;
00688 
00689 static pg_ctype_cache *pg_ctype_cache_list = NULL;
00690 
00691 /*
00692  * Add a chr or range to pcc->cv; return false if run out of memory
00693  */
00694 static bool
00695 store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs)
00696 {
00697     chr        *newchrs;
00698 
00699     if (nchrs > 1)
00700     {
00701         if (pcc->cv.nranges >= pcc->cv.rangespace)
00702         {
00703             pcc->cv.rangespace *= 2;
00704             newchrs = (chr *) realloc(pcc->cv.ranges,
00705                                       pcc->cv.rangespace * sizeof(chr) * 2);
00706             if (newchrs == NULL)
00707                 return false;
00708             pcc->cv.ranges = newchrs;
00709         }
00710         pcc->cv.ranges[pcc->cv.nranges * 2] = chr1;
00711         pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1;
00712         pcc->cv.nranges++;
00713     }
00714     else
00715     {
00716         assert(nchrs == 1);
00717         if (pcc->cv.nchrs >= pcc->cv.chrspace)
00718         {
00719             pcc->cv.chrspace *= 2;
00720             newchrs = (chr *) realloc(pcc->cv.chrs,
00721                                       pcc->cv.chrspace * sizeof(chr));
00722             if (newchrs == NULL)
00723                 return false;
00724             pcc->cv.chrs = newchrs;
00725         }
00726         pcc->cv.chrs[pcc->cv.nchrs++] = chr1;
00727     }
00728     return true;
00729 }
00730 
00731 /*
00732  * Given a probe function (e.g., pg_wc_isalpha) get a struct cvec for all
00733  * chrs satisfying the probe function.  The active collation is the one
00734  * previously set by pg_set_regex_collation.  Return NULL if out of memory.
00735  *
00736  * Note that the result must not be freed or modified by caller.
00737  */
00738 static struct cvec *
00739 pg_ctype_get_cache(pg_wc_probefunc probefunc)
00740 {
00741     pg_ctype_cache *pcc;
00742     pg_wchar    max_chr;
00743     pg_wchar    cur_chr;
00744     int         nmatches;
00745     chr        *newchrs;
00746 
00747     /*
00748      * Do we already have the answer cached?
00749      */
00750     for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
00751     {
00752         if (pcc->probefunc == probefunc &&
00753             pcc->collation == pg_regex_collation)
00754             return &pcc->cv;
00755     }
00756 
00757     /*
00758      * Nope, so initialize some workspace ...
00759      */
00760     pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache));
00761     if (pcc == NULL)
00762         return NULL;
00763     pcc->probefunc = probefunc;
00764     pcc->collation = pg_regex_collation;
00765     pcc->cv.nchrs = 0;
00766     pcc->cv.chrspace = 128;
00767     pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));
00768     pcc->cv.nranges = 0;
00769     pcc->cv.rangespace = 64;
00770     pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2);
00771     if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL)
00772         goto out_of_memory;
00773 
00774     /*
00775      * Decide how many character codes we ought to look through.  For C locale
00776      * there's no need to go further than 127.  Otherwise, if the encoding is
00777      * UTF8 go up to 0x7FF, which is a pretty arbitrary cutoff but we cannot
00778      * extend it as far as we'd like (say, 0xFFFF, the end of the Basic
00779      * Multilingual Plane) without creating significant performance issues due
00780      * to too many characters being fed through the colormap code.  This will
00781      * need redesign to fix reasonably, but at least for the moment we have
00782      * all common European languages covered.  Otherwise (not C, not UTF8) go
00783      * up to 255.  These limits are interrelated with restrictions discussed
00784      * at the head of this file.
00785      */
00786     switch (pg_regex_strategy)
00787     {
00788         case PG_REGEX_LOCALE_C:
00789             max_chr = (pg_wchar) 127;
00790             break;
00791         case PG_REGEX_LOCALE_WIDE:
00792         case PG_REGEX_LOCALE_WIDE_L:
00793             max_chr = (pg_wchar) 0x7FF;
00794             break;
00795         case PG_REGEX_LOCALE_1BYTE:
00796         case PG_REGEX_LOCALE_1BYTE_L:
00797             max_chr = (pg_wchar) UCHAR_MAX;
00798             break;
00799         default:
00800             max_chr = 0;        /* can't get here, but keep compiler quiet */
00801             break;
00802     }
00803 
00804     /*
00805      * And scan 'em ...
00806      */
00807     nmatches = 0;               /* number of consecutive matches */
00808 
00809     for (cur_chr = 0; cur_chr <= max_chr; cur_chr++)
00810     {
00811         if ((*probefunc) (cur_chr))
00812             nmatches++;
00813         else if (nmatches > 0)
00814         {
00815             if (!store_match(pcc, cur_chr - nmatches, nmatches))
00816                 goto out_of_memory;
00817             nmatches = 0;
00818         }
00819     }
00820 
00821     if (nmatches > 0)
00822         if (!store_match(pcc, cur_chr - nmatches, nmatches))
00823             goto out_of_memory;
00824 
00825     /*
00826      * We might have allocated more memory than needed, if so free it
00827      */
00828     if (pcc->cv.nchrs == 0)
00829     {
00830         free(pcc->cv.chrs);
00831         pcc->cv.chrs = NULL;
00832         pcc->cv.chrspace = 0;
00833     }
00834     else if (pcc->cv.nchrs < pcc->cv.chrspace)
00835     {
00836         newchrs = (chr *) realloc(pcc->cv.chrs,
00837                                   pcc->cv.nchrs * sizeof(chr));
00838         if (newchrs == NULL)
00839             goto out_of_memory;
00840         pcc->cv.chrs = newchrs;
00841         pcc->cv.chrspace = pcc->cv.nchrs;
00842     }
00843     if (pcc->cv.nranges == 0)
00844     {
00845         free(pcc->cv.ranges);
00846         pcc->cv.ranges = NULL;
00847         pcc->cv.rangespace = 0;
00848     }
00849     else if (pcc->cv.nranges < pcc->cv.rangespace)
00850     {
00851         newchrs = (chr *) realloc(pcc->cv.ranges,
00852                                   pcc->cv.nranges * sizeof(chr) * 2);
00853         if (newchrs == NULL)
00854             goto out_of_memory;
00855         pcc->cv.ranges = newchrs;
00856         pcc->cv.rangespace = pcc->cv.nranges;
00857     }
00858 
00859     /*
00860      * Success, link it into cache chain
00861      */
00862     pcc->next = pg_ctype_cache_list;
00863     pg_ctype_cache_list = pcc;
00864 
00865     return &pcc->cv;
00866 
00867     /*
00868      * Failure, clean up
00869      */
00870 out_of_memory:
00871     if (pcc->cv.chrs)
00872         free(pcc->cv.chrs);
00873     if (pcc->cv.ranges)
00874         free(pcc->cv.ranges);
00875     free(pcc);
00876 
00877     return NULL;
00878 }