Header And Logo

PostgreSQL
| The world's most advanced open source database.

chklocale.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * chklocale.c
00004  *      Functions for handling locale-related info
00005  *
00006  *
00007  * Copyright (c) 1996-2013, PostgreSQL Global Development Group
00008  *
00009  *
00010  * IDENTIFICATION
00011  *    src/port/chklocale.c
00012  *
00013  *-------------------------------------------------------------------------
00014  */
00015 
00016 #ifndef FRONTEND
00017 #include "postgres.h"
00018 #else
00019 #include "postgres_fe.h"
00020 #endif
00021 
00022 #include <locale.h>
00023 #ifdef HAVE_LANGINFO_H
00024 #include <langinfo.h>
00025 #endif
00026 
00027 #include "mb/pg_wchar.h"
00028 
00029 
00030 /*
00031  * This table needs to recognize all the CODESET spellings for supported
00032  * backend encodings, as well as frontend-only encodings where possible
00033  * (the latter case is currently only needed for initdb to recognize
00034  * error situations).  On Windows, we rely on entries for codepage
00035  * numbers (CPnnn).
00036  *
00037  * Note that we search the table with pg_strcasecmp(), so variant
00038  * capitalizations don't need their own entries.
00039  */
00040 struct encoding_match
00041 {
00042     enum pg_enc pg_enc_code;
00043     const char *system_enc_name;
00044 };
00045 
00046 static const struct encoding_match encoding_match_list[] = {
00047     {PG_EUC_JP, "EUC-JP"},
00048     {PG_EUC_JP, "eucJP"},
00049     {PG_EUC_JP, "IBM-eucJP"},
00050     {PG_EUC_JP, "sdeckanji"},
00051     {PG_EUC_JP, "CP20932"},
00052 
00053     {PG_EUC_CN, "EUC-CN"},
00054     {PG_EUC_CN, "eucCN"},
00055     {PG_EUC_CN, "IBM-eucCN"},
00056     {PG_EUC_CN, "GB2312"},
00057     {PG_EUC_CN, "dechanzi"},
00058     {PG_EUC_CN, "CP20936"},
00059 
00060     {PG_EUC_KR, "EUC-KR"},
00061     {PG_EUC_KR, "eucKR"},
00062     {PG_EUC_KR, "IBM-eucKR"},
00063     {PG_EUC_KR, "deckorean"},
00064     {PG_EUC_KR, "5601"},
00065     {PG_EUC_KR, "CP51949"},
00066 
00067     {PG_EUC_TW, "EUC-TW"},
00068     {PG_EUC_TW, "eucTW"},
00069     {PG_EUC_TW, "IBM-eucTW"},
00070     {PG_EUC_TW, "cns11643"},
00071     /* No codepage for EUC-TW ? */
00072 
00073     {PG_UTF8, "UTF-8"},
00074     {PG_UTF8, "utf8"},
00075     {PG_UTF8, "CP65001"},
00076 
00077     {PG_LATIN1, "ISO-8859-1"},
00078     {PG_LATIN1, "ISO8859-1"},
00079     {PG_LATIN1, "iso88591"},
00080     {PG_LATIN1, "CP28591"},
00081 
00082     {PG_LATIN2, "ISO-8859-2"},
00083     {PG_LATIN2, "ISO8859-2"},
00084     {PG_LATIN2, "iso88592"},
00085     {PG_LATIN2, "CP28592"},
00086 
00087     {PG_LATIN3, "ISO-8859-3"},
00088     {PG_LATIN3, "ISO8859-3"},
00089     {PG_LATIN3, "iso88593"},
00090     {PG_LATIN3, "CP28593"},
00091 
00092     {PG_LATIN4, "ISO-8859-4"},
00093     {PG_LATIN4, "ISO8859-4"},
00094     {PG_LATIN4, "iso88594"},
00095     {PG_LATIN4, "CP28594"},
00096 
00097     {PG_LATIN5, "ISO-8859-9"},
00098     {PG_LATIN5, "ISO8859-9"},
00099     {PG_LATIN5, "iso88599"},
00100     {PG_LATIN5, "CP28599"},
00101 
00102     {PG_LATIN6, "ISO-8859-10"},
00103     {PG_LATIN6, "ISO8859-10"},
00104     {PG_LATIN6, "iso885910"},
00105 
00106     {PG_LATIN7, "ISO-8859-13"},
00107     {PG_LATIN7, "ISO8859-13"},
00108     {PG_LATIN7, "iso885913"},
00109 
00110     {PG_LATIN8, "ISO-8859-14"},
00111     {PG_LATIN8, "ISO8859-14"},
00112     {PG_LATIN8, "iso885914"},
00113 
00114     {PG_LATIN9, "ISO-8859-15"},
00115     {PG_LATIN9, "ISO8859-15"},
00116     {PG_LATIN9, "iso885915"},
00117     {PG_LATIN9, "CP28605"},
00118 
00119     {PG_LATIN10, "ISO-8859-16"},
00120     {PG_LATIN10, "ISO8859-16"},
00121     {PG_LATIN10, "iso885916"},
00122 
00123     {PG_KOI8R, "KOI8-R"},
00124     {PG_KOI8R, "CP20866"},
00125 
00126     {PG_KOI8U, "KOI8-U"},
00127     {PG_KOI8U, "CP21866"},
00128 
00129     {PG_WIN866, "CP866"},
00130     {PG_WIN874, "CP874"},
00131     {PG_WIN1250, "CP1250"},
00132     {PG_WIN1251, "CP1251"},
00133     {PG_WIN1251, "ansi-1251"},
00134     {PG_WIN1252, "CP1252"},
00135     {PG_WIN1253, "CP1253"},
00136     {PG_WIN1254, "CP1254"},
00137     {PG_WIN1255, "CP1255"},
00138     {PG_WIN1256, "CP1256"},
00139     {PG_WIN1257, "CP1257"},
00140     {PG_WIN1258, "CP1258"},
00141 
00142     {PG_ISO_8859_5, "ISO-8859-5"},
00143     {PG_ISO_8859_5, "ISO8859-5"},
00144     {PG_ISO_8859_5, "iso88595"},
00145     {PG_ISO_8859_5, "CP28595"},
00146 
00147     {PG_ISO_8859_6, "ISO-8859-6"},
00148     {PG_ISO_8859_6, "ISO8859-6"},
00149     {PG_ISO_8859_6, "iso88596"},
00150     {PG_ISO_8859_6, "CP28596"},
00151 
00152     {PG_ISO_8859_7, "ISO-8859-7"},
00153     {PG_ISO_8859_7, "ISO8859-7"},
00154     {PG_ISO_8859_7, "iso88597"},
00155     {PG_ISO_8859_7, "CP28597"},
00156 
00157     {PG_ISO_8859_8, "ISO-8859-8"},
00158     {PG_ISO_8859_8, "ISO8859-8"},
00159     {PG_ISO_8859_8, "iso88598"},
00160     {PG_ISO_8859_8, "CP28598"},
00161 
00162     {PG_SJIS, "SJIS"},
00163     {PG_SJIS, "PCK"},
00164     {PG_SJIS, "CP932"},
00165 
00166     {PG_BIG5, "BIG5"},
00167     {PG_BIG5, "BIG5HKSCS"},
00168     {PG_BIG5, "Big5-HKSCS"},
00169     {PG_BIG5, "CP950"},
00170 
00171     {PG_GBK, "GBK"},
00172     {PG_GBK, "CP936"},
00173 
00174     {PG_UHC, "UHC"},
00175     {PG_UHC, "CP949"},
00176 
00177     {PG_JOHAB, "JOHAB"},
00178     {PG_JOHAB, "CP1361"},
00179 
00180     {PG_GB18030, "GB18030"},
00181     {PG_GB18030, "CP54936"},
00182 
00183     {PG_SHIFT_JIS_2004, "SJIS_2004"},
00184 
00185     {PG_SQL_ASCII, "US-ASCII"},
00186 
00187     {PG_SQL_ASCII, NULL}        /* end marker */
00188 };
00189 
00190 #ifdef WIN32
00191 /*
00192  * On Windows, use CP<code page number> instead of the nl_langinfo() result
00193  *
00194  * Visual Studio 2012 expanded the set of valid LC_CTYPE values, so have its
00195  * locale machinery determine the code page.  See comments at IsoLocaleName().
00196  * For other compilers, follow the locale's predictable format.
00197  *
00198  * Returns a malloc()'d string for the caller to free.
00199  */
00200 static char *
00201 win32_langinfo(const char *ctype)
00202 {
00203     char       *r = NULL;
00204 
00205 #if (_MSC_VER >= 1700)
00206     _locale_t   loct = NULL;
00207 
00208     loct = _create_locale(LC_CTYPE, ctype);
00209     if (loct != NULL)
00210     {
00211         r = malloc(16);         /* excess */
00212         if (r != NULL)
00213             sprintf(r, "CP%u", loct->locinfo->lc_codepage);
00214         _free_locale(loct);
00215     }
00216 #else
00217     char       *codepage;
00218 
00219     /*
00220      * Locale format on Win32 is <Language>_<Country>.<CodePage> . For
00221      * example, English_United States.1252.
00222      */
00223     codepage = strrchr(ctype, '.');
00224     if (codepage != NULL)
00225     {
00226         int         ln;
00227 
00228         codepage++;
00229         ln = strlen(codepage);
00230         r = malloc(ln + 3);
00231         if (r != NULL)
00232             sprintf(r, "CP%s", codepage);
00233     }
00234 #endif
00235 
00236     return r;
00237 }
00238 #endif   /* WIN32 */
00239 
00240 #if (defined(HAVE_LANGINFO_H) && defined(CODESET)) || defined(WIN32)
00241 
00242 /*
00243  * Given a setting for LC_CTYPE, return the Postgres ID of the associated
00244  * encoding, if we can determine it.  Return -1 if we can't determine it.
00245  *
00246  * Pass in NULL to get the encoding for the current locale setting.
00247  * Pass "" to get the encoding selected by the server's environment.
00248  *
00249  * If the result is PG_SQL_ASCII, callers should treat it as being compatible
00250  * with any desired encoding.
00251  */
00252 int
00253 pg_get_encoding_from_locale(const char *ctype, bool write_message)
00254 {
00255     char       *sys;
00256     int         i;
00257 
00258     /* Get the CODESET property, and also LC_CTYPE if not passed in */
00259     if (ctype)
00260     {
00261         char       *save;
00262         char       *name;
00263 
00264         /* If locale is C or POSIX, we can allow all encodings */
00265         if (pg_strcasecmp(ctype, "C") == 0 ||
00266             pg_strcasecmp(ctype, "POSIX") == 0)
00267             return PG_SQL_ASCII;
00268 
00269         save = setlocale(LC_CTYPE, NULL);
00270         if (!save)
00271             return -1;          /* setlocale() broken? */
00272         /* must copy result, or it might change after setlocale */
00273         save = strdup(save);
00274         if (!save)
00275             return -1;          /* out of memory; unlikely */
00276 
00277         name = setlocale(LC_CTYPE, ctype);
00278         if (!name)
00279         {
00280             free(save);
00281             return -1;          /* bogus ctype passed in? */
00282         }
00283 
00284 #ifndef WIN32
00285         sys = nl_langinfo(CODESET);
00286         if (sys)
00287             sys = strdup(sys);
00288 #else
00289         sys = win32_langinfo(name);
00290 #endif
00291 
00292         setlocale(LC_CTYPE, save);
00293         free(save);
00294     }
00295     else
00296     {
00297         /* much easier... */
00298         ctype = setlocale(LC_CTYPE, NULL);
00299         if (!ctype)
00300             return -1;          /* setlocale() broken? */
00301 
00302         /* If locale is C or POSIX, we can allow all encodings */
00303         if (pg_strcasecmp(ctype, "C") == 0 ||
00304             pg_strcasecmp(ctype, "POSIX") == 0)
00305             return PG_SQL_ASCII;
00306 
00307 #ifndef WIN32
00308         sys = nl_langinfo(CODESET);
00309         if (sys)
00310             sys = strdup(sys);
00311 #else
00312         sys = win32_langinfo(ctype);
00313 #endif
00314     }
00315 
00316     if (!sys)
00317         return -1;              /* out of memory; unlikely */
00318 
00319     /* Check the table */
00320     for (i = 0; encoding_match_list[i].system_enc_name; i++)
00321     {
00322         if (pg_strcasecmp(sys, encoding_match_list[i].system_enc_name) == 0)
00323         {
00324             free(sys);
00325             return encoding_match_list[i].pg_enc_code;
00326         }
00327     }
00328 
00329     /* Special-case kluges for particular platforms go here */
00330 
00331 #ifdef __darwin__
00332 
00333     /*
00334      * Current OS X has many locales that report an empty string for CODESET,
00335      * but they all seem to actually use UTF-8.
00336      */
00337     if (strlen(sys) == 0)
00338     {
00339         free(sys);
00340         return PG_UTF8;
00341     }
00342 #endif
00343 
00344     /*
00345      * We print a warning if we got a CODESET string but couldn't recognize
00346      * it.  This means we need another entry in the table.
00347      */
00348     if (write_message)
00349     {
00350 #ifdef FRONTEND
00351         fprintf(stderr, _("could not determine encoding for locale \"%s\": codeset is \"%s\""),
00352                 ctype, sys);
00353         /* keep newline separate so there's only one translatable string */
00354         fputc('\n', stderr);
00355 #else
00356         ereport(WARNING,
00357                 (errmsg("could not determine encoding for locale \"%s\": codeset is \"%s\"",
00358                         ctype, sys),
00359            errdetail("Please report this to <[email protected]>.")));
00360 #endif
00361     }
00362 
00363     free(sys);
00364     return -1;
00365 }
00366 #else                           /* (HAVE_LANGINFO_H && CODESET) || WIN32 */
00367 
00368 /*
00369  * stub if no multi-language platform support
00370  *
00371  * Note: we could return -1 here, but that would have the effect of
00372  * forcing users to specify an encoding to initdb on such platforms.
00373  * It seems better to silently default to SQL_ASCII.
00374  */
00375 int
00376 pg_get_encoding_from_locale(const char *ctype, bool write_message)
00377 {
00378     return PG_SQL_ASCII;
00379 }
00380 
00381 #endif   /* (HAVE_LANGINFO_H && CODESET) || WIN32 */