Header And Logo

PostgreSQL
| The world's most advanced open source database.

encnames.c

Go to the documentation of this file.
00001 /*
00002  * Encoding names and routines for work with it. All
00003  * in this file is shared bedween FE and BE.
00004  *
00005  * src/backend/utils/mb/encnames.c
00006  */
00007 #ifdef FRONTEND
00008 #include "postgres_fe.h"
00009 #else
00010 #include "postgres.h"
00011 #include "utils/builtins.h"
00012 #endif
00013 
00014 #include <ctype.h>
00015 #include <unistd.h>
00016 
00017 #include "mb/pg_wchar.h"
00018 
00019 
00020 /* ----------
00021  * All encoding names, sorted:       *** A L P H A B E T I C ***
00022  *
00023  * All names must be without irrelevant chars, search routines use
00024  * isalnum() chars only. It means ISO-8859-1, iso_8859-1 and Iso8859_1
00025  * are always converted to 'iso88591'. All must be lower case.
00026  *
00027  * The table doesn't contain 'cs' aliases (like csISOLatin1). It's needed?
00028  *
00029  * Karel Zak, Aug 2001
00030  * ----------
00031  */
00032 pg_encname  pg_encname_tbl[] =
00033 {
00034     {
00035         "abc", PG_WIN1258
00036     },                          /* alias for WIN1258 */
00037     {
00038         "alt", PG_WIN866
00039     },                          /* IBM866 */
00040     {
00041         "big5", PG_BIG5
00042     },                          /* Big5; Chinese for Taiwan multibyte set */
00043     {
00044         "euccn", PG_EUC_CN
00045     },                          /* EUC-CN; Extended Unix Code for simplified
00046                                  * Chinese */
00047     {
00048         "eucjis2004", PG_EUC_JIS_2004
00049     },                          /* EUC-JIS-2004; Extended UNIX Code fixed
00050                                  * Width for Japanese, standard JIS X 0213 */
00051     {
00052         "eucjp", PG_EUC_JP
00053     },                          /* EUC-JP; Extended UNIX Code fixed Width for
00054                                  * Japanese, standard OSF */
00055     {
00056         "euckr", PG_EUC_KR
00057     },                          /* EUC-KR; Extended Unix Code for Korean , KS
00058                                  * X 1001 standard */
00059     {
00060         "euctw", PG_EUC_TW
00061     },                          /* EUC-TW; Extended Unix Code for
00062                                  *
00063                                  * traditional Chinese */
00064     {
00065         "gb18030", PG_GB18030
00066     },                          /* GB18030;GB18030 */
00067     {
00068         "gbk", PG_GBK
00069     },                          /* GBK; Chinese Windows CodePage 936
00070                                  * simplified Chinese */
00071     {
00072         "iso88591", PG_LATIN1
00073     },                          /* ISO-8859-1; RFC1345,KXS2 */
00074     {
00075         "iso885910", PG_LATIN6
00076     },                          /* ISO-8859-10; RFC1345,KXS2 */
00077     {
00078         "iso885913", PG_LATIN7
00079     },                          /* ISO-8859-13; RFC1345,KXS2 */
00080     {
00081         "iso885914", PG_LATIN8
00082     },                          /* ISO-8859-14; RFC1345,KXS2 */
00083     {
00084         "iso885915", PG_LATIN9
00085     },                          /* ISO-8859-15; RFC1345,KXS2 */
00086     {
00087         "iso885916", PG_LATIN10
00088     },                          /* ISO-8859-16; RFC1345,KXS2 */
00089     {
00090         "iso88592", PG_LATIN2
00091     },                          /* ISO-8859-2; RFC1345,KXS2 */
00092     {
00093         "iso88593", PG_LATIN3
00094     },                          /* ISO-8859-3; RFC1345,KXS2 */
00095     {
00096         "iso88594", PG_LATIN4
00097     },                          /* ISO-8859-4; RFC1345,KXS2 */
00098     {
00099         "iso88595", PG_ISO_8859_5
00100     },                          /* ISO-8859-5; RFC1345,KXS2 */
00101     {
00102         "iso88596", PG_ISO_8859_6
00103     },                          /* ISO-8859-6; RFC1345,KXS2 */
00104     {
00105         "iso88597", PG_ISO_8859_7
00106     },                          /* ISO-8859-7; RFC1345,KXS2 */
00107     {
00108         "iso88598", PG_ISO_8859_8
00109     },                          /* ISO-8859-8; RFC1345,KXS2 */
00110     {
00111         "iso88599", PG_LATIN5
00112     },                          /* ISO-8859-9; RFC1345,KXS2 */
00113     {
00114         "johab", PG_JOHAB
00115     },                          /* JOHAB; Extended Unix Code for simplified
00116                                  * Chinese */
00117     {
00118         "koi8", PG_KOI8R
00119     },                          /* _dirty_ alias for KOI8-R (backward
00120                                  * compatibility) */
00121     {
00122         "koi8r", PG_KOI8R
00123     },                          /* KOI8-R; RFC1489 */
00124     {
00125         "koi8u", PG_KOI8U
00126     },                          /* KOI8-U; RFC2319 */
00127     {
00128         "latin1", PG_LATIN1
00129     },                          /* alias for ISO-8859-1 */
00130     {
00131         "latin10", PG_LATIN10
00132     },                          /* alias for ISO-8859-16 */
00133     {
00134         "latin2", PG_LATIN2
00135     },                          /* alias for ISO-8859-2 */
00136     {
00137         "latin3", PG_LATIN3
00138     },                          /* alias for ISO-8859-3 */
00139     {
00140         "latin4", PG_LATIN4
00141     },                          /* alias for ISO-8859-4 */
00142     {
00143         "latin5", PG_LATIN5
00144     },                          /* alias for ISO-8859-9 */
00145     {
00146         "latin6", PG_LATIN6
00147     },                          /* alias for ISO-8859-10 */
00148     {
00149         "latin7", PG_LATIN7
00150     },                          /* alias for ISO-8859-13 */
00151     {
00152         "latin8", PG_LATIN8
00153     },                          /* alias for ISO-8859-14 */
00154     {
00155         "latin9", PG_LATIN9
00156     },                          /* alias for ISO-8859-15 */
00157     {
00158         "mskanji", PG_SJIS
00159     },                          /* alias for Shift_JIS */
00160     {
00161         "muleinternal", PG_MULE_INTERNAL
00162     },
00163     {
00164         "shiftjis", PG_SJIS
00165     },                          /* Shift_JIS; JIS X 0202-1991 */
00166 
00167     {
00168         "shiftjis2004", PG_SHIFT_JIS_2004
00169     },                          /* SHIFT-JIS-2004; Shift JIS for Japanese,
00170                                  * standard JIS X 0213 */
00171     {
00172         "sjis", PG_SJIS
00173     },                          /* alias for Shift_JIS */
00174     {
00175         "sqlascii", PG_SQL_ASCII
00176     },
00177     {
00178         "tcvn", PG_WIN1258
00179     },                          /* alias for WIN1258 */
00180     {
00181         "tcvn5712", PG_WIN1258
00182     },                          /* alias for WIN1258 */
00183     {
00184         "uhc", PG_UHC
00185     },                          /* UHC; Korean Windows CodePage 949 */
00186     {
00187         "unicode", PG_UTF8
00188     },                          /* alias for UTF8 */
00189     {
00190         "utf8", PG_UTF8
00191     },                          /* alias for UTF8 */
00192     {
00193         "vscii", PG_WIN1258
00194     },                          /* alias for WIN1258 */
00195     {
00196         "win", PG_WIN1251
00197     },                          /* _dirty_ alias for windows-1251 (backward
00198                                  * compatibility) */
00199     {
00200         "win1250", PG_WIN1250
00201     },                          /* alias for Windows-1250 */
00202     {
00203         "win1251", PG_WIN1251
00204     },                          /* alias for Windows-1251 */
00205     {
00206         "win1252", PG_WIN1252
00207     },                          /* alias for Windows-1252 */
00208     {
00209         "win1253", PG_WIN1253
00210     },                          /* alias for Windows-1253 */
00211     {
00212         "win1254", PG_WIN1254
00213     },                          /* alias for Windows-1254 */
00214     {
00215         "win1255", PG_WIN1255
00216     },                          /* alias for Windows-1255 */
00217     {
00218         "win1256", PG_WIN1256
00219     },                          /* alias for Windows-1256 */
00220     {
00221         "win1257", PG_WIN1257
00222     },                          /* alias for Windows-1257 */
00223     {
00224         "win1258", PG_WIN1258
00225     },                          /* alias for Windows-1258 */
00226     {
00227         "win866", PG_WIN866
00228     },                          /* IBM866 */
00229     {
00230         "win874", PG_WIN874
00231     },                          /* alias for Windows-874 */
00232     {
00233         "win932", PG_SJIS
00234     },                          /* alias for Shift_JIS */
00235     {
00236         "win936", PG_GBK
00237     },                          /* alias for GBK */
00238     {
00239         "win949", PG_UHC
00240     },                          /* alias for UHC */
00241     {
00242         "win950", PG_BIG5
00243     },                          /* alias for BIG5 */
00244     {
00245         "windows1250", PG_WIN1250
00246     },                          /* Windows-1251; Microsoft */
00247     {
00248         "windows1251", PG_WIN1251
00249     },                          /* Windows-1251; Microsoft */
00250     {
00251         "windows1252", PG_WIN1252
00252     },                          /* Windows-1252; Microsoft */
00253     {
00254         "windows1253", PG_WIN1253
00255     },                          /* Windows-1253; Microsoft */
00256     {
00257         "windows1254", PG_WIN1254
00258     },                          /* Windows-1254; Microsoft */
00259     {
00260         "windows1255", PG_WIN1255
00261     },                          /* Windows-1255; Microsoft */
00262     {
00263         "windows1256", PG_WIN1256
00264     },                          /* Windows-1256; Microsoft */
00265     {
00266         "windows1257", PG_WIN1257
00267     },                          /* Windows-1257; Microsoft */
00268     {
00269         "windows1258", PG_WIN1258
00270     },                          /* Windows-1258; Microsoft */
00271     {
00272         "windows866", PG_WIN866
00273     },                          /* IBM866 */
00274     {
00275         "windows874", PG_WIN874
00276     },                          /* Windows-874; Microsoft */
00277     {
00278         "windows932", PG_SJIS
00279     },                          /* alias for Shift_JIS */
00280     {
00281         "windows936", PG_GBK
00282     },                          /* alias for GBK */
00283     {
00284         "windows949", PG_UHC
00285     },                          /* alias for UHC */
00286     {
00287         "windows950", PG_BIG5
00288     },                          /* alias for BIG5 */
00289     {
00290         NULL, 0
00291     }                           /* last */
00292 };
00293 
00294 unsigned int pg_encname_tbl_sz = \
00295 sizeof(pg_encname_tbl) / sizeof(pg_encname_tbl[0]) - 1;
00296 
00297 /* ----------
00298  * These are "official" encoding names.
00299  * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
00300  * ----------
00301  */
00302 #ifndef WIN32
00303 #define DEF_ENC2NAME(name, codepage) { #name, PG_##name }
00304 #else
00305 #define DEF_ENC2NAME(name, codepage) { #name, PG_##name, codepage }
00306 #endif
00307 pg_enc2name pg_enc2name_tbl[] =
00308 {
00309     DEF_ENC2NAME(SQL_ASCII, 0),
00310     DEF_ENC2NAME(EUC_JP, 20932),
00311     DEF_ENC2NAME(EUC_CN, 20936),
00312     DEF_ENC2NAME(EUC_KR, 51949),
00313     DEF_ENC2NAME(EUC_TW, 0),
00314     DEF_ENC2NAME(EUC_JIS_2004, 20932),
00315     DEF_ENC2NAME(UTF8, 65001),
00316     DEF_ENC2NAME(MULE_INTERNAL, 0),
00317     DEF_ENC2NAME(LATIN1, 28591),
00318     DEF_ENC2NAME(LATIN2, 28592),
00319     DEF_ENC2NAME(LATIN3, 28593),
00320     DEF_ENC2NAME(LATIN4, 28594),
00321     DEF_ENC2NAME(LATIN5, 28599),
00322     DEF_ENC2NAME(LATIN6, 0),
00323     DEF_ENC2NAME(LATIN7, 0),
00324     DEF_ENC2NAME(LATIN8, 0),
00325     DEF_ENC2NAME(LATIN9, 28605),
00326     DEF_ENC2NAME(LATIN10, 0),
00327     DEF_ENC2NAME(WIN1256, 1256),
00328     DEF_ENC2NAME(WIN1258, 1258),
00329     DEF_ENC2NAME(WIN866, 866),
00330     DEF_ENC2NAME(WIN874, 874),
00331     DEF_ENC2NAME(KOI8R, 20866),
00332     DEF_ENC2NAME(WIN1251, 1251),
00333     DEF_ENC2NAME(WIN1252, 1252),
00334     DEF_ENC2NAME(ISO_8859_5, 28595),
00335     DEF_ENC2NAME(ISO_8859_6, 28596),
00336     DEF_ENC2NAME(ISO_8859_7, 28597),
00337     DEF_ENC2NAME(ISO_8859_8, 28598),
00338     DEF_ENC2NAME(WIN1250, 1250),
00339     DEF_ENC2NAME(WIN1253, 1253),
00340     DEF_ENC2NAME(WIN1254, 1254),
00341     DEF_ENC2NAME(WIN1255, 1255),
00342     DEF_ENC2NAME(WIN1257, 1257),
00343     DEF_ENC2NAME(KOI8U, 21866),
00344     DEF_ENC2NAME(SJIS, 932),
00345     DEF_ENC2NAME(BIG5, 950),
00346     DEF_ENC2NAME(GBK, 936),
00347     DEF_ENC2NAME(UHC, 0),
00348     DEF_ENC2NAME(GB18030, 54936),
00349     DEF_ENC2NAME(JOHAB, 0),
00350     DEF_ENC2NAME(SHIFT_JIS_2004, 932)
00351 };
00352 
00353 /* ----------
00354  * These are encoding names for gettext.
00355  * ----------
00356  */
00357 pg_enc2gettext pg_enc2gettext_tbl[] =
00358 {
00359     {PG_UTF8, "UTF-8"},
00360     {PG_LATIN1, "LATIN1"},
00361     {PG_LATIN2, "LATIN2"},
00362     {PG_LATIN3, "LATIN3"},
00363     {PG_LATIN4, "LATIN4"},
00364     {PG_ISO_8859_5, "ISO-8859-5"},
00365     {PG_ISO_8859_6, "ISO_8859-6"},
00366     {PG_ISO_8859_7, "ISO-8859-7"},
00367     {PG_ISO_8859_8, "ISO-8859-8"},
00368     {PG_LATIN5, "LATIN5"},
00369     {PG_LATIN6, "LATIN6"},
00370     {PG_LATIN7, "LATIN7"},
00371     {PG_LATIN8, "LATIN8"},
00372     {PG_LATIN9, "LATIN-9"},
00373     {PG_LATIN10, "LATIN10"},
00374     {PG_KOI8R, "KOI8-R"},
00375     {PG_KOI8U, "KOI8-U"},
00376     {PG_WIN1250, "CP1250"},
00377     {PG_WIN1251, "CP1251"},
00378     {PG_WIN1252, "CP1252"},
00379     {PG_WIN1253, "CP1253"},
00380     {PG_WIN1254, "CP1254"},
00381     {PG_WIN1255, "CP1255"},
00382     {PG_WIN1256, "CP1256"},
00383     {PG_WIN1257, "CP1257"},
00384     {PG_WIN1258, "CP1258"},
00385     {PG_WIN866, "CP866"},
00386     {PG_WIN874, "CP874"},
00387     {PG_EUC_CN, "EUC-CN"},
00388     {PG_EUC_JP, "EUC-JP"},
00389     {PG_EUC_KR, "EUC-KR"},
00390     {PG_EUC_TW, "EUC-TW"},
00391     {PG_EUC_JIS_2004, "EUC-JP"},
00392     {0, NULL}
00393 };
00394 
00395 
00396 /* ----------
00397  * Encoding checks, for error returns -1 else encoding id
00398  * ----------
00399  */
00400 int
00401 pg_valid_client_encoding(const char *name)
00402 {
00403     int         enc;
00404 
00405     if ((enc = pg_char_to_encoding(name)) < 0)
00406         return -1;
00407 
00408     if (!PG_VALID_FE_ENCODING(enc))
00409         return -1;
00410 
00411     return enc;
00412 }
00413 
00414 int
00415 pg_valid_server_encoding(const char *name)
00416 {
00417     int         enc;
00418 
00419     if ((enc = pg_char_to_encoding(name)) < 0)
00420         return -1;
00421 
00422     if (!PG_VALID_BE_ENCODING(enc))
00423         return -1;
00424 
00425     return enc;
00426 }
00427 
00428 int
00429 pg_valid_server_encoding_id(int encoding)
00430 {
00431     return PG_VALID_BE_ENCODING(encoding);
00432 }
00433 
00434 /* ----------
00435  * Remove irrelevant chars from encoding name
00436  * ----------
00437  */
00438 static char *
00439 clean_encoding_name(const char *key, char *newkey)
00440 {
00441     const char *p;
00442     char       *np;
00443 
00444     for (p = key, np = newkey; *p != '\0'; p++)
00445     {
00446         if (isalnum((unsigned char) *p))
00447         {
00448             if (*p >= 'A' && *p <= 'Z')
00449                 *np++ = *p + 'a' - 'A';
00450             else
00451                 *np++ = *p;
00452         }
00453     }
00454     *np = '\0';
00455     return newkey;
00456 }
00457 
00458 /* ----------
00459  * Search encoding by encoding name
00460  * ----------
00461  */
00462 pg_encname *
00463 pg_char_to_encname_struct(const char *name)
00464 {
00465     unsigned int nel = pg_encname_tbl_sz;
00466     pg_encname *base = pg_encname_tbl,
00467                *last = base + nel - 1,
00468                *position;
00469     int         result;
00470     char        buff[NAMEDATALEN],
00471                *key;
00472 
00473     if (name == NULL || *name == '\0')
00474         return NULL;
00475 
00476     if (strlen(name) >= NAMEDATALEN)
00477     {
00478 #ifdef FRONTEND
00479         fprintf(stderr, "encoding name too long\n");
00480         return NULL;
00481 #else
00482         ereport(ERROR,
00483                 (errcode(ERRCODE_NAME_TOO_LONG),
00484                  errmsg("encoding name too long")));
00485 #endif
00486     }
00487     key = clean_encoding_name(name, buff);
00488 
00489     while (last >= base)
00490     {
00491         position = base + ((last - base) >> 1);
00492         result = key[0] - position->name[0];
00493 
00494         if (result == 0)
00495         {
00496             result = strcmp(key, position->name);
00497             if (result == 0)
00498                 return position;
00499         }
00500         if (result < 0)
00501             last = position - 1;
00502         else
00503             base = position + 1;
00504     }
00505     return NULL;
00506 }
00507 
00508 /*
00509  * Returns encoding or -1 for error
00510  */
00511 int
00512 pg_char_to_encoding(const char *name)
00513 {
00514     pg_encname *p;
00515 
00516     if (!name)
00517         return -1;
00518 
00519     p = pg_char_to_encname_struct(name);
00520     return p ? p->encoding : -1;
00521 }
00522 
00523 #ifndef FRONTEND
00524 Datum
00525 PG_char_to_encoding(PG_FUNCTION_ARGS)
00526 {
00527     Name        s = PG_GETARG_NAME(0);
00528 
00529     PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
00530 }
00531 #endif
00532 
00533 const char *
00534 pg_encoding_to_char(int encoding)
00535 {
00536     if (PG_VALID_ENCODING(encoding))
00537     {
00538         pg_enc2name *p = &pg_enc2name_tbl[encoding];
00539 
00540         Assert(encoding == p->encoding);
00541         return p->name;
00542     }
00543     return "";
00544 }
00545 
00546 #ifndef FRONTEND
00547 Datum
00548 PG_encoding_to_char(PG_FUNCTION_ARGS)
00549 {
00550     int32       encoding = PG_GETARG_INT32(0);
00551     const char *encoding_name = pg_encoding_to_char(encoding);
00552 
00553     return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
00554 }
00555 
00556 #endif