Header And Logo

PostgreSQL
| The world's most advanced open source database.

mbutils.c

Go to the documentation of this file.
00001 /*
00002  * This file contains public functions for conversion between
00003  * client encoding and server (database) encoding.
00004  *
00005  * Tatsuo Ishii
00006  *
00007  * src/backend/utils/mb/mbutils.c
00008  */
00009 #include "postgres.h"
00010 
00011 #include "access/xact.h"
00012 #include "catalog/namespace.h"
00013 #include "mb/pg_wchar.h"
00014 #include "utils/builtins.h"
00015 #include "utils/memutils.h"
00016 #include "utils/syscache.h"
00017 
00018 /*
00019  * When converting strings between different encodings, we assume that space
00020  * for converted result is 4-to-1 growth in the worst case. The rate for
00021  * currently supported encoding pairs are within 3 (SJIS JIS X0201 half width
00022  * kanna -> UTF8 is the worst case).  So "4" should be enough for the moment.
00023  *
00024  * Note that this is not the same as the maximum character width in any
00025  * particular encoding.
00026  */
00027 #define MAX_CONVERSION_GROWTH  4
00028 
00029 /*
00030  * We maintain a simple linked list caching the fmgr lookup info for the
00031  * currently selected conversion functions, as well as any that have been
00032  * selected previously in the current session.  (We remember previous
00033  * settings because we must be able to restore a previous setting during
00034  * transaction rollback, without doing any fresh catalog accesses.)
00035  *
00036  * Since we'll never release this data, we just keep it in TopMemoryContext.
00037  */
00038 typedef struct ConvProcInfo
00039 {
00040     int         s_encoding;     /* server and client encoding IDs */
00041     int         c_encoding;
00042     FmgrInfo    to_server_info; /* lookup info for conversion procs */
00043     FmgrInfo    to_client_info;
00044 } ConvProcInfo;
00045 
00046 static List *ConvProcList = NIL;    /* List of ConvProcInfo */
00047 
00048 /*
00049  * These variables point to the currently active conversion functions,
00050  * or are NULL when no conversion is needed.
00051  */
00052 static FmgrInfo *ToServerConvProc = NULL;
00053 static FmgrInfo *ToClientConvProc = NULL;
00054 
00055 /*
00056  * These variables track the currently selected FE and BE encodings.
00057  */
00058 static pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
00059 static pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
00060 static pg_enc2name *PlatformEncoding = NULL;
00061 
00062 /*
00063  * During backend startup we can't set client encoding because we (a)
00064  * can't look up the conversion functions, and (b) may not know the database
00065  * encoding yet either.  So SetClientEncoding() just accepts anything and
00066  * remembers it for InitializeClientEncoding() to apply later.
00067  */
00068 static bool backend_startup_complete = false;
00069 static int  pending_client_encoding = PG_SQL_ASCII;
00070 
00071 
00072 /* Internal functions */
00073 static char *perform_default_encoding_conversion(const char *src,
00074                                     int len, bool is_client_to_server);
00075 static int  cliplen(const char *str, int len, int limit);
00076 
00077 
00078 /*
00079  * Prepare for a future call to SetClientEncoding.  Success should mean
00080  * that SetClientEncoding is guaranteed to succeed for this encoding request.
00081  *
00082  * (But note that success before backend_startup_complete does not guarantee
00083  * success after ...)
00084  *
00085  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
00086  */
00087 int
00088 PrepareClientEncoding(int encoding)
00089 {
00090     int         current_server_encoding;
00091     ListCell   *lc;
00092 
00093     if (!PG_VALID_FE_ENCODING(encoding))
00094         return -1;
00095 
00096     /* Can't do anything during startup, per notes above */
00097     if (!backend_startup_complete)
00098         return 0;
00099 
00100     current_server_encoding = GetDatabaseEncoding();
00101 
00102     /*
00103      * Check for cases that require no conversion function.
00104      */
00105     if (current_server_encoding == encoding ||
00106         current_server_encoding == PG_SQL_ASCII ||
00107         encoding == PG_SQL_ASCII)
00108         return 0;
00109 
00110     if (IsTransactionState())
00111     {
00112         /*
00113          * If we're in a live transaction, it's safe to access the catalogs,
00114          * so look up the functions.  We repeat the lookup even if the info is
00115          * already cached, so that we can react to changes in the contents of
00116          * pg_conversion.
00117          */
00118         Oid         to_server_proc,
00119                     to_client_proc;
00120         ConvProcInfo *convinfo;
00121         MemoryContext oldcontext;
00122 
00123         to_server_proc = FindDefaultConversionProc(encoding,
00124                                                    current_server_encoding);
00125         if (!OidIsValid(to_server_proc))
00126             return -1;
00127         to_client_proc = FindDefaultConversionProc(current_server_encoding,
00128                                                    encoding);
00129         if (!OidIsValid(to_client_proc))
00130             return -1;
00131 
00132         /*
00133          * Load the fmgr info into TopMemoryContext (could still fail here)
00134          */
00135         convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
00136                                                        sizeof(ConvProcInfo));
00137         convinfo->s_encoding = current_server_encoding;
00138         convinfo->c_encoding = encoding;
00139         fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
00140                       TopMemoryContext);
00141         fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
00142                       TopMemoryContext);
00143 
00144         /* Attach new info to head of list */
00145         oldcontext = MemoryContextSwitchTo(TopMemoryContext);
00146         ConvProcList = lcons(convinfo, ConvProcList);
00147         MemoryContextSwitchTo(oldcontext);
00148 
00149         /*
00150          * We cannot yet remove any older entry for the same encoding pair,
00151          * since it could still be in use.  SetClientEncoding will clean up.
00152          */
00153 
00154         return 0;               /* success */
00155     }
00156     else
00157     {
00158         /*
00159          * If we're not in a live transaction, the only thing we can do is
00160          * restore a previous setting using the cache.  This covers all
00161          * transaction-rollback cases.  The only case it might not work for is
00162          * trying to change client_encoding on the fly by editing
00163          * postgresql.conf and SIGHUP'ing.  Which would probably be a stupid
00164          * thing to do anyway.
00165          */
00166         foreach(lc, ConvProcList)
00167         {
00168             ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
00169 
00170             if (oldinfo->s_encoding == current_server_encoding &&
00171                 oldinfo->c_encoding == encoding)
00172                 return 0;
00173         }
00174 
00175         return -1;              /* it's not cached, so fail */
00176     }
00177 }
00178 
00179 /*
00180  * Set the active client encoding and set up the conversion-function pointers.
00181  * PrepareClientEncoding should have been called previously for this encoding.
00182  *
00183  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
00184  */
00185 int
00186 SetClientEncoding(int encoding)
00187 {
00188     int         current_server_encoding;
00189     bool        found;
00190     ListCell   *lc;
00191     ListCell   *prev;
00192     ListCell   *next;
00193 
00194     if (!PG_VALID_FE_ENCODING(encoding))
00195         return -1;
00196 
00197     /* Can't do anything during startup, per notes above */
00198     if (!backend_startup_complete)
00199     {
00200         pending_client_encoding = encoding;
00201         return 0;
00202     }
00203 
00204     current_server_encoding = GetDatabaseEncoding();
00205 
00206     /*
00207      * Check for cases that require no conversion function.
00208      */
00209     if (current_server_encoding == encoding ||
00210         current_server_encoding == PG_SQL_ASCII ||
00211         encoding == PG_SQL_ASCII)
00212     {
00213         ClientEncoding = &pg_enc2name_tbl[encoding];
00214         ToServerConvProc = NULL;
00215         ToClientConvProc = NULL;
00216         return 0;
00217     }
00218 
00219     /*
00220      * Search the cache for the entry previously prepared by
00221      * PrepareClientEncoding; if there isn't one, we lose.  While at it,
00222      * release any duplicate entries so that repeated Prepare/Set cycles don't
00223      * leak memory.
00224      */
00225     found = false;
00226     prev = NULL;
00227     for (lc = list_head(ConvProcList); lc; lc = next)
00228     {
00229         ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
00230 
00231         next = lnext(lc);
00232 
00233         if (convinfo->s_encoding == current_server_encoding &&
00234             convinfo->c_encoding == encoding)
00235         {
00236             if (!found)
00237             {
00238                 /* Found newest entry, so set up */
00239                 ClientEncoding = &pg_enc2name_tbl[encoding];
00240                 ToServerConvProc = &convinfo->to_server_info;
00241                 ToClientConvProc = &convinfo->to_client_info;
00242                 found = true;
00243             }
00244             else
00245             {
00246                 /* Duplicate entry, release it */
00247                 ConvProcList = list_delete_cell(ConvProcList, lc, prev);
00248                 pfree(convinfo);
00249                 continue;       /* prev mustn't advance */
00250             }
00251         }
00252 
00253         prev = lc;
00254     }
00255 
00256     if (found)
00257         return 0;               /* success */
00258     else
00259         return -1;              /* it's not cached, so fail */
00260 }
00261 
00262 /*
00263  * Initialize client encoding conversions.
00264  *      Called from InitPostgres() once during backend startup.
00265  */
00266 void
00267 InitializeClientEncoding(void)
00268 {
00269     Assert(!backend_startup_complete);
00270     backend_startup_complete = true;
00271 
00272     if (PrepareClientEncoding(pending_client_encoding) < 0 ||
00273         SetClientEncoding(pending_client_encoding) < 0)
00274     {
00275         /*
00276          * Oops, the requested conversion is not available. We couldn't fail
00277          * before, but we can now.
00278          */
00279         ereport(FATAL,
00280                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
00281                  errmsg("conversion between %s and %s is not supported",
00282                         pg_enc2name_tbl[pending_client_encoding].name,
00283                         GetDatabaseEncodingName())));
00284     }
00285 }
00286 
00287 /*
00288  * returns the current client encoding
00289  */
00290 int
00291 pg_get_client_encoding(void)
00292 {
00293     Assert(ClientEncoding);
00294     return ClientEncoding->encoding;
00295 }
00296 
00297 /*
00298  * returns the current client encoding name
00299  */
00300 const char *
00301 pg_get_client_encoding_name(void)
00302 {
00303     Assert(ClientEncoding);
00304     return ClientEncoding->name;
00305 }
00306 
00307 /*
00308  * Apply encoding conversion on src and return it. The encoding
00309  * conversion function is chosen from the pg_conversion system catalog
00310  * marked as "default". If it is not found in the schema search path,
00311  * it's taken from pg_catalog schema. If it even is not in the schema,
00312  * warn and return src.
00313  *
00314  * If conversion occurs, a palloc'd null-terminated string is returned.
00315  * In the case of no conversion, src is returned.
00316  *
00317  * CAUTION: although the presence of a length argument means that callers
00318  * can pass non-null-terminated strings, care is required because the same
00319  * string will be passed back if no conversion occurs.  Such callers *must*
00320  * check whether result == src and handle that case differently.
00321  *
00322  * Note: we try to avoid raising error, since that could get us into
00323  * infinite recursion when this function is invoked during error message
00324  * sending.  It should be OK to raise error for overlength strings though,
00325  * since the recursion will come with a shorter message.
00326  */
00327 unsigned char *
00328 pg_do_encoding_conversion(unsigned char *src, int len,
00329                           int src_encoding, int dest_encoding)
00330 {
00331     unsigned char *result;
00332     Oid         proc;
00333 
00334     if (!IsTransactionState())
00335         return src;
00336 
00337     if (src_encoding == dest_encoding)
00338         return src;
00339 
00340     if (src_encoding == PG_SQL_ASCII || dest_encoding == PG_SQL_ASCII)
00341         return src;
00342 
00343     if (len <= 0)
00344         return src;
00345 
00346     proc = FindDefaultConversionProc(src_encoding, dest_encoding);
00347     if (!OidIsValid(proc))
00348     {
00349         ereport(LOG,
00350                 (errcode(ERRCODE_UNDEFINED_FUNCTION),
00351                  errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
00352                         pg_encoding_to_char(src_encoding),
00353                         pg_encoding_to_char(dest_encoding))));
00354         return src;
00355     }
00356 
00357     /*
00358      * XXX we should avoid throwing errors in OidFunctionCall. Otherwise we
00359      * are going into infinite loop!  So we have to make sure that the
00360      * function exists before calling OidFunctionCall.
00361      */
00362     if (!SearchSysCacheExists1(PROCOID, ObjectIdGetDatum(proc)))
00363     {
00364         elog(LOG, "cache lookup failed for function %u", proc);
00365         return src;
00366     }
00367 
00368     /*
00369      * Allocate space for conversion result, being wary of integer overflow
00370      */
00371     if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
00372         ereport(ERROR,
00373                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
00374                  errmsg("out of memory"),
00375          errdetail("String of %d bytes is too long for encoding conversion.",
00376                    len)));
00377 
00378     result = palloc(len * MAX_CONVERSION_GROWTH + 1);
00379 
00380     OidFunctionCall5(proc,
00381                      Int32GetDatum(src_encoding),
00382                      Int32GetDatum(dest_encoding),
00383                      CStringGetDatum(src),
00384                      CStringGetDatum(result),
00385                      Int32GetDatum(len));
00386     return result;
00387 }
00388 
00389 /*
00390  * Convert string using encoding_name. The source
00391  * encoding is the DB encoding.
00392  *
00393  * BYTEA convert_to(TEXT string, NAME encoding_name) */
00394 Datum
00395 pg_convert_to(PG_FUNCTION_ARGS)
00396 {
00397     Datum       string = PG_GETARG_DATUM(0);
00398     Datum       dest_encoding_name = PG_GETARG_DATUM(1);
00399     Datum       src_encoding_name = DirectFunctionCall1(namein,
00400                                     CStringGetDatum(DatabaseEncoding->name));
00401     Datum       result;
00402 
00403     /*
00404      * pg_convert expects a bytea as its first argument. We're passing it a
00405      * text argument here, relying on the fact that they are both in fact
00406      * varlena types, and thus structurally identical.
00407      */
00408     result = DirectFunctionCall3(pg_convert, string,
00409                                  src_encoding_name, dest_encoding_name);
00410 
00411     PG_RETURN_DATUM(result);
00412 }
00413 
00414 /*
00415  * Convert string using encoding_name. The destination
00416  * encoding is the DB encoding.
00417  *
00418  * TEXT convert_from(BYTEA string, NAME encoding_name) */
00419 Datum
00420 pg_convert_from(PG_FUNCTION_ARGS)
00421 {
00422     Datum       string = PG_GETARG_DATUM(0);
00423     Datum       src_encoding_name = PG_GETARG_DATUM(1);
00424     Datum       dest_encoding_name = DirectFunctionCall1(namein,
00425                                     CStringGetDatum(DatabaseEncoding->name));
00426     Datum       result;
00427 
00428     result = DirectFunctionCall3(pg_convert, string,
00429                                  src_encoding_name, dest_encoding_name);
00430 
00431     /*
00432      * pg_convert returns a bytea, which we in turn return as text, relying on
00433      * the fact that they are both in fact varlena types, and thus
00434      * structurally identical. Although not all bytea values are valid text,
00435      * in this case it will be because we've told pg_convert to return one
00436      * that is valid as text in the current database encoding.
00437      */
00438     PG_RETURN_DATUM(result);
00439 }
00440 
00441 /*
00442  * Convert string using encoding_names.
00443  *
00444  * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
00445  */
00446 Datum
00447 pg_convert(PG_FUNCTION_ARGS)
00448 {
00449     bytea      *string = PG_GETARG_BYTEA_PP(0);
00450     char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
00451     int         src_encoding = pg_char_to_encoding(src_encoding_name);
00452     char       *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
00453     int         dest_encoding = pg_char_to_encoding(dest_encoding_name);
00454     const char *src_str;
00455     char       *dest_str;
00456     bytea      *retval;
00457     int         len;
00458 
00459     if (src_encoding < 0)
00460         ereport(ERROR,
00461                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00462                  errmsg("invalid source encoding name \"%s\"",
00463                         src_encoding_name)));
00464     if (dest_encoding < 0)
00465         ereport(ERROR,
00466                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00467                  errmsg("invalid destination encoding name \"%s\"",
00468                         dest_encoding_name)));
00469 
00470     /* make sure that source string is valid */
00471     len = VARSIZE_ANY_EXHDR(string);
00472     src_str = VARDATA_ANY(string);
00473     pg_verify_mbstr_len(src_encoding, src_str, len, false);
00474 
00475     dest_str = (char *) pg_do_encoding_conversion(
00476                 (unsigned char *) src_str, len, src_encoding, dest_encoding);
00477     if (dest_str != src_str)
00478         len = strlen(dest_str);
00479 
00480     /*
00481      * build bytea data type structure.
00482      */
00483     retval = (bytea *) palloc(len + VARHDRSZ);
00484     SET_VARSIZE(retval, len + VARHDRSZ);
00485     memcpy(VARDATA(retval), dest_str, len);
00486 
00487     if (dest_str != src_str)
00488         pfree(dest_str);
00489 
00490     /* free memory if allocated by the toaster */
00491     PG_FREE_IF_COPY(string, 0);
00492 
00493     PG_RETURN_BYTEA_P(retval);
00494 }
00495 
00496 /*
00497  * get the length of the string considered as text in the specified
00498  * encoding. Raises an error if the data is not valid in that
00499  * encoding.
00500  *
00501  * INT4 length (BYTEA string, NAME src_encoding_name)
00502  */
00503 Datum
00504 length_in_encoding(PG_FUNCTION_ARGS)
00505 {
00506     bytea      *string = PG_GETARG_BYTEA_P(0);
00507     char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
00508     int         src_encoding = pg_char_to_encoding(src_encoding_name);
00509     int         len = VARSIZE(string) - VARHDRSZ;
00510     int         retval;
00511 
00512     if (src_encoding < 0)
00513         ereport(ERROR,
00514                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00515                  errmsg("invalid encoding name \"%s\"",
00516                         src_encoding_name)));
00517 
00518     retval = pg_verify_mbstr_len(src_encoding, VARDATA(string), len, false);
00519     PG_RETURN_INT32(retval);
00520 
00521 }
00522 
00523 Datum
00524 pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
00525 {
00526     int         encoding = PG_GETARG_INT32(0);
00527 
00528     if (PG_VALID_ENCODING(encoding))
00529         PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
00530     else
00531         PG_RETURN_NULL();
00532 }
00533 
00534 /*
00535  * convert client encoding to server encoding.
00536  */
00537 char *
00538 pg_client_to_server(const char *s, int len)
00539 {
00540     Assert(ClientEncoding);
00541 
00542     return pg_any_to_server(s, len, ClientEncoding->encoding);
00543 }
00544 
00545 /*
00546  * convert any encoding to server encoding.
00547  */
00548 char *
00549 pg_any_to_server(const char *s, int len, int encoding)
00550 {
00551     Assert(DatabaseEncoding);
00552     Assert(ClientEncoding);
00553 
00554     if (len <= 0)
00555         return (char *) s;
00556 
00557     if (encoding == DatabaseEncoding->encoding ||
00558         encoding == PG_SQL_ASCII)
00559     {
00560         /*
00561          * No conversion is needed, but we must still validate the data.
00562          */
00563         (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
00564         return (char *) s;
00565     }
00566 
00567     if (DatabaseEncoding->encoding == PG_SQL_ASCII)
00568     {
00569         /*
00570          * No conversion is possible, but we must still validate the data,
00571          * because the client-side code might have done string escaping using
00572          * the selected client_encoding.  If the client encoding is ASCII-safe
00573          * then we just do a straight validation under that encoding.  For an
00574          * ASCII-unsafe encoding we have a problem: we dare not pass such data
00575          * to the parser but we have no way to convert it.  We compromise by
00576          * rejecting the data if it contains any non-ASCII characters.
00577          */
00578         if (PG_VALID_BE_ENCODING(encoding))
00579             (void) pg_verify_mbstr(encoding, s, len, false);
00580         else
00581         {
00582             int         i;
00583 
00584             for (i = 0; i < len; i++)
00585             {
00586                 if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
00587                     ereport(ERROR,
00588                             (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
00589                      errmsg("invalid byte value for encoding \"%s\": 0x%02x",
00590                             pg_enc2name_tbl[PG_SQL_ASCII].name,
00591                             (unsigned char) s[i])));
00592             }
00593         }
00594         return (char *) s;
00595     }
00596 
00597     if (ClientEncoding->encoding == encoding)
00598         return perform_default_encoding_conversion(s, len, true);
00599     else
00600         return (char *) pg_do_encoding_conversion(
00601              (unsigned char *) s, len, encoding, DatabaseEncoding->encoding);
00602 }
00603 
00604 /*
00605  * convert server encoding to client encoding.
00606  */
00607 char *
00608 pg_server_to_client(const char *s, int len)
00609 {
00610     Assert(ClientEncoding);
00611 
00612     return pg_server_to_any(s, len, ClientEncoding->encoding);
00613 }
00614 
00615 /*
00616  * convert server encoding to any encoding.
00617  */
00618 char *
00619 pg_server_to_any(const char *s, int len, int encoding)
00620 {
00621     Assert(DatabaseEncoding);
00622     Assert(ClientEncoding);
00623 
00624     if (len <= 0)
00625         return (char *) s;
00626 
00627     if (encoding == DatabaseEncoding->encoding ||
00628         encoding == PG_SQL_ASCII ||
00629         DatabaseEncoding->encoding == PG_SQL_ASCII)
00630         return (char *) s;      /* assume data is valid */
00631 
00632     if (ClientEncoding->encoding == encoding)
00633         return perform_default_encoding_conversion(s, len, false);
00634     else
00635         return (char *) pg_do_encoding_conversion(
00636              (unsigned char *) s, len, DatabaseEncoding->encoding, encoding);
00637 }
00638 
00639 /*
00640  *  Perform default encoding conversion using cached FmgrInfo. Since
00641  *  this function does not access database at all, it is safe to call
00642  *  outside transactions.  If the conversion has not been set up by
00643  *  SetClientEncoding(), no conversion is performed.
00644  */
00645 static char *
00646 perform_default_encoding_conversion(const char *src, int len, bool is_client_to_server)
00647 {
00648     char       *result;
00649     int         src_encoding,
00650                 dest_encoding;
00651     FmgrInfo   *flinfo;
00652 
00653     if (is_client_to_server)
00654     {
00655         src_encoding = ClientEncoding->encoding;
00656         dest_encoding = DatabaseEncoding->encoding;
00657         flinfo = ToServerConvProc;
00658     }
00659     else
00660     {
00661         src_encoding = DatabaseEncoding->encoding;
00662         dest_encoding = ClientEncoding->encoding;
00663         flinfo = ToClientConvProc;
00664     }
00665 
00666     if (flinfo == NULL)
00667         return (char *) src;
00668 
00669     /*
00670      * Allocate space for conversion result, being wary of integer overflow
00671      */
00672     if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
00673         ereport(ERROR,
00674                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
00675                  errmsg("out of memory"),
00676          errdetail("String of %d bytes is too long for encoding conversion.",
00677                    len)));
00678 
00679     result = palloc(len * MAX_CONVERSION_GROWTH + 1);
00680 
00681     FunctionCall5(flinfo,
00682                   Int32GetDatum(src_encoding),
00683                   Int32GetDatum(dest_encoding),
00684                   CStringGetDatum(src),
00685                   CStringGetDatum(result),
00686                   Int32GetDatum(len));
00687     return result;
00688 }
00689 
00690 
00691 /* convert a multibyte string to a wchar */
00692 int
00693 pg_mb2wchar(const char *from, pg_wchar *to)
00694 {
00695     return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, strlen(from));
00696 }
00697 
00698 /* convert a multibyte string to a wchar with a limited length */
00699 int
00700 pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
00701 {
00702     return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
00703 }
00704 
00705 /* same, with any encoding */
00706 int
00707 pg_encoding_mb2wchar_with_len(int encoding,
00708                               const char *from, pg_wchar *to, int len)
00709 {
00710     return (*pg_wchar_table[encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
00711 }
00712 
00713 /* convert a wchar string to a multibyte */
00714 int
00715 pg_wchar2mb(const pg_wchar *from, char *to)
00716 {
00717     return (*pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len) (from, (unsigned char *)to, pg_wchar_strlen(from));
00718 }
00719 
00720 /* convert a wchar string to a multibyte with a limited length */
00721 int
00722 pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
00723 {
00724     return (*pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len) (from, (unsigned char *)to, len);
00725 }
00726 
00727 /* same, with any encoding */
00728 int
00729 pg_encoding_wchar2mb_with_len(int encoding,
00730                               const pg_wchar *from, char *to, int len)
00731 {
00732     return (*pg_wchar_table[encoding].wchar2mb_with_len) (from, (unsigned char *)to, len);
00733 }
00734 
00735 /* returns the byte length of a multibyte character */
00736 int
00737 pg_mblen(const char *mbstr)
00738 {
00739     return ((*pg_wchar_table[DatabaseEncoding->encoding].mblen) ((const unsigned char *) mbstr));
00740 }
00741 
00742 /* returns the display length of a multibyte character */
00743 int
00744 pg_dsplen(const char *mbstr)
00745 {
00746     return ((*pg_wchar_table[DatabaseEncoding->encoding].dsplen) ((const unsigned char *) mbstr));
00747 }
00748 
00749 /* returns the length (counted in wchars) of a multibyte string */
00750 int
00751 pg_mbstrlen(const char *mbstr)
00752 {
00753     int         len = 0;
00754 
00755     /* optimization for single byte encoding */
00756     if (pg_database_encoding_max_length() == 1)
00757         return strlen(mbstr);
00758 
00759     while (*mbstr)
00760     {
00761         mbstr += pg_mblen(mbstr);
00762         len++;
00763     }
00764     return len;
00765 }
00766 
00767 /* returns the length (counted in wchars) of a multibyte string
00768  * (not necessarily NULL terminated)
00769  */
00770 int
00771 pg_mbstrlen_with_len(const char *mbstr, int limit)
00772 {
00773     int         len = 0;
00774 
00775     /* optimization for single byte encoding */
00776     if (pg_database_encoding_max_length() == 1)
00777         return limit;
00778 
00779     while (limit > 0 && *mbstr)
00780     {
00781         int         l = pg_mblen(mbstr);
00782 
00783         limit -= l;
00784         mbstr += l;
00785         len++;
00786     }
00787     return len;
00788 }
00789 
00790 /*
00791  * returns the byte length of a multibyte string
00792  * (not necessarily NULL terminated)
00793  * that is no longer than limit.
00794  * this function does not break multibyte character boundary.
00795  */
00796 int
00797 pg_mbcliplen(const char *mbstr, int len, int limit)
00798 {
00799     return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
00800                                  len, limit);
00801 }
00802 
00803 /*
00804  * pg_mbcliplen with specified encoding
00805  */
00806 int
00807 pg_encoding_mbcliplen(int encoding, const char *mbstr,
00808                       int len, int limit)
00809 {
00810     mblen_converter mblen_fn;
00811     int         clen = 0;
00812     int         l;
00813 
00814     /* optimization for single byte encoding */
00815     if (pg_encoding_max_length(encoding) == 1)
00816         return cliplen(mbstr, len, limit);
00817 
00818     mblen_fn = pg_wchar_table[encoding].mblen;
00819 
00820     while (len > 0 && *mbstr)
00821     {
00822         l = (*mblen_fn) ((const unsigned char *) mbstr);
00823         if ((clen + l) > limit)
00824             break;
00825         clen += l;
00826         if (clen == limit)
00827             break;
00828         len -= l;
00829         mbstr += l;
00830     }
00831     return clen;
00832 }
00833 
00834 /*
00835  * Similar to pg_mbcliplen except the limit parameter specifies the
00836  * character length, not the byte length.
00837  */
00838 int
00839 pg_mbcharcliplen(const char *mbstr, int len, int limit)
00840 {
00841     int         clen = 0;
00842     int         nch = 0;
00843     int         l;
00844 
00845     /* optimization for single byte encoding */
00846     if (pg_database_encoding_max_length() == 1)
00847         return cliplen(mbstr, len, limit);
00848 
00849     while (len > 0 && *mbstr)
00850     {
00851         l = pg_mblen(mbstr);
00852         nch++;
00853         if (nch > limit)
00854             break;
00855         clen += l;
00856         len -= l;
00857         mbstr += l;
00858     }
00859     return clen;
00860 }
00861 
00862 /* mbcliplen for any single-byte encoding */
00863 static int
00864 cliplen(const char *str, int len, int limit)
00865 {
00866     int         l = 0;
00867 
00868     len = Min(len, limit);
00869     while (l < len && str[l])
00870         l++;
00871     return l;
00872 }
00873 
00874 void
00875 SetDatabaseEncoding(int encoding)
00876 {
00877     if (!PG_VALID_BE_ENCODING(encoding))
00878         elog(ERROR, "invalid database encoding: %d", encoding);
00879 
00880     DatabaseEncoding = &pg_enc2name_tbl[encoding];
00881     Assert(DatabaseEncoding->encoding == encoding);
00882 }
00883 
00884 /*
00885  * Bind gettext to the codeset equivalent with the database encoding.
00886  */
00887 void
00888 pg_bind_textdomain_codeset(const char *domainname)
00889 {
00890 #if defined(ENABLE_NLS)
00891     int         encoding = GetDatabaseEncoding();
00892     int         i;
00893 
00894     /*
00895      * gettext() uses the codeset specified by LC_CTYPE by default, so if that
00896      * matches the database encoding we don't need to do anything. In CREATE
00897      * DATABASE, we enforce or trust that the locale's codeset matches
00898      * database encoding, except for the C locale. In C locale, we bind
00899      * gettext() explicitly to the right codeset.
00900      *
00901      * On Windows, though, gettext() tends to get confused so we always bind
00902      * it.
00903      */
00904 #ifndef WIN32
00905     const char *ctype = setlocale(LC_CTYPE, NULL);
00906 
00907     if (pg_strcasecmp(ctype, "C") != 0 && pg_strcasecmp(ctype, "POSIX") != 0)
00908         return;
00909 #endif
00910 
00911     for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
00912     {
00913         if (pg_enc2gettext_tbl[i].encoding == encoding)
00914         {
00915             if (bind_textdomain_codeset(domainname,
00916                                         pg_enc2gettext_tbl[i].name) == NULL)
00917                 elog(LOG, "bind_textdomain_codeset failed");
00918             break;
00919         }
00920     }
00921 #endif
00922 }
00923 
00924 int
00925 GetDatabaseEncoding(void)
00926 {
00927     Assert(DatabaseEncoding);
00928     return DatabaseEncoding->encoding;
00929 }
00930 
00931 const char *
00932 GetDatabaseEncodingName(void)
00933 {
00934     Assert(DatabaseEncoding);
00935     return DatabaseEncoding->name;
00936 }
00937 
00938 Datum
00939 getdatabaseencoding(PG_FUNCTION_ARGS)
00940 {
00941     Assert(DatabaseEncoding);
00942     return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
00943 }
00944 
00945 Datum
00946 pg_client_encoding(PG_FUNCTION_ARGS)
00947 {
00948     Assert(ClientEncoding);
00949     return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
00950 }
00951 
00952 int
00953 GetPlatformEncoding(void)
00954 {
00955     if (PlatformEncoding == NULL)
00956     {
00957         /* try to determine encoding of server's environment locale */
00958         int         encoding = pg_get_encoding_from_locale("", true);
00959 
00960         if (encoding < 0)
00961             encoding = PG_SQL_ASCII;
00962         PlatformEncoding = &pg_enc2name_tbl[encoding];
00963     }
00964     return PlatformEncoding->encoding;
00965 }
00966 
00967 #ifdef WIN32
00968 
00969 /*
00970  * Result is palloc'ed null-terminated utf16 string. The character length
00971  * is also passed to utf16len if not null. Returns NULL iff failed.
00972  */
00973 WCHAR *
00974 pgwin32_toUTF16(const char *str, int len, int *utf16len)
00975 {
00976     WCHAR      *utf16;
00977     int         dstlen;
00978     UINT        codepage;
00979 
00980     codepage = pg_enc2name_tbl[GetDatabaseEncoding()].codepage;
00981 
00982     /*
00983      * Use MultiByteToWideChar directly if there is a corresponding codepage,
00984      * or double conversion through UTF8 if not.
00985      */
00986     if (codepage != 0)
00987     {
00988         utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
00989         dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
00990         utf16[dstlen] = (WCHAR) 0;
00991     }
00992     else
00993     {
00994         char       *utf8;
00995 
00996         utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
00997                                         len, GetDatabaseEncoding(), PG_UTF8);
00998         if (utf8 != str)
00999             len = strlen(utf8);
01000 
01001         utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
01002         dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
01003         utf16[dstlen] = (WCHAR) 0;
01004 
01005         if (utf8 != str)
01006             pfree(utf8);
01007     }
01008 
01009     if (dstlen == 0 && len > 0)
01010     {
01011         pfree(utf16);
01012         return NULL;            /* error */
01013     }
01014 
01015     if (utf16len)
01016         *utf16len = dstlen;
01017     return utf16;
01018 }
01019 
01020 #endif