Header And Logo

PostgreSQL
| The world's most advanced open source database.

wchar.c

Go to the documentation of this file.
00001 /*
00002  * conversion functions between pg_wchar and multibyte streams.
00003  * Tatsuo Ishii
00004  * src/backend/utils/mb/wchar.c
00005  *
00006  */
00007 /* can be used in either frontend or backend */
00008 #ifdef FRONTEND
00009 #include "postgres_fe.h"
00010 #else
00011 #include "postgres.h"
00012 #endif
00013 
00014 #include "mb/pg_wchar.h"
00015 
00016 
00017 /*
00018  * conversion to pg_wchar is done by "table driven."
00019  * to add an encoding support, define mb2wchar_with_len(), mblen(), dsplen()
00020  * for the particular encoding. Note that if the encoding is only
00021  * supported in the client, you don't need to define
00022  * mb2wchar_with_len() function (SJIS is the case).
00023  *
00024  * These functions generally assume that their input is validly formed.
00025  * The "verifier" functions, further down in the file, have to be more
00026  * paranoid.  We expect that mblen() does not need to examine more than
00027  * the first byte of the character to discover the correct length.
00028  *
00029  * Note: for the display output of psql to work properly, the return values
00030  * of the dsplen functions must conform to the Unicode standard. In particular
00031  * the NUL character is zero width and control characters are generally
00032  * width -1. It is recommended that non-ASCII encodings refer their ASCII
00033  * subset to the ASCII routines to ensure consistency.
00034  */
00035 
00036 /*
00037  * SQL/ASCII
00038  */
00039 static int
00040 pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
00041 {
00042     int         cnt = 0;
00043 
00044     while (len > 0 && *from)
00045     {
00046         *to++ = *from++;
00047         len--;
00048         cnt++;
00049     }
00050     *to = 0;
00051     return cnt;
00052 }
00053 
00054 static int
00055 pg_ascii_mblen(const unsigned char *s)
00056 {
00057     return 1;
00058 }
00059 
00060 static int
00061 pg_ascii_dsplen(const unsigned char *s)
00062 {
00063     if (*s == '\0')
00064         return 0;
00065     if (*s < 0x20 || *s == 0x7f)
00066         return -1;
00067 
00068     return 1;
00069 }
00070 
00071 /*
00072  * EUC
00073  */
00074 static int
00075 pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
00076 {
00077     int         cnt = 0;
00078 
00079     while (len > 0 && *from)
00080     {
00081         if (*from == SS2 && len >= 2)   /* JIS X 0201 (so called "1 byte
00082                                          * KANA") */
00083         {
00084             from++;
00085             *to = (SS2 << 8) | *from++;
00086             len -= 2;
00087         }
00088         else if (*from == SS3 && len >= 3)      /* JIS X 0212 KANJI */
00089         {
00090             from++;
00091             *to = (SS3 << 16) | (*from++ << 8);
00092             *to |= *from++;
00093             len -= 3;
00094         }
00095         else if (IS_HIGHBIT_SET(*from) && len >= 2)     /* JIS X 0208 KANJI */
00096         {
00097             *to = *from++ << 8;
00098             *to |= *from++;
00099             len -= 2;
00100         }
00101         else                            /* must be ASCII */
00102         {
00103             *to = *from++;
00104             len--;
00105         }
00106         to++;
00107         cnt++;
00108     }
00109     *to = 0;
00110     return cnt;
00111 }
00112 
00113 static inline int
00114 pg_euc_mblen(const unsigned char *s)
00115 {
00116     int         len;
00117 
00118     if (*s == SS2)
00119         len = 2;
00120     else if (*s == SS3)
00121         len = 3;
00122     else if (IS_HIGHBIT_SET(*s))
00123         len = 2;
00124     else
00125         len = 1;
00126     return len;
00127 }
00128 
00129 static inline int
00130 pg_euc_dsplen(const unsigned char *s)
00131 {
00132     int         len;
00133 
00134     if (*s == SS2)
00135         len = 2;
00136     else if (*s == SS3)
00137         len = 2;
00138     else if (IS_HIGHBIT_SET(*s))
00139         len = 2;
00140     else
00141         len = pg_ascii_dsplen(s);
00142     return len;
00143 }
00144 
00145 /*
00146  * EUC_JP
00147  */
00148 static int
00149 pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
00150 {
00151     return pg_euc2wchar_with_len(from, to, len);
00152 }
00153 
00154 static int
00155 pg_eucjp_mblen(const unsigned char *s)
00156 {
00157     return pg_euc_mblen(s);
00158 }
00159 
00160 static int
00161 pg_eucjp_dsplen(const unsigned char *s)
00162 {
00163     int         len;
00164 
00165     if (*s == SS2)
00166         len = 1;
00167     else if (*s == SS3)
00168         len = 2;
00169     else if (IS_HIGHBIT_SET(*s))
00170         len = 2;
00171     else
00172         len = pg_ascii_dsplen(s);
00173     return len;
00174 }
00175 
00176 /*
00177  * EUC_KR
00178  */
00179 static int
00180 pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
00181 {
00182     return pg_euc2wchar_with_len(from, to, len);
00183 }
00184 
00185 static int
00186 pg_euckr_mblen(const unsigned char *s)
00187 {
00188     return pg_euc_mblen(s);
00189 }
00190 
00191 static int
00192 pg_euckr_dsplen(const unsigned char *s)
00193 {
00194     return pg_euc_dsplen(s);
00195 }
00196 
00197 /*
00198  * EUC_CN
00199  *
00200  */
00201 static int
00202 pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
00203 {
00204     int         cnt = 0;
00205 
00206     while (len > 0 && *from)
00207     {
00208         if (*from == SS2 && len >= 3)   /* code set 2 (unused?) */
00209         {
00210             from++;
00211             *to = (SS2 << 16) | (*from++ << 8);
00212             *to |= *from++;
00213             len -= 3;
00214         }
00215         else if (*from == SS3 && len >= 3)      /* code set 3 (unused ?) */
00216         {
00217             from++;
00218             *to = (SS3 << 16) | (*from++ << 8);
00219             *to |= *from++;
00220             len -= 3;
00221         }
00222         else if (IS_HIGHBIT_SET(*from) && len >= 2)     /* code set 1 */
00223         {
00224             *to = *from++ << 8;
00225             *to |= *from++;
00226             len -= 2;
00227         }
00228         else
00229         {
00230             *to = *from++;
00231             len--;
00232         }
00233         to++;
00234         cnt++;
00235     }
00236     *to = 0;
00237     return cnt;
00238 }
00239 
00240 static int
00241 pg_euccn_mblen(const unsigned char *s)
00242 {
00243     int         len;
00244 
00245     if (IS_HIGHBIT_SET(*s))
00246         len = 2;
00247     else
00248         len = 1;
00249     return len;
00250 }
00251 
00252 static int
00253 pg_euccn_dsplen(const unsigned char *s)
00254 {
00255     int         len;
00256 
00257     if (IS_HIGHBIT_SET(*s))
00258         len = 2;
00259     else
00260         len = pg_ascii_dsplen(s);
00261     return len;
00262 }
00263 
00264 /*
00265  * EUC_TW
00266  *
00267  */
00268 static int
00269 pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
00270 {
00271     int         cnt = 0;
00272 
00273     while (len > 0 && *from)
00274     {
00275         if (*from == SS2 && len >= 4)   /* code set 2 */
00276         {
00277             from++;
00278             *to = (((uint32) SS2) << 24) | (*from++ << 16);
00279             *to |= *from++ << 8;
00280             *to |= *from++;
00281             len -= 4;
00282         }
00283         else if (*from == SS3 && len >= 3)      /* code set 3 (unused?) */
00284         {
00285             from++;
00286             *to = (SS3 << 16) | (*from++ << 8);
00287             *to |= *from++;
00288             len -= 3;
00289         }
00290         else if (IS_HIGHBIT_SET(*from) && len >= 2)     /* code set 2 */
00291         {
00292             *to = *from++ << 8;
00293             *to |= *from++;
00294             len -= 2;
00295         }
00296         else
00297         {
00298             *to = *from++;
00299             len--;
00300         }
00301         to++;
00302         cnt++;
00303     }
00304     *to = 0;
00305     return cnt;
00306 }
00307 
00308 static int
00309 pg_euctw_mblen(const unsigned char *s)
00310 {
00311     int         len;
00312 
00313     if (*s == SS2)
00314         len = 4;
00315     else if (*s == SS3)
00316         len = 3;
00317     else if (IS_HIGHBIT_SET(*s))
00318         len = 2;
00319     else
00320         len = 1;
00321     return len;
00322 }
00323 
00324 static int
00325 pg_euctw_dsplen(const unsigned char *s)
00326 {
00327     int         len;
00328 
00329     if (*s == SS2)
00330         len = 2;
00331     else if (*s == SS3)
00332         len = 2;
00333     else if (IS_HIGHBIT_SET(*s))
00334         len = 2;
00335     else
00336         len = pg_ascii_dsplen(s);
00337     return len;
00338 }
00339 
00340 /*
00341  * Convert pg_wchar to EUC_* encoding.
00342  * caller must allocate enough space for "to", including a trailing zero!
00343  * len: length of from.
00344  * "from" not necessarily null terminated.
00345  */
00346 static int
00347 pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
00348 {
00349     int         cnt = 0;
00350 
00351     while (len > 0 && *from)
00352     {
00353         unsigned char c;
00354 
00355         if ((c = (*from >> 24)))
00356         {
00357             *to++ = c;
00358             *to++ = (*from >> 16) & 0xff;
00359             *to++ = (*from >> 8) & 0xff;
00360             *to++ = *from & 0xff;
00361             cnt += 4;
00362         }
00363         else if ((c = (*from >> 16)))
00364         {
00365             *to++ = c;
00366             *to++ = (*from >> 8) & 0xff;
00367             *to++ = *from & 0xff;
00368             cnt += 3;
00369         }
00370         else if ((c = (*from >> 8)))
00371         {
00372             *to++ = c;
00373             *to++ = *from & 0xff;
00374             cnt += 2;
00375         }
00376         else
00377         {
00378             *to++ = *from;
00379             cnt++;
00380         }
00381         from++;
00382         len--;
00383     }
00384     *to = 0;
00385     return cnt;
00386 }
00387 
00388 
00389 /*
00390  * JOHAB
00391  */
00392 static int
00393 pg_johab_mblen(const unsigned char *s)
00394 {
00395     return pg_euc_mblen(s);
00396 }
00397 
00398 static int
00399 pg_johab_dsplen(const unsigned char *s)
00400 {
00401     return pg_euc_dsplen(s);
00402 }
00403 
00404 /*
00405  * convert UTF8 string to pg_wchar (UCS-4)
00406  * caller must allocate enough space for "to", including a trailing zero!
00407  * len: length of from.
00408  * "from" not necessarily null terminated.
00409  */
00410 static int
00411 pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
00412 {
00413     int         cnt = 0;
00414     uint32      c1,
00415                 c2,
00416                 c3,
00417                 c4;
00418 
00419     while (len > 0 && *from)
00420     {
00421         if ((*from & 0x80) == 0)
00422         {
00423             *to = *from++;
00424             len--;
00425         }
00426         else if ((*from & 0xe0) == 0xc0)
00427         {
00428             if (len < 2)
00429                 break;          /* drop trailing incomplete char */
00430             c1 = *from++ & 0x1f;
00431             c2 = *from++ & 0x3f;
00432             *to = (c1 << 6) | c2;
00433             len -= 2;
00434         }
00435         else if ((*from & 0xf0) == 0xe0)
00436         {
00437             if (len < 3)
00438                 break;          /* drop trailing incomplete char */
00439             c1 = *from++ & 0x0f;
00440             c2 = *from++ & 0x3f;
00441             c3 = *from++ & 0x3f;
00442             *to = (c1 << 12) | (c2 << 6) | c3;
00443             len -= 3;
00444         }
00445         else if ((*from & 0xf8) == 0xf0)
00446         {
00447             if (len < 4)
00448                 break;          /* drop trailing incomplete char */
00449             c1 = *from++ & 0x07;
00450             c2 = *from++ & 0x3f;
00451             c3 = *from++ & 0x3f;
00452             c4 = *from++ & 0x3f;
00453             *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
00454             len -= 4;
00455         }
00456         else
00457         {
00458             /* treat a bogus char as length 1; not ours to raise error */
00459             *to = *from++;
00460             len--;
00461         }
00462         to++;
00463         cnt++;
00464     }
00465     *to = 0;
00466     return cnt;
00467 }
00468 
00469 
00470 /*
00471  * Map a Unicode code point to UTF-8.  utf8string must have 4 bytes of
00472  * space allocated.
00473  */
00474 unsigned char *
00475 unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
00476 {
00477     if (c <= 0x7F)
00478     {
00479         utf8string[0] = c;
00480     }
00481     else if (c <= 0x7FF)
00482     {
00483         utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
00484         utf8string[1] = 0x80 | (c & 0x3F);
00485     }
00486     else if (c <= 0xFFFF)
00487     {
00488         utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
00489         utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
00490         utf8string[2] = 0x80 | (c & 0x3F);
00491     }
00492     else
00493     {
00494         utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
00495         utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
00496         utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
00497         utf8string[3] = 0x80 | (c & 0x3F);
00498     }
00499 
00500     return utf8string;
00501 }
00502 
00503 /*
00504  * Trivial conversion from pg_wchar to UTF-8.
00505  * caller should allocate enough space for "to"
00506  * len: length of from.
00507  * "from" not necessarily null terminated.
00508  */
00509 static int
00510 pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
00511 {
00512     int         cnt = 0;
00513 
00514     while (len > 0 && *from)
00515     {
00516         int char_len;
00517 
00518         unicode_to_utf8(*from, to);
00519         char_len = pg_utf_mblen(to);
00520         cnt += char_len;
00521         to += char_len;
00522         from++;
00523         len--;
00524     }
00525     *to = 0;
00526     return cnt;
00527 }
00528 
00529 /*
00530  * Return the byte length of a UTF8 character pointed to by s
00531  *
00532  * Note: in the current implementation we do not support UTF8 sequences
00533  * of more than 4 bytes; hence do NOT return a value larger than 4.
00534  * We return "1" for any leading byte that is either flat-out illegal or
00535  * indicates a length larger than we support.
00536  *
00537  * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
00538  * other places would need to be fixed to change this.
00539  */
00540 int
00541 pg_utf_mblen(const unsigned char *s)
00542 {
00543     int         len;
00544 
00545     if ((*s & 0x80) == 0)
00546         len = 1;
00547     else if ((*s & 0xe0) == 0xc0)
00548         len = 2;
00549     else if ((*s & 0xf0) == 0xe0)
00550         len = 3;
00551     else if ((*s & 0xf8) == 0xf0)
00552         len = 4;
00553 #ifdef NOT_USED
00554     else if ((*s & 0xfc) == 0xf8)
00555         len = 5;
00556     else if ((*s & 0xfe) == 0xfc)
00557         len = 6;
00558 #endif
00559     else
00560         len = 1;
00561     return len;
00562 }
00563 
00564 /*
00565  * This is an implementation of wcwidth() and wcswidth() as defined in
00566  * "The Single UNIX Specification, Version 2, The Open Group, 1997"
00567  * <http://www.UNIX-systems.org/online.html>
00568  *
00569  * Markus Kuhn -- 2001-09-08 -- public domain
00570  *
00571  * customised for PostgreSQL
00572  *
00573  * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
00574  */
00575 
00576 struct mbinterval
00577 {
00578     unsigned short first;
00579     unsigned short last;
00580 };
00581 
00582 /* auxiliary function for binary search in interval table */
00583 static int
00584 mbbisearch(pg_wchar ucs, const struct mbinterval * table, int max)
00585 {
00586     int         min = 0;
00587     int         mid;
00588 
00589     if (ucs < table[0].first || ucs > table[max].last)
00590         return 0;
00591     while (max >= min)
00592     {
00593         mid = (min + max) / 2;
00594         if (ucs > table[mid].last)
00595             min = mid + 1;
00596         else if (ucs < table[mid].first)
00597             max = mid - 1;
00598         else
00599             return 1;
00600     }
00601 
00602     return 0;
00603 }
00604 
00605 
00606 /* The following functions define the column width of an ISO 10646
00607  * character as follows:
00608  *
00609  *    - The null character (U+0000) has a column width of 0.
00610  *
00611  *    - Other C0/C1 control characters and DEL will lead to a return
00612  *      value of -1.
00613  *
00614  *    - Non-spacing and enclosing combining characters (general
00615  *      category code Mn or Me in the Unicode database) have a
00616  *      column width of 0.
00617  *
00618  *    - Other format characters (general category code Cf in the Unicode
00619  *      database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
00620  *
00621  *    - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
00622  *      have a column width of 0.
00623  *
00624  *    - Spacing characters in the East Asian Wide (W) or East Asian
00625  *      FullWidth (F) category as defined in Unicode Technical
00626  *      Report #11 have a column width of 2.
00627  *
00628  *    - All remaining characters (including all printable
00629  *      ISO 8859-1 and WGL4 characters, Unicode control characters,
00630  *      etc.) have a column width of 1.
00631  *
00632  * This implementation assumes that wchar_t characters are encoded
00633  * in ISO 10646.
00634  */
00635 
00636 static int
00637 ucs_wcwidth(pg_wchar ucs)
00638 {
00639     /* sorted list of non-overlapping intervals of non-spacing characters */
00640     static const struct mbinterval combining[] = {
00641         {0x0300, 0x034E}, {0x0360, 0x0362}, {0x0483, 0x0486},
00642         {0x0488, 0x0489}, {0x0591, 0x05A1}, {0x05A3, 0x05B9},
00643         {0x05BB, 0x05BD}, {0x05BF, 0x05BF}, {0x05C1, 0x05C2},
00644         {0x05C4, 0x05C4}, {0x064B, 0x0655}, {0x0670, 0x0670},
00645         {0x06D6, 0x06E4}, {0x06E7, 0x06E8}, {0x06EA, 0x06ED},
00646         {0x070F, 0x070F}, {0x0711, 0x0711}, {0x0730, 0x074A},
00647         {0x07A6, 0x07B0}, {0x0901, 0x0902}, {0x093C, 0x093C},
00648         {0x0941, 0x0948}, {0x094D, 0x094D}, {0x0951, 0x0954},
00649         {0x0962, 0x0963}, {0x0981, 0x0981}, {0x09BC, 0x09BC},
00650         {0x09C1, 0x09C4}, {0x09CD, 0x09CD}, {0x09E2, 0x09E3},
00651         {0x0A02, 0x0A02}, {0x0A3C, 0x0A3C}, {0x0A41, 0x0A42},
00652         {0x0A47, 0x0A48}, {0x0A4B, 0x0A4D}, {0x0A70, 0x0A71},
00653         {0x0A81, 0x0A82}, {0x0ABC, 0x0ABC}, {0x0AC1, 0x0AC5},
00654         {0x0AC7, 0x0AC8}, {0x0ACD, 0x0ACD}, {0x0B01, 0x0B01},
00655         {0x0B3C, 0x0B3C}, {0x0B3F, 0x0B3F}, {0x0B41, 0x0B43},
00656         {0x0B4D, 0x0B4D}, {0x0B56, 0x0B56}, {0x0B82, 0x0B82},
00657         {0x0BC0, 0x0BC0}, {0x0BCD, 0x0BCD}, {0x0C3E, 0x0C40},
00658         {0x0C46, 0x0C48}, {0x0C4A, 0x0C4D}, {0x0C55, 0x0C56},
00659         {0x0CBF, 0x0CBF}, {0x0CC6, 0x0CC6}, {0x0CCC, 0x0CCD},
00660         {0x0D41, 0x0D43}, {0x0D4D, 0x0D4D}, {0x0DCA, 0x0DCA},
00661         {0x0DD2, 0x0DD4}, {0x0DD6, 0x0DD6}, {0x0E31, 0x0E31},
00662         {0x0E34, 0x0E3A}, {0x0E47, 0x0E4E}, {0x0EB1, 0x0EB1},
00663         {0x0EB4, 0x0EB9}, {0x0EBB, 0x0EBC}, {0x0EC8, 0x0ECD},
00664         {0x0F18, 0x0F19}, {0x0F35, 0x0F35}, {0x0F37, 0x0F37},
00665         {0x0F39, 0x0F39}, {0x0F71, 0x0F7E}, {0x0F80, 0x0F84},
00666         {0x0F86, 0x0F87}, {0x0F90, 0x0F97}, {0x0F99, 0x0FBC},
00667         {0x0FC6, 0x0FC6}, {0x102D, 0x1030}, {0x1032, 0x1032},
00668         {0x1036, 0x1037}, {0x1039, 0x1039}, {0x1058, 0x1059},
00669         {0x1160, 0x11FF}, {0x17B7, 0x17BD}, {0x17C6, 0x17C6},
00670         {0x17C9, 0x17D3}, {0x180B, 0x180E}, {0x18A9, 0x18A9},
00671         {0x200B, 0x200F}, {0x202A, 0x202E}, {0x206A, 0x206F},
00672         {0x20D0, 0x20E3}, {0x302A, 0x302F}, {0x3099, 0x309A},
00673         {0xFB1E, 0xFB1E}, {0xFE20, 0xFE23}, {0xFEFF, 0xFEFF},
00674         {0xFFF9, 0xFFFB}
00675     };
00676 
00677     /* test for 8-bit control characters */
00678     if (ucs == 0)
00679         return 0;
00680 
00681     if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
00682         return -1;
00683 
00684     /* binary search in table of non-spacing characters */
00685     if (mbbisearch(ucs, combining,
00686                    sizeof(combining) / sizeof(struct mbinterval) - 1))
00687         return 0;
00688 
00689     /*
00690      * if we arrive here, ucs is not a combining or C0/C1 control character
00691      */
00692 
00693     return 1 +
00694         (ucs >= 0x1100 &&
00695          (ucs <= 0x115f ||      /* Hangul Jamo init. consonants */
00696           (ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a &&
00697            ucs != 0x303f) ||    /* CJK ... Yi */
00698           (ucs >= 0xac00 && ucs <= 0xd7a3) ||   /* Hangul Syllables */
00699           (ucs >= 0xf900 && ucs <= 0xfaff) ||   /* CJK Compatibility
00700                                                  * Ideographs */
00701           (ucs >= 0xfe30 && ucs <= 0xfe6f) ||   /* CJK Compatibility Forms */
00702           (ucs >= 0xff00 && ucs <= 0xff5f) ||   /* Fullwidth Forms */
00703           (ucs >= 0xffe0 && ucs <= 0xffe6) ||
00704           (ucs >= 0x20000 && ucs <= 0x2ffff)));
00705 }
00706 
00707 /*
00708  * Convert a UTF-8 character to a Unicode code point.
00709  * This is a one-character version of pg_utf2wchar_with_len.
00710  *
00711  * No error checks here, c must point to a long-enough string.
00712  */
00713 pg_wchar
00714 utf8_to_unicode(const unsigned char *c)
00715 {
00716     if ((*c & 0x80) == 0)
00717         return (pg_wchar) c[0];
00718     else if ((*c & 0xe0) == 0xc0)
00719         return (pg_wchar) (((c[0] & 0x1f) << 6) |
00720                            (c[1] & 0x3f));
00721     else if ((*c & 0xf0) == 0xe0)
00722         return (pg_wchar) (((c[0] & 0x0f) << 12) |
00723                            ((c[1] & 0x3f) << 6) |
00724                            (c[2] & 0x3f));
00725     else if ((*c & 0xf8) == 0xf0)
00726         return (pg_wchar) (((c[0] & 0x07) << 18) |
00727                            ((c[1] & 0x3f) << 12) |
00728                            ((c[2] & 0x3f) << 6) |
00729                            (c[3] & 0x3f));
00730     else
00731         /* that is an invalid code on purpose */
00732         return 0xffffffff;
00733 }
00734 
00735 static int
00736 pg_utf_dsplen(const unsigned char *s)
00737 {
00738     return ucs_wcwidth(utf8_to_unicode(s));
00739 }
00740 
00741 /*
00742  * convert mule internal code to pg_wchar
00743  * caller should allocate enough space for "to"
00744  * len: length of from.
00745  * "from" not necessarily null terminated.
00746  */
00747 static int
00748 pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
00749 {
00750     int         cnt = 0;
00751 
00752     while (len > 0 && *from)
00753     {
00754         if (IS_LC1(*from) && len >= 2)
00755         {
00756             *to = *from++ << 16;
00757             *to |= *from++;
00758             len -= 2;
00759         }
00760         else if (IS_LCPRV1(*from) && len >= 3)
00761         {
00762             from++;
00763             *to = *from++ << 16;
00764             *to |= *from++;
00765             len -= 3;
00766         }
00767         else if (IS_LC2(*from) && len >= 3)
00768         {
00769             *to = *from++ << 16;
00770             *to |= *from++ << 8;
00771             *to |= *from++;
00772             len -= 3;
00773         }
00774         else if (IS_LCPRV2(*from) && len >= 4)
00775         {
00776             from++;
00777             *to = *from++ << 16;
00778             *to |= *from++ << 8;
00779             *to |= *from++;
00780             len -= 4;
00781         }
00782         else
00783         {                       /* assume ASCII */
00784             *to = (unsigned char) *from++;
00785             len--;
00786         }
00787         to++;
00788         cnt++;
00789     }
00790     *to = 0;
00791     return cnt;
00792 }
00793 
00794 /*
00795  * convert pg_wchar to mule internal code
00796  * caller should allocate enough space for "to"
00797  * len: length of from.
00798  * "from" not necessarily null terminated.
00799  */
00800 static int
00801 pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
00802 {
00803     int         cnt = 0;
00804 
00805     while (len > 0 && *from)
00806     {
00807         unsigned char lb;
00808 
00809         lb = (*from >> 16) & 0xff;
00810         if (IS_LC1(lb))
00811         {
00812             *to++ = lb;
00813             *to++ = *from & 0xff;
00814             cnt += 2;
00815         }
00816         else if (IS_LC2(lb))
00817         {
00818             *to++ = lb;
00819             *to++ = (*from >> 8) & 0xff;
00820             *to++ = *from & 0xff;
00821             cnt += 3;
00822         }
00823         else if (IS_LCPRV1_A_RANGE(lb))
00824         {
00825             *to++ = LCPRV1_A;
00826             *to++ = lb;
00827             *to++ = *from & 0xff;
00828             cnt += 3;
00829         }
00830         else if (IS_LCPRV1_B_RANGE(lb))
00831         {
00832             *to++ = LCPRV1_B;
00833             *to++ = lb;
00834             *to++ = *from & 0xff;
00835             cnt += 3;
00836         }
00837         else if (IS_LCPRV2_A_RANGE(lb))
00838         {
00839             *to++ = LCPRV2_A;
00840             *to++ = lb;
00841             *to++ = (*from >> 8) & 0xff;
00842             *to++ = *from & 0xff;
00843             cnt += 4;
00844         }
00845         else if (IS_LCPRV2_B_RANGE(lb))
00846         {
00847             *to++ = LCPRV2_B;
00848             *to++ = lb;
00849             *to++ = (*from >> 8) & 0xff;
00850             *to++ = *from & 0xff;
00851             cnt += 4;
00852         }
00853         else
00854         {
00855             *to++ = *from & 0xff;
00856             cnt += 1;
00857         }
00858         from++;
00859         len--;
00860     }
00861     *to = 0;
00862     return cnt;
00863 }
00864 
00865 int
00866 pg_mule_mblen(const unsigned char *s)
00867 {
00868     int         len;
00869 
00870     if (IS_LC1(*s))
00871         len = 2;
00872     else if (IS_LCPRV1(*s))
00873         len = 3;
00874     else if (IS_LC2(*s))
00875         len = 3;
00876     else if (IS_LCPRV2(*s))
00877         len = 4;
00878     else
00879         len = 1;                /* assume ASCII */
00880     return len;
00881 }
00882 
00883 static int
00884 pg_mule_dsplen(const unsigned char *s)
00885 {
00886     int         len;
00887 
00888     /*
00889      * Note: it's not really appropriate to assume that all multibyte charsets
00890      * are double-wide on screen.  But this seems an okay approximation for
00891      * the MULE charsets we currently support.
00892      */
00893 
00894     if (IS_LC1(*s))
00895         len = 1;
00896     else if (IS_LCPRV1(*s))
00897         len = 1;
00898     else if (IS_LC2(*s))
00899         len = 2;
00900     else if (IS_LCPRV2(*s))
00901         len = 2;
00902     else
00903         len = 1;                /* assume ASCII */
00904 
00905     return len;
00906 }
00907 
00908 /*
00909  * ISO8859-1
00910  */
00911 static int
00912 pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
00913 {
00914     int         cnt = 0;
00915 
00916     while (len > 0 && *from)
00917     {
00918         *to++ = *from++;
00919         len--;
00920         cnt++;
00921     }
00922     *to = 0;
00923     return cnt;
00924 }
00925 
00926 /*
00927  * Trivial conversion from pg_wchar to single byte encoding. Just ignores
00928  * high bits.
00929  * caller should allocate enough space for "to"
00930  * len: length of from.
00931  * "from" not necessarily null terminated.
00932  */
00933 static int
00934 pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
00935 {
00936     int         cnt = 0;
00937 
00938     while (len > 0 && *from)
00939     {
00940         *to++ = *from++;
00941         len--;
00942         cnt++;
00943     }
00944     *to = 0;
00945     return cnt;
00946 }
00947 
00948 static int
00949 pg_latin1_mblen(const unsigned char *s)
00950 {
00951     return 1;
00952 }
00953 
00954 static int
00955 pg_latin1_dsplen(const unsigned char *s)
00956 {
00957     return pg_ascii_dsplen(s);
00958 }
00959 
00960 /*
00961  * SJIS
00962  */
00963 static int
00964 pg_sjis_mblen(const unsigned char *s)
00965 {
00966     int         len;
00967 
00968     if (*s >= 0xa1 && *s <= 0xdf)
00969         len = 1;                /* 1 byte kana? */
00970     else if (IS_HIGHBIT_SET(*s))
00971         len = 2;                /* kanji? */
00972     else
00973         len = 1;                /* should be ASCII */
00974     return len;
00975 }
00976 
00977 static int
00978 pg_sjis_dsplen(const unsigned char *s)
00979 {
00980     int         len;
00981 
00982     if (*s >= 0xa1 && *s <= 0xdf)
00983         len = 1;                /* 1 byte kana? */
00984     else if (IS_HIGHBIT_SET(*s))
00985         len = 2;                /* kanji? */
00986     else
00987         len = pg_ascii_dsplen(s);       /* should be ASCII */
00988     return len;
00989 }
00990 
00991 /*
00992  * Big5
00993  */
00994 static int
00995 pg_big5_mblen(const unsigned char *s)
00996 {
00997     int         len;
00998 
00999     if (IS_HIGHBIT_SET(*s))
01000         len = 2;                /* kanji? */
01001     else
01002         len = 1;                /* should be ASCII */
01003     return len;
01004 }
01005 
01006 static int
01007 pg_big5_dsplen(const unsigned char *s)
01008 {
01009     int         len;
01010 
01011     if (IS_HIGHBIT_SET(*s))
01012         len = 2;                /* kanji? */
01013     else
01014         len = pg_ascii_dsplen(s);       /* should be ASCII */
01015     return len;
01016 }
01017 
01018 /*
01019  * GBK
01020  */
01021 static int
01022 pg_gbk_mblen(const unsigned char *s)
01023 {
01024     int         len;
01025 
01026     if (IS_HIGHBIT_SET(*s))
01027         len = 2;                /* kanji? */
01028     else
01029         len = 1;                /* should be ASCII */
01030     return len;
01031 }
01032 
01033 static int
01034 pg_gbk_dsplen(const unsigned char *s)
01035 {
01036     int         len;
01037 
01038     if (IS_HIGHBIT_SET(*s))
01039         len = 2;                /* kanji? */
01040     else
01041         len = pg_ascii_dsplen(s);       /* should be ASCII */
01042     return len;
01043 }
01044 
01045 /*
01046  * UHC
01047  */
01048 static int
01049 pg_uhc_mblen(const unsigned char *s)
01050 {
01051     int         len;
01052 
01053     if (IS_HIGHBIT_SET(*s))
01054         len = 2;                /* 2byte? */
01055     else
01056         len = 1;                /* should be ASCII */
01057     return len;
01058 }
01059 
01060 static int
01061 pg_uhc_dsplen(const unsigned char *s)
01062 {
01063     int         len;
01064 
01065     if (IS_HIGHBIT_SET(*s))
01066         len = 2;                /* 2byte? */
01067     else
01068         len = pg_ascii_dsplen(s);       /* should be ASCII */
01069     return len;
01070 }
01071 
01072 /*
01073  *  * GB18030
01074  *   * Added by Bill Huang <[email protected]>,<[email protected]>
01075  *    */
01076 static int
01077 pg_gb18030_mblen(const unsigned char *s)
01078 {
01079     int         len;
01080 
01081     if (!IS_HIGHBIT_SET(*s))
01082         len = 1;                /* ASCII */
01083     else
01084     {
01085         if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) || (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
01086             len = 2;
01087         else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
01088             len = 4;
01089         else
01090             len = 2;
01091     }
01092     return len;
01093 }
01094 
01095 static int
01096 pg_gb18030_dsplen(const unsigned char *s)
01097 {
01098     int         len;
01099 
01100     if (IS_HIGHBIT_SET(*s))
01101         len = 2;
01102     else
01103         len = pg_ascii_dsplen(s);       /* ASCII */
01104     return len;
01105 }
01106 
01107 /*
01108  *-------------------------------------------------------------------
01109  * multibyte sequence validators
01110  *
01111  * These functions accept "s", a pointer to the first byte of a string,
01112  * and "len", the remaining length of the string.  If there is a validly
01113  * encoded character beginning at *s, return its length in bytes; else
01114  * return -1.
01115  *
01116  * The functions can assume that len > 0 and that *s != '\0', but they must
01117  * test for and reject zeroes in any additional bytes of a multibyte character.
01118  *
01119  * Note that this definition allows the function for a single-byte
01120  * encoding to be just "return 1".
01121  *-------------------------------------------------------------------
01122  */
01123 
01124 static int
01125 pg_ascii_verifier(const unsigned char *s, int len)
01126 {
01127     return 1;
01128 }
01129 
01130 #define IS_EUC_RANGE_VALID(c)   ((c) >= 0xa1 && (c) <= 0xfe)
01131 
01132 static int
01133 pg_eucjp_verifier(const unsigned char *s, int len)
01134 {
01135     int         l;
01136     unsigned char c1,
01137                 c2;
01138 
01139     c1 = *s++;
01140 
01141     switch (c1)
01142     {
01143         case SS2:               /* JIS X 0201 */
01144             l = 2;
01145             if (l > len)
01146                 return -1;
01147             c2 = *s++;
01148             if (c2 < 0xa1 || c2 > 0xdf)
01149                 return -1;
01150             break;
01151 
01152         case SS3:               /* JIS X 0212 */
01153             l = 3;
01154             if (l > len)
01155                 return -1;
01156             c2 = *s++;
01157             if (!IS_EUC_RANGE_VALID(c2))
01158                 return -1;
01159             c2 = *s++;
01160             if (!IS_EUC_RANGE_VALID(c2))
01161                 return -1;
01162             break;
01163 
01164         default:
01165             if (IS_HIGHBIT_SET(c1))     /* JIS X 0208? */
01166             {
01167                 l = 2;
01168                 if (l > len)
01169                     return -1;
01170                 if (!IS_EUC_RANGE_VALID(c1))
01171                     return -1;
01172                 c2 = *s++;
01173                 if (!IS_EUC_RANGE_VALID(c2))
01174                     return -1;
01175             }
01176             else
01177                 /* must be ASCII */
01178             {
01179                 l = 1;
01180             }
01181             break;
01182     }
01183 
01184     return l;
01185 }
01186 
01187 static int
01188 pg_euckr_verifier(const unsigned char *s, int len)
01189 {
01190     int         l;
01191     unsigned char c1,
01192                 c2;
01193 
01194     c1 = *s++;
01195 
01196     if (IS_HIGHBIT_SET(c1))
01197     {
01198         l = 2;
01199         if (l > len)
01200             return -1;
01201         if (!IS_EUC_RANGE_VALID(c1))
01202             return -1;
01203         c2 = *s++;
01204         if (!IS_EUC_RANGE_VALID(c2))
01205             return -1;
01206     }
01207     else
01208         /* must be ASCII */
01209     {
01210         l = 1;
01211     }
01212 
01213     return l;
01214 }
01215 
01216 /* EUC-CN byte sequences are exactly same as EUC-KR */
01217 #define pg_euccn_verifier   pg_euckr_verifier
01218 
01219 static int
01220 pg_euctw_verifier(const unsigned char *s, int len)
01221 {
01222     int         l;
01223     unsigned char c1,
01224                 c2;
01225 
01226     c1 = *s++;
01227 
01228     switch (c1)
01229     {
01230         case SS2:               /* CNS 11643 Plane 1-7 */
01231             l = 4;
01232             if (l > len)
01233                 return -1;
01234             c2 = *s++;
01235             if (c2 < 0xa1 || c2 > 0xa7)
01236                 return -1;
01237             c2 = *s++;
01238             if (!IS_EUC_RANGE_VALID(c2))
01239                 return -1;
01240             c2 = *s++;
01241             if (!IS_EUC_RANGE_VALID(c2))
01242                 return -1;
01243             break;
01244 
01245         case SS3:               /* unused */
01246             return -1;
01247 
01248         default:
01249             if (IS_HIGHBIT_SET(c1))     /* CNS 11643 Plane 1 */
01250             {
01251                 l = 2;
01252                 if (l > len)
01253                     return -1;
01254                 /* no further range check on c1? */
01255                 c2 = *s++;
01256                 if (!IS_EUC_RANGE_VALID(c2))
01257                     return -1;
01258             }
01259             else
01260                 /* must be ASCII */
01261             {
01262                 l = 1;
01263             }
01264             break;
01265     }
01266     return l;
01267 }
01268 
01269 static int
01270 pg_johab_verifier(const unsigned char *s, int len)
01271 {
01272     int         l,
01273                 mbl;
01274     unsigned char c;
01275 
01276     l = mbl = pg_johab_mblen(s);
01277 
01278     if (len < l)
01279         return -1;
01280 
01281     if (!IS_HIGHBIT_SET(*s))
01282         return mbl;
01283 
01284     while (--l > 0)
01285     {
01286         c = *++s;
01287         if (!IS_EUC_RANGE_VALID(c))
01288             return -1;
01289     }
01290     return mbl;
01291 }
01292 
01293 static int
01294 pg_mule_verifier(const unsigned char *s, int len)
01295 {
01296     int         l,
01297                 mbl;
01298     unsigned char c;
01299 
01300     l = mbl = pg_mule_mblen(s);
01301 
01302     if (len < l)
01303         return -1;
01304 
01305     while (--l > 0)
01306     {
01307         c = *++s;
01308         if (!IS_HIGHBIT_SET(c))
01309             return -1;
01310     }
01311     return mbl;
01312 }
01313 
01314 static int
01315 pg_latin1_verifier(const unsigned char *s, int len)
01316 {
01317     return 1;
01318 }
01319 
01320 static int
01321 pg_sjis_verifier(const unsigned char *s, int len)
01322 {
01323     int         l,
01324                 mbl;
01325     unsigned char c1,
01326                 c2;
01327 
01328     l = mbl = pg_sjis_mblen(s);
01329 
01330     if (len < l)
01331         return -1;
01332 
01333     if (l == 1)                 /* pg_sjis_mblen already verified it */
01334         return mbl;
01335 
01336     c1 = *s++;
01337     c2 = *s;
01338     if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
01339         return -1;
01340     return mbl;
01341 }
01342 
01343 static int
01344 pg_big5_verifier(const unsigned char *s, int len)
01345 {
01346     int         l,
01347                 mbl;
01348 
01349     l = mbl = pg_big5_mblen(s);
01350 
01351     if (len < l)
01352         return -1;
01353 
01354     while (--l > 0)
01355     {
01356         if (*++s == '\0')
01357             return -1;
01358     }
01359 
01360     return mbl;
01361 }
01362 
01363 static int
01364 pg_gbk_verifier(const unsigned char *s, int len)
01365 {
01366     int         l,
01367                 mbl;
01368 
01369     l = mbl = pg_gbk_mblen(s);
01370 
01371     if (len < l)
01372         return -1;
01373 
01374     while (--l > 0)
01375     {
01376         if (*++s == '\0')
01377             return -1;
01378     }
01379 
01380     return mbl;
01381 }
01382 
01383 static int
01384 pg_uhc_verifier(const unsigned char *s, int len)
01385 {
01386     int         l,
01387                 mbl;
01388 
01389     l = mbl = pg_uhc_mblen(s);
01390 
01391     if (len < l)
01392         return -1;
01393 
01394     while (--l > 0)
01395     {
01396         if (*++s == '\0')
01397             return -1;
01398     }
01399 
01400     return mbl;
01401 }
01402 
01403 static int
01404 pg_gb18030_verifier(const unsigned char *s, int len)
01405 {
01406     int         l,
01407                 mbl;
01408 
01409     l = mbl = pg_gb18030_mblen(s);
01410 
01411     if (len < l)
01412         return -1;
01413 
01414     while (--l > 0)
01415     {
01416         if (*++s == '\0')
01417             return -1;
01418     }
01419 
01420     return mbl;
01421 }
01422 
01423 static int
01424 pg_utf8_verifier(const unsigned char *s, int len)
01425 {
01426     int         l = pg_utf_mblen(s);
01427 
01428     if (len < l)
01429         return -1;
01430 
01431     if (!pg_utf8_islegal(s, l))
01432         return -1;
01433 
01434     return l;
01435 }
01436 
01437 /*
01438  * Check for validity of a single UTF-8 encoded character
01439  *
01440  * This directly implements the rules in RFC3629.  The bizarre-looking
01441  * restrictions on the second byte are meant to ensure that there isn't
01442  * more than one encoding of a given Unicode character point; that is,
01443  * you may not use a longer-than-necessary byte sequence with high order
01444  * zero bits to represent a character that would fit in fewer bytes.
01445  * To do otherwise is to create security hazards (eg, create an apparent
01446  * non-ASCII character that decodes to plain ASCII).
01447  *
01448  * length is assumed to have been obtained by pg_utf_mblen(), and the
01449  * caller must have checked that that many bytes are present in the buffer.
01450  */
01451 bool
01452 pg_utf8_islegal(const unsigned char *source, int length)
01453 {
01454     unsigned char a;
01455 
01456     switch (length)
01457     {
01458         default:
01459             /* reject lengths 5 and 6 for now */
01460             return false;
01461         case 4:
01462             a = source[3];
01463             if (a < 0x80 || a > 0xBF)
01464                 return false;
01465             /* FALL THRU */
01466         case 3:
01467             a = source[2];
01468             if (a < 0x80 || a > 0xBF)
01469                 return false;
01470             /* FALL THRU */
01471         case 2:
01472             a = source[1];
01473             switch (*source)
01474             {
01475                 case 0xE0:
01476                     if (a < 0xA0 || a > 0xBF)
01477                         return false;
01478                     break;
01479                 case 0xED:
01480                     if (a < 0x80 || a > 0x9F)
01481                         return false;
01482                     break;
01483                 case 0xF0:
01484                     if (a < 0x90 || a > 0xBF)
01485                         return false;
01486                     break;
01487                 case 0xF4:
01488                     if (a < 0x80 || a > 0x8F)
01489                         return false;
01490                     break;
01491                 default:
01492                     if (a < 0x80 || a > 0xBF)
01493                         return false;
01494                     break;
01495             }
01496             /* FALL THRU */
01497         case 1:
01498             a = *source;
01499             if (a >= 0x80 && a < 0xC2)
01500                 return false;
01501             if (a > 0xF4)
01502                 return false;
01503             break;
01504     }
01505     return true;
01506 }
01507 
01508 #ifndef FRONTEND
01509 
01510 /*
01511  * Generic character incrementer function.
01512  *
01513  * Not knowing anything about the properties of the encoding in use, we just
01514  * keep incrementing the last byte until we get a validly-encoded result,
01515  * or we run out of values to try.  We don't bother to try incrementing
01516  * higher-order bytes, so there's no growth in runtime for wider characters.
01517  * (If we did try to do that, we'd need to consider the likelihood that 255
01518  * is not a valid final byte in the encoding.)
01519  */
01520 static bool
01521 pg_generic_charinc(unsigned char *charptr, int len)
01522 {
01523     unsigned char *lastbyte = charptr + len - 1;
01524     mbverifier  mbverify;
01525 
01526     /* We can just invoke the character verifier directly. */
01527     mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
01528 
01529     while (*lastbyte < (unsigned char) 255)
01530     {
01531         (*lastbyte)++;
01532         if ((*mbverify) (charptr, len) == len)
01533             return true;
01534     }
01535 
01536     return false;
01537 }
01538 
01539 /*
01540  * UTF-8 character incrementer function.
01541  *
01542  * For a one-byte character less than 0x7F, we just increment the byte.
01543  *
01544  * For a multibyte character, every byte but the first must fall between 0x80
01545  * and 0xBF; and the first byte must be between 0xC0 and 0xF4.  We increment
01546  * the last byte that's not already at its maximum value.  If we can't find a
01547  * byte that's less than the maximum allowable value, we simply fail.  We also
01548  * need some special-case logic to skip regions used for surrogate pair
01549  * handling, as those should not occur in valid UTF-8.
01550  *
01551  * Note that we don't reset lower-order bytes back to their minimums, since
01552  * we can't afford to make an exhaustive search (see make_greater_string).
01553  */
01554 static bool
01555 pg_utf8_increment(unsigned char *charptr, int length)
01556 {
01557     unsigned char a;
01558     unsigned char limit;
01559 
01560     switch (length)
01561     {
01562         default:
01563             /* reject lengths 5 and 6 for now */
01564             return false;
01565         case 4:
01566             a = charptr[3];
01567             if (a < 0xBF)
01568             {
01569                 charptr[3]++;
01570                 break;
01571             }
01572             /* FALL THRU */
01573         case 3:
01574             a = charptr[2];
01575             if (a < 0xBF)
01576             {
01577                 charptr[2]++;
01578                 break;
01579             }
01580             /* FALL THRU */
01581         case 2:
01582             a = charptr[1];
01583             switch (*charptr)
01584             {
01585                 case 0xED:
01586                     limit = 0x9F;
01587                     break;
01588                 case 0xF4:
01589                     limit = 0x8F;
01590                     break;
01591                 default:
01592                     limit = 0xBF;
01593                     break;
01594             }
01595             if (a < limit)
01596             {
01597                 charptr[1]++;
01598                 break;
01599             }
01600             /* FALL THRU */
01601         case 1:
01602             a = *charptr;
01603             if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
01604                 return false;
01605             charptr[0]++;
01606             break;
01607     }
01608 
01609     return true;
01610 }
01611 
01612 /*
01613  * EUC-JP character incrementer function.
01614  *
01615  * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
01616  * representing JIS X 0201 characters with the second byte ranging between
01617  * 0xa1 and 0xdf.  We just increment the last byte if it's less than 0xdf,
01618  * and otherwise rewrite the whole sequence to 0xa1 0xa1.
01619  *
01620  * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
01621  * in which the last two bytes range between 0xa1 and 0xfe.  The last byte
01622  * is incremented if possible, otherwise the second-to-last byte.
01623  *
01624  * If the sequence starts with a value other than the above and its MSB
01625  * is set, it must be a two-byte sequence representing JIS X 0208 characters
01626  * with both bytes ranging between 0xa1 and 0xfe.  The last byte is
01627  * incremented if possible, otherwise the second-to-last byte.
01628  *
01629  * Otherwise, the sequence is a single-byte ASCII character. It is
01630  * incremented up to 0x7f.
01631  */
01632 static bool
01633 pg_eucjp_increment(unsigned char *charptr, int length)
01634 {
01635     unsigned char c1,
01636                 c2;
01637     int         i;
01638 
01639     c1 = *charptr;
01640 
01641     switch (c1)
01642     {
01643         case SS2:               /* JIS X 0201 */
01644             if (length != 2)
01645                 return false;
01646 
01647             c2 = charptr[1];
01648 
01649             if (c2 >= 0xdf)
01650                 charptr[0] = charptr[1] = 0xa1;
01651             else if (c2 < 0xa1)
01652                 charptr[1] = 0xa1;
01653             else
01654                 charptr[1]++;
01655             break;
01656 
01657         case SS3:               /* JIS X 0212 */
01658             if (length != 3)
01659                 return false;
01660 
01661             for (i = 2; i > 0; i--)
01662             {
01663                 c2 = charptr[i];
01664                 if (c2 < 0xa1)
01665                 {
01666                     charptr[i] = 0xa1;
01667                     return true;
01668                 }
01669                 else if (c2 < 0xfe)
01670                 {
01671                     charptr[i]++;
01672                     return true;
01673                 }
01674             }
01675 
01676             /* Out of 3-byte code region */
01677             return false;
01678 
01679         default:
01680             if (IS_HIGHBIT_SET(c1))     /* JIS X 0208? */
01681             {
01682                 if (length != 2)
01683                     return false;
01684 
01685                 for (i = 1; i >= 0; i--)
01686                 {
01687                     c2 = charptr[i];
01688                     if (c2 < 0xa1)
01689                     {
01690                         charptr[i] = 0xa1;
01691                         return true;
01692                     }
01693                     else if (c2 < 0xfe)
01694                     {
01695                         charptr[i]++;
01696                         return true;
01697                     }
01698                 }
01699 
01700                 /* Out of 2 byte code region */
01701                 return false;
01702             }
01703             else
01704             {                   /* ASCII, single byte */
01705                 if (c1 > 0x7e)
01706                     return false;
01707                 (*charptr)++;
01708             }
01709             break;
01710     }
01711 
01712     return true;
01713 }
01714 #endif   /* !FRONTEND */
01715 
01716 
01717 /*
01718  *-------------------------------------------------------------------
01719  * encoding info table
01720  * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
01721  *-------------------------------------------------------------------
01722  */
01723 pg_wchar_tbl pg_wchar_table[] = {
01724     {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */
01725     {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3},    /* PG_EUC_JP */
01726     {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2},    /* PG_EUC_CN */
01727     {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3},    /* PG_EUC_KR */
01728     {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 4},    /* PG_EUC_TW */
01729     {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3},    /* PG_EUC_JIS_2004 */
01730     {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4},   /* PG_UTF8 */
01731     {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 4},       /* PG_MULE_INTERNAL */
01732     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* PG_LATIN1 */
01733     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* PG_LATIN2 */
01734     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* PG_LATIN3 */
01735     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* PG_LATIN4 */
01736     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* PG_LATIN5 */
01737     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* PG_LATIN6 */
01738     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* PG_LATIN7 */
01739     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* PG_LATIN8 */
01740     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* PG_LATIN9 */
01741     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* PG_LATIN10 */
01742     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* PG_WIN1256 */
01743     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* PG_WIN1258 */
01744     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* PG_WIN866 */
01745     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* PG_WIN874 */
01746     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* PG_KOI8R */
01747     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* PG_WIN1251 */
01748     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* PG_WIN1252 */
01749     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* ISO-8859-5 */
01750     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* ISO-8859-6 */
01751     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* ISO-8859-7 */
01752     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* ISO-8859-8 */
01753     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* PG_WIN1250 */
01754     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* PG_WIN1253 */
01755     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* PG_WIN1254 */
01756     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* PG_WIN1255 */
01757     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* PG_WIN1257 */
01758     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},     /* PG_KOI8U */
01759     {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* PG_SJIS */
01760     {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* PG_BIG5 */
01761     {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2},        /* PG_GBK */
01762     {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2},        /* PG_UHC */
01763     {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4},    /* PG_GB18030 */
01764     {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3}, /* PG_JOHAB */
01765     {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}      /* PG_SHIFT_JIS_2004 */
01766 };
01767 
01768 /* returns the byte length of a word for mule internal code */
01769 int
01770 pg_mic_mblen(const unsigned char *mbstr)
01771 {
01772     return pg_mule_mblen(mbstr);
01773 }
01774 
01775 /*
01776  * Returns the byte length of a multibyte character.
01777  */
01778 int
01779 pg_encoding_mblen(int encoding, const char *mbstr)
01780 {
01781     Assert(PG_VALID_ENCODING(encoding));
01782 
01783     return ((encoding >= 0 &&
01784              encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ?
01785         ((*pg_wchar_table[encoding].mblen) ((const unsigned char *) mbstr)) :
01786     ((*pg_wchar_table[PG_SQL_ASCII].mblen) ((const unsigned char *) mbstr)));
01787 }
01788 
01789 /*
01790  * Returns the display length of a multibyte character.
01791  */
01792 int
01793 pg_encoding_dsplen(int encoding, const char *mbstr)
01794 {
01795     Assert(PG_VALID_ENCODING(encoding));
01796 
01797     return ((encoding >= 0 &&
01798              encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ?
01799        ((*pg_wchar_table[encoding].dsplen) ((const unsigned char *) mbstr)) :
01800     ((*pg_wchar_table[PG_SQL_ASCII].dsplen) ((const unsigned char *) mbstr)));
01801 }
01802 
01803 /*
01804  * Verify the first multibyte character of the given string.
01805  * Return its byte length if good, -1 if bad.  (See comments above for
01806  * full details of the mbverify API.)
01807  */
01808 int
01809 pg_encoding_verifymb(int encoding, const char *mbstr, int len)
01810 {
01811     Assert(PG_VALID_ENCODING(encoding));
01812 
01813     return ((encoding >= 0 &&
01814              encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ?
01815             ((*pg_wchar_table[encoding].mbverify) ((const unsigned char *) mbstr, len)) :
01816             ((*pg_wchar_table[PG_SQL_ASCII].mbverify) ((const unsigned char *) mbstr, len)));
01817 }
01818 
01819 /*
01820  * fetch maximum length of a given encoding
01821  */
01822 int
01823 pg_encoding_max_length(int encoding)
01824 {
01825     Assert(PG_VALID_ENCODING(encoding));
01826 
01827     return pg_wchar_table[encoding].maxmblen;
01828 }
01829 
01830 #ifndef FRONTEND
01831 
01832 /*
01833  * fetch maximum length of the encoding for the current database
01834  */
01835 int
01836 pg_database_encoding_max_length(void)
01837 {
01838     return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
01839 }
01840 
01841 /*
01842  * get the character incrementer for the encoding for the current database
01843  */
01844 mbcharacter_incrementer
01845 pg_database_encoding_character_incrementer(void)
01846 {
01847     /*
01848      * Eventually it might be best to add a field to pg_wchar_table[], but for
01849      * now we just use a switch.
01850      */
01851     switch (GetDatabaseEncoding())
01852     {
01853         case PG_UTF8:
01854             return pg_utf8_increment;
01855 
01856         case PG_EUC_JP:
01857             return pg_eucjp_increment;
01858 
01859         default:
01860             return pg_generic_charinc;
01861     }
01862 }
01863 
01864 /*
01865  * Verify mbstr to make sure that it is validly encoded in the current
01866  * database encoding.  Otherwise same as pg_verify_mbstr().
01867  */
01868 bool
01869 pg_verifymbstr(const char *mbstr, int len, bool noError)
01870 {
01871     return
01872         pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
01873 }
01874 
01875 /*
01876  * Verify mbstr to make sure that it is validly encoded in the specified
01877  * encoding.
01878  */
01879 bool
01880 pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
01881 {
01882     return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
01883 }
01884 
01885 /*
01886  * Verify mbstr to make sure that it is validly encoded in the specified
01887  * encoding.
01888  *
01889  * mbstr is not necessarily zero terminated; length of mbstr is
01890  * specified by len.
01891  *
01892  * If OK, return length of string in the encoding.
01893  * If a problem is found, return -1 when noError is
01894  * true; when noError is false, ereport() a descriptive message.
01895  */
01896 int
01897 pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
01898 {
01899     mbverifier  mbverify;
01900     int         mb_len;
01901 
01902     Assert(PG_VALID_ENCODING(encoding));
01903 
01904     /*
01905      * In single-byte encodings, we need only reject nulls (\0).
01906      */
01907     if (pg_encoding_max_length(encoding) <= 1)
01908     {
01909         const char *nullpos = memchr(mbstr, 0, len);
01910 
01911         if (nullpos == NULL)
01912             return len;
01913         if (noError)
01914             return -1;
01915         report_invalid_encoding(encoding, nullpos, 1);
01916     }
01917 
01918     /* fetch function pointer just once */
01919     mbverify = pg_wchar_table[encoding].mbverify;
01920 
01921     mb_len = 0;
01922 
01923     while (len > 0)
01924     {
01925         int         l;
01926 
01927         /* fast path for ASCII-subset characters */
01928         if (!IS_HIGHBIT_SET(*mbstr))
01929         {
01930             if (*mbstr != '\0')
01931             {
01932                 mb_len++;
01933                 mbstr++;
01934                 len--;
01935                 continue;
01936             }
01937             if (noError)
01938                 return -1;
01939             report_invalid_encoding(encoding, mbstr, len);
01940         }
01941 
01942         l = (*mbverify) ((const unsigned char *) mbstr, len);
01943 
01944         if (l < 0)
01945         {
01946             if (noError)
01947                 return -1;
01948             report_invalid_encoding(encoding, mbstr, len);
01949         }
01950 
01951         mbstr += l;
01952         len -= l;
01953         mb_len++;
01954     }
01955     return mb_len;
01956 }
01957 
01958 /*
01959  * check_encoding_conversion_args: check arguments of a conversion function
01960  *
01961  * "expected" arguments can be either an encoding ID or -1 to indicate that
01962  * the caller will check whether it accepts the ID.
01963  *
01964  * Note: the errors here are not really user-facing, so elog instead of
01965  * ereport seems sufficient.  Also, we trust that the "expected" encoding
01966  * arguments are valid encoding IDs, but we don't trust the actuals.
01967  */
01968 void
01969 check_encoding_conversion_args(int src_encoding,
01970                                int dest_encoding,
01971                                int len,
01972                                int expected_src_encoding,
01973                                int expected_dest_encoding)
01974 {
01975     if (!PG_VALID_ENCODING(src_encoding))
01976         elog(ERROR, "invalid source encoding ID: %d", src_encoding);
01977     if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
01978         elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
01979              pg_enc2name_tbl[expected_src_encoding].name,
01980              pg_enc2name_tbl[src_encoding].name);
01981     if (!PG_VALID_ENCODING(dest_encoding))
01982         elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
01983     if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
01984         elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
01985              pg_enc2name_tbl[expected_dest_encoding].name,
01986              pg_enc2name_tbl[dest_encoding].name);
01987     if (len < 0)
01988         elog(ERROR, "encoding conversion length must not be negative");
01989 }
01990 
01991 /*
01992  * report_invalid_encoding: complain about invalid multibyte character
01993  *
01994  * note: len is remaining length of string, not length of character;
01995  * len must be greater than zero, as we always examine the first byte.
01996  */
01997 void
01998 report_invalid_encoding(int encoding, const char *mbstr, int len)
01999 {
02000     int         l = pg_encoding_mblen(encoding, mbstr);
02001     char        buf[8 * 5 + 1];
02002     char       *p = buf;
02003     int         j,
02004                 jlimit;
02005 
02006     jlimit = Min(l, len);
02007     jlimit = Min(jlimit, 8);    /* prevent buffer overrun */
02008 
02009     for (j = 0; j < jlimit; j++)
02010     {
02011         p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
02012         if (j < jlimit - 1)
02013             p += sprintf(p, " ");
02014     }
02015 
02016     ereport(ERROR,
02017             (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
02018              errmsg("invalid byte sequence for encoding \"%s\": %s",
02019                     pg_enc2name_tbl[encoding].name,
02020                     buf)));
02021 }
02022 
02023 /*
02024  * report_untranslatable_char: complain about untranslatable character
02025  *
02026  * note: len is remaining length of string, not length of character;
02027  * len must be greater than zero, as we always examine the first byte.
02028  */
02029 void
02030 report_untranslatable_char(int src_encoding, int dest_encoding,
02031                            const char *mbstr, int len)
02032 {
02033     int         l = pg_encoding_mblen(src_encoding, mbstr);
02034     char        buf[8 * 5 + 1];
02035     char       *p = buf;
02036     int         j,
02037                 jlimit;
02038 
02039     jlimit = Min(l, len);
02040     jlimit = Min(jlimit, 8);    /* prevent buffer overrun */
02041 
02042     for (j = 0; j < jlimit; j++)
02043     {
02044         p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
02045         if (j < jlimit - 1)
02046             p += sprintf(p, " ");
02047     }
02048 
02049     ereport(ERROR,
02050             (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
02051              errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
02052                     buf,
02053                     pg_enc2name_tbl[src_encoding].name,
02054                     pg_enc2name_tbl[dest_encoding].name)));
02055 }
02056 
02057 #endif   /* !FRONTEND */