Header And Logo

PostgreSQL
| The world's most advanced open source database.

big5.c

Go to the documentation of this file.
00001 /*
00002  * conversion between BIG5 and Mule Internal Code(CNS 116643-1992
00003  * plane 1 and plane 2).
00004  * This program is partially copied from lv(Multilingual file viewer)
00005  * and slightly modified. lv is written and copyrighted by NARITA Tomio
00006  * ([email protected]).
00007  *
00008  * 1999/1/15 Tatsuo Ishii
00009  *
00010  * src/backend/utils/mb/conversion_procs/euc_tw_and_big5/big5.c
00011  */
00012 
00013 /* can be used in either frontend or backend */
00014 #include "postgres_fe.h"
00015 
00016 #include "mb/pg_wchar.h"
00017 
00018 typedef struct
00019 {
00020     unsigned short code,
00021                 peer;
00022 } codes_t;
00023 
00024 /* map Big5 Level 1 to CNS 11643-1992 Plane 1 */
00025 static codes_t big5Level1ToCnsPlane1[25] = {    /* range */
00026     {0xA140, 0x2121},
00027     {0xA1F6, 0x2258},
00028     {0xA1F7, 0x2257},
00029     {0xA1F8, 0x2259},
00030     {0xA2AF, 0x2421},
00031     {0xA3C0, 0x4221},
00032     {0xa3e1, 0x0000},
00033     {0xA440, 0x4421},
00034     {0xACFE, 0x5753},
00035     {0xacff, 0x0000},
00036     {0xAD40, 0x5323},
00037     {0xAFD0, 0x5754},
00038     {0xBBC8, 0x6B51},
00039     {0xBE52, 0x6B50},
00040     {0xBE53, 0x6F5C},
00041     {0xC1AB, 0x7536},
00042     {0xC2CB, 0x7535},
00043     {0xC2CC, 0x7737},
00044     {0xC361, 0x782E},
00045     {0xC3B9, 0x7865},
00046     {0xC3BA, 0x7864},
00047     {0xC3BB, 0x7866},
00048     {0xC456, 0x782D},
00049     {0xC457, 0x7962},
00050     {0xc67f, 0x0000}
00051 };
00052 
00053 /* map CNS 11643-1992 Plane 1 to Big5 Level 1 */
00054 static codes_t cnsPlane1ToBig5Level1[26] = {    /* range */
00055     {0x2121, 0xA140},
00056     {0x2257, 0xA1F7},
00057     {0x2258, 0xA1F6},
00058     {0x2259, 0xA1F8},
00059     {0x234f, 0x0000},
00060     {0x2421, 0xA2AF},
00061     {0x2571, 0x0000},
00062     {0x4221, 0xA3C0},
00063     {0x4242, 0x0000},
00064     {0x4421, 0xA440},
00065     {0x5323, 0xAD40},
00066     {0x5753, 0xACFE},
00067     {0x5754, 0xAFD0},
00068     {0x6B50, 0xBE52},
00069     {0x6B51, 0xBBC8},
00070     {0x6F5C, 0xBE53},
00071     {0x7535, 0xC2CB},
00072     {0x7536, 0xC1AB},
00073     {0x7737, 0xC2CC},
00074     {0x782D, 0xC456},
00075     {0x782E, 0xC361},
00076     {0x7864, 0xC3BA},
00077     {0x7865, 0xC3B9},
00078     {0x7866, 0xC3BB},
00079     {0x7962, 0xC457},
00080     {0x7d4c, 0x0000}
00081 };
00082 
00083 /* map Big5 Level 2 to CNS 11643-1992 Plane 2 */
00084 static codes_t big5Level2ToCnsPlane2[48] = {    /* range */
00085     {0xC940, 0x2121},
00086     {0xc94a, 0x0000},
00087     {0xC94B, 0x212B},
00088     {0xC96C, 0x214D},
00089     {0xC9BE, 0x214C},
00090     {0xC9BF, 0x217D},
00091     {0xC9ED, 0x224E},
00092     {0xCAF7, 0x224D},
00093     {0xCAF8, 0x2439},
00094     {0xD77A, 0x3F6A},
00095     {0xD77B, 0x387E},
00096     {0xDBA7, 0x3F6B},
00097     {0xDDFC, 0x4176},
00098     {0xDDFD, 0x4424},
00099     {0xE8A3, 0x554C},
00100     {0xE976, 0x5723},
00101     {0xEB5B, 0x5A29},
00102     {0xEBF1, 0x554B},
00103     {0xEBF2, 0x5B3F},
00104     {0xECDE, 0x5722},
00105     {0xECDF, 0x5C6A},
00106     {0xEDAA, 0x5D75},
00107     {0xEEEB, 0x642F},
00108     {0xEEEC, 0x6039},
00109     {0xF056, 0x5D74},
00110     {0xF057, 0x6243},
00111     {0xF0CB, 0x5A28},
00112     {0xF0CC, 0x6337},
00113     {0xF163, 0x6430},
00114     {0xF16B, 0x6761},
00115     {0xF16C, 0x6438},
00116     {0xF268, 0x6934},
00117     {0xF269, 0x6573},
00118     {0xF2C3, 0x664E},
00119     {0xF375, 0x6762},
00120     {0xF466, 0x6935},
00121     {0xF4B5, 0x664D},
00122     {0xF4B6, 0x6962},
00123     {0xF4FD, 0x6A4C},
00124     {0xF663, 0x6A4B},
00125     {0xF664, 0x6C52},
00126     {0xF977, 0x7167},
00127     {0xF9C4, 0x7166},
00128     {0xF9C5, 0x7234},
00129     {0xF9C6, 0x7240},
00130     {0xF9C7, 0x7235},
00131     {0xF9D2, 0x7241},
00132     {0xf9d6, 0x0000}
00133 };
00134 
00135 /* map CNS 11643-1992 Plane 2 to Big5 Level 2 */
00136 static codes_t cnsPlane2ToBig5Level2[49] = {    /* range */
00137     {0x2121, 0xC940},
00138     {0x212B, 0xC94B},
00139     {0x214C, 0xC9BE},
00140     {0x214D, 0xC96C},
00141     {0x217D, 0xC9BF},
00142     {0x224D, 0xCAF7},
00143     {0x224E, 0xC9ED},
00144     {0x2439, 0xCAF8},
00145     {0x387E, 0xD77B},
00146     {0x3F6A, 0xD77A},
00147     {0x3F6B, 0xDBA7},
00148     {0x4424, 0x0000},
00149     {0x4176, 0xDDFC},
00150     {0x4177, 0x0000},
00151     {0x4424, 0xDDFD},
00152     {0x554B, 0xEBF1},
00153     {0x554C, 0xE8A3},
00154     {0x5722, 0xECDE},
00155     {0x5723, 0xE976},
00156     {0x5A28, 0xF0CB},
00157     {0x5A29, 0xEB5B},
00158     {0x5B3F, 0xEBF2},
00159     {0x5C6A, 0xECDF},
00160     {0x5D74, 0xF056},
00161     {0x5D75, 0xEDAA},
00162     {0x6039, 0xEEEC},
00163     {0x6243, 0xF057},
00164     {0x6337, 0xF0CC},
00165     {0x642F, 0xEEEB},
00166     {0x6430, 0xF163},
00167     {0x6438, 0xF16C},
00168     {0x6573, 0xF269},
00169     {0x664D, 0xF4B5},
00170     {0x664E, 0xF2C3},
00171     {0x6761, 0xF16B},
00172     {0x6762, 0xF375},
00173     {0x6934, 0xF268},
00174     {0x6935, 0xF466},
00175     {0x6962, 0xF4B6},
00176     {0x6A4B, 0xF663},
00177     {0x6A4C, 0xF4FD},
00178     {0x6C52, 0xF664},
00179     {0x7166, 0xF9C4},
00180     {0x7167, 0xF977},
00181     {0x7234, 0xF9C5},
00182     {0x7235, 0xF9C7},
00183     {0x7240, 0xF9C6},
00184     {0x7241, 0xF9D2},
00185     {0x7245, 0x0000}
00186 };
00187 
00188 /* Big Five Level 1 Correspondence to CNS 11643-1992 Plane 4 */
00189 static unsigned short b1c4[][2] = {
00190     {0xC879, 0x2123},
00191     {0xC87B, 0x2124},
00192     {0xC87D, 0x212A},
00193     {0xC8A2, 0x2152}
00194 };
00195 
00196 /* Big Five Level 2 Correspondence to CNS 11643-1992 Plane 3 */
00197 static unsigned short b2c3[][2] = {
00198     {0xF9D6, 0x4337},
00199     {0xF9D7, 0x4F50},
00200     {0xF9D8, 0x444E},
00201     {0xF9D9, 0x504A},
00202     {0xF9DA, 0x2C5D},
00203     {0xF9DB, 0x3D7E},
00204     {0xF9DC, 0x4B5C}
00205 };
00206 
00207 static unsigned short BinarySearchRange
00208             (codes_t *array, int high, unsigned short code)
00209 {
00210     int         low,
00211                 mid,
00212                 distance,
00213                 tmp;
00214 
00215     low = 0;
00216     mid = high >> 1;
00217 
00218     for (; low <= high; mid = (low + high) >> 1)
00219     {
00220         if ((array[mid].code <= code) && (array[mid + 1].code > code))
00221         {
00222             if (0 == array[mid].peer)
00223                 return 0;
00224             if (code >= 0xa140U)
00225             {
00226                 /* big5 to cns */
00227                 tmp = ((code & 0xff00) - (array[mid].code & 0xff00)) >> 8;
00228                 high = code & 0x00ff;
00229                 low = array[mid].code & 0x00ff;
00230 
00231                 /*
00232                  * NOTE: big5 high_byte: 0xa1-0xfe, low_byte: 0x40-0x7e,
00233                  * 0xa1-0xfe (radicals: 0x00-0x3e, 0x3f-0x9c) big5 radix is
00234                  * 0x9d.                     [region_low, region_high] We
00235                  * should remember big5 has two different regions (above).
00236                  * There is a bias for the distance between these regions.
00237                  * 0xa1 - 0x7e + bias = 1 (Distance between 0xa1 and 0x7e is
00238                  * 1.) bias = - 0x22.
00239                  */
00240                 distance = tmp * 0x9d + high - low +
00241                     (high >= 0xa1 ? (low >= 0xa1 ? 0 : -0x22)
00242                      : (low >= 0xa1 ? +0x22 : 0));
00243 
00244                 /*
00245                  * NOTE: we have to convert the distance into a code point.
00246                  * The code point's low_byte is 0x21 plus mod_0x5e. In the
00247                  * first, we extract the mod_0x5e of the starting code point,
00248                  * subtracting 0x21, and add distance to it. Then we calculate
00249                  * again mod_0x5e of them, and restore the final codepoint,
00250                  * adding 0x21.
00251                  */
00252                 tmp = (array[mid].peer & 0x00ff) + distance - 0x21;
00253                 tmp = (array[mid].peer & 0xff00) + ((tmp / 0x5e) << 8)
00254                     + 0x21 + tmp % 0x5e;
00255                 return tmp;
00256             }
00257             else
00258             {
00259                 /* cns to big5 */
00260                 tmp = ((code & 0xff00) - (array[mid].code & 0xff00)) >> 8;
00261 
00262                 /*
00263                  * NOTE: ISO charsets ranges between 0x21-0xfe (94charset).
00264                  * Its radix is 0x5e. But there is no distance bias like big5.
00265                  */
00266                 distance = tmp * 0x5e
00267                     + ((int) (code & 0x00ff) - (int) (array[mid].code & 0x00ff));
00268 
00269                 /*
00270                  * NOTE: Similar to big5 to cns conversion, we extract
00271                  * mod_0x9d and restore mod_0x9d into a code point.
00272                  */
00273                 low = array[mid].peer & 0x00ff;
00274                 tmp = low + distance - (low >= 0xa1 ? 0x62 : 0x40);
00275                 low = tmp % 0x9d;
00276                 tmp = (array[mid].peer & 0xff00) + ((tmp / 0x9d) << 8)
00277                     + (low > 0x3e ? 0x62 : 0x40) + low;
00278                 return tmp;
00279             }
00280         }
00281         else if (array[mid].code > code)
00282             high = mid - 1;
00283         else
00284             low = mid + 1;
00285     }
00286 
00287     return 0;
00288 }
00289 
00290 
00291 unsigned short
00292 BIG5toCNS(unsigned short big5, unsigned char *lc)
00293 {
00294     unsigned short cns = 0;
00295     int         i;
00296 
00297     if (big5 < 0xc940U)
00298     {
00299         /* level 1 */
00300 
00301         for (i = 0; i < sizeof(b1c4) / (sizeof(unsigned short) * 2); i++)
00302         {
00303             if (b1c4[i][0] == big5)
00304             {
00305                 *lc = LC_CNS11643_4;
00306                 return (b1c4[i][1] | 0x8080U);
00307             }
00308         }
00309 
00310         if (0 < (cns = BinarySearchRange(big5Level1ToCnsPlane1, 23, big5)))
00311             *lc = LC_CNS11643_1;
00312     }
00313     else if (big5 == 0xc94aU)
00314     {
00315         /* level 2 */
00316         *lc = LC_CNS11643_1;
00317         cns = 0x4442;
00318     }
00319     else
00320     {
00321         /* level 2 */
00322         for (i = 0; i < sizeof(b2c3) / (sizeof(unsigned short) * 2); i++)
00323         {
00324             if (b2c3[i][0] == big5)
00325             {
00326                 *lc = LC_CNS11643_3;
00327                 return (b2c3[i][1] | 0x8080U);
00328             }
00329         }
00330 
00331         if (0 < (cns = BinarySearchRange(big5Level2ToCnsPlane2, 46, big5)))
00332             *lc = LC_CNS11643_2;
00333     }
00334 
00335     if (0 == cns)
00336     {                           /* no mapping Big5 to CNS 11643-1992 */
00337         *lc = 0;
00338         return (unsigned short) '?';
00339     }
00340 
00341     return cns | 0x8080;
00342 }
00343 
00344 unsigned short
00345 CNStoBIG5(unsigned short cns, unsigned char lc)
00346 {
00347     int         i;
00348     unsigned int big5 = 0;
00349 
00350     cns &= 0x7f7f;
00351 
00352     switch (lc)
00353     {
00354         case LC_CNS11643_1:
00355             big5 = BinarySearchRange(cnsPlane1ToBig5Level1, 24, cns);
00356             break;
00357         case LC_CNS11643_2:
00358             big5 = BinarySearchRange(cnsPlane2ToBig5Level2, 47, cns);
00359             break;
00360         case LC_CNS11643_3:
00361             for (i = 0; i < sizeof(b2c3) / (sizeof(unsigned short) * 2); i++)
00362             {
00363                 if (b2c3[i][1] == cns)
00364                     return (b2c3[i][0]);
00365             }
00366             break;
00367         case LC_CNS11643_4:
00368             for (i = 0; i < sizeof(b1c4) / (sizeof(unsigned short) * 2); i++)
00369             {
00370                 if (b1c4[i][1] == cns)
00371                     return (b1c4[i][0]);
00372             }
00373         default:
00374             break;
00375     }
00376     return big5;
00377 }