Header And Logo

PostgreSQL
| The world's most advanced open source database.

conv.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  *    Utility functions for conversion procs.
00004  *
00005  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00006  * Portions Copyright (c) 1994, Regents of the University of California
00007  *
00008  * IDENTIFICATION
00009  *    src/backend/utils/mb/conv.c
00010  *
00011  *-------------------------------------------------------------------------
00012  */
00013 #include "postgres.h"
00014 #include "mb/pg_wchar.h"
00015 
00016 
00017 /*
00018  * LATINn ---> MIC when the charset's local codes map directly to MIC
00019  *
00020  * l points to the source string of length len
00021  * p is the output area (must be large enough!)
00022  * lc is the mule character set id for the local encoding
00023  * encoding is the PG identifier for the local encoding
00024  */
00025 void
00026 latin2mic(const unsigned char *l, unsigned char *p, int len,
00027           int lc, int encoding)
00028 {
00029     int         c1;
00030 
00031     while (len > 0)
00032     {
00033         c1 = *l;
00034         if (c1 == 0)
00035             report_invalid_encoding(encoding, (const char *) l, len);
00036         if (IS_HIGHBIT_SET(c1))
00037             *p++ = lc;
00038         *p++ = c1;
00039         l++;
00040         len--;
00041     }
00042     *p = '\0';
00043 }
00044 
00045 /*
00046  * MIC ---> LATINn when the charset's local codes map directly to MIC
00047  *
00048  * mic points to the source string of length len
00049  * p is the output area (must be large enough!)
00050  * lc is the mule character set id for the local encoding
00051  * encoding is the PG identifier for the local encoding
00052  */
00053 void
00054 mic2latin(const unsigned char *mic, unsigned char *p, int len,
00055           int lc, int encoding)
00056 {
00057     int         c1;
00058 
00059     while (len > 0)
00060     {
00061         c1 = *mic;
00062         if (c1 == 0)
00063             report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
00064         if (!IS_HIGHBIT_SET(c1))
00065         {
00066             /* easy for ASCII */
00067             *p++ = c1;
00068             mic++;
00069             len--;
00070         }
00071         else
00072         {
00073             int         l = pg_mic_mblen(mic);
00074 
00075             if (len < l)
00076                 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
00077                                         len);
00078             if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
00079                 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
00080                                            (const char *) mic, len);
00081             *p++ = mic[1];
00082             mic += 2;
00083             len -= 2;
00084         }
00085     }
00086     *p = '\0';
00087 }
00088 
00089 
00090 /*
00091  * ASCII ---> MIC
00092  *
00093  * While ordinarily SQL_ASCII encoding is forgiving of high-bit-set
00094  * characters, here we must take a hard line because we don't know
00095  * the appropriate MIC equivalent.
00096  */
00097 void
00098 pg_ascii2mic(const unsigned char *l, unsigned char *p, int len)
00099 {
00100     int         c1;
00101 
00102     while (len > 0)
00103     {
00104         c1 = *l;
00105         if (c1 == 0 || IS_HIGHBIT_SET(c1))
00106             report_invalid_encoding(PG_SQL_ASCII, (const char *) l, len);
00107         *p++ = c1;
00108         l++;
00109         len--;
00110     }
00111     *p = '\0';
00112 }
00113 
00114 /*
00115  * MIC ---> ASCII
00116  */
00117 void
00118 pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len)
00119 {
00120     int         c1;
00121 
00122     while (len > 0)
00123     {
00124         c1 = *mic;
00125         if (c1 == 0 || IS_HIGHBIT_SET(c1))
00126             report_untranslatable_char(PG_MULE_INTERNAL, PG_SQL_ASCII,
00127                                        (const char *) mic, len);
00128         *p++ = c1;
00129         mic++;
00130         len--;
00131     }
00132     *p = '\0';
00133 }
00134 
00135 /*
00136  * latin2mic_with_table: a generic single byte charset encoding
00137  * conversion from a local charset to the mule internal code.
00138  *
00139  * l points to the source string of length len
00140  * p is the output area (must be large enough!)
00141  * lc is the mule character set id for the local encoding
00142  * encoding is the PG identifier for the local encoding
00143  * tab holds conversion entries for the local charset
00144  * starting from 128 (0x80). each entry in the table
00145  * holds the corresponding code point for the mule internal code.
00146  */
00147 void
00148 latin2mic_with_table(const unsigned char *l,
00149                      unsigned char *p,
00150                      int len,
00151                      int lc,
00152                      int encoding,
00153                      const unsigned char *tab)
00154 {
00155     unsigned char c1,
00156                 c2;
00157 
00158     while (len > 0)
00159     {
00160         c1 = *l;
00161         if (c1 == 0)
00162             report_invalid_encoding(encoding, (const char *) l, len);
00163         if (!IS_HIGHBIT_SET(c1))
00164             *p++ = c1;
00165         else
00166         {
00167             c2 = tab[c1 - HIGHBIT];
00168             if (c2)
00169             {
00170                 *p++ = lc;
00171                 *p++ = c2;
00172             }
00173             else
00174                 report_untranslatable_char(encoding, PG_MULE_INTERNAL,
00175                                            (const char *) l, len);
00176         }
00177         l++;
00178         len--;
00179     }
00180     *p = '\0';
00181 }
00182 
00183 /*
00184  * mic2latin_with_table: a generic single byte charset encoding
00185  * conversion from the mule internal code to a local charset.
00186  *
00187  * mic points to the source string of length len
00188  * p is the output area (must be large enough!)
00189  * lc is the mule character set id for the local encoding
00190  * encoding is the PG identifier for the local encoding
00191  * tab holds conversion entries for the mule internal code's
00192  * second byte, starting from 128 (0x80). each entry in the table
00193  * holds the corresponding code point for the local charset.
00194  */
00195 void
00196 mic2latin_with_table(const unsigned char *mic,
00197                      unsigned char *p,
00198                      int len,
00199                      int lc,
00200                      int encoding,
00201                      const unsigned char *tab)
00202 {
00203     unsigned char c1,
00204                 c2;
00205 
00206     while (len > 0)
00207     {
00208         c1 = *mic;
00209         if (c1 == 0)
00210             report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
00211         if (!IS_HIGHBIT_SET(c1))
00212         {
00213             /* easy for ASCII */
00214             *p++ = c1;
00215             mic++;
00216             len--;
00217         }
00218         else
00219         {
00220             int         l = pg_mic_mblen(mic);
00221 
00222             if (len < l)
00223                 report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
00224                                         len);
00225             if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
00226                 (c2 = tab[mic[1] - HIGHBIT]) == 0)
00227             {
00228                 report_untranslatable_char(PG_MULE_INTERNAL, encoding,
00229                                            (const char *) mic, len);
00230                 break;          /* keep compiler quiet */
00231             }
00232             *p++ = c2;
00233             mic += 2;
00234             len -= 2;
00235         }
00236     }
00237     *p = '\0';
00238 }
00239 
00240 /*
00241  * comparison routine for bsearch()
00242  * this routine is intended for UTF8 -> local code
00243  */
00244 static int
00245 compare1(const void *p1, const void *p2)
00246 {
00247     uint32      v1,
00248                 v2;
00249 
00250     v1 = *(const uint32 *) p1;
00251     v2 = ((const pg_utf_to_local *) p2)->utf;
00252     return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
00253 }
00254 
00255 /*
00256  * comparison routine for bsearch()
00257  * this routine is intended for local code -> UTF8
00258  */
00259 static int
00260 compare2(const void *p1, const void *p2)
00261 {
00262     uint32      v1,
00263                 v2;
00264 
00265     v1 = *(const uint32 *) p1;
00266     v2 = ((const pg_local_to_utf *) p2)->code;
00267     return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
00268 }
00269 
00270 /*
00271  * comparison routine for bsearch()
00272  * this routine is intended for combined UTF8 -> local code
00273  */
00274 static int
00275 compare3(const void *p1, const void *p2)
00276 {
00277     uint32      s1,
00278                 s2,
00279                 d1,
00280                 d2;
00281 
00282     s1 = *(const uint32 *) p1;
00283     s2 = *((const uint32 *) p1 + 1);
00284     d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
00285     d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
00286     return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
00287 }
00288 
00289 /*
00290  * comparison routine for bsearch()
00291  * this routine is intended for local code -> combined UTF8
00292  */
00293 static int
00294 compare4(const void *p1, const void *p2)
00295 {
00296     uint32      v1,
00297                 v2;
00298 
00299     v1 = *(const uint32 *) p1;
00300     v2 = ((const pg_local_to_utf_combined *) p2)->code;
00301     return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
00302 }
00303 
00304 /*
00305  * convert 32bit wide character to mutibye stream pointed to by iso
00306  */
00307 static unsigned char *
00308 set_iso_code(unsigned char *iso, uint32 code)
00309 {
00310     if (code & 0xff000000)
00311         *iso++ = code >> 24;
00312     if (code & 0x00ff0000)
00313         *iso++ = (code & 0x00ff0000) >> 16;
00314     if (code & 0x0000ff00)
00315         *iso++ = (code & 0x0000ff00) >> 8;
00316     if (code & 0x000000ff)
00317         *iso++ = code & 0x000000ff;
00318     return iso;
00319 }
00320 
00321 /*
00322  * UTF8 ---> local code
00323  *
00324  * utf: input UTF8 string (need not be null-terminated).
00325  * iso: pointer to the output area (must be large enough!)
00326  * map: the conversion map.
00327  * cmap: the conversion map for combined characters.
00328  *        (optional)
00329  * size1: the size of the conversion map.
00330  * size2: the size of the conversion map for combined characters
00331  *        (optional)
00332  * encoding: the PG identifier for the local encoding.
00333  * len: length of input string.
00334  */
00335 void
00336 UtfToLocal(const unsigned char *utf, unsigned char *iso,
00337            const pg_utf_to_local *map, const pg_utf_to_local_combined *cmap,
00338            int size1, int size2, int encoding, int len)
00339 {
00340     uint32      iutf;
00341     uint32      cutf[2];
00342     uint32      code;
00343     pg_utf_to_local *p;
00344     pg_utf_to_local_combined *cp;
00345     int         l;
00346 
00347     for (; len > 0; len -= l)
00348     {
00349         /* "break" cases all represent errors */
00350         if (*utf == '\0')
00351             break;
00352 
00353         l = pg_utf_mblen(utf);
00354 
00355         if (len < l)
00356             break;
00357 
00358         if (!pg_utf8_islegal(utf, l))
00359             break;
00360 
00361         if (l == 1)
00362         {
00363             /* ASCII case is easy */
00364             *iso++ = *utf++;
00365             continue;
00366         }
00367         else if (l == 2)
00368         {
00369             iutf = *utf++ << 8;
00370             iutf |= *utf++;
00371         }
00372         else if (l == 3)
00373         {
00374             iutf = *utf++ << 16;
00375             iutf |= *utf++ << 8;
00376             iutf |= *utf++;
00377         }
00378         else if (l == 4)
00379         {
00380             iutf = *utf++ << 24;
00381             iutf |= *utf++ << 16;
00382             iutf |= *utf++ << 8;
00383             iutf |= *utf++;
00384         }
00385 
00386         /*
00387          * first, try with combined map if possible
00388          */
00389         if (cmap && len > l)
00390         {
00391             const unsigned char *utf_save = utf;
00392             int         len_save = len;
00393             int         l_save = l;
00394 
00395             len -= l;
00396 
00397             l = pg_utf_mblen(utf);
00398             if (len < l)
00399                 break;
00400 
00401             if (!pg_utf8_islegal(utf, l))
00402                 break;
00403 
00404             cutf[0] = iutf;
00405 
00406             if (l == 1)
00407             {
00408                 if (len_save > 1)
00409                 {
00410                     p = bsearch(&cutf[0], map, size1,
00411                                 sizeof(pg_utf_to_local), compare1);
00412                     if (p == NULL)
00413                         report_untranslatable_char(PG_UTF8, encoding,
00414                                (const char *) (utf_save - l_save), len_save);
00415                     iso = set_iso_code(iso, p->code);
00416                 }
00417 
00418                 /* ASCII case is easy */
00419                 *iso++ = *utf++;
00420                 continue;
00421             }
00422             else if (l == 2)
00423             {
00424                 iutf = *utf++ << 8;
00425                 iutf |= *utf++;
00426             }
00427             else if (l == 3)
00428             {
00429                 iutf = *utf++ << 16;
00430                 iutf |= *utf++ << 8;
00431                 iutf |= *utf++;
00432             }
00433             else if (l == 4)
00434             {
00435                 iutf = *utf++ << 24;
00436                 iutf |= *utf++ << 16;
00437                 iutf |= *utf++ << 8;
00438                 iutf |= *utf++;
00439             }
00440 
00441             cutf[1] = iutf;
00442             cp = bsearch(cutf, cmap, size2,
00443                          sizeof(pg_utf_to_local_combined), compare3);
00444             if (cp)
00445                 code = cp->code;
00446             else
00447             {
00448                 /* not found in combined map. try with ordinary map */
00449                 p = bsearch(&cutf[0], map, size1,
00450                             sizeof(pg_utf_to_local), compare1);
00451                 if (p == NULL)
00452                     report_untranslatable_char(PG_UTF8, encoding,
00453                                (const char *) (utf_save - l_save), len_save);
00454                 iso = set_iso_code(iso, p->code);
00455 
00456                 p = bsearch(&cutf[1], map, size1,
00457                             sizeof(pg_utf_to_local), compare1);
00458                 if (p == NULL)
00459                     report_untranslatable_char(PG_UTF8, encoding,
00460                                                (const char *) (utf - l), len);
00461                 code = p->code;
00462             }
00463         }
00464         else    /* no cmap or no remaining data */
00465         {
00466             p = bsearch(&iutf, map, size1,
00467                         sizeof(pg_utf_to_local), compare1);
00468             if (p == NULL)
00469                 report_untranslatable_char(PG_UTF8, encoding,
00470                                            (const char *) (utf - l), len);
00471             code = p->code;
00472         }
00473         iso = set_iso_code(iso, code);
00474     }
00475 
00476     if (len > 0)
00477         report_invalid_encoding(PG_UTF8, (const char *) utf, len);
00478 
00479     *iso = '\0';
00480 }
00481 
00482 /*
00483  * local code ---> UTF8
00484  *
00485  * iso: input local string (need not be null-terminated).
00486  * utf: pointer to the output area (must be large enough!)
00487  * map: the conversion map.
00488  * cmap: the conversion map for combined characters.
00489  *        (optional)
00490  * size1: the size of the conversion map.
00491  * size2: the size of the conversion map for combined characters
00492  *        (optional)
00493  * encoding: the PG identifier for the local encoding.
00494  * len: length of input string.
00495  */
00496 void
00497 LocalToUtf(const unsigned char *iso, unsigned char *utf,
00498            const pg_local_to_utf *map, const pg_local_to_utf_combined *cmap,
00499            int size1, int size2, int encoding, int len)
00500 {
00501     unsigned int iiso;
00502     int         l;
00503     pg_local_to_utf *p;
00504     pg_local_to_utf_combined *cp;
00505 
00506     if (!PG_VALID_ENCODING(encoding))
00507         ereport(ERROR,
00508                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00509                  errmsg("invalid encoding number: %d", encoding)));
00510 
00511     for (; len > 0; len -= l)
00512     {
00513         /* "break" cases all represent errors */
00514         if (*iso == '\0')
00515             break;
00516 
00517         if (!IS_HIGHBIT_SET(*iso))
00518         {
00519             /* ASCII case is easy */
00520             *utf++ = *iso++;
00521             l = 1;
00522             continue;
00523         }
00524 
00525         l = pg_encoding_verifymb(encoding, (const char *) iso, len);
00526         if (l < 0)
00527             break;
00528 
00529         if (l == 1)
00530             iiso = *iso++;
00531         else if (l == 2)
00532         {
00533             iiso = *iso++ << 8;
00534             iiso |= *iso++;
00535         }
00536         else if (l == 3)
00537         {
00538             iiso = *iso++ << 16;
00539             iiso |= *iso++ << 8;
00540             iiso |= *iso++;
00541         }
00542         else if (l == 4)
00543         {
00544             iiso = *iso++ << 24;
00545             iiso |= *iso++ << 16;
00546             iiso |= *iso++ << 8;
00547             iiso |= *iso++;
00548         }
00549 
00550         p = bsearch(&iiso, map, size1,
00551                     sizeof(pg_local_to_utf), compare2);
00552 
00553         if (p == NULL)
00554         {
00555             /*
00556              * not found in the ordinary map. if there's a combined character
00557              * map, try with it
00558              */
00559             if (cmap)
00560             {
00561                 cp = bsearch(&iiso, cmap, size2,
00562                              sizeof(pg_local_to_utf_combined), compare4);
00563 
00564                 if (cp)
00565                 {
00566                     if (cp->utf1 & 0xff000000)
00567                         *utf++ = cp->utf1 >> 24;
00568                     if (cp->utf1 & 0x00ff0000)
00569                         *utf++ = (cp->utf1 & 0x00ff0000) >> 16;
00570                     if (cp->utf1 & 0x0000ff00)
00571                         *utf++ = (cp->utf1 & 0x0000ff00) >> 8;
00572                     if (cp->utf1 & 0x000000ff)
00573                         *utf++ = cp->utf1 & 0x000000ff;
00574 
00575                     if (cp->utf2 & 0xff000000)
00576                         *utf++ = cp->utf2 >> 24;
00577                     if (cp->utf2 & 0x00ff0000)
00578                         *utf++ = (cp->utf2 & 0x00ff0000) >> 16;
00579                     if (cp->utf2 & 0x0000ff00)
00580                         *utf++ = (cp->utf2 & 0x0000ff00) >> 8;
00581                     if (cp->utf2 & 0x000000ff)
00582                         *utf++ = cp->utf2 & 0x000000ff;
00583 
00584                     continue;
00585                 }
00586             }
00587 
00588             report_untranslatable_char(encoding, PG_UTF8,
00589                                        (const char *) (iso - l), len);
00590 
00591         }
00592         else
00593         {
00594             if (p->utf & 0xff000000)
00595                 *utf++ = p->utf >> 24;
00596             if (p->utf & 0x00ff0000)
00597                 *utf++ = (p->utf & 0x00ff0000) >> 16;
00598             if (p->utf & 0x0000ff00)
00599                 *utf++ = (p->utf & 0x0000ff00) >> 8;
00600             if (p->utf & 0x000000ff)
00601                 *utf++ = p->utf & 0x000000ff;
00602         }
00603     }
00604 
00605     if (len > 0)
00606         report_invalid_encoding(encoding, (const char *) iso, len);
00607 
00608     *utf = '\0';
00609 }