Header And Logo

PostgreSQL
| The world's most advanced open source database.

utf8_and_iso8859_1.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  *    ISO8859_1 <--> UTF8
00004  *
00005  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00006  * Portions Copyright (c) 1994, Regents of the University of California
00007  *
00008  * IDENTIFICATION
00009  *    src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c
00010  *
00011  *-------------------------------------------------------------------------
00012  */
00013 
00014 #include "postgres.h"
00015 #include "fmgr.h"
00016 #include "mb/pg_wchar.h"
00017 
00018 PG_MODULE_MAGIC;
00019 
00020 PG_FUNCTION_INFO_V1(iso8859_1_to_utf8);
00021 PG_FUNCTION_INFO_V1(utf8_to_iso8859_1);
00022 
00023 extern Datum iso8859_1_to_utf8(PG_FUNCTION_ARGS);
00024 extern Datum utf8_to_iso8859_1(PG_FUNCTION_ARGS);
00025 
00026 /* ----------
00027  * conv_proc(
00028  *      INTEGER,    -- source encoding id
00029  *      INTEGER,    -- destination encoding id
00030  *      CSTRING,    -- source string (null terminated C string)
00031  *      CSTRING,    -- destination string (null terminated C string)
00032  *      INTEGER     -- source string length
00033  * ) returns VOID;
00034  * ----------
00035  */
00036 
00037 Datum
00038 iso8859_1_to_utf8(PG_FUNCTION_ARGS)
00039 {
00040     unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
00041     unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
00042     int         len = PG_GETARG_INT32(4);
00043     unsigned short c;
00044 
00045     CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN1, PG_UTF8);
00046 
00047     while (len > 0)
00048     {
00049         c = *src;
00050         if (c == 0)
00051             report_invalid_encoding(PG_LATIN1, (const char *) src, len);
00052         if (!IS_HIGHBIT_SET(c))
00053             *dest++ = c;
00054         else
00055         {
00056             *dest++ = (c >> 6) | 0xc0;
00057             *dest++ = (c & 0x003f) | HIGHBIT;
00058         }
00059         src++;
00060         len--;
00061     }
00062     *dest = '\0';
00063 
00064     PG_RETURN_VOID();
00065 }
00066 
00067 Datum
00068 utf8_to_iso8859_1(PG_FUNCTION_ARGS)
00069 {
00070     unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
00071     unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
00072     int         len = PG_GETARG_INT32(4);
00073     unsigned short c,
00074                 c1;
00075 
00076     CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_LATIN1);
00077 
00078     while (len > 0)
00079     {
00080         c = *src;
00081         if (c == 0)
00082             report_invalid_encoding(PG_UTF8, (const char *) src, len);
00083         /* fast path for ASCII-subset characters */
00084         if (!IS_HIGHBIT_SET(c))
00085         {
00086             *dest++ = c;
00087             src++;
00088             len--;
00089         }
00090         else
00091         {
00092             int         l = pg_utf_mblen(src);
00093 
00094             if (l > len || !pg_utf8_islegal(src, l))
00095                 report_invalid_encoding(PG_UTF8, (const char *) src, len);
00096             if (l != 2)
00097                 report_untranslatable_char(PG_UTF8, PG_LATIN1,
00098                                            (const char *) src, len);
00099             c1 = src[1] & 0x3f;
00100             c = ((c & 0x1f) << 6) | c1;
00101             if (c >= 0x80 && c <= 0xff)
00102             {
00103                 *dest++ = (unsigned char) c;
00104                 src += 2;
00105                 len -= 2;
00106             }
00107             else
00108                 report_untranslatable_char(PG_UTF8, PG_LATIN1,
00109                                            (const char *) src, len);
00110         }
00111     }
00112     *dest = '\0';
00113 
00114     PG_RETURN_VOID();
00115 }