Header And Logo

PostgreSQL
| The world's most advanced open source database.

encode.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * encode.c
00004  *    Various data encoding/decoding things.
00005  *
00006  * Copyright (c) 2001-2013, PostgreSQL Global Development Group
00007  *
00008  *
00009  * IDENTIFICATION
00010  *    src/backend/utils/adt/encode.c
00011  *
00012  *-------------------------------------------------------------------------
00013  */
00014 #include "postgres.h"
00015 
00016 #include <ctype.h>
00017 
00018 #include "utils/builtins.h"
00019 
00020 
00021 struct pg_encoding
00022 {
00023     unsigned    (*encode_len) (const char *data, unsigned dlen);
00024     unsigned    (*decode_len) (const char *data, unsigned dlen);
00025     unsigned    (*encode) (const char *data, unsigned dlen, char *res);
00026     unsigned    (*decode) (const char *data, unsigned dlen, char *res);
00027 };
00028 
00029 static const struct pg_encoding *pg_find_encoding(const char *name);
00030 
00031 /*
00032  * SQL functions.
00033  */
00034 
00035 Datum
00036 binary_encode(PG_FUNCTION_ARGS)
00037 {
00038     bytea      *data = PG_GETARG_BYTEA_P(0);
00039     Datum       name = PG_GETARG_DATUM(1);
00040     text       *result;
00041     char       *namebuf;
00042     int         datalen,
00043                 resultlen,
00044                 res;
00045     const struct pg_encoding *enc;
00046 
00047     datalen = VARSIZE(data) - VARHDRSZ;
00048 
00049     namebuf = TextDatumGetCString(name);
00050 
00051     enc = pg_find_encoding(namebuf);
00052     if (enc == NULL)
00053         ereport(ERROR,
00054                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00055                  errmsg("unrecognized encoding: \"%s\"", namebuf)));
00056 
00057     resultlen = enc->encode_len(VARDATA(data), datalen);
00058     result = palloc(VARHDRSZ + resultlen);
00059 
00060     res = enc->encode(VARDATA(data), datalen, VARDATA(result));
00061 
00062     /* Make this FATAL 'cause we've trodden on memory ... */
00063     if (res > resultlen)
00064         elog(FATAL, "overflow - encode estimate too small");
00065 
00066     SET_VARSIZE(result, VARHDRSZ + res);
00067 
00068     PG_RETURN_TEXT_P(result);
00069 }
00070 
00071 Datum
00072 binary_decode(PG_FUNCTION_ARGS)
00073 {
00074     text       *data = PG_GETARG_TEXT_P(0);
00075     Datum       name = PG_GETARG_DATUM(1);
00076     bytea      *result;
00077     char       *namebuf;
00078     int         datalen,
00079                 resultlen,
00080                 res;
00081     const struct pg_encoding *enc;
00082 
00083     datalen = VARSIZE(data) - VARHDRSZ;
00084 
00085     namebuf = TextDatumGetCString(name);
00086 
00087     enc = pg_find_encoding(namebuf);
00088     if (enc == NULL)
00089         ereport(ERROR,
00090                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00091                  errmsg("unrecognized encoding: \"%s\"", namebuf)));
00092 
00093     resultlen = enc->decode_len(VARDATA(data), datalen);
00094     result = palloc(VARHDRSZ + resultlen);
00095 
00096     res = enc->decode(VARDATA(data), datalen, VARDATA(result));
00097 
00098     /* Make this FATAL 'cause we've trodden on memory ... */
00099     if (res > resultlen)
00100         elog(FATAL, "overflow - decode estimate too small");
00101 
00102     SET_VARSIZE(result, VARHDRSZ + res);
00103 
00104     PG_RETURN_BYTEA_P(result);
00105 }
00106 
00107 
00108 /*
00109  * HEX
00110  */
00111 
00112 static const char hextbl[] = "0123456789abcdef";
00113 
00114 static const int8 hexlookup[128] = {
00115     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00116     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00117     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00118     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
00119     -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00120     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00121     -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00122     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00123 };
00124 
00125 unsigned
00126 hex_encode(const char *src, unsigned len, char *dst)
00127 {
00128     const char *end = src + len;
00129 
00130     while (src < end)
00131     {
00132         *dst++ = hextbl[(*src >> 4) & 0xF];
00133         *dst++ = hextbl[*src & 0xF];
00134         src++;
00135     }
00136     return len * 2;
00137 }
00138 
00139 static inline char
00140 get_hex(char c)
00141 {
00142     int         res = -1;
00143 
00144     if (c > 0 && c < 127)
00145         res = hexlookup[(unsigned char) c];
00146 
00147     if (res < 0)
00148         ereport(ERROR,
00149                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00150                  errmsg("invalid hexadecimal digit: \"%c\"", c)));
00151 
00152     return (char) res;
00153 }
00154 
00155 unsigned
00156 hex_decode(const char *src, unsigned len, char *dst)
00157 {
00158     const char *s,
00159                *srcend;
00160     char        v1,
00161                 v2,
00162                *p;
00163 
00164     srcend = src + len;
00165     s = src;
00166     p = dst;
00167     while (s < srcend)
00168     {
00169         if (*s == ' ' || *s == '\n' || *s == '\t' || *s == '\r')
00170         {
00171             s++;
00172             continue;
00173         }
00174         v1 = get_hex(*s++) << 4;
00175         if (s >= srcend)
00176             ereport(ERROR,
00177                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00178                   errmsg("invalid hexadecimal data: odd number of digits")));
00179 
00180         v2 = get_hex(*s++);
00181         *p++ = v1 | v2;
00182     }
00183 
00184     return p - dst;
00185 }
00186 
00187 static unsigned
00188 hex_enc_len(const char *src, unsigned srclen)
00189 {
00190     return srclen << 1;
00191 }
00192 
00193 static unsigned
00194 hex_dec_len(const char *src, unsigned srclen)
00195 {
00196     return srclen >> 1;
00197 }
00198 
00199 /*
00200  * BASE64
00201  */
00202 
00203 static const char _base64[] =
00204 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
00205 
00206 static const int8 b64lookup[128] = {
00207     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00208     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
00209     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63,
00210     52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1,
00211     -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
00212     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
00213     -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
00214     41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1,
00215 };
00216 
00217 static unsigned
00218 b64_encode(const char *src, unsigned len, char *dst)
00219 {
00220     char       *p,
00221                *lend = dst + 76;
00222     const char *s,
00223                *end = src + len;
00224     int         pos = 2;
00225     uint32      buf = 0;
00226 
00227     s = src;
00228     p = dst;
00229 
00230     while (s < end)
00231     {
00232         buf |= (unsigned char) *s << (pos << 3);
00233         pos--;
00234         s++;
00235 
00236         /* write it out */
00237         if (pos < 0)
00238         {
00239             *p++ = _base64[(buf >> 18) & 0x3f];
00240             *p++ = _base64[(buf >> 12) & 0x3f];
00241             *p++ = _base64[(buf >> 6) & 0x3f];
00242             *p++ = _base64[buf & 0x3f];
00243 
00244             pos = 2;
00245             buf = 0;
00246         }
00247         if (p >= lend)
00248         {
00249             *p++ = '\n';
00250             lend = p + 76;
00251         }
00252     }
00253     if (pos != 2)
00254     {
00255         *p++ = _base64[(buf >> 18) & 0x3f];
00256         *p++ = _base64[(buf >> 12) & 0x3f];
00257         *p++ = (pos == 0) ? _base64[(buf >> 6) & 0x3f] : '=';
00258         *p++ = '=';
00259     }
00260 
00261     return p - dst;
00262 }
00263 
00264 static unsigned
00265 b64_decode(const char *src, unsigned len, char *dst)
00266 {
00267     const char *srcend = src + len,
00268                *s = src;
00269     char       *p = dst;
00270     char        c;
00271     int         b = 0;
00272     uint32      buf = 0;
00273     int         pos = 0,
00274                 end = 0;
00275 
00276     while (s < srcend)
00277     {
00278         c = *s++;
00279 
00280         if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
00281             continue;
00282 
00283         if (c == '=')
00284         {
00285             /* end sequence */
00286             if (!end)
00287             {
00288                 if (pos == 2)
00289                     end = 1;
00290                 else if (pos == 3)
00291                     end = 2;
00292                 else
00293                     ereport(ERROR,
00294                             (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00295                              errmsg("unexpected \"=\"")));
00296             }
00297             b = 0;
00298         }
00299         else
00300         {
00301             b = -1;
00302             if (c > 0 && c < 127)
00303                 b = b64lookup[(unsigned char) c];
00304             if (b < 0)
00305                 ereport(ERROR,
00306                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00307                          errmsg("invalid symbol")));
00308         }
00309         /* add it to buffer */
00310         buf = (buf << 6) + b;
00311         pos++;
00312         if (pos == 4)
00313         {
00314             *p++ = (buf >> 16) & 255;
00315             if (end == 0 || end > 1)
00316                 *p++ = (buf >> 8) & 255;
00317             if (end == 0 || end > 2)
00318                 *p++ = buf & 255;
00319             buf = 0;
00320             pos = 0;
00321         }
00322     }
00323 
00324     if (pos != 0)
00325         ereport(ERROR,
00326                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00327                  errmsg("invalid end sequence")));
00328 
00329     return p - dst;
00330 }
00331 
00332 
00333 static unsigned
00334 b64_enc_len(const char *src, unsigned srclen)
00335 {
00336     /* 3 bytes will be converted to 4, linefeed after 76 chars */
00337     return (srclen + 2) * 4 / 3 + srclen / (76 * 3 / 4);
00338 }
00339 
00340 static unsigned
00341 b64_dec_len(const char *src, unsigned srclen)
00342 {
00343     return (srclen * 3) >> 2;
00344 }
00345 
00346 /*
00347  * Escape
00348  * Minimally escape bytea to text.
00349  * De-escape text to bytea.
00350  *
00351  * We must escape zero bytes and high-bit-set bytes to avoid generating
00352  * text that might be invalid in the current encoding, or that might
00353  * change to something else if passed through an encoding conversion
00354  * (leading to failing to de-escape to the original bytea value).
00355  * Also of course backslash itself has to be escaped.
00356  *
00357  * De-escaping processes \\ and any \### octal
00358  */
00359 
00360 #define VAL(CH)         ((CH) - '0')
00361 #define DIG(VAL)        ((VAL) + '0')
00362 
00363 static unsigned
00364 esc_encode(const char *src, unsigned srclen, char *dst)
00365 {
00366     const char *end = src + srclen;
00367     char       *rp = dst;
00368     int         len = 0;
00369 
00370     while (src < end)
00371     {
00372         unsigned char c = (unsigned char) *src;
00373 
00374         if (c == '\0' || IS_HIGHBIT_SET(c))
00375         {
00376             rp[0] = '\\';
00377             rp[1] = DIG(c >> 6);
00378             rp[2] = DIG((c >> 3) & 7);
00379             rp[3] = DIG(c & 7);
00380             rp += 4;
00381             len += 4;
00382         }
00383         else if (c == '\\')
00384         {
00385             rp[0] = '\\';
00386             rp[1] = '\\';
00387             rp += 2;
00388             len += 2;
00389         }
00390         else
00391         {
00392             *rp++ = c;
00393             len++;
00394         }
00395 
00396         src++;
00397     }
00398 
00399     return len;
00400 }
00401 
00402 static unsigned
00403 esc_decode(const char *src, unsigned srclen, char *dst)
00404 {
00405     const char *end = src + srclen;
00406     char       *rp = dst;
00407     int         len = 0;
00408 
00409     while (src < end)
00410     {
00411         if (src[0] != '\\')
00412             *rp++ = *src++;
00413         else if (src + 3 < end &&
00414                  (src[1] >= '0' && src[1] <= '3') &&
00415                  (src[2] >= '0' && src[2] <= '7') &&
00416                  (src[3] >= '0' && src[3] <= '7'))
00417         {
00418             int         val;
00419 
00420             val = VAL(src[1]);
00421             val <<= 3;
00422             val += VAL(src[2]);
00423             val <<= 3;
00424             *rp++ = val + VAL(src[3]);
00425             src += 4;
00426         }
00427         else if (src + 1 < end &&
00428                  (src[1] == '\\'))
00429         {
00430             *rp++ = '\\';
00431             src += 2;
00432         }
00433         else
00434         {
00435             /*
00436              * One backslash, not followed by ### valid octal. Should never
00437              * get here, since esc_dec_len does same check.
00438              */
00439             ereport(ERROR,
00440                     (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
00441                      errmsg("invalid input syntax for type bytea")));
00442         }
00443 
00444         len++;
00445     }
00446 
00447     return len;
00448 }
00449 
00450 static unsigned
00451 esc_enc_len(const char *src, unsigned srclen)
00452 {
00453     const char *end = src + srclen;
00454     int         len = 0;
00455 
00456     while (src < end)
00457     {
00458         if (*src == '\0' || IS_HIGHBIT_SET(*src))
00459             len += 4;
00460         else if (*src == '\\')
00461             len += 2;
00462         else
00463             len++;
00464 
00465         src++;
00466     }
00467 
00468     return len;
00469 }
00470 
00471 static unsigned
00472 esc_dec_len(const char *src, unsigned srclen)
00473 {
00474     const char *end = src + srclen;
00475     int         len = 0;
00476 
00477     while (src < end)
00478     {
00479         if (src[0] != '\\')
00480             src++;
00481         else if (src + 3 < end &&
00482                  (src[1] >= '0' && src[1] <= '3') &&
00483                  (src[2] >= '0' && src[2] <= '7') &&
00484                  (src[3] >= '0' && src[3] <= '7'))
00485         {
00486             /*
00487              * backslash + valid octal
00488              */
00489             src += 4;
00490         }
00491         else if (src + 1 < end &&
00492                  (src[1] == '\\'))
00493         {
00494             /*
00495              * two backslashes = backslash
00496              */
00497             src += 2;
00498         }
00499         else
00500         {
00501             /*
00502              * one backslash, not followed by ### valid octal
00503              */
00504             ereport(ERROR,
00505                     (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
00506                      errmsg("invalid input syntax for type bytea")));
00507         }
00508 
00509         len++;
00510     }
00511     return len;
00512 }
00513 
00514 /*
00515  * Common
00516  */
00517 
00518 static const struct
00519 {
00520     const char *name;
00521     struct pg_encoding enc;
00522 }   enclist[] =
00523 
00524 {
00525     {
00526         "hex",
00527         {
00528             hex_enc_len, hex_dec_len, hex_encode, hex_decode
00529         }
00530     },
00531     {
00532         "base64",
00533         {
00534             b64_enc_len, b64_dec_len, b64_encode, b64_decode
00535         }
00536     },
00537     {
00538         "escape",
00539         {
00540             esc_enc_len, esc_dec_len, esc_encode, esc_decode
00541         }
00542     },
00543     {
00544         NULL,
00545         {
00546             NULL, NULL, NULL, NULL
00547         }
00548     }
00549 };
00550 
00551 static const struct pg_encoding *
00552 pg_find_encoding(const char *name)
00553 {
00554     int         i;
00555 
00556     for (i = 0; enclist[i].name; i++)
00557         if (pg_strcasecmp(enclist[i].name, name) == 0)
00558             return &enclist[i].enc;
00559 
00560     return NULL;
00561 }