Header And Logo

PostgreSQL
| The world's most advanced open source database.

mbprint.c

Go to the documentation of this file.
00001 /*
00002  * psql - the PostgreSQL interactive terminal
00003  *
00004  * Copyright (c) 2000-2013, PostgreSQL Global Development Group
00005  *
00006  * src/bin/psql/mbprint.c
00007  *
00008  * XXX this file does not really belong in psql/.  Perhaps move to libpq?
00009  * It also seems that the mbvalidate function is redundant with existing
00010  * functionality.
00011  */
00012 
00013 #include "postgres_fe.h"
00014 #include "mbprint.h"
00015 #ifndef PGSCRIPTS
00016 #include "settings.h"
00017 #endif
00018 
00019 /*
00020  * To avoid version-skew problems, this file must not use declarations
00021  * from pg_wchar.h: the encoding IDs we are dealing with are determined
00022  * by the libpq.so we are linked with, and that might not match the
00023  * numbers we see at compile time.  (If this file were inside libpq,
00024  * the problem would go away...)
00025  *
00026  * Hence, we have our own definition of pg_wchar, and we get the values
00027  * of any needed encoding IDs on-the-fly.
00028  */
00029 
00030 typedef unsigned int pg_wchar;
00031 
00032 static int
00033 pg_get_utf8_id(void)
00034 {
00035     static int  utf8_id = -1;
00036 
00037     if (utf8_id < 0)
00038         utf8_id = pg_char_to_encoding("utf8");
00039     return utf8_id;
00040 }
00041 
00042 #define PG_UTF8     pg_get_utf8_id()
00043 
00044 
00045 /*
00046  * Convert a UTF-8 character to a Unicode code point.
00047  * This is a one-character version of pg_utf2wchar_with_len.
00048  *
00049  * No error checks here, c must point to a long-enough string.
00050  */
00051 static pg_wchar
00052 utf8_to_unicode(const unsigned char *c)
00053 {
00054     if ((*c & 0x80) == 0)
00055         return (pg_wchar) c[0];
00056     else if ((*c & 0xe0) == 0xc0)
00057         return (pg_wchar) (((c[0] & 0x1f) << 6) |
00058                            (c[1] & 0x3f));
00059     else if ((*c & 0xf0) == 0xe0)
00060         return (pg_wchar) (((c[0] & 0x0f) << 12) |
00061                            ((c[1] & 0x3f) << 6) |
00062                            (c[2] & 0x3f));
00063     else if ((*c & 0xf8) == 0xf0)
00064         return (pg_wchar) (((c[0] & 0x07) << 18) |
00065                            ((c[1] & 0x3f) << 12) |
00066                            ((c[2] & 0x3f) << 6) |
00067                            (c[3] & 0x3f));
00068     else
00069         /* that is an invalid code on purpose */
00070         return 0xffffffff;
00071 }
00072 
00073 
00074 /*
00075  * Unicode 3.1 compliant validation : for each category, it checks the
00076  * combination of each byte to make sure it maps to a valid range. It also
00077  * returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe =
00078  * 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates)
00079  */
00080 static int
00081 utf_charcheck(const unsigned char *c)
00082 {
00083     if ((*c & 0x80) == 0)
00084         return 1;
00085     else if ((*c & 0xe0) == 0xc0)
00086     {
00087         /* two-byte char */
00088         if (((c[1] & 0xc0) == 0x80) && ((c[0] & 0x1f) > 0x01))
00089             return 2;
00090         return -1;
00091     }
00092     else if ((*c & 0xf0) == 0xe0)
00093     {
00094         /* three-byte char */
00095         if (((c[1] & 0xc0) == 0x80) &&
00096             (((c[0] & 0x0f) != 0x00) || ((c[1] & 0x20) == 0x20)) &&
00097             ((c[2] & 0xc0) == 0x80))
00098         {
00099             int         z = c[0] & 0x0f;
00100             int         yx = ((c[1] & 0x3f) << 6) | (c[0] & 0x3f);
00101             int         lx = yx & 0x7f;
00102 
00103             /* check 0xfffe/0xffff, 0xfdd0..0xfedf range, surrogates */
00104             if (((z == 0x0f) &&
00105                  (((yx & 0xffe) == 0xffe) ||
00106                (((yx & 0xf80) == 0xd80) && (lx >= 0x30) && (lx <= 0x4f)))) ||
00107                 ((z == 0x0d) && ((yx & 0xb00) == 0x800)))
00108                 return -1;
00109             return 3;
00110         }
00111         return -1;
00112     }
00113     else if ((*c & 0xf8) == 0xf0)
00114     {
00115         int         u = ((c[0] & 0x07) << 2) | ((c[1] & 0x30) >> 4);
00116 
00117         /* four-byte char */
00118         if (((c[1] & 0xc0) == 0x80) &&
00119             (u > 0x00) && (u <= 0x10) &&
00120             ((c[2] & 0xc0) == 0x80) && ((c[3] & 0xc0) == 0x80))
00121         {
00122             /* test for 0xzzzzfffe/0xzzzzfffff */
00123             if (((c[1] & 0x0f) == 0x0f) && ((c[2] & 0x3f) == 0x3f) &&
00124                 ((c[3] & 0x3e) == 0x3e))
00125                 return -1;
00126             return 4;
00127         }
00128         return -1;
00129     }
00130     return -1;
00131 }
00132 
00133 
00134 static void
00135 mb_utf_validate(unsigned char *pwcs)
00136 {
00137     unsigned char *p = pwcs;
00138 
00139     while (*pwcs)
00140     {
00141         int         len;
00142 
00143         if ((len = utf_charcheck(pwcs)) > 0)
00144         {
00145             if (p != pwcs)
00146             {
00147                 int         i;
00148 
00149                 for (i = 0; i < len; i++)
00150                     *p++ = *pwcs++;
00151             }
00152             else
00153             {
00154                 pwcs += len;
00155                 p += len;
00156             }
00157         }
00158         else
00159             /* we skip the char */
00160             pwcs++;
00161     }
00162     if (p != pwcs)
00163         *p = '\0';
00164 }
00165 
00166 /*
00167  * public functions : wcswidth and mbvalidate
00168  */
00169 
00170 /*
00171  * pg_wcswidth is the dumb display-width function.
00172  * It assumes that everything will appear on one line.
00173  * OTOH it is easier to use than pg_wcssize if this applies to you.
00174  */
00175 int
00176 pg_wcswidth(const char *pwcs, size_t len, int encoding)
00177 {
00178     int         width = 0;
00179 
00180     while (len > 0)
00181     {
00182         int         chlen,
00183                     chwidth;
00184 
00185         chlen = PQmblen(pwcs, encoding);
00186         if (len < (size_t) chlen)
00187             break;              /* Invalid string */
00188 
00189         chwidth = PQdsplen(pwcs, encoding);
00190         if (chwidth > 0)
00191             width += chwidth;
00192 
00193         pwcs += chlen;
00194         len -= chlen;
00195     }
00196     return width;
00197 }
00198 
00199 /*
00200  * pg_wcssize takes the given string in the given encoding and returns three
00201  * values:
00202  *    result_width: Width in display characters of the longest line in string
00203  *    result_height: Number of lines in display output
00204  *    result_format_size: Number of bytes required to store formatted
00205  *      representation of string
00206  *
00207  * This MUST be kept in sync with pg_wcsformat!
00208  */
00209 void
00210 pg_wcssize(const unsigned char *pwcs, size_t len, int encoding,
00211            int *result_width, int *result_height, int *result_format_size)
00212 {
00213     int         w,
00214                 chlen = 0,
00215                 linewidth = 0;
00216     int         width = 0;
00217     int         height = 1;
00218     int         format_size = 0;
00219 
00220     for (; *pwcs && len > 0; pwcs += chlen)
00221     {
00222         chlen = PQmblen((const char *) pwcs, encoding);
00223         if (len < (size_t) chlen)
00224             break;
00225         w = PQdsplen((const char *) pwcs, encoding);
00226 
00227         if (chlen == 1)         /* single-byte char */
00228         {
00229             if (*pwcs == '\n')  /* Newline */
00230             {
00231                 if (linewidth > width)
00232                     width = linewidth;
00233                 linewidth = 0;
00234                 height += 1;
00235                 format_size += 1;       /* For NUL char */
00236             }
00237             else if (*pwcs == '\r')     /* Linefeed */
00238             {
00239                 linewidth += 2;
00240                 format_size += 2;
00241             }
00242             else if (*pwcs == '\t')     /* Tab */
00243             {
00244                 do
00245                 {
00246                     linewidth++;
00247                     format_size++;
00248                 } while (linewidth % 8 != 0);
00249             }
00250             else if (w < 0)     /* Other control char */
00251             {
00252                 linewidth += 4;
00253                 format_size += 4;
00254             }
00255             else    /* Output it as-is */
00256             {
00257                 linewidth += w;
00258                 format_size += 1;
00259             }
00260         }
00261         else if (w < 0)         /* Non-ascii control char */
00262         {
00263             linewidth += 6;     /* \u0000 */
00264             format_size += 6;
00265         }
00266         else    /* All other chars */
00267         {
00268             linewidth += w;
00269             format_size += chlen;
00270         }
00271         len -= chlen;
00272     }
00273     if (linewidth > width)
00274         width = linewidth;
00275     format_size += 1;           /* For NUL char */
00276 
00277     /* Set results */
00278     if (result_width)
00279         *result_width = width;
00280     if (result_height)
00281         *result_height = height;
00282     if (result_format_size)
00283         *result_format_size = format_size;
00284 }
00285 
00286 /*
00287  *  Format a string into one or more "struct lineptr" lines.
00288  *  lines[i].ptr == NULL indicates the end of the array.
00289  *
00290  * This MUST be kept in sync with pg_wcssize!
00291  */
00292 void
00293 pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding,
00294              struct lineptr * lines, int count)
00295 {
00296     int         w,
00297                 chlen = 0;
00298     int         linewidth = 0;
00299     unsigned char *ptr = lines->ptr;    /* Pointer to data area */
00300 
00301     for (; *pwcs && len > 0; pwcs += chlen)
00302     {
00303         chlen = PQmblen((const char *) pwcs, encoding);
00304         if (len < (size_t) chlen)
00305             break;
00306         w = PQdsplen((const char *) pwcs, encoding);
00307 
00308         if (chlen == 1)         /* single-byte char */
00309         {
00310             if (*pwcs == '\n')  /* Newline */
00311             {
00312                 *ptr++ = '\0';
00313                 lines->width = linewidth;
00314                 linewidth = 0;
00315                 lines++;
00316                 count--;
00317                 if (count <= 0)
00318                     exit(1);    /* Screwup */
00319 
00320                 /* make next line point to remaining memory */
00321                 lines->ptr = ptr;
00322             }
00323             else if (*pwcs == '\r')     /* Linefeed */
00324             {
00325                 strcpy((char *) ptr, "\\r");
00326                 linewidth += 2;
00327                 ptr += 2;
00328             }
00329             else if (*pwcs == '\t')     /* Tab */
00330             {
00331                 do
00332                 {
00333                     *ptr++ = ' ';
00334                     linewidth++;
00335                 } while (linewidth % 8 != 0);
00336             }
00337             else if (w < 0)     /* Other control char */
00338             {
00339                 sprintf((char *) ptr, "\\x%02X", *pwcs);
00340                 linewidth += 4;
00341                 ptr += 4;
00342             }
00343             else    /* Output it as-is */
00344             {
00345                 linewidth += w;
00346                 *ptr++ = *pwcs;
00347             }
00348         }
00349         else if (w < 0)         /* Non-ascii control char */
00350         {
00351             if (encoding == PG_UTF8)
00352                 sprintf((char *) ptr, "\\u%04X", utf8_to_unicode(pwcs));
00353             else
00354             {
00355                 /*
00356                  * This case cannot happen in the current code because only
00357                  * UTF-8 signals multibyte control characters. But we may need
00358                  * to support it at some stage
00359                  */
00360                 sprintf((char *) ptr, "\\u????");
00361             }
00362             ptr += 6;
00363             linewidth += 6;
00364         }
00365         else    /* All other chars */
00366         {
00367             int         i;
00368 
00369             for (i = 0; i < chlen; i++)
00370                 *ptr++ = pwcs[i];
00371             linewidth += w;
00372         }
00373         len -= chlen;
00374     }
00375     lines->width = linewidth;
00376     *ptr++ = '\0';              /* Terminate formatted string */
00377 
00378     if (count <= 0)
00379         exit(1);                /* Screwup */
00380 
00381     (lines + 1)->ptr = NULL;    /* terminate line array */
00382 }
00383 
00384 unsigned char *
00385 mbvalidate(unsigned char *pwcs, int encoding)
00386 {
00387     if (encoding == PG_UTF8)
00388         mb_utf_validate(pwcs);
00389     else
00390     {
00391         /*
00392          * other encodings needing validation should add their own routines
00393          * here
00394          */
00395     }
00396 
00397     return pwcs;
00398 }