Main Page | Modules | Class Hierarchy | Class List | Directories | File List | Class Members | File Members | Related Pages

unicode.c

00001 /*****************************************************************************
00002  * unicode.c: UTF8 <-> locale functions
00003  *****************************************************************************
00004  * Copyright (C) 2005 the VideoLAN team
00005  * $Id: unicode.c 12751 2005-10-02 16:52:52Z courmisch $
00006  *
00007  * Authors: RĂ©mi Denis-Courmont <rem # videolan.org>
00008  *
00009  * This program is free software; you can redistribute it and/or modify
00010  * it under the terms of the GNU General Public License as published by
00011  * the Free Software Foundation; either version 2 of the License, or
00012  * (at your option) any later version.
00013  *
00014  * This program is distributed in the hope that it will be useful,
00015  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  * GNU General Public License for more details.
00018  *
00019  * You should have received a copy of the GNU General Public License
00020  * along with this program; if not, write to the Free Software
00021  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
00022  *****************************************************************************/
00023 
00024 /*****************************************************************************
00025  * Preamble
00026  *****************************************************************************/
00027 #include <vlc/vlc.h>
00028 #include "charset.h"
00029 
00030 /*****************************************************************************
00031  * EnsureUTF8: replaces invalid/overlong UTF-8 sequences with question marks
00032  *****************************************************************************
00033  * Not Todo : convert Latin1 to UTF-8 on the flu
00034  * It is not possible given UTF-8 needs more space
00035  *****************************************************************************/
00036 #define isutf8cont( c ) (((c) >= 0x80) && ((c) <= 0xBF)) 
00037 char *EnsureUTF8( char *str )
00038 {
00039     unsigned char *ptr, c;
00040 
00041     ptr = (unsigned char *)str;
00042     while( (c = *ptr) != '\0' )
00043     {
00044         /* US-ASCII, 1 byte */
00045         if( ( ( c >= 0x20 ) && ( c <= 0x7F ) )
00046          || ( c == 0x09 ) || ( c == 0x0A ) || ( c == 0x0D ) )
00047         {
00048             ptr++; /* OK */
00049         }
00050         else
00051         /* 2 bytes */
00052         if( ( c >= 0xC2 ) && ( c <= 0xDF ) )
00053         {
00054             c = ptr[1];
00055             if( isutf8cont( c ) )
00056                 ptr += 2; /* OK */
00057             else
00058                 *ptr++ = '?'; /* invalid */
00059         }
00060         else
00061         /* 3 bytes */
00062         if( c == 0xE0 )
00063         {
00064             c = ptr[1];
00065             if( ( c >= 0xA0 ) && ( c <= 0xBF ) )
00066             {
00067                 c = ptr[2];
00068                 if( isutf8cont( c ) )
00069                     ptr += 3; /* OK */
00070                 else
00071                     *ptr++ = '?';
00072             }
00073             else
00074                 *ptr++ = '?';
00075         }
00076         else
00077         if( ( ( c >= 0xE1 ) && ( c <= 0xEC ) ) || ( c == 0xEC )
00078          || ( c == 0xEE ) || ( c == 0xEF ) )
00079         {
00080             c = ptr[1];
00081             if( isutf8cont( c ) )
00082             {
00083                 c = ptr[2];
00084                 if( isutf8cont( c ) )
00085                     ptr += 3; /* OK */
00086                 else
00087                     *ptr++ = '?';
00088             }
00089             else
00090                 *ptr++ = '?';
00091         }
00092         else
00093         if( c == 0xED )
00094         {
00095             c = ptr[1];
00096             if( ( c >= 0x80 ) && ( c <= 0x9F ) )
00097             {
00098                 c = ptr[2];
00099                 if( isutf8cont( c ) )
00100                     ptr += 3; /* OK */
00101                 else
00102                     *ptr++ = '?';
00103             }
00104             else
00105                 *ptr++ = '?';
00106         }
00107         else
00108         /* 4 bytes */
00109         if( c == 0xF0 )
00110         {
00111             c = ptr[1];
00112             if( ( c >= 0x90 ) && ( c <= 0xBF ) )
00113             {
00114                 c = ptr[2];
00115                 if( isutf8cont( c ) )
00116                 {
00117                     c = ptr[3];
00118                     if( isutf8cont( c ) )
00119                         ptr += 4; /* OK */
00120                     else
00121                         *ptr++ = '?';
00122                 }
00123                 else
00124                     *ptr++ = '?';
00125             }
00126             else
00127                 *ptr++ = '?';
00128         }
00129         else
00130         if( ( c >= 0xF1 ) && ( c <= 0xF3 ) )
00131         {
00132             c = ptr[1];
00133             if( isutf8cont( c ) )
00134             {
00135                 c = ptr[2];
00136                 if( isutf8cont( c ) )
00137                 {
00138                     c = ptr[3];
00139                     if( isutf8cont( c ) )
00140                         ptr += 4; /* OK */
00141                     else
00142                         *ptr++ = '?';
00143                 }
00144                 else
00145                     *ptr++ = '?';
00146             }
00147             else
00148                 *ptr++ = '?';
00149         }
00150         else
00151         if( c == 0xF4 )
00152         {
00153             c = ptr[1];
00154             if( ( c >= 0x80 ) && ( c <= 0x8F ) )
00155             {
00156                 c = ptr[2];
00157                 if( isutf8cont( c ) )
00158                 {
00159                     c = ptr[3];
00160                     if( isutf8cont( c ) )
00161                         ptr += 4; /* OK */
00162                     else
00163                         *ptr++ = '?';
00164                 }
00165                 else
00166                     *ptr++ = '?';
00167             }
00168             else
00169                 *ptr++ = '?';
00170         }
00171         else
00172             *ptr++ = '?';
00173     }
00174 
00175     return str;
00176 }
00177 
00178 /**********************************************************************
00179  * UTF32toUTF8: converts an array from UTF-32 to UTF-8
00180  *********************************************************************/
00181 char *UTF32toUTF8( const wchar_t *src, size_t len, size_t *newlen )
00182 {
00183     char *res, *out;
00184 
00185     /* allocate memory */
00186     out = res = (char *)malloc( 6 * len );
00187     if( res == NULL )
00188         return NULL;
00189 
00190     while( len > 0 )
00191     {
00192         uint32_t uv = *src++;
00193         len--;
00194 
00195         if( uv < 0x80 )
00196         {
00197             *out++ = uv;
00198             continue;
00199         }
00200         else
00201         if( uv < 0x800 )
00202         {
00203             *out++ = (( uv >>  6)         | 0xc0);
00204             *out++ = (( uv        & 0x3f) | 0x80);
00205             continue;
00206         }
00207         else
00208         if( uv < 0x10000 )
00209         {
00210             *out++ = (( uv >> 12)         | 0xe0);
00211             *out++ = (((uv >>  6) & 0x3f) | 0x80);
00212             *out++ = (( uv        & 0x3f) | 0x80);
00213             continue;
00214         }
00215         else
00216         {
00217             *out++ = (( uv >> 18)         | 0xf0);
00218             *out++ = (((uv >> 12) & 0x3f) | 0x80);
00219             *out++ = (((uv >>  6) & 0x3f) | 0x80);
00220             *out++ = (( uv        & 0x3f) | 0x80);
00221             continue;
00222         }
00223     }
00224     len = out - res;
00225     res = realloc( res, len );
00226     if( newlen != NULL )
00227         *newlen = len;
00228     return res;
00229 }
00230 
00231 /**********************************************************************
00232  * FromUTF32: converts an UTF-32 string to UTF-8
00233  **********************************************************************
00234  * The result must be free()'d. NULL on error.
00235  *********************************************************************/
00236 char *FromUTF32( const wchar_t *src )
00237 {
00238     size_t len;
00239     const wchar_t *in;
00240 
00241     /* determine the size of the string */
00242     for( len = 1, in = src; GetWBE( in ); len++ )
00243         in++;
00244 
00245     return UTF32toUTF8( src, len, NULL );
00246 }

Generated on Tue Dec 20 10:15:00 2005 for vlc-0.8.4a by  doxygen 1.4.2