Main Page | Modules | Class Hierarchy | Class List | Directories | File List | Class Members | File Members | Related Pages

charset.c

00001 /*****************************************************************************
00002  * charset.c: Determine a canonical name for the current locale's character
00003  *            encoding.
00004  *****************************************************************************
00005  * Copyright (C) 2003-2005 the VideoLAN team
00006  * $Id: charset.c 12751 2005-10-02 16:52:52Z courmisch $
00007  *
00008  * Author: Derk-Jan Hartman <thedj at users.sf.net>
00009  *
00010  * vlc_current_charset() an adaption of mp_locale_charset():
00011  *
00012  *  Copyright (C) 2001-2003 The Mape Project
00013  *  Written by Karel Zak  <[email protected]>.
00014  *
00015  * This program is free software; you can redistribute it and/or modify
00016  * it under the terms of the GNU General Public License as published by
00017  * the Free Software Foundation; either version 2 of the License, or
00018  * (at your option) any later version.
00019  *
00020  * This program is distributed in the hope that it will be useful,
00021  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00022  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00023  * GNU General Public License for more details.
00024  *
00025  * You should have received a copy of the GNU General Public License
00026  * along with this program; if not, write to the Free Software
00027  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
00028  *****************************************************************************/
00029 
00030 #include <stdlib.h>
00031 #include <stdio.h>
00032 #include <vlc/vlc.h>
00033 
00034 #if !defined WIN32
00035 # if HAVE_LANGINFO_CODESET
00036 #  include <langinfo.h>
00037 # else
00038 #  if HAVE_SETLOCALE
00039 #   include <locale.h>
00040 #  endif
00041 # endif
00042 #elif defined WIN32
00043 # include <windows.h>
00044 #endif
00045 
00046 #ifdef SYS_DARWIN
00047 #   include <errno.h>
00048 #   include <string.h>
00049 #endif
00050 
00051 #include "charset.h"
00052 
00053 typedef struct VLCCharsetAlias
00054 {
00055     char *psz_alias, *psz_name;
00056 } VLCCharsetAlias;
00057 
00058 /*
00059  * The libcharset load all from external text file, but it's strange and
00060  * slow solution, we rather use array(s) compiled into source. In the
00061  * "good" libc this is not needful -- for example in linux.
00062  *
00063  * Please, put to this funtion exotic aliases only. The libc 'iconv' knows
00064  * a lot of basic aliases (check it first by iconv -l).
00065  *
00066  */
00067 #if defined WIN32 || defined OS2 || !HAVE_LANGINFO_CODESET
00068 static const char* vlc_encoding_from_language( const char *l )
00069 {
00070     /* check for language (and perhaps country) codes */
00071     if (strstr(l, "zh_TW")) return "Big5";
00072     if (strstr(l, "zh_HK")) return "Big5HKSCS";   /* no MIME charset */
00073     if (strstr(l, "zh")) return "GB2312";
00074     if (strstr(l, "th")) return "TIS-620";
00075     if (strstr(l, "ja")) return "EUC-JP";
00076     if (strstr(l, "ko")) return "EUC-KR";
00077     if (strstr(l, "ru")) return "KOI8-R";
00078     if (strstr(l, "uk")) return "KOI8-U";
00079     if (strstr(l, "pl") || strstr(l, "hr") ||
00080         strstr(l, "hu") || strstr(l, "cs") ||
00081         strstr(l, "sk") || strstr(l, "sl")) return "ISO-8859-2";
00082     if (strstr(l, "eo") || strstr(l, "mt")) return "ISO-8859-3";
00083     if (strstr(l, "lt") || strstr(l, "la")) return "ISO-8859-4";
00084     if (strstr(l, "bg") || strstr(l, "be") ||
00085         strstr(l, "mk") || strstr(l, "uk")) return "ISO-8859-5";
00086     if (strstr(l, "ar")) return "ISO-8859-6";
00087     if (strstr(l, "el")) return "ISO-8859-7";
00088     if (strstr(l, "he") || strstr(l, "iw")) return "ISO-8859-8";
00089     if (strstr(l, "tr")) return "ISO-8859-9";
00090     if (strstr(l, "th")) return "ISO-8859-11";
00091     if (strstr(l, "lv")) return "ISO-8859-13";
00092     if (strstr(l, "cy")) return "ISO-8859-14";
00093     if (strstr(l, "et")) return "ISO-8859-15"; /* all latin1 could be iso15 as well */
00094     if (strstr(l, "ro")) return "ISO-8859-2";   /* or ISO-8859-16 */
00095     if (strstr(l, "am") || strstr(l, "vi")) return "UTF-8";
00096     /* We don't know. This ain't working go to default. */
00097     return "ISO-8859-1";
00098 }
00099 #endif
00100 
00101 static const char* vlc_charset_aliases( const char *psz_name )
00102 {
00103     VLCCharsetAlias     *a;
00104 
00105 #if defined WIN32
00106     VLCCharsetAlias aliases[] =
00107     {
00108         { "CP936",      "GBK" },
00109         { "CP1361",     "JOHAB" },
00110         { "CP20127",    "ASCII" },
00111         { "CP20866",    "KOI8-R" },
00112         { "CP21866",    "KOI8-RU" },
00113         { "CP28591",    "ISO-8859-1" },
00114         { "CP28592",    "ISO-8859-2" },
00115         { "CP28593",    "ISO-8859-3" },
00116         { "CP28594",    "ISO-8859-4" },
00117         { "CP28595",    "ISO-8859-5" },
00118         { "CP28596",    "ISO-8859-6" },
00119         { "CP28597",    "ISO-8859-7" },
00120         { "CP28598",    "ISO-8859-8" },
00121         { "CP28599",    "ISO-8859-9" },
00122         { "CP28605",    "ISO-8859-15" },
00123         { NULL,         NULL }
00124     };
00125 #elif SYS_AIX
00126     VLCCharsetAlias aliases[] =
00127     {
00128         { "IBM-850",    "CP850" },
00129         { "IBM-856",    "CP856" },
00130         { "IBM-921",    "ISO-8859-13" },
00131         { "IBM-922",    "CP922" },
00132         { "IBM-932",    "CP932" },
00133         { "IBM-943",    "CP943" },
00134         { "IBM-1046",   "CP1046" },
00135         { "IBM-1124",   "CP1124" },
00136         { "IBM-1129",   "CP1129" },
00137         { "IBM-1252",   "CP1252" },
00138         { "IBM-EUCCN",  "GB2312" },
00139         { "IBM-EUCJP",  "EUC-JP" },
00140         { "IBM-EUCKR",  "EUC-KR" },
00141         { "IBM-EUCTW",  "EUC-TW" },
00142         { NULL, NULL }
00143     };
00144 #elif SYS_HPUX
00145     VLCCharsetAlias aliases[] =
00146     {
00147         { "ROMAN8",     "HP-ROMAN8" },
00148         { "ARABIC8",    "HP-ARABIC8" },
00149         { "GREEK8",     "HP-GREEK8" },
00150         { "HEBREW8",    "HP-HEBREW8" },
00151         { "TURKISH8",   "HP-TURKISH8" },
00152         { "KANA8",      "HP-KANA8" },
00153         { "HP15CN",     "GB2312" },
00154         { NULL, NULL }
00155     };
00156 #elif SYS_IRIX
00157     VLCCharsetAlias aliases[] =
00158     {
00159         { "EUCCN",      "GB2312" },
00160         { NULL, NULL }
00161     };
00162 #elif SYS_OSF
00163     VLCCharsetAlias aliases[] =
00164     {
00165         { "KSC5601",    "CP949" },
00166         { "SDECKANJI",  "EUC-JP" },
00167         { "TACTIS",     "TIS-620" },
00168         { NULL, NULL }
00169     };
00170 #elif SYS_SOLARIS
00171     VLCCharsetAlias aliases[] =
00172     {
00173         { "646",        "ASCII" },
00174         { "CNS11643",   "EUC-TW" },
00175         { "5601",       "EUC-KR" },
00176         { "JOHAP92",    "JOHAB" },
00177         { "PCK",        "SHIFT_JIS" },
00178         { "2533",       "TIS-620" },
00179         { NULL, NULL }
00180     };
00181 #elif SYS_BSD
00182     VLCCharsetAlias aliases[] =
00183     {
00184         { "646", " ASCII" },
00185         { "EUCCN", "GB2312" },
00186         { NULL, NULL }
00187     };
00188 #else
00189     VLCCharsetAlias aliases[] = {{NULL, NULL}};
00190 #endif
00191 
00192     if( aliases )
00193     {
00194         for (a = aliases; a->psz_alias; a++)
00195             if (strcasecmp (a->psz_alias, psz_name) == 0)
00196                 return a->psz_name;
00197     }
00198 
00199     /* we return original name beacuse iconv() probably will know
00200      * something better about name if we don't know it :-) */
00201     return psz_name;
00202 }
00203 
00204 /* Returns charset from "language_COUNTRY.charset@modifier" string */
00205 #if defined WIN32 || defined OS2 || !HAVE_LANGINFO_CODESET
00206 static void vlc_encoding_from_locale( char *psz_locale, char *psz_charset )
00207 {
00208     char *psz_dot = strchr( psz_locale, '.' );
00209 
00210     if( psz_dot != NULL )
00211     {
00212         const char *psz_modifier;
00213 
00214         psz_dot++;
00215 
00216         /* Look for the possible @... trailer and remove it, if any.  */
00217         psz_modifier = strchr( psz_dot, '@' );
00218 
00219         if( psz_modifier == NULL )
00220         {
00221             strcpy( psz_charset, psz_dot );
00222             return;
00223         }
00224         if( 0 < ( psz_modifier - psz_dot )
00225              && ( psz_modifier - psz_dot ) < 2 + 10 + 1 )
00226         {
00227             memcpy( psz_charset, psz_dot, psz_modifier - psz_dot );
00228             psz_charset[ psz_modifier - psz_dot ] = '\0';
00229             return;
00230         }
00231     }
00232     /* try language mapping */
00233     strcpy( psz_charset, vlc_encoding_from_language( psz_locale ) );
00234 }
00235 #endif
00236 
00237 vlc_bool_t vlc_current_charset( char **psz_charset )
00238 {
00239     const char *psz_codeset;
00240 
00241 #if !(defined WIN32 || defined OS2 || defined SYS_DARWIN)
00242 
00243 # if HAVE_LANGINFO_CODESET
00244     /* Most systems support nl_langinfo( CODESET ) nowadays.  */
00245     psz_codeset = nl_langinfo( CODESET );
00246     if( !strcmp( psz_codeset, "ANSI_X3.4-1968" ) )
00247         psz_codeset = "ASCII";
00248 # else
00249     /* On old systems which lack it, use setlocale or getenv.  */
00250     const char *psz_locale = NULL;
00251     char buf[2 + 10 + 1];
00252 
00253     /* But most old systems don't have a complete set of locales.  Some
00254      * (like SunOS 4 or DJGPP) have only the C locale.  Therefore we don't
00255      * use setlocale here; it would return "C" when it doesn't support the
00256      * locale name the user has set. Darwin's setlocale is broken. */
00257 #  if HAVE_SETLOCALE && !SYS_DARWIN
00258     psz_locale = setlocale( LC_ALL, NULL );
00259 #  endif
00260     if( psz_locale == NULL || psz_locale[0] == '\0' )
00261     {
00262         psz_locale = getenv( "LC_ALL" );
00263         if( psz_locale == NULL || psz_locale[0] == '\0' )
00264         {
00265             psz_locale = getenv( "LC_CTYPE" );
00266             if( psz_locale == NULL || psz_locale[0] == '\0')
00267                 psz_locale = getenv( "LANG" );
00268         }
00269     }
00270 
00271     /* On some old systems, one used to set locale = "iso8859_1". On others,
00272      * you set it to "language_COUNTRY.charset". Darwin only has LANG :( */
00273     vlc_encoding_from_locale( (char *)psz_locale, buf );
00274     psz_codeset =  buf;
00275 # endif /* HAVE_LANGINFO_CODESET */
00276 
00277 #elif defined SYS_DARWIN
00278 
00279     /* Darwin is always using UTF-8 internally. */
00280     psz_codeset = "UTF-8";
00281 
00282 #elif defined WIN32
00283 
00284     char buf[2 + 10 + 1];
00285 
00286     /* Woe32 has a function returning the locale's codepage as a number.  */
00287     sprintf( buf, "CP%u", GetACP() );
00288     psz_codeset = buf;
00289 
00290 #elif defined OS2
00291 
00292     const char *psz_locale;
00293     char buf[2 + 10 + 1];
00294     ULONG cp[3];
00295     ULONG cplen;
00296 
00297     /* Allow user to override the codeset, as set in the operating system,
00298      * with standard language environment variables. */
00299     psz_locale = getenv( "LC_ALL" );
00300     if( psz_locale == NULL || psz_locale[0] == '\0' )
00301     {
00302         psz+locale = getenv( "LC_CTYPE" );
00303         if( psz_locale == NULL || locale[0] == '\0' )
00304             locale = getenv( "LANG" );
00305     }
00306     if( psz_locale != NULL && psz_locale[0] != '\0' )
00307         vlc_encoding_from_locale( psz_locale, buf );
00308         psz_codeset = buf;
00309     else
00310     {
00311         /* OS/2 has a function returning the locale's codepage as a number. */
00312         if( DosQueryCp( sizeof( cp ), cp, &cplen ) )
00313             psz_codeset = "";
00314         else
00315         {
00316             sprintf( buf, "CP%u", cp[0] );
00317             psz_codeset = buf;
00318         }
00319     }
00320 #endif
00321     if( psz_codeset == NULL )
00322         /* The canonical name cannot be determined. */
00323         psz_codeset = "";
00324     else
00325         psz_codeset = vlc_charset_aliases( psz_codeset );
00326 
00327     /* Don't return an empty string.  GNU libc and GNU libiconv interpret
00328      * the empty string as denoting "the locale's character encoding",
00329      * thus GNU libiconv would call this function a second time. */
00330     if( psz_codeset[0] == '\0' )
00331     {
00332         /* Last possibility is 'CHARSET' enviroment variable */
00333         if( !( psz_codeset = getenv( "CHARSET" ) ) )
00334             psz_codeset = "ISO-8859-1";
00335     }
00336 
00337     if( psz_charset )
00338         *psz_charset = strdup(psz_codeset);
00339 
00340     if( !strcasecmp(psz_codeset, "UTF8") || !strcasecmp(psz_codeset, "UTF-8") )
00341         return VLC_TRUE;
00342 
00343     return VLC_FALSE;
00344 }
00345 
00346 char *__vlc_fix_readdir_charset( vlc_object_t *p_this, const char *psz_string )
00347 {
00348 #ifdef SYS_DARWIN
00349     if ( p_this->p_libvlc->iconv_macosx != (vlc_iconv_t)-1 )
00350     {
00351         const char *psz_in = psz_string;
00352         size_t i_in = strlen(psz_in);
00353         size_t i_out = i_in * 2;
00354         char *psz_utf8 = malloc(i_out + 1);
00355         char *psz_out = psz_utf8;
00356 
00357         vlc_mutex_lock( &p_this->p_libvlc->iconv_lock );
00358         size_t i_ret = vlc_iconv( p_this->p_libvlc->iconv_macosx,
00359                                   &psz_in, &i_in, &psz_out, &i_out );
00360         vlc_mutex_unlock( &p_this->p_libvlc->iconv_lock );
00361         if( i_ret == (size_t)-1 || i_in )
00362         {
00363             msg_Warn( p_this,
00364                       "failed to convert \"%s\" from HFS+ charset (%s)",
00365                       psz_string, strerror(errno) );
00366             free( psz_utf8 );
00367             return strdup( psz_string );
00368         }
00369 
00370         *psz_out = '\0';
00371         return psz_utf8;
00372     }
00373 #endif
00374 
00375     return strdup( psz_string );
00376 }

Generated on Tue Dec 20 10:15:00 2005 for vlc-0.8.4a by  doxygen 1.4.2