csutil/csuctransform.h

Go to the documentation of this file.
00001 /*
00002     Copyright (C) 2003 by Frank Richter
00003 
00004     This library is free software; you can redistribute it and/or
00005     modify it under the terms of the GNU Library General Public
00006     License as published by the Free Software Foundation; either
00007     version 2 of the License, or (at your option) any later version.
00008 
00009     This library is distributed in the hope that it will be useful,
00010     but WITHOUT ANY WARRANTY; without even the implied warranty of
00011     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00012     Library General Public License for more details.
00013 
00014     You should have received a copy of the GNU Library General Public
00015     License along with this library; if not, write to the Free
00016     Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
00017 */
00018 
00019 #ifndef __CS_CSUCTRANSFORM_H__
00020 #define __CS_CSUCTRANSFORM_H__
00021 
00022 #include "csunicode.h"
00023 
00035 #define CS_UC_MAX_UTF8_ENCODED          4  /* 6 to encode 32 bit */
00036 
00040 #define CS_UC_MAX_UTF16_ENCODED         2
00041 
00045 #define CS_UC_MAX_UTF32_ENCODED         1
00046 #if (CS_WCHAR_T_SIZE == 1)
00047 #define CS_UC_MAX_WCHAR_T_ENCODED       CS_UC_MAX_UTF8_ENCODED
00048 #elif (CS_WCHAR_T_SIZE == 2)
00049 
00053 #define CS_UC_MAX_WCHAR_T_ENCODED       CS_UC_MAX_UTF16_ENCODED
00054 #else
00055 #define CS_UC_MAX_WCHAR_T_ENCODED       CS_UC_MAX_UTF32_ENCODED
00056 #endif
00057 
00061 #define CS_UC_MAX_MAPPED                3
00062 
00066 enum
00067 {
00073   csUcMapSimple = (1 << 0)
00074 };
00075 
00079 class CS_CRYSTALSPACE_EXPORT csUnicodeTransform
00080 {
00081 public:
00082 #define FAIL(ret)                               \
00083   {                                             \
00084     if (isValid) *isValid = false;              \
00085     ch = CS_UC_CHAR_REPLACER;                   \
00086     return ret;                                 \
00087   }
00088 
00089 #define SUCCEED                                 \
00090     if (isValid) *isValid = true;               \
00091     return chUsed;
00092   
00093 #define GET_NEXT(next)  \
00094   if ((size_t)chUsed == strlen)                 \
00095   {                                             \
00096     FAIL(chUsed);                               \
00097   }                                             \
00098   next = *str++;                                \
00099   if (next == 0)                                \
00100   {                                             \
00101     FAIL(chUsed);                               \
00102   }                                             \
00103   chUsed++;                                     
00104   
00123   inline static int UTF8Decode (const utf8_char* str, size_t strlen, 
00124     utf32_char& ch, bool* isValid = 0, bool returnNonChar = false)
00125   {
00126     if (str == 0)
00127     {
00128       FAIL(0);
00129     }
00130     int chUsed = 0;
00131     
00132     utf8_char curCh;
00133     GET_NEXT(curCh);
00134     if ((curCh & 0x80) == 0)
00135     {
00136       // easy case
00137       ch = curCh;
00138       SUCCEED;
00139     }
00140     else
00141     {
00142       // Count with how many bytes this char is encoded.
00143       int n = 0;
00144       while ((n < 7) && ((curCh & (1 << (7 - n))) != 0)) { n++; }
00145 
00146       if ((n < 2) || (n > 6))
00147       {
00148         // Invalid code: first char of a "sequence" must have
00149         // at least two and at most six MSBs set
00150         FAIL(1);
00151       }
00152 
00153       ch = (curCh & ((1 << (8 - n)) - 1));
00154       
00155       for (int i = 1; i < n; i++)
00156       {
00157         GET_NEXT(curCh);
00158         if ((curCh & 0xc0) != 0x80)
00159         {
00160           FAIL(chUsed);
00161         }
00162         else
00163         {
00164           ch <<= 6;
00165           ch |= (curCh & 0x3f);
00166         }
00167       }
00168       
00169       // Check if in Unicode range.
00170       if (ch > CS_UC_LAST_CHAR)
00171       {
00172         FAIL(chUsed);
00173       }
00174 
00175       // Check for "overlong" codes.
00176       if ((ch < 0x80) && (n > 0))
00177       {
00178         FAIL(chUsed);
00179       }
00180       else if ((ch < 0x800) && (n > 2))
00181       {
00182         FAIL(chUsed);
00183       }
00184       else if ((ch < 0x10000) && (n > 3))
00185       {
00186         FAIL(chUsed);
00187       }
00188       else if ((ch < 0x200000) && (n > 4))
00189       {
00190         FAIL(chUsed);
00191       }
00192       /* 
00193       else if ((ch < 0x4000000) && (n > 5))
00194       {
00195         FAIL(chUsed);
00196       }
00197       else if ((ch < 0x80000000) && (n > 6))
00198       {
00199         FAIL(chUsed);
00200       }
00201       */
00202       
00203       if (!returnNonChar && (CS_UC_IS_NONCHARACTER(ch) 
00204         || CS_UC_IS_SURROGATE(ch)))
00205         FAIL(chUsed);
00206       SUCCEED;
00207     }
00208   }
00209   
00214   inline static int UTF16Decode (const utf16_char* str, size_t strlen, 
00215     utf32_char& ch, bool* isValid = 0, bool returnNonChar = false)
00216   {
00217     if (str == 0)
00218     {
00219       FAIL(0);
00220     }
00221     int chUsed = 0;
00222     
00223     utf16_char curCh;
00224     GET_NEXT(curCh);
00225     // Decode surrogate
00226     if (CS_UC_IS_SURROGATE (curCh))
00227     {
00228       // Invalid code
00229       if (!CS_UC_IS_HIGH_SURROGATE (curCh))
00230       {
00231         FAIL(chUsed);
00232       }
00233       ch = 0x10000 + ((curCh & 0x03ff) << 10);
00234       GET_NEXT(curCh);
00235       // Invalid code
00236       if (!CS_UC_IS_LOW_SURROGATE (curCh))
00237       {
00238         // Fail with 1 so the char is handled upon the next Decode.
00239         FAIL(1);
00240       }
00241       ch |= (curCh & 0x3ff);
00242     }
00243     else
00244     {
00245       ch = curCh;
00246     }
00247     if (!returnNonChar && (CS_UC_IS_NONCHARACTER(ch) 
00248       || CS_UC_IS_SURROGATE(ch)))
00249       FAIL(chUsed);
00250     SUCCEED;
00251   }
00252   
00257   inline static int UTF32Decode (const utf32_char* str, size_t strlen, 
00258     utf32_char& ch, bool* isValid = 0, bool returnNonChar = false)
00259   {
00260     if (str == 0)
00261     {
00262       FAIL(0);
00263     }
00264     int chUsed = 0;
00265     
00266     GET_NEXT(ch);
00267     if ((!returnNonChar && (CS_UC_IS_NONCHARACTER(ch) 
00268       || CS_UC_IS_SURROGATE(ch))) || (ch > CS_UC_LAST_CHAR))
00269       FAIL(chUsed);
00270     SUCCEED;
00271   }
00272 
00277   inline static int Decode (const utf8_char* str, size_t strlen, 
00278     utf32_char& ch, bool* isValid = 0, bool returnNonChar = false)
00279   {
00280     return UTF8Decode (str, strlen, ch, isValid, returnNonChar);
00281   }
00286   inline static int Decode (const utf16_char* str, size_t strlen, 
00287     utf32_char& ch, bool* isValid = 0, bool returnNonChar = false)
00288   {
00289     return UTF16Decode (str, strlen, ch, isValid, returnNonChar);
00290   }
00295   inline static int Decode (const utf32_char* str, size_t strlen, 
00296     utf32_char& ch, bool* isValid = 0, bool returnNonChar = false)
00297   {
00298     return UTF32Decode (str, strlen, ch, isValid, returnNonChar);
00299   }
00300 
00302 #undef FAIL
00303 #undef SUCCEED
00304 #undef GET_NEXT
00305 
00308 #define _OUTPUT_CHAR(buf, chr)                          \
00309   if (bufRemaining > 0)                                 \
00310   {                                                     \
00311     if(buf) *buf++ = chr;                               \
00312     bufRemaining--;                                     \
00313   }                                                     \
00314   encodedLen++;
00315 
00316 #define OUTPUT_CHAR(chr) _OUTPUT_CHAR(buf, chr)
00317   
00331   inline static int EncodeUTF8 (const utf32_char ch, utf8_char* buf, 
00332     size_t bufsize, bool allowNonchars = false)
00333   {
00334     if ((!allowNonchars && ((CS_UC_IS_NONCHARACTER(ch)) 
00335       || (CS_UC_IS_SURROGATE(ch)))) || (ch > CS_UC_LAST_CHAR))
00336       return 0;
00337     size_t bufRemaining = bufsize;
00338     int encodedLen = 0;
00339     
00340     if (ch < 0x80)
00341     {
00342       OUTPUT_CHAR ((utf8_char)ch);
00343     }
00344     else if (ch < 0x800)
00345     {
00346       OUTPUT_CHAR ((utf8_char)(0xc0 | (ch >> 6)));
00347       OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f)));
00348     }
00349     else if (ch < 0x10000)
00350     {
00351       OUTPUT_CHAR ((utf8_char)(0xe0 | (ch >> 12)));
00352       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f)));
00353       OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f)));
00354     }
00355     else if (ch < 0x200000)
00356     {
00357       OUTPUT_CHAR ((utf8_char)(0xf0 | (ch >> 18)));
00358       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 12) & 0x3f)));
00359       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f)));
00360       OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f)));
00361     }
00362     /*
00363     else if (ch < 0x4000000)
00364     {
00365       OUTPUT_CHAR ((utf8_char)(0xf8 | (ch >> 24)));
00366       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 18) & 0x3f)));
00367       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 12) & 0x3f)));
00368       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f)));
00369       OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f)));
00370     }
00371     else if (ch < 0x80000000)
00372     {
00373       OUTPUT_CHAR ((utf8_char)(0xfc | (ch >> 30)));
00374       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 24) & 0x3f)));
00375       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 18) & 0x3f)));
00376       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 12) & 0x3f)));
00377       OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f)));
00378       OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f)));
00379     }
00380     */
00381     return encodedLen;
00382   }
00383     
00388   inline static int EncodeUTF16 (const utf32_char ch, utf16_char* buf, 
00389     size_t bufsize, bool allowNonchars = false)
00390   {
00391     if ((!allowNonchars && ((CS_UC_IS_NONCHARACTER(ch)) 
00392       || (CS_UC_IS_SURROGATE(ch)))) || (ch > CS_UC_LAST_CHAR))
00393       return 0;
00394     size_t bufRemaining = bufsize;
00395     int encodedLen = 0;
00396     
00397     if (ch < 0x10000)
00398     {
00399       OUTPUT_CHAR((utf16_char)ch);
00400     }
00401     else if (ch < 0x100000)
00402     {
00403       utf32_char ch_shifted = ch - 0x10000;
00404       OUTPUT_CHAR((utf16_char)((ch_shifted >> 10) 
00405         | CS_UC_CHAR_HIGH_SURROGATE_FIRST));
00406       OUTPUT_CHAR((utf16_char)((ch_shifted & 0x3ff) 
00407         | CS_UC_CHAR_LOW_SURROGATE_FIRST));
00408     }
00409     else
00410       return 0;
00411     
00412     return encodedLen;
00413   }
00414 
00419   inline static int EncodeUTF32 (const utf32_char ch, utf32_char* buf, 
00420     size_t bufsize, bool allowNonchars = false)
00421   {
00422     if ((!allowNonchars && ((CS_UC_IS_NONCHARACTER(ch)) 
00423       || (CS_UC_IS_SURROGATE(ch)))) || (ch > CS_UC_LAST_CHAR))
00424       return 0;
00425     size_t bufRemaining = bufsize;
00426     int encodedLen = 0;
00427     
00428     OUTPUT_CHAR(ch);
00429     
00430     return encodedLen;
00431   }
00432 
00437   inline static int Encode (const utf32_char ch, utf8_char* buf, 
00438     size_t bufsize, bool allowNonchars = false)
00439   {
00440     return EncodeUTF8 (ch, buf, bufsize, allowNonchars);
00441   }
00446   inline static int Encode (const utf32_char ch, utf16_char* buf, 
00447     size_t bufsize, bool allowNonchars = false)
00448   {
00449     return EncodeUTF16 (ch, buf, bufsize, allowNonchars);
00450   }
00455   inline static int Encode (const utf32_char ch, utf32_char* buf, 
00456     size_t bufsize, bool allowNonchars = false)
00457   {
00458     return EncodeUTF32 (ch, buf, bufsize, allowNonchars);
00459   }
00461 #undef OUTPUT_CHAR
00462   
00465 #define OUTPUT_CHAR(chr) _OUTPUT_CHAR(dest, chr)
00466   
00467 #define UCTF_CONVERTER(funcName, fromType, decoder, toType, encoder)    \
00468   inline static size_t funcName (toType* dest, size_t destSize,         \
00469     const fromType* source, size_t srcSize = (size_t)-1)                \
00470   {                                                                     \
00471     if ((srcSize == 0) || (source == 0))                                \
00472       return 0;                                                         \
00473                                                                         \
00474     size_t bufRemaining = (destSize > 0) ? destSize - 1 : 0;            \
00475     size_t encodedLen = 0;                                              \
00476                                                                         \
00477     size_t srcChars = srcSize;                                          \
00478                                                                         \
00479     if (srcSize == (size_t)-1)                                          \
00480     {                                                                   \
00481       srcChars = 0;                                                     \
00482       const fromType* sptr = source;                                    \
00483       while (*sptr++ != 0) srcChars++;                                  \
00484     }                                                                   \
00485                                                                         \
00486     while (srcChars > 0)                                                \
00487     {                                                                   \
00488       utf32_char ch;                                                    \
00489       int scnt = decoder (source, srcChars, ch, 0);                     \
00490       if (scnt == 0) break;                                             \
00491       int dcnt = encoder (ch, dest, bufRemaining);                      \
00492       if (dcnt == 0)                                                    \
00493       {                                                                 \
00494         dcnt = encoder (CS_UC_CHAR_REPLACER, dest, bufRemaining);       \
00495       }                                                                 \
00496                                                                         \
00497       if ((size_t)dcnt >= bufRemaining)                                 \
00498       {                                                                 \
00499         if (dest && (destSize > 0)) dest += bufRemaining;               \
00500         bufRemaining = 0;                                               \
00501       }                                                                 \
00502       else                                                              \
00503       {                                                                 \
00504         bufRemaining -= dcnt;                                           \
00505         if (dest && (destSize > 0)) dest += dcnt;                       \
00506       }                                                                 \
00507       encodedLen += dcnt;                                               \
00508       if ((size_t)scnt >= srcChars) break;                              \
00509       srcChars -= scnt;                                                 \
00510       source += scnt;                                                   \
00511     }                                                                   \
00512                                                                         \
00513     if (dest) *dest = 0;                                                \
00514                                                                         \
00515     return encodedLen + 1;                                              \
00516   }
00517 
00533   UCTF_CONVERTER (UTF8to16, utf8_char, UTF8Decode, utf16_char, EncodeUTF16);
00538   UCTF_CONVERTER (UTF8to32, utf8_char, UTF8Decode, utf32_char, EncodeUTF32);
00539 
00544   UCTF_CONVERTER (UTF16to8, utf16_char, UTF16Decode, utf8_char, EncodeUTF8);
00549   UCTF_CONVERTER (UTF16to32, utf16_char, UTF16Decode, utf32_char, EncodeUTF32);
00550   
00555   UCTF_CONVERTER (UTF32to8, utf32_char, UTF32Decode, utf8_char, EncodeUTF8);
00560   UCTF_CONVERTER (UTF32to16, utf32_char, UTF32Decode, utf16_char, EncodeUTF16);
00563 #undef UCTF_CONVERTER
00564 #undef OUTPUT_CHAR
00565 #undef _OUTPUT_CHAR
00566 
00567 #if (CS_WCHAR_T_SIZE == 1)
00568   inline static size_t UTF8toWC (wchar_t* dest, size_t destSize, 
00569     const utf8_char* source, size_t srcSize)
00570   {
00571     size_t srcChars = srcSize;                                          
00572     if (srcSize == (size_t)-1)                                          
00573     {                                                                   
00574       srcChars = 0;                                                     
00575       const utf8_char* sptr = source;                                   
00576       while (*sptr++ != 0) srcChars++;                                  
00577     }                           
00578     if ((dest != 0) && (destSize != 0))
00579     {
00580       size_t len = MIN (destSize - 1, srcChars);
00581       memcpy (dest, source, size * sizeof (wchar_t));
00582       *(dest + len) = 0;
00583     }
00584     return srcChars + 1;
00585   };
00586 
00587   inline static size_t UTF16toWC (wchar_t* dest, size_t destSize, 
00588     const utf16_char* source, size_t srcSize)
00589   {
00590     return UTF16to8 ((utf8_char*)dest, destSize, source, srcSize);
00591   };
00592 
00593   inline static size_t UTF32toWC (wchar_t* dest, size_t destSize, 
00594     const utf32_char* source, size_t srcSize)
00595   {
00596     return UTF32to8 ((utf8_char*)dest, destSize, source, srcSize);
00597   };
00598   
00599   inline static size_t WCtoUTF8 (utf8_char* dest, size_t destSize, 
00600     const wchar_t* source, size_t srcSize)
00601   {
00602     size_t srcChars = srcSize;                                          
00603     if (srcSize == (size_t)-1)                                          
00604     {                                                                   
00605       srcChars = 0;                                                     
00606       const wchar_t* sptr = source;                                     
00607       while (*sptr++ != 0) srcChars++;                                  
00608     }                           
00609     if ((dest != 0) && (destSize != 0))
00610     {
00611       size_t len = MIN (destSize - 1, srcChars);
00612       memcpy (dest, source, len * sizeof (wchar_t));
00613       *(dest + len) = 0;
00614     }
00615     return srcChars + 1;
00616   };
00617 
00618   inline static size_t WCtoUTF16 (utf16_char* dest, size_t destSize, 
00619     const wchar_t* source, size_t srcSize)
00620   {
00621     return UTF8to16 (dest, destSize, source, srcSize);
00622   };
00623 
00624   inline static size_t WCtoUTF32 (utf32_char* dest, size_t destSize, 
00625     const wchar_t* source, size_t srcSize)
00626   {
00627     return UTF8to32 (dest, destSize, source, srcSize);
00628   };
00629 
00630   inline static int Decode (const wchar_t* str, size_t strlen, 
00631     utf32_char& ch, bool* isValid = 0, bool returnNonChar = false)
00632   {
00633     return UTF8Decode ((utf8_char*)str, strlen, ch, isValid, returnNonChar);
00634   }
00635   inline static int Encode (const utf32_char ch, wchar_t* buf, 
00636     size_t bufsize, bool allowNonchars = false)
00637   {
00638     return EncodeUTF8 (ch, (utf8_char*)buf, bufsize, allowNonchars);
00639   }
00640 #elif (CS_WCHAR_T_SIZE == 2)
00641   // Methods below for doxygen documentation are here as the size '2' is 
00642   // default.
00643   
00650   inline static size_t UTF8toWC (wchar_t* dest, size_t destSize, 
00651     const utf8_char* source, size_t srcSize)
00652   {
00653     return UTF8to16 ((utf16_char*)dest, destSize, source, srcSize);
00654   };
00655 
00660   inline static size_t UTF16toWC (wchar_t* dest, size_t destSize, 
00661     const utf16_char* source, size_t srcSize)
00662   {
00663     size_t srcChars = srcSize;                                          
00664     if (srcSize == (size_t)-1)                                          
00665     {                                                                   
00666       srcChars = 0;                                                     
00667       const utf16_char* sptr = source;                                  
00668       while (*sptr++ != 0) srcChars++;                                  
00669     }                           
00670     if ((dest != 0) && (destSize != 0))
00671     {
00672       size_t len = MIN (destSize - 1, srcChars);
00673       memcpy (dest, source, len * sizeof (wchar_t));
00674       *(dest + len) = 0;
00675     }
00676     return srcChars + 1;
00677   };
00678 
00683   inline static size_t UTF32toWC (wchar_t* dest, size_t destSize, 
00684     const utf32_char* source, size_t srcSize)
00685   {
00686     return UTF32to16 ((utf16_char*)dest, destSize, source, srcSize);
00687   };
00688   
00693   inline static size_t WCtoUTF8 (utf8_char* dest, size_t destSize, 
00694     const wchar_t* source, size_t srcSize)
00695   {
00696     return UTF16to8 (dest, destSize, (utf16_char*)source, srcSize);
00697   };
00698 
00703   inline static size_t WCtoUTF16 (utf16_char* dest, size_t destSize, 
00704     const wchar_t* source, size_t srcSize)
00705   {
00706     size_t srcChars = srcSize;                                          
00707     if (srcSize == (size_t)-1)                                          
00708     {                                                                   
00709       srcChars = 0;                                                     
00710       const wchar_t* sptr = source;                                     
00711       while (*sptr++ != 0) srcChars++;                                  
00712     }                           
00713     if ((dest != 0) && (destSize != 0))
00714     {
00715       size_t len = MIN (destSize - 1, srcChars);
00716       memcpy (dest, source, len * sizeof (wchar_t));
00717       *(dest + len) = 0;
00718     }
00719     return srcChars + 1;
00720   };
00721 
00726   inline static size_t WCtoUTF32 (utf32_char* dest, size_t destSize, 
00727     const wchar_t* source, size_t srcSize)
00728   {
00729     return UTF16to32 (dest, destSize, (utf16_char*)source, srcSize);
00730   };
00731 
00732   /* Decode()/Encode() overloads for wchar_t.
00733    * - On VC7+, wchar_t may be an unsigned short or the special type __wchar_t.
00734    * - On VC6 wchar_t is always an unsigned short. __wchar_t does not exist.
00735    * Now there may be conflicts with the utf16_char overloads if wchar_t is
00736    * an unsigned short. On the other hand, we would like to support VC7+'s
00737    * built-in wchar_t as well.
00738    * So: on VC7+, provide overloads for __wchar_t, on VC6, don't compile this
00739    * code at all, on other compilers, provide overloads for wchar_t instead
00740    * (by re#definining __wchar_t). 
00741    */
00742 #if !defined(CS_COMPILER_MSVC) || (_MSC_VER > 1300)
00743 #if !defined(CS_COMPILER_MSVC)
00744   #define __wchar_t wchar_t
00745 #endif  
00746 
00750   inline static int Decode (const __wchar_t* str, size_t strlen, 
00751     utf32_char& ch, bool* isValid = 0, bool returnNonChar = false)
00752   {
00753     return UTF16Decode ((utf16_char*)str, strlen, ch, isValid, returnNonChar);
00754   }
00759   inline static int Encode (const utf32_char ch, __wchar_t* buf, 
00760     size_t bufsize, bool allowNonchars = false)
00761   {
00762     return EncodeUTF16 (ch, (utf16_char*)buf, bufsize, allowNonchars);
00763   }
00764 #ifdef __wchar_t
00765   #undef __wchar_t
00766 #endif
00767 #endif
00768 
00769 #elif (CS_WCHAR_T_SIZE == 4)
00770   inline static size_t UTF8toWC (wchar_t* dest, size_t destSize, 
00771     const utf8_char* source, size_t srcSize)
00772   {
00773     return UTF8to32 ((utf32_char*)dest, destSize, source, srcSize);
00774   };
00775 
00776   inline static size_t UTF16toWC (wchar_t* dest, size_t destSize, 
00777     const utf16_char* source, size_t srcSize)
00778   {
00779     return UTF16to32 ((utf32_char*)dest, destSize, source, srcSize);
00780   };
00781 
00782   inline static size_t UTF32toWC (wchar_t* dest, size_t destSize, 
00783     const utf32_char* source,  size_t srcSize)
00784   {
00785     size_t srcChars = srcSize;                                          
00786     if (srcSize == (size_t)-1)                                          
00787     {                                                                   
00788       srcChars = 0;                                                     
00789       const utf32_char* sptr = source;                                  
00790       while (*sptr++ != 0) srcChars++;                                  
00791     }                           
00792     if ((dest != 0) && (destSize != 0))
00793     {
00794       size_t len = MIN (destSize - 1, srcChars);
00795       memcpy (dest, source, len * sizeof (wchar_t));
00796       *(dest + len) = 0;
00797     }
00798     return srcChars + 1;
00799   };
00800   
00801   inline static size_t WCtoUTF8 (utf8_char* dest, size_t destSize, 
00802     const wchar_t* source, size_t srcSize)
00803   {
00804     return UTF32to8 (dest, destSize, (utf32_char*)source, srcSize);
00805   };
00806 
00807   inline static size_t WCtoUTF16 (utf16_char* dest, size_t destSize, 
00808     const wchar_t* source, size_t srcSize)
00809   {
00810     return UTF32to16 (dest, destSize, (utf32_char*)source, srcSize);
00811   };
00812 
00813   inline static size_t WCtoUTF32 (utf32_char* dest, size_t destSize, 
00814     const wchar_t* source, size_t srcSize)
00815   {
00816     size_t srcChars = srcSize;                                          
00817     if (srcSize == (size_t)-1)                                          
00818     {                                                                   
00819       srcChars = 0;                                                     
00820       const wchar_t* sptr = source;                                     
00821       while (*sptr++ != 0) srcChars++;                                  
00822     }                           
00823     if ((dest != 0) && (destSize != 0))
00824     {
00825       size_t len = MIN (destSize - 1, srcChars);
00826       memcpy (dest, source, len * sizeof (wchar_t));
00827       *(dest + len) = 0;
00828     }
00829     return srcChars + 1;
00830   };
00831 
00832   inline static int Decode (const wchar_t* str, size_t strlen, 
00833     utf32_char& ch, bool* isValid = 0, bool returnNonChar = false)
00834   {
00835     return UTF32Decode ((utf32_char*)str, strlen, ch, isValid, returnNonChar);
00836   }
00837   inline static int Encode (const utf32_char ch, wchar_t* buf, 
00838     size_t bufsize, bool allowNonchars = false)
00839   {
00840     return EncodeUTF32 (ch, (utf32_char*)buf, bufsize, allowNonchars);
00841   }
00842 #else
00843   #error Odd-sized, unsupported wchar_t!
00844 #endif
00845 
00858   inline static int UTF8Skip (const utf8_char* str, size_t maxSkip)
00859   {
00860     if (maxSkip < 1) return 0;
00861   
00862     if ((*str & 0x80) == 0)
00863     {
00864       return 1;
00865     }
00866     else
00867     {
00868       int n = 0;
00869       while ((n < 7) && ((*str & (1 << (7 - n))) != 0)) { n++; }
00870 
00871       if ((n < 2) || (n > 6))
00872       {
00873         return 1;
00874       }
00875 
00876       int skip = 1;
00877       
00878       for (; skip < n; skip++)
00879       {
00880         if (((str[skip] & 0xc0) != 0x80) || ((size_t)skip > maxSkip))
00881         {
00882           break;
00883         }
00884       }
00885       return skip;
00886     }
00887   }
00888   
00899   inline static int UTF8Rewind (const utf8_char* str, size_t maxRew)
00900   {
00901     if (maxRew < 1) return 0;
00902     
00903     const utf8_char* pos = str - 1;
00904     
00905     if ((*pos & 0x80) == 0)
00906     {
00907       return 1;
00908     }
00909     
00910     // Skip backward to the first byte of the sequence.
00911     int skip = 1;
00912     while (((*pos & 0xc0) == 0x80) && ((size_t)skip < maxRew))
00913     {
00914       skip++;
00915       pos--;
00916     }
00917     
00918     return skip;
00919   }
00920   
00926   inline static int UTF16Skip (const utf16_char* str, size_t maxSkip)
00927   {
00928     if (CS_UC_IS_HIGH_SURROGATE (*str))
00929       return (int)(MIN(maxSkip, 2));
00930     else
00931       return (int)(MIN(maxSkip, 1));
00932   }
00933   
00939   inline static int UTF16Rewind (const utf16_char* str, size_t maxRew)
00940   {
00941     if (maxRew < 1) return 0;
00942     
00943     const utf16_char* pos = str - 1;
00944     if (!CS_UC_IS_SURROGATE(*pos)) 
00945       return 1;
00946     else
00947     {
00948       if ((maxRew > 1) && (CS_UC_IS_HIGH_SURROGATE(*(pos - 1))))
00949         return 2;
00950       else
00951         return 1;
00952     }
00953   }
00954   
00960   inline static int UTF32Skip (const utf32_char* str, size_t maxSkip)
00961   {
00962     (void)str; // silence gcc
00963     return (int)(MIN(maxSkip, 1));
00964   }
00965 
00971   inline static int UTF32Rewind (const utf32_char* str, size_t maxRew)
00972   {
00973     (void)str; // silence gcc
00974     if (maxRew < 1) return 0;
00975     return 1;
00976   }
00991   static size_t MapToUpper (const utf32_char ch, utf32_char* dest, 
00992     size_t destSize, uint flags = 0);
00997   static size_t MapToLower (const utf32_char ch, utf32_char* dest, 
00998     size_t destSize, uint flags = 0);
01004   static size_t MapToFold (const utf32_char ch, utf32_char* dest, 
01005     size_t destSize, uint flags = 0);
01007 };
01008 
01011 #endif
01012
Generated for Crystal Space by doxygen 1.4.7
	Public API Reference
Public API Reference

csutil/csuctransform.h