csutil/csuctransform.h
Go to the documentation of this file.00001 /* 00002 Copyright (C) 2003 by Frank Richter 00003 00004 This library is free software; you can redistribute it and/or 00005 modify it under the terms of the GNU Library General Public 00006 License as published by the Free Software Foundation; either 00007 version 2 of the License, or (at your option) any later version. 00008 00009 This library is distributed in the hope that it will be useful, 00010 but WITHOUT ANY WARRANTY; without even the implied warranty of 00011 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00012 Library General Public License for more details. 00013 00014 You should have received a copy of the GNU Library General Public 00015 License along with this library; if not, write to the Free 00016 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 00017 */ 00018 00019 #ifndef __CS_CSUCTRANSFORM_H__ 00020 #define __CS_CSUCTRANSFORM_H__ 00021 00022 #include "csunicode.h" 00023 00035 #define CS_UC_MAX_UTF8_ENCODED 4 /* 6 to encode 32 bit */ 00036 00040 #define CS_UC_MAX_UTF16_ENCODED 2 00041 00045 #define CS_UC_MAX_UTF32_ENCODED 1 00046 #if (CS_WCHAR_T_SIZE == 1) 00047 #define CS_UC_MAX_WCHAR_T_ENCODED CS_UC_MAX_UTF8_ENCODED 00048 #elif (CS_WCHAR_T_SIZE == 2) 00049 00053 #define CS_UC_MAX_WCHAR_T_ENCODED CS_UC_MAX_UTF16_ENCODED 00054 #else 00055 #define CS_UC_MAX_WCHAR_T_ENCODED CS_UC_MAX_UTF32_ENCODED 00056 #endif 00057 00061 #define CS_UC_MAX_MAPPED 3 00062 00066 enum 00067 { 00073 csUcMapSimple = (1 << 0) 00074 }; 00075 00079 class CS_CRYSTALSPACE_EXPORT csUnicodeTransform 00080 { 00081 public: 00082 #define FAIL(ret) \ 00083 { \ 00084 if (isValid) *isValid = false; \ 00085 ch = CS_UC_CHAR_REPLACER; \ 00086 return ret; \ 00087 } 00088 00089 #define SUCCEED \ 00090 if (isValid) *isValid = true; \ 00091 return chUsed; 00092 00093 #define GET_NEXT(next) \ 00094 if ((size_t)chUsed == strlen) \ 00095 { \ 00096 FAIL(chUsed); \ 00097 } \ 00098 next = *str++; \ 00099 if (next == 0) \ 00100 { \ 00101 FAIL(chUsed); \ 00102 } \ 00103 chUsed++; 00104 00123 inline static int UTF8Decode (const utf8_char* str, size_t strlen, 00124 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00125 { 00126 if (str == 0) 00127 { 00128 FAIL(0); 00129 } 00130 int chUsed = 0; 00131 00132 utf8_char curCh; 00133 GET_NEXT(curCh); 00134 if ((curCh & 0x80) == 0) 00135 { 00136 // easy case 00137 ch = curCh; 00138 SUCCEED; 00139 } 00140 else 00141 { 00142 // Count with how many bytes this char is encoded. 00143 int n = 0; 00144 while ((n < 7) && ((curCh & (1 << (7 - n))) != 0)) { n++; } 00145 00146 if ((n < 2) || (n > 6)) 00147 { 00148 // Invalid code: first char of a "sequence" must have 00149 // at least two and at most six MSBs set 00150 FAIL(1); 00151 } 00152 00153 ch = (curCh & ((1 << (8 - n)) - 1)); 00154 00155 for (int i = 1; i < n; i++) 00156 { 00157 GET_NEXT(curCh); 00158 if ((curCh & 0xc0) != 0x80) 00159 { 00160 FAIL(chUsed); 00161 } 00162 else 00163 { 00164 ch <<= 6; 00165 ch |= (curCh & 0x3f); 00166 } 00167 } 00168 00169 // Check if in Unicode range. 00170 if (ch > CS_UC_LAST_CHAR) 00171 { 00172 FAIL(chUsed); 00173 } 00174 00175 // Check for "overlong" codes. 00176 if ((ch < 0x80) && (n > 0)) 00177 { 00178 FAIL(chUsed); 00179 } 00180 else if ((ch < 0x800) && (n > 2)) 00181 { 00182 FAIL(chUsed); 00183 } 00184 else if ((ch < 0x10000) && (n > 3)) 00185 { 00186 FAIL(chUsed); 00187 } 00188 else if ((ch < 0x200000) && (n > 4)) 00189 { 00190 FAIL(chUsed); 00191 } 00192 /* 00193 else if ((ch < 0x4000000) && (n > 5)) 00194 { 00195 FAIL(chUsed); 00196 } 00197 else if ((ch < 0x80000000) && (n > 6)) 00198 { 00199 FAIL(chUsed); 00200 } 00201 */ 00202 00203 if (!returnNonChar && (CS_UC_IS_NONCHARACTER(ch) 00204 || CS_UC_IS_SURROGATE(ch))) 00205 FAIL(chUsed); 00206 SUCCEED; 00207 } 00208 } 00209 00214 inline static int UTF16Decode (const utf16_char* str, size_t strlen, 00215 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00216 { 00217 if (str == 0) 00218 { 00219 FAIL(0); 00220 } 00221 int chUsed = 0; 00222 00223 utf16_char curCh; 00224 GET_NEXT(curCh); 00225 // Decode surrogate 00226 if (CS_UC_IS_SURROGATE (curCh)) 00227 { 00228 // Invalid code 00229 if (!CS_UC_IS_HIGH_SURROGATE (curCh)) 00230 { 00231 FAIL(chUsed); 00232 } 00233 ch = 0x10000 + ((curCh & 0x03ff) << 10); 00234 GET_NEXT(curCh); 00235 // Invalid code 00236 if (!CS_UC_IS_LOW_SURROGATE (curCh)) 00237 { 00238 // Fail with 1 so the char is handled upon the next Decode. 00239 FAIL(1); 00240 } 00241 ch |= (curCh & 0x3ff); 00242 } 00243 else 00244 { 00245 ch = curCh; 00246 } 00247 if (!returnNonChar && (CS_UC_IS_NONCHARACTER(ch) 00248 || CS_UC_IS_SURROGATE(ch))) 00249 FAIL(chUsed); 00250 SUCCEED; 00251 } 00252 00257 inline static int UTF32Decode (const utf32_char* str, size_t strlen, 00258 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00259 { 00260 if (str == 0) 00261 { 00262 FAIL(0); 00263 } 00264 int chUsed = 0; 00265 00266 GET_NEXT(ch); 00267 if ((!returnNonChar && (CS_UC_IS_NONCHARACTER(ch) 00268 || CS_UC_IS_SURROGATE(ch))) || (ch > CS_UC_LAST_CHAR)) 00269 FAIL(chUsed); 00270 SUCCEED; 00271 } 00272 00277 inline static int Decode (const utf8_char* str, size_t strlen, 00278 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00279 { 00280 return UTF8Decode (str, strlen, ch, isValid, returnNonChar); 00281 } 00286 inline static int Decode (const utf16_char* str, size_t strlen, 00287 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00288 { 00289 return UTF16Decode (str, strlen, ch, isValid, returnNonChar); 00290 } 00295 inline static int Decode (const utf32_char* str, size_t strlen, 00296 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00297 { 00298 return UTF32Decode (str, strlen, ch, isValid, returnNonChar); 00299 } 00300 00302 #undef FAIL 00303 #undef SUCCEED 00304 #undef GET_NEXT 00305 00308 #define _OUTPUT_CHAR(buf, chr) \ 00309 if (bufRemaining > 0) \ 00310 { \ 00311 if(buf) *buf++ = chr; \ 00312 bufRemaining--; \ 00313 } \ 00314 encodedLen++; 00315 00316 #define OUTPUT_CHAR(chr) _OUTPUT_CHAR(buf, chr) 00317 00331 inline static int EncodeUTF8 (const utf32_char ch, utf8_char* buf, 00332 size_t bufsize, bool allowNonchars = false) 00333 { 00334 if ((!allowNonchars && ((CS_UC_IS_NONCHARACTER(ch)) 00335 || (CS_UC_IS_SURROGATE(ch)))) || (ch > CS_UC_LAST_CHAR)) 00336 return 0; 00337 size_t bufRemaining = bufsize; 00338 int encodedLen = 0; 00339 00340 if (ch < 0x80) 00341 { 00342 OUTPUT_CHAR ((utf8_char)ch); 00343 } 00344 else if (ch < 0x800) 00345 { 00346 OUTPUT_CHAR ((utf8_char)(0xc0 | (ch >> 6))); 00347 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f))); 00348 } 00349 else if (ch < 0x10000) 00350 { 00351 OUTPUT_CHAR ((utf8_char)(0xe0 | (ch >> 12))); 00352 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f))); 00353 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f))); 00354 } 00355 else if (ch < 0x200000) 00356 { 00357 OUTPUT_CHAR ((utf8_char)(0xf0 | (ch >> 18))); 00358 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 12) & 0x3f))); 00359 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f))); 00360 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f))); 00361 } 00362 /* 00363 else if (ch < 0x4000000) 00364 { 00365 OUTPUT_CHAR ((utf8_char)(0xf8 | (ch >> 24))); 00366 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 18) & 0x3f))); 00367 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 12) & 0x3f))); 00368 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f))); 00369 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f))); 00370 } 00371 else if (ch < 0x80000000) 00372 { 00373 OUTPUT_CHAR ((utf8_char)(0xfc | (ch >> 30))); 00374 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 24) & 0x3f))); 00375 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 18) & 0x3f))); 00376 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 12) & 0x3f))); 00377 OUTPUT_CHAR ((utf8_char)(0x80 | ((ch >> 6) & 0x3f))); 00378 OUTPUT_CHAR ((utf8_char)(0x80 | (ch & 0x3f))); 00379 } 00380 */ 00381 return encodedLen; 00382 } 00383 00388 inline static int EncodeUTF16 (const utf32_char ch, utf16_char* buf, 00389 size_t bufsize, bool allowNonchars = false) 00390 { 00391 if ((!allowNonchars && ((CS_UC_IS_NONCHARACTER(ch)) 00392 || (CS_UC_IS_SURROGATE(ch)))) || (ch > CS_UC_LAST_CHAR)) 00393 return 0; 00394 size_t bufRemaining = bufsize; 00395 int encodedLen = 0; 00396 00397 if (ch < 0x10000) 00398 { 00399 OUTPUT_CHAR((utf16_char)ch); 00400 } 00401 else if (ch < 0x100000) 00402 { 00403 utf32_char ch_shifted = ch - 0x10000; 00404 OUTPUT_CHAR((utf16_char)((ch_shifted >> 10) 00405 | CS_UC_CHAR_HIGH_SURROGATE_FIRST)); 00406 OUTPUT_CHAR((utf16_char)((ch_shifted & 0x3ff) 00407 | CS_UC_CHAR_LOW_SURROGATE_FIRST)); 00408 } 00409 else 00410 return 0; 00411 00412 return encodedLen; 00413 } 00414 00419 inline static int EncodeUTF32 (const utf32_char ch, utf32_char* buf, 00420 size_t bufsize, bool allowNonchars = false) 00421 { 00422 if ((!allowNonchars && ((CS_UC_IS_NONCHARACTER(ch)) 00423 || (CS_UC_IS_SURROGATE(ch)))) || (ch > CS_UC_LAST_CHAR)) 00424 return 0; 00425 size_t bufRemaining = bufsize; 00426 int encodedLen = 0; 00427 00428 OUTPUT_CHAR(ch); 00429 00430 return encodedLen; 00431 } 00432 00437 inline static int Encode (const utf32_char ch, utf8_char* buf, 00438 size_t bufsize, bool allowNonchars = false) 00439 { 00440 return EncodeUTF8 (ch, buf, bufsize, allowNonchars); 00441 } 00446 inline static int Encode (const utf32_char ch, utf16_char* buf, 00447 size_t bufsize, bool allowNonchars = false) 00448 { 00449 return EncodeUTF16 (ch, buf, bufsize, allowNonchars); 00450 } 00455 inline static int Encode (const utf32_char ch, utf32_char* buf, 00456 size_t bufsize, bool allowNonchars = false) 00457 { 00458 return EncodeUTF32 (ch, buf, bufsize, allowNonchars); 00459 } 00461 #undef OUTPUT_CHAR 00462 00465 #define OUTPUT_CHAR(chr) _OUTPUT_CHAR(dest, chr) 00466 00467 #define UCTF_CONVERTER(funcName, fromType, decoder, toType, encoder) \ 00468 inline static size_t funcName (toType* dest, size_t destSize, \ 00469 const fromType* source, size_t srcSize = (size_t)-1) \ 00470 { \ 00471 if ((srcSize == 0) || (source == 0)) \ 00472 return 0; \ 00473 \ 00474 size_t bufRemaining = (destSize > 0) ? destSize - 1 : 0; \ 00475 size_t encodedLen = 0; \ 00476 \ 00477 size_t srcChars = srcSize; \ 00478 \ 00479 if (srcSize == (size_t)-1) \ 00480 { \ 00481 srcChars = 0; \ 00482 const fromType* sptr = source; \ 00483 while (*sptr++ != 0) srcChars++; \ 00484 } \ 00485 \ 00486 while (srcChars > 0) \ 00487 { \ 00488 utf32_char ch; \ 00489 int scnt = decoder (source, srcChars, ch, 0); \ 00490 if (scnt == 0) break; \ 00491 int dcnt = encoder (ch, dest, bufRemaining); \ 00492 if (dcnt == 0) \ 00493 { \ 00494 dcnt = encoder (CS_UC_CHAR_REPLACER, dest, bufRemaining); \ 00495 } \ 00496 \ 00497 if ((size_t)dcnt >= bufRemaining) \ 00498 { \ 00499 if (dest && (destSize > 0)) dest += bufRemaining; \ 00500 bufRemaining = 0; \ 00501 } \ 00502 else \ 00503 { \ 00504 bufRemaining -= dcnt; \ 00505 if (dest && (destSize > 0)) dest += dcnt; \ 00506 } \ 00507 encodedLen += dcnt; \ 00508 if ((size_t)scnt >= srcChars) break; \ 00509 srcChars -= scnt; \ 00510 source += scnt; \ 00511 } \ 00512 \ 00513 if (dest) *dest = 0; \ 00514 \ 00515 return encodedLen + 1; \ 00516 } 00517 00533 UCTF_CONVERTER (UTF8to16, utf8_char, UTF8Decode, utf16_char, EncodeUTF16); 00538 UCTF_CONVERTER (UTF8to32, utf8_char, UTF8Decode, utf32_char, EncodeUTF32); 00539 00544 UCTF_CONVERTER (UTF16to8, utf16_char, UTF16Decode, utf8_char, EncodeUTF8); 00549 UCTF_CONVERTER (UTF16to32, utf16_char, UTF16Decode, utf32_char, EncodeUTF32); 00550 00555 UCTF_CONVERTER (UTF32to8, utf32_char, UTF32Decode, utf8_char, EncodeUTF8); 00560 UCTF_CONVERTER (UTF32to16, utf32_char, UTF32Decode, utf16_char, EncodeUTF16); 00563 #undef UCTF_CONVERTER 00564 #undef OUTPUT_CHAR 00565 #undef _OUTPUT_CHAR 00566 00567 #if (CS_WCHAR_T_SIZE == 1) 00568 inline static size_t UTF8toWC (wchar_t* dest, size_t destSize, 00569 const utf8_char* source, size_t srcSize) 00570 { 00571 size_t srcChars = srcSize; 00572 if (srcSize == (size_t)-1) 00573 { 00574 srcChars = 0; 00575 const utf8_char* sptr = source; 00576 while (*sptr++ != 0) srcChars++; 00577 } 00578 if ((dest != 0) && (destSize != 0)) 00579 { 00580 size_t len = MIN (destSize - 1, srcChars); 00581 memcpy (dest, source, size * sizeof (wchar_t)); 00582 *(dest + len) = 0; 00583 } 00584 return srcChars + 1; 00585 }; 00586 00587 inline static size_t UTF16toWC (wchar_t* dest, size_t destSize, 00588 const utf16_char* source, size_t srcSize) 00589 { 00590 return UTF16to8 ((utf8_char*)dest, destSize, source, srcSize); 00591 }; 00592 00593 inline static size_t UTF32toWC (wchar_t* dest, size_t destSize, 00594 const utf32_char* source, size_t srcSize) 00595 { 00596 return UTF32to8 ((utf8_char*)dest, destSize, source, srcSize); 00597 }; 00598 00599 inline static size_t WCtoUTF8 (utf8_char* dest, size_t destSize, 00600 const wchar_t* source, size_t srcSize) 00601 { 00602 size_t srcChars = srcSize; 00603 if (srcSize == (size_t)-1) 00604 { 00605 srcChars = 0; 00606 const wchar_t* sptr = source; 00607 while (*sptr++ != 0) srcChars++; 00608 } 00609 if ((dest != 0) && (destSize != 0)) 00610 { 00611 size_t len = MIN (destSize - 1, srcChars); 00612 memcpy (dest, source, len * sizeof (wchar_t)); 00613 *(dest + len) = 0; 00614 } 00615 return srcChars + 1; 00616 }; 00617 00618 inline static size_t WCtoUTF16 (utf16_char* dest, size_t destSize, 00619 const wchar_t* source, size_t srcSize) 00620 { 00621 return UTF8to16 (dest, destSize, source, srcSize); 00622 }; 00623 00624 inline static size_t WCtoUTF32 (utf32_char* dest, size_t destSize, 00625 const wchar_t* source, size_t srcSize) 00626 { 00627 return UTF8to32 (dest, destSize, source, srcSize); 00628 }; 00629 00630 inline static int Decode (const wchar_t* str, size_t strlen, 00631 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00632 { 00633 return UTF8Decode ((utf8_char*)str, strlen, ch, isValid, returnNonChar); 00634 } 00635 inline static int Encode (const utf32_char ch, wchar_t* buf, 00636 size_t bufsize, bool allowNonchars = false) 00637 { 00638 return EncodeUTF8 (ch, (utf8_char*)buf, bufsize, allowNonchars); 00639 } 00640 #elif (CS_WCHAR_T_SIZE == 2) 00641 // Methods below for doxygen documentation are here as the size '2' is 00642 // default. 00643 00650 inline static size_t UTF8toWC (wchar_t* dest, size_t destSize, 00651 const utf8_char* source, size_t srcSize) 00652 { 00653 return UTF8to16 ((utf16_char*)dest, destSize, source, srcSize); 00654 }; 00655 00660 inline static size_t UTF16toWC (wchar_t* dest, size_t destSize, 00661 const utf16_char* source, size_t srcSize) 00662 { 00663 size_t srcChars = srcSize; 00664 if (srcSize == (size_t)-1) 00665 { 00666 srcChars = 0; 00667 const utf16_char* sptr = source; 00668 while (*sptr++ != 0) srcChars++; 00669 } 00670 if ((dest != 0) && (destSize != 0)) 00671 { 00672 size_t len = MIN (destSize - 1, srcChars); 00673 memcpy (dest, source, len * sizeof (wchar_t)); 00674 *(dest + len) = 0; 00675 } 00676 return srcChars + 1; 00677 }; 00678 00683 inline static size_t UTF32toWC (wchar_t* dest, size_t destSize, 00684 const utf32_char* source, size_t srcSize) 00685 { 00686 return UTF32to16 ((utf16_char*)dest, destSize, source, srcSize); 00687 }; 00688 00693 inline static size_t WCtoUTF8 (utf8_char* dest, size_t destSize, 00694 const wchar_t* source, size_t srcSize) 00695 { 00696 return UTF16to8 (dest, destSize, (utf16_char*)source, srcSize); 00697 }; 00698 00703 inline static size_t WCtoUTF16 (utf16_char* dest, size_t destSize, 00704 const wchar_t* source, size_t srcSize) 00705 { 00706 size_t srcChars = srcSize; 00707 if (srcSize == (size_t)-1) 00708 { 00709 srcChars = 0; 00710 const wchar_t* sptr = source; 00711 while (*sptr++ != 0) srcChars++; 00712 } 00713 if ((dest != 0) && (destSize != 0)) 00714 { 00715 size_t len = MIN (destSize - 1, srcChars); 00716 memcpy (dest, source, len * sizeof (wchar_t)); 00717 *(dest + len) = 0; 00718 } 00719 return srcChars + 1; 00720 }; 00721 00726 inline static size_t WCtoUTF32 (utf32_char* dest, size_t destSize, 00727 const wchar_t* source, size_t srcSize) 00728 { 00729 return UTF16to32 (dest, destSize, (utf16_char*)source, srcSize); 00730 }; 00731 00732 /* Decode()/Encode() overloads for wchar_t. 00733 * - On VC7+, wchar_t may be an unsigned short or the special type __wchar_t. 00734 * - On VC6 wchar_t is always an unsigned short. __wchar_t does not exist. 00735 * Now there may be conflicts with the utf16_char overloads if wchar_t is 00736 * an unsigned short. On the other hand, we would like to support VC7+'s 00737 * built-in wchar_t as well. 00738 * So: on VC7+, provide overloads for __wchar_t, on VC6, don't compile this 00739 * code at all, on other compilers, provide overloads for wchar_t instead 00740 * (by re#definining __wchar_t). 00741 */ 00742 #if !defined(CS_COMPILER_MSVC) || (_MSC_VER > 1300) 00743 #if !defined(CS_COMPILER_MSVC) 00744 #define __wchar_t wchar_t 00745 #endif 00746 00750 inline static int Decode (const __wchar_t* str, size_t strlen, 00751 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00752 { 00753 return UTF16Decode ((utf16_char*)str, strlen, ch, isValid, returnNonChar); 00754 } 00759 inline static int Encode (const utf32_char ch, __wchar_t* buf, 00760 size_t bufsize, bool allowNonchars = false) 00761 { 00762 return EncodeUTF16 (ch, (utf16_char*)buf, bufsize, allowNonchars); 00763 } 00764 #ifdef __wchar_t 00765 #undef __wchar_t 00766 #endif 00767 #endif 00768 00769 #elif (CS_WCHAR_T_SIZE == 4) 00770 inline static size_t UTF8toWC (wchar_t* dest, size_t destSize, 00771 const utf8_char* source, size_t srcSize) 00772 { 00773 return UTF8to32 ((utf32_char*)dest, destSize, source, srcSize); 00774 }; 00775 00776 inline static size_t UTF16toWC (wchar_t* dest, size_t destSize, 00777 const utf16_char* source, size_t srcSize) 00778 { 00779 return UTF16to32 ((utf32_char*)dest, destSize, source, srcSize); 00780 }; 00781 00782 inline static size_t UTF32toWC (wchar_t* dest, size_t destSize, 00783 const utf32_char* source, size_t srcSize) 00784 { 00785 size_t srcChars = srcSize; 00786 if (srcSize == (size_t)-1) 00787 { 00788 srcChars = 0; 00789 const utf32_char* sptr = source; 00790 while (*sptr++ != 0) srcChars++; 00791 } 00792 if ((dest != 0) && (destSize != 0)) 00793 { 00794 size_t len = MIN (destSize - 1, srcChars); 00795 memcpy (dest, source, len * sizeof (wchar_t)); 00796 *(dest + len) = 0; 00797 } 00798 return srcChars + 1; 00799 }; 00800 00801 inline static size_t WCtoUTF8 (utf8_char* dest, size_t destSize, 00802 const wchar_t* source, size_t srcSize) 00803 { 00804 return UTF32to8 (dest, destSize, (utf32_char*)source, srcSize); 00805 }; 00806 00807 inline static size_t WCtoUTF16 (utf16_char* dest, size_t destSize, 00808 const wchar_t* source, size_t srcSize) 00809 { 00810 return UTF32to16 (dest, destSize, (utf32_char*)source, srcSize); 00811 }; 00812 00813 inline static size_t WCtoUTF32 (utf32_char* dest, size_t destSize, 00814 const wchar_t* source, size_t srcSize) 00815 { 00816 size_t srcChars = srcSize; 00817 if (srcSize == (size_t)-1) 00818 { 00819 srcChars = 0; 00820 const wchar_t* sptr = source; 00821 while (*sptr++ != 0) srcChars++; 00822 } 00823 if ((dest != 0) && (destSize != 0)) 00824 { 00825 size_t len = MIN (destSize - 1, srcChars); 00826 memcpy (dest, source, len * sizeof (wchar_t)); 00827 *(dest + len) = 0; 00828 } 00829 return srcChars + 1; 00830 }; 00831 00832 inline static int Decode (const wchar_t* str, size_t strlen, 00833 utf32_char& ch, bool* isValid = 0, bool returnNonChar = false) 00834 { 00835 return UTF32Decode ((utf32_char*)str, strlen, ch, isValid, returnNonChar); 00836 } 00837 inline static int Encode (const utf32_char ch, wchar_t* buf, 00838 size_t bufsize, bool allowNonchars = false) 00839 { 00840 return EncodeUTF32 (ch, (utf32_char*)buf, bufsize, allowNonchars); 00841 } 00842 #else 00843 #error Odd-sized, unsupported wchar_t! 00844 #endif 00845 00858 inline static int UTF8Skip (const utf8_char* str, size_t maxSkip) 00859 { 00860 if (maxSkip < 1) return 0; 00861 00862 if ((*str & 0x80) == 0) 00863 { 00864 return 1; 00865 } 00866 else 00867 { 00868 int n = 0; 00869 while ((n < 7) && ((*str & (1 << (7 - n))) != 0)) { n++; } 00870 00871 if ((n < 2) || (n > 6)) 00872 { 00873 return 1; 00874 } 00875 00876 int skip = 1; 00877 00878 for (; skip < n; skip++) 00879 { 00880 if (((str[skip] & 0xc0) != 0x80) || ((size_t)skip > maxSkip)) 00881 { 00882 break; 00883 } 00884 } 00885 return skip; 00886 } 00887 } 00888 00899 inline static int UTF8Rewind (const utf8_char* str, size_t maxRew) 00900 { 00901 if (maxRew < 1) return 0; 00902 00903 const utf8_char* pos = str - 1; 00904 00905 if ((*pos & 0x80) == 0) 00906 { 00907 return 1; 00908 } 00909 00910 // Skip backward to the first byte of the sequence. 00911 int skip = 1; 00912 while (((*pos & 0xc0) == 0x80) && ((size_t)skip < maxRew)) 00913 { 00914 skip++; 00915 pos--; 00916 } 00917 00918 return skip; 00919 } 00920 00926 inline static int UTF16Skip (const utf16_char* str, size_t maxSkip) 00927 { 00928 if (CS_UC_IS_HIGH_SURROGATE (*str)) 00929 return (int)(MIN(maxSkip, 2)); 00930 else 00931 return (int)(MIN(maxSkip, 1)); 00932 } 00933 00939 inline static int UTF16Rewind (const utf16_char* str, size_t maxRew) 00940 { 00941 if (maxRew < 1) return 0; 00942 00943 const utf16_char* pos = str - 1; 00944 if (!CS_UC_IS_SURROGATE(*pos)) 00945 return 1; 00946 else 00947 { 00948 if ((maxRew > 1) && (CS_UC_IS_HIGH_SURROGATE(*(pos - 1)))) 00949 return 2; 00950 else 00951 return 1; 00952 } 00953 } 00954 00960 inline static int UTF32Skip (const utf32_char* str, size_t maxSkip) 00961 { 00962 (void)str; // silence gcc 00963 return (int)(MIN(maxSkip, 1)); 00964 } 00965 00971 inline static int UTF32Rewind (const utf32_char* str, size_t maxRew) 00972 { 00973 (void)str; // silence gcc 00974 if (maxRew < 1) return 0; 00975 return 1; 00976 } 00991 static size_t MapToUpper (const utf32_char ch, utf32_char* dest, 00992 size_t destSize, uint flags = 0); 00997 static size_t MapToLower (const utf32_char ch, utf32_char* dest, 00998 size_t destSize, uint flags = 0); 01004 static size_t MapToFold (const utf32_char ch, utf32_char* dest, 01005 size_t destSize, uint flags = 0); 01007 }; 01008 01011 #endif 01012
Generated for Crystal Space by doxygen 1.4.7