LLVM API Documentation
00001 /*===--- ConvertUTF.c - Universal Character Names conversions ---------------=== 00002 * 00003 * The LLVM Compiler Infrastructure 00004 * 00005 * This file is distributed under the University of Illinois Open Source 00006 * License. See LICENSE.TXT for details. 00007 * 00008 *===------------------------------------------------------------------------=*/ 00009 /* 00010 * Copyright 2001-2004 Unicode, Inc. 00011 * 00012 * Disclaimer 00013 * 00014 * This source code is provided as is by Unicode, Inc. No claims are 00015 * made as to fitness for any particular purpose. No warranties of any 00016 * kind are expressed or implied. The recipient agrees to determine 00017 * applicability of information provided. If this file has been 00018 * purchased on magnetic or optical media from Unicode, Inc., the 00019 * sole remedy for any claim will be exchange of defective media 00020 * within 90 days of receipt. 00021 * 00022 * Limitations on Rights to Redistribute This Code 00023 * 00024 * Unicode, Inc. hereby grants the right to freely use the information 00025 * supplied in this file in the creation of products supporting the 00026 * Unicode Standard, and to make copies of this file in any form 00027 * for internal or external distribution as long as this notice 00028 * remains attached. 00029 */ 00030 00031 /* --------------------------------------------------------------------- 00032 00033 Conversions between UTF32, UTF-16, and UTF-8. Source code file. 00034 Author: Mark E. Davis, 1994. 00035 Rev History: Rick McGowan, fixes & updates May 2001. 00036 Sept 2001: fixed const & error conditions per 00037 mods suggested by S. Parent & A. Lillich. 00038 June 2002: Tim Dodd added detection and handling of incomplete 00039 source sequences, enhanced error detection, added casts 00040 to eliminate compiler warnings. 00041 July 2003: slight mods to back out aggressive FFFE detection. 00042 Jan 2004: updated switches in from-UTF8 conversions. 00043 Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. 00044 00045 See the header file "ConvertUTF.h" for complete documentation. 00046 00047 ------------------------------------------------------------------------ */ 00048 00049 00050 #include "llvm/Support/ConvertUTF.h" 00051 #ifdef CVTUTF_DEBUG 00052 #include <stdio.h> 00053 #endif 00054 #include <assert.h> 00055 00056 static const int halfShift = 10; /* used for shifting by 10 bits */ 00057 00058 static const UTF32 halfBase = 0x0010000UL; 00059 static const UTF32 halfMask = 0x3FFUL; 00060 00061 #define UNI_SUR_HIGH_START (UTF32)0xD800 00062 #define UNI_SUR_HIGH_END (UTF32)0xDBFF 00063 #define UNI_SUR_LOW_START (UTF32)0xDC00 00064 #define UNI_SUR_LOW_END (UTF32)0xDFFF 00065 #define false 0 00066 #define true 1 00067 00068 /* --------------------------------------------------------------------- */ 00069 00070 /* 00071 * Index into the table below with the first byte of a UTF-8 sequence to 00072 * get the number of trailing bytes that are supposed to follow it. 00073 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is 00074 * left as-is for anyone who may want to do such conversion, which was 00075 * allowed in earlier algorithms. 00076 */ 00077 static const char trailingBytesForUTF8[256] = { 00078 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 00079 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 00080 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 00081 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 00082 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 00083 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 00084 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00085 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 00086 }; 00087 00088 /* 00089 * Magic values subtracted from a buffer value during UTF8 conversion. 00090 * This table contains as many values as there might be trailing bytes 00091 * in a UTF-8 sequence. 00092 */ 00093 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 00094 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; 00095 00096 /* 00097 * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed 00098 * into the first byte, depending on how many bytes follow. There are 00099 * as many entries in this table as there are UTF-8 sequence types. 00100 * (I.e., one byte sequence, two byte... etc.). Remember that sequencs 00101 * for *legal* UTF-8 will be 4 or fewer bytes total. 00102 */ 00103 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 00104 00105 /* --------------------------------------------------------------------- */ 00106 00107 /* The interface converts a whole buffer to avoid function-call overhead. 00108 * Constants have been gathered. Loops & conditionals have been removed as 00109 * much as possible for efficiency, in favor of drop-through switches. 00110 * (See "Note A" at the bottom of the file for equivalent code.) 00111 * If your compiler supports it, the "isLegalUTF8" call can be turned 00112 * into an inline function. 00113 */ 00114 00115 00116 /* --------------------------------------------------------------------- */ 00117 00118 ConversionResult ConvertUTF32toUTF16 ( 00119 const UTF32** sourceStart, const UTF32* sourceEnd, 00120 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { 00121 ConversionResult result = conversionOK; 00122 const UTF32* source = *sourceStart; 00123 UTF16* target = *targetStart; 00124 while (source < sourceEnd) { 00125 UTF32 ch; 00126 if (target >= targetEnd) { 00127 result = targetExhausted; break; 00128 } 00129 ch = *source++; 00130 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ 00131 /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ 00132 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 00133 if (flags == strictConversion) { 00134 --source; /* return to the illegal value itself */ 00135 result = sourceIllegal; 00136 break; 00137 } else { 00138 *target++ = UNI_REPLACEMENT_CHAR; 00139 } 00140 } else { 00141 *target++ = (UTF16)ch; /* normal case */ 00142 } 00143 } else if (ch > UNI_MAX_LEGAL_UTF32) { 00144 if (flags == strictConversion) { 00145 result = sourceIllegal; 00146 } else { 00147 *target++ = UNI_REPLACEMENT_CHAR; 00148 } 00149 } else { 00150 /* target is a character in range 0xFFFF - 0x10FFFF. */ 00151 if (target + 1 >= targetEnd) { 00152 --source; /* Back up source pointer! */ 00153 result = targetExhausted; break; 00154 } 00155 ch -= halfBase; 00156 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); 00157 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); 00158 } 00159 } 00160 *sourceStart = source; 00161 *targetStart = target; 00162 return result; 00163 } 00164 00165 /* --------------------------------------------------------------------- */ 00166 00167 ConversionResult ConvertUTF16toUTF32 ( 00168 const UTF16** sourceStart, const UTF16* sourceEnd, 00169 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { 00170 ConversionResult result = conversionOK; 00171 const UTF16* source = *sourceStart; 00172 UTF32* target = *targetStart; 00173 UTF32 ch, ch2; 00174 while (source < sourceEnd) { 00175 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ 00176 ch = *source++; 00177 /* If we have a surrogate pair, convert to UTF32 first. */ 00178 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { 00179 /* If the 16 bits following the high surrogate are in the source buffer... */ 00180 if (source < sourceEnd) { 00181 ch2 = *source; 00182 /* If it's a low surrogate, convert to UTF32. */ 00183 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { 00184 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 00185 + (ch2 - UNI_SUR_LOW_START) + halfBase; 00186 ++source; 00187 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ 00188 --source; /* return to the illegal value itself */ 00189 result = sourceIllegal; 00190 break; 00191 } 00192 } else { /* We don't have the 16 bits following the high surrogate. */ 00193 --source; /* return to the high surrogate */ 00194 result = sourceExhausted; 00195 break; 00196 } 00197 } else if (flags == strictConversion) { 00198 /* UTF-16 surrogate values are illegal in UTF-32 */ 00199 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { 00200 --source; /* return to the illegal value itself */ 00201 result = sourceIllegal; 00202 break; 00203 } 00204 } 00205 if (target >= targetEnd) { 00206 source = oldSource; /* Back up source pointer! */ 00207 result = targetExhausted; break; 00208 } 00209 *target++ = ch; 00210 } 00211 *sourceStart = source; 00212 *targetStart = target; 00213 #ifdef CVTUTF_DEBUG 00214 if (result == sourceIllegal) { 00215 fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); 00216 fflush(stderr); 00217 } 00218 #endif 00219 return result; 00220 } 00221 ConversionResult ConvertUTF16toUTF8 ( 00222 const UTF16** sourceStart, const UTF16* sourceEnd, 00223 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { 00224 ConversionResult result = conversionOK; 00225 const UTF16* source = *sourceStart; 00226 UTF8* target = *targetStart; 00227 while (source < sourceEnd) { 00228 UTF32 ch; 00229 unsigned short bytesToWrite = 0; 00230 const UTF32 byteMask = 0xBF; 00231 const UTF32 byteMark = 0x80; 00232 const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ 00233 ch = *source++; 00234 /* If we have a surrogate pair, convert to UTF32 first. */ 00235 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { 00236 /* If the 16 bits following the high surrogate are in the source buffer... */ 00237 if (source < sourceEnd) { 00238 UTF32 ch2 = *source; 00239 /* If it's a low surrogate, convert to UTF32. */ 00240 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { 00241 ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 00242 + (ch2 - UNI_SUR_LOW_START) + halfBase; 00243 ++source; 00244 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ 00245 --source; /* return to the illegal value itself */ 00246 result = sourceIllegal; 00247 break; 00248 } 00249 } else { /* We don't have the 16 bits following the high surrogate. */ 00250 --source; /* return to the high surrogate */ 00251 result = sourceExhausted; 00252 break; 00253 } 00254 } else if (flags == strictConversion) { 00255 /* UTF-16 surrogate values are illegal in UTF-32 */ 00256 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { 00257 --source; /* return to the illegal value itself */ 00258 result = sourceIllegal; 00259 break; 00260 } 00261 } 00262 /* Figure out how many bytes the result will require */ 00263 if (ch < (UTF32)0x80) { bytesToWrite = 1; 00264 } else if (ch < (UTF32)0x800) { bytesToWrite = 2; 00265 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; 00266 } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; 00267 } else { bytesToWrite = 3; 00268 ch = UNI_REPLACEMENT_CHAR; 00269 } 00270 00271 target += bytesToWrite; 00272 if (target > targetEnd) { 00273 source = oldSource; /* Back up source pointer! */ 00274 target -= bytesToWrite; result = targetExhausted; break; 00275 } 00276 switch (bytesToWrite) { /* note: everything falls through. */ 00277 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 00278 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 00279 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 00280 case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); 00281 } 00282 target += bytesToWrite; 00283 } 00284 *sourceStart = source; 00285 *targetStart = target; 00286 return result; 00287 } 00288 00289 /* --------------------------------------------------------------------- */ 00290 00291 ConversionResult ConvertUTF32toUTF8 ( 00292 const UTF32** sourceStart, const UTF32* sourceEnd, 00293 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { 00294 ConversionResult result = conversionOK; 00295 const UTF32* source = *sourceStart; 00296 UTF8* target = *targetStart; 00297 while (source < sourceEnd) { 00298 UTF32 ch; 00299 unsigned short bytesToWrite = 0; 00300 const UTF32 byteMask = 0xBF; 00301 const UTF32 byteMark = 0x80; 00302 ch = *source++; 00303 if (flags == strictConversion ) { 00304 /* UTF-16 surrogate values are illegal in UTF-32 */ 00305 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 00306 --source; /* return to the illegal value itself */ 00307 result = sourceIllegal; 00308 break; 00309 } 00310 } 00311 /* 00312 * Figure out how many bytes the result will require. Turn any 00313 * illegally large UTF32 things (> Plane 17) into replacement chars. 00314 */ 00315 if (ch < (UTF32)0x80) { bytesToWrite = 1; 00316 } else if (ch < (UTF32)0x800) { bytesToWrite = 2; 00317 } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; 00318 } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; 00319 } else { bytesToWrite = 3; 00320 ch = UNI_REPLACEMENT_CHAR; 00321 result = sourceIllegal; 00322 } 00323 00324 target += bytesToWrite; 00325 if (target > targetEnd) { 00326 --source; /* Back up source pointer! */ 00327 target -= bytesToWrite; result = targetExhausted; break; 00328 } 00329 switch (bytesToWrite) { /* note: everything falls through. */ 00330 case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 00331 case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 00332 case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 00333 case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); 00334 } 00335 target += bytesToWrite; 00336 } 00337 *sourceStart = source; 00338 *targetStart = target; 00339 return result; 00340 } 00341 00342 /* --------------------------------------------------------------------- */ 00343 00344 /* 00345 * Utility routine to tell whether a sequence of bytes is legal UTF-8. 00346 * This must be called with the length pre-determined by the first byte. 00347 * If not calling this from ConvertUTF8to*, then the length can be set by: 00348 * length = trailingBytesForUTF8[*source]+1; 00349 * and the sequence is illegal right away if there aren't that many bytes 00350 * available. 00351 * If presented with a length > 4, this returns false. The Unicode 00352 * definition of UTF-8 goes up to 4-byte sequences. 00353 */ 00354 00355 static Boolean isLegalUTF8(const UTF8 *source, int length) { 00356 UTF8 a; 00357 const UTF8 *srcptr = source+length; 00358 switch (length) { 00359 default: return false; 00360 /* Everything else falls through when "true"... */ 00361 case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 00362 case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 00363 case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 00364 00365 switch (*source) { 00366 /* no fall-through in this inner switch */ 00367 case 0xE0: if (a < 0xA0) return false; break; 00368 case 0xED: if (a > 0x9F) return false; break; 00369 case 0xF0: if (a < 0x90) return false; break; 00370 case 0xF4: if (a > 0x8F) return false; break; 00371 default: if (a < 0x80) return false; 00372 } 00373 00374 case 1: if (*source >= 0x80 && *source < 0xC2) return false; 00375 } 00376 if (*source > 0xF4) return false; 00377 return true; 00378 } 00379 00380 /* --------------------------------------------------------------------- */ 00381 00382 /* 00383 * Exported function to return whether a UTF-8 sequence is legal or not. 00384 * This is not used here; it's just exported. 00385 */ 00386 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { 00387 int length = trailingBytesForUTF8[*source]+1; 00388 if (length > sourceEnd - source) { 00389 return false; 00390 } 00391 return isLegalUTF8(source, length); 00392 } 00393 00394 /* --------------------------------------------------------------------- */ 00395 00396 static unsigned 00397 findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source, 00398 const UTF8 *sourceEnd) { 00399 UTF8 b1, b2, b3; 00400 00401 assert(!isLegalUTF8Sequence(source, sourceEnd)); 00402 00403 /* 00404 * Unicode 6.3.0, D93b: 00405 * 00406 * Maximal subpart of an ill-formed subsequence: The longest code unit 00407 * subsequence starting at an unconvertible offset that is either: 00408 * a. the initial subsequence of a well-formed code unit sequence, or 00409 * b. a subsequence of length one. 00410 */ 00411 00412 if (source == sourceEnd) 00413 return 0; 00414 00415 /* 00416 * Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8 00417 * Byte Sequences. 00418 */ 00419 00420 b1 = *source; 00421 ++source; 00422 if (b1 >= 0xC2 && b1 <= 0xDF) { 00423 /* 00424 * First byte is valid, but we know that this code unit sequence is 00425 * invalid, so the maximal subpart has to end after the first byte. 00426 */ 00427 return 1; 00428 } 00429 00430 if (source == sourceEnd) 00431 return 1; 00432 00433 b2 = *source; 00434 ++source; 00435 00436 if (b1 == 0xE0) { 00437 return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1; 00438 } 00439 if (b1 >= 0xE1 && b1 <= 0xEC) { 00440 return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1; 00441 } 00442 if (b1 == 0xED) { 00443 return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1; 00444 } 00445 if (b1 >= 0xEE && b1 <= 0xEF) { 00446 return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1; 00447 } 00448 if (b1 == 0xF0) { 00449 if (b2 >= 0x90 && b2 <= 0xBF) { 00450 if (source == sourceEnd) 00451 return 2; 00452 00453 b3 = *source; 00454 return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; 00455 } 00456 return 1; 00457 } 00458 if (b1 >= 0xF1 && b1 <= 0xF3) { 00459 if (b2 >= 0x80 && b2 <= 0xBF) { 00460 if (source == sourceEnd) 00461 return 2; 00462 00463 b3 = *source; 00464 return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; 00465 } 00466 return 1; 00467 } 00468 if (b1 == 0xF4) { 00469 if (b2 >= 0x80 && b2 <= 0x8F) { 00470 if (source == sourceEnd) 00471 return 2; 00472 00473 b3 = *source; 00474 return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; 00475 } 00476 return 1; 00477 } 00478 00479 assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5); 00480 /* 00481 * There are no valid sequences that start with these bytes. Maximal subpart 00482 * is defined to have length 1 in these cases. 00483 */ 00484 return 1; 00485 } 00486 00487 /* --------------------------------------------------------------------- */ 00488 00489 /* 00490 * Exported function to return the total number of bytes in a codepoint 00491 * represented in UTF-8, given the value of the first byte. 00492 */ 00493 unsigned getNumBytesForUTF8(UTF8 first) { 00494 return trailingBytesForUTF8[first] + 1; 00495 } 00496 00497 /* --------------------------------------------------------------------- */ 00498 00499 /* 00500 * Exported function to return whether a UTF-8 string is legal or not. 00501 * This is not used here; it's just exported. 00502 */ 00503 Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) { 00504 while (*source != sourceEnd) { 00505 int length = trailingBytesForUTF8[**source] + 1; 00506 if (length > sourceEnd - *source || !isLegalUTF8(*source, length)) 00507 return false; 00508 *source += length; 00509 } 00510 return true; 00511 } 00512 00513 /* --------------------------------------------------------------------- */ 00514 00515 ConversionResult ConvertUTF8toUTF16 ( 00516 const UTF8** sourceStart, const UTF8* sourceEnd, 00517 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { 00518 ConversionResult result = conversionOK; 00519 const UTF8* source = *sourceStart; 00520 UTF16* target = *targetStart; 00521 while (source < sourceEnd) { 00522 UTF32 ch = 0; 00523 unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 00524 if (extraBytesToRead >= sourceEnd - source) { 00525 result = sourceExhausted; break; 00526 } 00527 /* Do this check whether lenient or strict */ 00528 if (!isLegalUTF8(source, extraBytesToRead+1)) { 00529 result = sourceIllegal; 00530 break; 00531 } 00532 /* 00533 * The cases all fall through. See "Note A" below. 00534 */ 00535 switch (extraBytesToRead) { 00536 case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ 00537 case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ 00538 case 3: ch += *source++; ch <<= 6; 00539 case 2: ch += *source++; ch <<= 6; 00540 case 1: ch += *source++; ch <<= 6; 00541 case 0: ch += *source++; 00542 } 00543 ch -= offsetsFromUTF8[extraBytesToRead]; 00544 00545 if (target >= targetEnd) { 00546 source -= (extraBytesToRead+1); /* Back up source pointer! */ 00547 result = targetExhausted; break; 00548 } 00549 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ 00550 /* UTF-16 surrogate values are illegal in UTF-32 */ 00551 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 00552 if (flags == strictConversion) { 00553 source -= (extraBytesToRead+1); /* return to the illegal value itself */ 00554 result = sourceIllegal; 00555 break; 00556 } else { 00557 *target++ = UNI_REPLACEMENT_CHAR; 00558 } 00559 } else { 00560 *target++ = (UTF16)ch; /* normal case */ 00561 } 00562 } else if (ch > UNI_MAX_UTF16) { 00563 if (flags == strictConversion) { 00564 result = sourceIllegal; 00565 source -= (extraBytesToRead+1); /* return to the start */ 00566 break; /* Bail out; shouldn't continue */ 00567 } else { 00568 *target++ = UNI_REPLACEMENT_CHAR; 00569 } 00570 } else { 00571 /* target is a character in range 0xFFFF - 0x10FFFF. */ 00572 if (target + 1 >= targetEnd) { 00573 source -= (extraBytesToRead+1); /* Back up source pointer! */ 00574 result = targetExhausted; break; 00575 } 00576 ch -= halfBase; 00577 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); 00578 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); 00579 } 00580 } 00581 *sourceStart = source; 00582 *targetStart = target; 00583 return result; 00584 } 00585 00586 /* --------------------------------------------------------------------- */ 00587 00588 static ConversionResult ConvertUTF8toUTF32Impl( 00589 const UTF8** sourceStart, const UTF8* sourceEnd, 00590 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags, 00591 Boolean InputIsPartial) { 00592 ConversionResult result = conversionOK; 00593 const UTF8* source = *sourceStart; 00594 UTF32* target = *targetStart; 00595 while (source < sourceEnd) { 00596 UTF32 ch = 0; 00597 unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 00598 if (extraBytesToRead >= sourceEnd - source) { 00599 if (flags == strictConversion || InputIsPartial) { 00600 result = sourceExhausted; 00601 break; 00602 } else { 00603 result = sourceIllegal; 00604 00605 /* 00606 * Replace the maximal subpart of ill-formed sequence with 00607 * replacement character. 00608 */ 00609 source += findMaximalSubpartOfIllFormedUTF8Sequence(source, 00610 sourceEnd); 00611 *target++ = UNI_REPLACEMENT_CHAR; 00612 continue; 00613 } 00614 } 00615 if (target >= targetEnd) { 00616 result = targetExhausted; break; 00617 } 00618 00619 /* Do this check whether lenient or strict */ 00620 if (!isLegalUTF8(source, extraBytesToRead+1)) { 00621 result = sourceIllegal; 00622 if (flags == strictConversion) { 00623 /* Abort conversion. */ 00624 break; 00625 } else { 00626 /* 00627 * Replace the maximal subpart of ill-formed sequence with 00628 * replacement character. 00629 */ 00630 source += findMaximalSubpartOfIllFormedUTF8Sequence(source, 00631 sourceEnd); 00632 *target++ = UNI_REPLACEMENT_CHAR; 00633 continue; 00634 } 00635 } 00636 /* 00637 * The cases all fall through. See "Note A" below. 00638 */ 00639 switch (extraBytesToRead) { 00640 case 5: ch += *source++; ch <<= 6; 00641 case 4: ch += *source++; ch <<= 6; 00642 case 3: ch += *source++; ch <<= 6; 00643 case 2: ch += *source++; ch <<= 6; 00644 case 1: ch += *source++; ch <<= 6; 00645 case 0: ch += *source++; 00646 } 00647 ch -= offsetsFromUTF8[extraBytesToRead]; 00648 00649 if (ch <= UNI_MAX_LEGAL_UTF32) { 00650 /* 00651 * UTF-16 surrogate values are illegal in UTF-32, and anything 00652 * over Plane 17 (> 0x10FFFF) is illegal. 00653 */ 00654 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 00655 if (flags == strictConversion) { 00656 source -= (extraBytesToRead+1); /* return to the illegal value itself */ 00657 result = sourceIllegal; 00658 break; 00659 } else { 00660 *target++ = UNI_REPLACEMENT_CHAR; 00661 } 00662 } else { 00663 *target++ = ch; 00664 } 00665 } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ 00666 result = sourceIllegal; 00667 *target++ = UNI_REPLACEMENT_CHAR; 00668 } 00669 } 00670 *sourceStart = source; 00671 *targetStart = target; 00672 return result; 00673 } 00674 00675 ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart, 00676 const UTF8 *sourceEnd, 00677 UTF32 **targetStart, 00678 UTF32 *targetEnd, 00679 ConversionFlags flags) { 00680 return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd, 00681 flags, /*InputIsPartial=*/true); 00682 } 00683 00684 ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, 00685 const UTF8 *sourceEnd, UTF32 **targetStart, 00686 UTF32 *targetEnd, ConversionFlags flags) { 00687 return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd, 00688 flags, /*InputIsPartial=*/false); 00689 } 00690 00691 /* --------------------------------------------------------------------- 00692 00693 Note A. 00694 The fall-through switches in UTF-8 reading code save a 00695 temp variable, some decrements & conditionals. The switches 00696 are equivalent to the following loop: 00697 { 00698 int tmpBytesToRead = extraBytesToRead+1; 00699 do { 00700 ch += *source++; 00701 --tmpBytesToRead; 00702 if (tmpBytesToRead) ch <<= 6; 00703 } while (tmpBytesToRead > 0); 00704 } 00705 In UTF-8 writing code, the switches on "bytesToWrite" are 00706 similarly unrolled loops. 00707 00708 --------------------------------------------------------------------- */