LLVM API Documentation

ConvertUTF.c
Go to the documentation of this file.
00001 /*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
00002  *
00003  *                     The LLVM Compiler Infrastructure
00004  *
00005  * This file is distributed under the University of Illinois Open Source
00006  * License. See LICENSE.TXT for details.
00007  *
00008  *===------------------------------------------------------------------------=*/
00009 /*
00010  * Copyright 2001-2004 Unicode, Inc.
00011  * 
00012  * Disclaimer
00013  * 
00014  * This source code is provided as is by Unicode, Inc. No claims are
00015  * made as to fitness for any particular purpose. No warranties of any
00016  * kind are expressed or implied. The recipient agrees to determine
00017  * applicability of information provided. If this file has been
00018  * purchased on magnetic or optical media from Unicode, Inc., the
00019  * sole remedy for any claim will be exchange of defective media
00020  * within 90 days of receipt.
00021  * 
00022  * Limitations on Rights to Redistribute This Code
00023  * 
00024  * Unicode, Inc. hereby grants the right to freely use the information
00025  * supplied in this file in the creation of products supporting the
00026  * Unicode Standard, and to make copies of this file in any form
00027  * for internal or external distribution as long as this notice
00028  * remains attached.
00029  */
00030 
00031 /* ---------------------------------------------------------------------
00032 
00033     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
00034     Author: Mark E. Davis, 1994.
00035     Rev History: Rick McGowan, fixes & updates May 2001.
00036     Sept 2001: fixed const & error conditions per
00037         mods suggested by S. Parent & A. Lillich.
00038     June 2002: Tim Dodd added detection and handling of incomplete
00039         source sequences, enhanced error detection, added casts
00040         to eliminate compiler warnings.
00041     July 2003: slight mods to back out aggressive FFFE detection.
00042     Jan 2004: updated switches in from-UTF8 conversions.
00043     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
00044 
00045     See the header file "ConvertUTF.h" for complete documentation.
00046 
00047 ------------------------------------------------------------------------ */
00048 
00049 
00050 #include "llvm/Support/ConvertUTF.h"
00051 #ifdef CVTUTF_DEBUG
00052 #include <stdio.h>
00053 #endif
00054 #include <assert.h>
00055 
00056 static const int halfShift  = 10; /* used for shifting by 10 bits */
00057 
00058 static const UTF32 halfBase = 0x0010000UL;
00059 static const UTF32 halfMask = 0x3FFUL;
00060 
00061 #define UNI_SUR_HIGH_START  (UTF32)0xD800
00062 #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
00063 #define UNI_SUR_LOW_START   (UTF32)0xDC00
00064 #define UNI_SUR_LOW_END     (UTF32)0xDFFF
00065 #define false      0
00066 #define true        1
00067 
00068 /* --------------------------------------------------------------------- */
00069 
00070 /*
00071  * Index into the table below with the first byte of a UTF-8 sequence to
00072  * get the number of trailing bytes that are supposed to follow it.
00073  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
00074  * left as-is for anyone who may want to do such conversion, which was
00075  * allowed in earlier algorithms.
00076  */
00077 static const char trailingBytesForUTF8[256] = {
00078     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00079     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00080     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00081     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00082     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00083     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
00084     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00085     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
00086 };
00087 
00088 /*
00089  * Magic values subtracted from a buffer value during UTF8 conversion.
00090  * This table contains as many values as there might be trailing bytes
00091  * in a UTF-8 sequence.
00092  */
00093 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 
00094                      0x03C82080UL, 0xFA082080UL, 0x82082080UL };
00095 
00096 /*
00097  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
00098  * into the first byte, depending on how many bytes follow.  There are
00099  * as many entries in this table as there are UTF-8 sequence types.
00100  * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
00101  * for *legal* UTF-8 will be 4 or fewer bytes total.
00102  */
00103 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
00104 
00105 /* --------------------------------------------------------------------- */
00106 
00107 /* The interface converts a whole buffer to avoid function-call overhead.
00108  * Constants have been gathered. Loops & conditionals have been removed as
00109  * much as possible for efficiency, in favor of drop-through switches.
00110  * (See "Note A" at the bottom of the file for equivalent code.)
00111  * If your compiler supports it, the "isLegalUTF8" call can be turned
00112  * into an inline function.
00113  */
00114 
00115 
00116 /* --------------------------------------------------------------------- */
00117 
00118 ConversionResult ConvertUTF32toUTF16 (
00119         const UTF32** sourceStart, const UTF32* sourceEnd, 
00120         UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
00121     ConversionResult result = conversionOK;
00122     const UTF32* source = *sourceStart;
00123     UTF16* target = *targetStart;
00124     while (source < sourceEnd) {
00125         UTF32 ch;
00126         if (target >= targetEnd) {
00127             result = targetExhausted; break;
00128         }
00129         ch = *source++;
00130         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
00131             /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
00132             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
00133                 if (flags == strictConversion) {
00134                     --source; /* return to the illegal value itself */
00135                     result = sourceIllegal;
00136                     break;
00137                 } else {
00138                     *target++ = UNI_REPLACEMENT_CHAR;
00139                 }
00140             } else {
00141                 *target++ = (UTF16)ch; /* normal case */
00142             }
00143         } else if (ch > UNI_MAX_LEGAL_UTF32) {
00144             if (flags == strictConversion) {
00145                 result = sourceIllegal;
00146             } else {
00147                 *target++ = UNI_REPLACEMENT_CHAR;
00148             }
00149         } else {
00150             /* target is a character in range 0xFFFF - 0x10FFFF. */
00151             if (target + 1 >= targetEnd) {
00152                 --source; /* Back up source pointer! */
00153                 result = targetExhausted; break;
00154             }
00155             ch -= halfBase;
00156             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
00157             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
00158         }
00159     }
00160     *sourceStart = source;
00161     *targetStart = target;
00162     return result;
00163 }
00164 
00165 /* --------------------------------------------------------------------- */
00166 
00167 ConversionResult ConvertUTF16toUTF32 (
00168         const UTF16** sourceStart, const UTF16* sourceEnd, 
00169         UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
00170     ConversionResult result = conversionOK;
00171     const UTF16* source = *sourceStart;
00172     UTF32* target = *targetStart;
00173     UTF32 ch, ch2;
00174     while (source < sourceEnd) {
00175         const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
00176         ch = *source++;
00177         /* If we have a surrogate pair, convert to UTF32 first. */
00178         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
00179             /* If the 16 bits following the high surrogate are in the source buffer... */
00180             if (source < sourceEnd) {
00181                 ch2 = *source;
00182                 /* If it's a low surrogate, convert to UTF32. */
00183                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
00184                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
00185                         + (ch2 - UNI_SUR_LOW_START) + halfBase;
00186                     ++source;
00187                 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
00188                     --source; /* return to the illegal value itself */
00189                     result = sourceIllegal;
00190                     break;
00191                 }
00192             } else { /* We don't have the 16 bits following the high surrogate. */
00193                 --source; /* return to the high surrogate */
00194                 result = sourceExhausted;
00195                 break;
00196             }
00197         } else if (flags == strictConversion) {
00198             /* UTF-16 surrogate values are illegal in UTF-32 */
00199             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
00200                 --source; /* return to the illegal value itself */
00201                 result = sourceIllegal;
00202                 break;
00203             }
00204         }
00205         if (target >= targetEnd) {
00206             source = oldSource; /* Back up source pointer! */
00207             result = targetExhausted; break;
00208         }
00209         *target++ = ch;
00210     }
00211     *sourceStart = source;
00212     *targetStart = target;
00213 #ifdef CVTUTF_DEBUG
00214 if (result == sourceIllegal) {
00215     fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
00216     fflush(stderr);
00217 }
00218 #endif
00219     return result;
00220 }
00221 ConversionResult ConvertUTF16toUTF8 (
00222         const UTF16** sourceStart, const UTF16* sourceEnd, 
00223         UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
00224     ConversionResult result = conversionOK;
00225     const UTF16* source = *sourceStart;
00226     UTF8* target = *targetStart;
00227     while (source < sourceEnd) {
00228         UTF32 ch;
00229         unsigned short bytesToWrite = 0;
00230         const UTF32 byteMask = 0xBF;
00231         const UTF32 byteMark = 0x80; 
00232         const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
00233         ch = *source++;
00234         /* If we have a surrogate pair, convert to UTF32 first. */
00235         if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
00236             /* If the 16 bits following the high surrogate are in the source buffer... */
00237             if (source < sourceEnd) {
00238                 UTF32 ch2 = *source;
00239                 /* If it's a low surrogate, convert to UTF32. */
00240                 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
00241                     ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
00242                         + (ch2 - UNI_SUR_LOW_START) + halfBase;
00243                     ++source;
00244                 } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
00245                     --source; /* return to the illegal value itself */
00246                     result = sourceIllegal;
00247                     break;
00248                 }
00249             } else { /* We don't have the 16 bits following the high surrogate. */
00250                 --source; /* return to the high surrogate */
00251                 result = sourceExhausted;
00252                 break;
00253             }
00254         } else if (flags == strictConversion) {
00255             /* UTF-16 surrogate values are illegal in UTF-32 */
00256             if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
00257                 --source; /* return to the illegal value itself */
00258                 result = sourceIllegal;
00259                 break;
00260             }
00261         }
00262         /* Figure out how many bytes the result will require */
00263         if (ch < (UTF32)0x80) {      bytesToWrite = 1;
00264         } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
00265         } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
00266         } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
00267         } else {                            bytesToWrite = 3;
00268                                             ch = UNI_REPLACEMENT_CHAR;
00269         }
00270 
00271         target += bytesToWrite;
00272         if (target > targetEnd) {
00273             source = oldSource; /* Back up source pointer! */
00274             target -= bytesToWrite; result = targetExhausted; break;
00275         }
00276         switch (bytesToWrite) { /* note: everything falls through. */
00277             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
00278             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
00279             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
00280             case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
00281         }
00282         target += bytesToWrite;
00283     }
00284     *sourceStart = source;
00285     *targetStart = target;
00286     return result;
00287 }
00288 
00289 /* --------------------------------------------------------------------- */
00290 
00291 ConversionResult ConvertUTF32toUTF8 (
00292         const UTF32** sourceStart, const UTF32* sourceEnd, 
00293         UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
00294     ConversionResult result = conversionOK;
00295     const UTF32* source = *sourceStart;
00296     UTF8* target = *targetStart;
00297     while (source < sourceEnd) {
00298         UTF32 ch;
00299         unsigned short bytesToWrite = 0;
00300         const UTF32 byteMask = 0xBF;
00301         const UTF32 byteMark = 0x80; 
00302         ch = *source++;
00303         if (flags == strictConversion ) {
00304             /* UTF-16 surrogate values are illegal in UTF-32 */
00305             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
00306                 --source; /* return to the illegal value itself */
00307                 result = sourceIllegal;
00308                 break;
00309             }
00310         }
00311         /*
00312          * Figure out how many bytes the result will require. Turn any
00313          * illegally large UTF32 things (> Plane 17) into replacement chars.
00314          */
00315         if (ch < (UTF32)0x80) {      bytesToWrite = 1;
00316         } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
00317         } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
00318         } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
00319         } else {                            bytesToWrite = 3;
00320                                             ch = UNI_REPLACEMENT_CHAR;
00321                                             result = sourceIllegal;
00322         }
00323         
00324         target += bytesToWrite;
00325         if (target > targetEnd) {
00326             --source; /* Back up source pointer! */
00327             target -= bytesToWrite; result = targetExhausted; break;
00328         }
00329         switch (bytesToWrite) { /* note: everything falls through. */
00330             case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
00331             case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
00332             case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
00333             case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
00334         }
00335         target += bytesToWrite;
00336     }
00337     *sourceStart = source;
00338     *targetStart = target;
00339     return result;
00340 }
00341 
00342 /* --------------------------------------------------------------------- */
00343 
00344 /*
00345  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
00346  * This must be called with the length pre-determined by the first byte.
00347  * If not calling this from ConvertUTF8to*, then the length can be set by:
00348  *  length = trailingBytesForUTF8[*source]+1;
00349  * and the sequence is illegal right away if there aren't that many bytes
00350  * available.
00351  * If presented with a length > 4, this returns false.  The Unicode
00352  * definition of UTF-8 goes up to 4-byte sequences.
00353  */
00354 
00355 static Boolean isLegalUTF8(const UTF8 *source, int length) {
00356     UTF8 a;
00357     const UTF8 *srcptr = source+length;
00358     switch (length) {
00359     default: return false;
00360         /* Everything else falls through when "true"... */
00361     case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
00362     case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
00363     case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
00364 
00365         switch (*source) {
00366             /* no fall-through in this inner switch */
00367             case 0xE0: if (a < 0xA0) return false; break;
00368             case 0xED: if (a > 0x9F) return false; break;
00369             case 0xF0: if (a < 0x90) return false; break;
00370             case 0xF4: if (a > 0x8F) return false; break;
00371             default:   if (a < 0x80) return false;
00372         }
00373 
00374     case 1: if (*source >= 0x80 && *source < 0xC2) return false;
00375     }
00376     if (*source > 0xF4) return false;
00377     return true;
00378 }
00379 
00380 /* --------------------------------------------------------------------- */
00381 
00382 /*
00383  * Exported function to return whether a UTF-8 sequence is legal or not.
00384  * This is not used here; it's just exported.
00385  */
00386 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
00387     int length = trailingBytesForUTF8[*source]+1;
00388     if (length > sourceEnd - source) {
00389         return false;
00390     }
00391     return isLegalUTF8(source, length);
00392 }
00393 
00394 /* --------------------------------------------------------------------- */
00395 
00396 static unsigned
00397 findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
00398                                           const UTF8 *sourceEnd) {
00399   UTF8 b1, b2, b3;
00400 
00401   assert(!isLegalUTF8Sequence(source, sourceEnd));
00402 
00403   /*
00404    * Unicode 6.3.0, D93b:
00405    *
00406    *   Maximal subpart of an ill-formed subsequence: The longest code unit
00407    *   subsequence starting at an unconvertible offset that is either:
00408    *   a. the initial subsequence of a well-formed code unit sequence, or
00409    *   b. a subsequence of length one.
00410    */
00411 
00412   if (source == sourceEnd)
00413     return 0;
00414 
00415   /*
00416    * Perform case analysis.  See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
00417    * Byte Sequences.
00418    */
00419 
00420   b1 = *source;
00421   ++source;
00422   if (b1 >= 0xC2 && b1 <= 0xDF) {
00423     /*
00424      * First byte is valid, but we know that this code unit sequence is
00425      * invalid, so the maximal subpart has to end after the first byte.
00426      */
00427     return 1;
00428   }
00429 
00430   if (source == sourceEnd)
00431     return 1;
00432 
00433   b2 = *source;
00434   ++source;
00435 
00436   if (b1 == 0xE0) {
00437     return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
00438   }
00439   if (b1 >= 0xE1 && b1 <= 0xEC) {
00440     return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
00441   }
00442   if (b1 == 0xED) {
00443     return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
00444   }
00445   if (b1 >= 0xEE && b1 <= 0xEF) {
00446     return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
00447   }
00448   if (b1 == 0xF0) {
00449     if (b2 >= 0x90 && b2 <= 0xBF) {
00450       if (source == sourceEnd)
00451         return 2;
00452 
00453       b3 = *source;
00454       return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
00455     }
00456     return 1;
00457   }
00458   if (b1 >= 0xF1 && b1 <= 0xF3) {
00459     if (b2 >= 0x80 && b2 <= 0xBF) {
00460       if (source == sourceEnd)
00461         return 2;
00462 
00463       b3 = *source;
00464       return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
00465     }
00466     return 1;
00467   }
00468   if (b1 == 0xF4) {
00469     if (b2 >= 0x80 && b2 <= 0x8F) {
00470       if (source == sourceEnd)
00471         return 2;
00472 
00473       b3 = *source;
00474       return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
00475     }
00476     return 1;
00477   }
00478 
00479   assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
00480   /*
00481    * There are no valid sequences that start with these bytes.  Maximal subpart
00482    * is defined to have length 1 in these cases.
00483    */
00484   return 1;
00485 }
00486 
00487 /* --------------------------------------------------------------------- */
00488 
00489 /*
00490  * Exported function to return the total number of bytes in a codepoint
00491  * represented in UTF-8, given the value of the first byte.
00492  */
00493 unsigned getNumBytesForUTF8(UTF8 first) {
00494   return trailingBytesForUTF8[first] + 1;
00495 }
00496 
00497 /* --------------------------------------------------------------------- */
00498 
00499 /*
00500  * Exported function to return whether a UTF-8 string is legal or not.
00501  * This is not used here; it's just exported.
00502  */
00503 Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
00504     while (*source != sourceEnd) {
00505         int length = trailingBytesForUTF8[**source] + 1;
00506         if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
00507             return false;
00508         *source += length;
00509     }
00510     return true;
00511 }
00512 
00513 /* --------------------------------------------------------------------- */
00514 
00515 ConversionResult ConvertUTF8toUTF16 (
00516         const UTF8** sourceStart, const UTF8* sourceEnd, 
00517         UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
00518     ConversionResult result = conversionOK;
00519     const UTF8* source = *sourceStart;
00520     UTF16* target = *targetStart;
00521     while (source < sourceEnd) {
00522         UTF32 ch = 0;
00523         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
00524         if (extraBytesToRead >= sourceEnd - source) {
00525             result = sourceExhausted; break;
00526         }
00527         /* Do this check whether lenient or strict */
00528         if (!isLegalUTF8(source, extraBytesToRead+1)) {
00529             result = sourceIllegal;
00530             break;
00531         }
00532         /*
00533          * The cases all fall through. See "Note A" below.
00534          */
00535         switch (extraBytesToRead) {
00536             case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
00537             case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
00538             case 3: ch += *source++; ch <<= 6;
00539             case 2: ch += *source++; ch <<= 6;
00540             case 1: ch += *source++; ch <<= 6;
00541             case 0: ch += *source++;
00542         }
00543         ch -= offsetsFromUTF8[extraBytesToRead];
00544 
00545         if (target >= targetEnd) {
00546             source -= (extraBytesToRead+1); /* Back up source pointer! */
00547             result = targetExhausted; break;
00548         }
00549         if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
00550             /* UTF-16 surrogate values are illegal in UTF-32 */
00551             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
00552                 if (flags == strictConversion) {
00553                     source -= (extraBytesToRead+1); /* return to the illegal value itself */
00554                     result = sourceIllegal;
00555                     break;
00556                 } else {
00557                     *target++ = UNI_REPLACEMENT_CHAR;
00558                 }
00559             } else {
00560                 *target++ = (UTF16)ch; /* normal case */
00561             }
00562         } else if (ch > UNI_MAX_UTF16) {
00563             if (flags == strictConversion) {
00564                 result = sourceIllegal;
00565                 source -= (extraBytesToRead+1); /* return to the start */
00566                 break; /* Bail out; shouldn't continue */
00567             } else {
00568                 *target++ = UNI_REPLACEMENT_CHAR;
00569             }
00570         } else {
00571             /* target is a character in range 0xFFFF - 0x10FFFF. */
00572             if (target + 1 >= targetEnd) {
00573                 source -= (extraBytesToRead+1); /* Back up source pointer! */
00574                 result = targetExhausted; break;
00575             }
00576             ch -= halfBase;
00577             *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
00578             *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
00579         }
00580     }
00581     *sourceStart = source;
00582     *targetStart = target;
00583     return result;
00584 }
00585 
00586 /* --------------------------------------------------------------------- */
00587 
00588 static ConversionResult ConvertUTF8toUTF32Impl(
00589         const UTF8** sourceStart, const UTF8* sourceEnd, 
00590         UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
00591         Boolean InputIsPartial) {
00592     ConversionResult result = conversionOK;
00593     const UTF8* source = *sourceStart;
00594     UTF32* target = *targetStart;
00595     while (source < sourceEnd) {
00596         UTF32 ch = 0;
00597         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
00598         if (extraBytesToRead >= sourceEnd - source) {
00599             if (flags == strictConversion || InputIsPartial) {
00600                 result = sourceExhausted;
00601                 break;
00602             } else {
00603                 result = sourceIllegal;
00604 
00605                 /*
00606                  * Replace the maximal subpart of ill-formed sequence with
00607                  * replacement character.
00608                  */
00609                 source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
00610                                                                     sourceEnd);
00611                 *target++ = UNI_REPLACEMENT_CHAR;
00612                 continue;
00613             }
00614         }
00615         if (target >= targetEnd) {
00616             result = targetExhausted; break;
00617         }
00618 
00619         /* Do this check whether lenient or strict */
00620         if (!isLegalUTF8(source, extraBytesToRead+1)) {
00621             result = sourceIllegal;
00622             if (flags == strictConversion) {
00623                 /* Abort conversion. */
00624                 break;
00625             } else {
00626                 /*
00627                  * Replace the maximal subpart of ill-formed sequence with
00628                  * replacement character.
00629                  */
00630                 source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
00631                                                                     sourceEnd);
00632                 *target++ = UNI_REPLACEMENT_CHAR;
00633                 continue;
00634             }
00635         }
00636         /*
00637          * The cases all fall through. See "Note A" below.
00638          */
00639         switch (extraBytesToRead) {
00640             case 5: ch += *source++; ch <<= 6;
00641             case 4: ch += *source++; ch <<= 6;
00642             case 3: ch += *source++; ch <<= 6;
00643             case 2: ch += *source++; ch <<= 6;
00644             case 1: ch += *source++; ch <<= 6;
00645             case 0: ch += *source++;
00646         }
00647         ch -= offsetsFromUTF8[extraBytesToRead];
00648 
00649         if (ch <= UNI_MAX_LEGAL_UTF32) {
00650             /*
00651              * UTF-16 surrogate values are illegal in UTF-32, and anything
00652              * over Plane 17 (> 0x10FFFF) is illegal.
00653              */
00654             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
00655                 if (flags == strictConversion) {
00656                     source -= (extraBytesToRead+1); /* return to the illegal value itself */
00657                     result = sourceIllegal;
00658                     break;
00659                 } else {
00660                     *target++ = UNI_REPLACEMENT_CHAR;
00661                 }
00662             } else {
00663                 *target++ = ch;
00664             }
00665         } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
00666             result = sourceIllegal;
00667             *target++ = UNI_REPLACEMENT_CHAR;
00668         }
00669     }
00670     *sourceStart = source;
00671     *targetStart = target;
00672     return result;
00673 }
00674 
00675 ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,
00676                                            const UTF8 *sourceEnd,
00677                                            UTF32 **targetStart,
00678                                            UTF32 *targetEnd,
00679                                            ConversionFlags flags) {
00680   return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
00681                                 flags, /*InputIsPartial=*/true);
00682 }
00683 
00684 ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
00685                                     const UTF8 *sourceEnd, UTF32 **targetStart,
00686                                     UTF32 *targetEnd, ConversionFlags flags) {
00687   return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
00688                                 flags, /*InputIsPartial=*/false);
00689 }
00690 
00691 /* ---------------------------------------------------------------------
00692 
00693     Note A.
00694     The fall-through switches in UTF-8 reading code save a
00695     temp variable, some decrements & conditionals.  The switches
00696     are equivalent to the following loop:
00697         {
00698             int tmpBytesToRead = extraBytesToRead+1;
00699             do {
00700                 ch += *source++;
00701                 --tmpBytesToRead;
00702                 if (tmpBytesToRead) ch <<= 6;
00703             } while (tmpBytesToRead > 0);
00704         }
00705     In UTF-8 writing code, the switches on "bytesToWrite" are
00706     similarly unrolled loops.
00707 
00708    --------------------------------------------------------------------- */