LLVM API Documentation
00001 /*===--- ConvertUTF.h - Universal Character Names conversions ---------------=== 00002 * 00003 * The LLVM Compiler Infrastructure 00004 * 00005 * This file is distributed under the University of Illinois Open Source 00006 * License. See LICENSE.TXT for details. 00007 * 00008 *==------------------------------------------------------------------------==*/ 00009 /* 00010 * Copyright 2001-2004 Unicode, Inc. 00011 * 00012 * Disclaimer 00013 * 00014 * This source code is provided as is by Unicode, Inc. No claims are 00015 * made as to fitness for any particular purpose. No warranties of any 00016 * kind are expressed or implied. The recipient agrees to determine 00017 * applicability of information provided. If this file has been 00018 * purchased on magnetic or optical media from Unicode, Inc., the 00019 * sole remedy for any claim will be exchange of defective media 00020 * within 90 days of receipt. 00021 * 00022 * Limitations on Rights to Redistribute This Code 00023 * 00024 * Unicode, Inc. hereby grants the right to freely use the information 00025 * supplied in this file in the creation of products supporting the 00026 * Unicode Standard, and to make copies of this file in any form 00027 * for internal or external distribution as long as this notice 00028 * remains attached. 00029 */ 00030 00031 /* --------------------------------------------------------------------- 00032 00033 Conversions between UTF32, UTF-16, and UTF-8. Header file. 00034 00035 Several funtions are included here, forming a complete set of 00036 conversions between the three formats. UTF-7 is not included 00037 here, but is handled in a separate source file. 00038 00039 Each of these routines takes pointers to input buffers and output 00040 buffers. The input buffers are const. 00041 00042 Each routine converts the text between *sourceStart and sourceEnd, 00043 putting the result into the buffer between *targetStart and 00044 targetEnd. Note: the end pointers are *after* the last item: e.g. 00045 *(sourceEnd - 1) is the last item. 00046 00047 The return result indicates whether the conversion was successful, 00048 and if not, whether the problem was in the source or target buffers. 00049 (Only the first encountered problem is indicated.) 00050 00051 After the conversion, *sourceStart and *targetStart are both 00052 updated to point to the end of last text successfully converted in 00053 the respective buffers. 00054 00055 Input parameters: 00056 sourceStart - pointer to a pointer to the source buffer. 00057 The contents of this are modified on return so that 00058 it points at the next thing to be converted. 00059 targetStart - similarly, pointer to pointer to the target buffer. 00060 sourceEnd, targetEnd - respectively pointers to the ends of the 00061 two buffers, for overflow checking only. 00062 00063 These conversion functions take a ConversionFlags argument. When this 00064 flag is set to strict, both irregular sequences and isolated surrogates 00065 will cause an error. When the flag is set to lenient, both irregular 00066 sequences and isolated surrogates are converted. 00067 00068 Whether the flag is strict or lenient, all illegal sequences will cause 00069 an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>, 00070 or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code 00071 must check for illegal sequences. 00072 00073 When the flag is set to lenient, characters over 0x10FFFF are converted 00074 to the replacement character; otherwise (when the flag is set to strict) 00075 they constitute an error. 00076 00077 Output parameters: 00078 The value "sourceIllegal" is returned from some routines if the input 00079 sequence is malformed. When "sourceIllegal" is returned, the source 00080 value will point to the illegal value that caused the problem. E.g., 00081 in UTF-8 when a sequence is malformed, it points to the start of the 00082 malformed sequence. 00083 00084 Author: Mark E. Davis, 1994. 00085 Rev History: Rick McGowan, fixes & updates May 2001. 00086 Fixes & updates, Sept 2001. 00087 00088 ------------------------------------------------------------------------ */ 00089 00090 #ifndef LLVM_SUPPORT_CONVERTUTF_H 00091 #define LLVM_SUPPORT_CONVERTUTF_H 00092 00093 /* --------------------------------------------------------------------- 00094 The following 4 definitions are compiler-specific. 00095 The C standard does not guarantee that wchar_t has at least 00096 16 bits, so wchar_t is no less portable than unsigned short! 00097 All should be unsigned values to avoid sign extension during 00098 bit mask & shift operations. 00099 ------------------------------------------------------------------------ */ 00100 00101 typedef unsigned int UTF32; /* at least 32 bits */ 00102 typedef unsigned short UTF16; /* at least 16 bits */ 00103 typedef unsigned char UTF8; /* typically 8 bits */ 00104 typedef unsigned char Boolean; /* 0 or 1 */ 00105 00106 /* Some fundamental constants */ 00107 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD 00108 #define UNI_MAX_BMP (UTF32)0x0000FFFF 00109 #define UNI_MAX_UTF16 (UTF32)0x0010FFFF 00110 #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF 00111 #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF 00112 00113 #define UNI_MAX_UTF8_BYTES_PER_CODE_POINT 4 00114 00115 #define UNI_UTF16_BYTE_ORDER_MARK_NATIVE 0xFEFF 00116 #define UNI_UTF16_BYTE_ORDER_MARK_SWAPPED 0xFFFE 00117 00118 typedef enum { 00119 conversionOK, /* conversion successful */ 00120 sourceExhausted, /* partial character in source, but hit end */ 00121 targetExhausted, /* insuff. room in target for conversion */ 00122 sourceIllegal /* source sequence is illegal/malformed */ 00123 } ConversionResult; 00124 00125 typedef enum { 00126 strictConversion = 0, 00127 lenientConversion 00128 } ConversionFlags; 00129 00130 /* This is for C++ and does no harm in C */ 00131 #ifdef __cplusplus 00132 extern "C" { 00133 #endif 00134 00135 ConversionResult ConvertUTF8toUTF16 ( 00136 const UTF8** sourceStart, const UTF8* sourceEnd, 00137 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); 00138 00139 /** 00140 * Convert a partial UTF8 sequence to UTF32. If the sequence ends in an 00141 * incomplete code unit sequence, returns \c sourceExhausted. 00142 */ 00143 ConversionResult ConvertUTF8toUTF32Partial( 00144 const UTF8** sourceStart, const UTF8* sourceEnd, 00145 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); 00146 00147 /** 00148 * Convert a partial UTF8 sequence to UTF32. If the sequence ends in an 00149 * incomplete code unit sequence, returns \c sourceIllegal. 00150 */ 00151 ConversionResult ConvertUTF8toUTF32( 00152 const UTF8** sourceStart, const UTF8* sourceEnd, 00153 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); 00154 00155 ConversionResult ConvertUTF16toUTF8 ( 00156 const UTF16** sourceStart, const UTF16* sourceEnd, 00157 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); 00158 00159 ConversionResult ConvertUTF32toUTF8 ( 00160 const UTF32** sourceStart, const UTF32* sourceEnd, 00161 UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); 00162 00163 ConversionResult ConvertUTF16toUTF32 ( 00164 const UTF16** sourceStart, const UTF16* sourceEnd, 00165 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); 00166 00167 ConversionResult ConvertUTF32toUTF16 ( 00168 const UTF32** sourceStart, const UTF32* sourceEnd, 00169 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); 00170 00171 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd); 00172 00173 Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd); 00174 00175 unsigned getNumBytesForUTF8(UTF8 firstByte); 00176 00177 #ifdef __cplusplus 00178 } 00179 00180 /*************************************************************************/ 00181 /* Below are LLVM-specific wrappers of the functions above. */ 00182 00183 #include "llvm/ADT/ArrayRef.h" 00184 #include "llvm/ADT/StringRef.h" 00185 00186 namespace llvm { 00187 00188 /** 00189 * Convert an UTF8 StringRef to UTF8, UTF16, or UTF32 depending on 00190 * WideCharWidth. The converted data is written to ResultPtr, which needs to 00191 * point to at least WideCharWidth * (Source.Size() + 1) bytes. On success, 00192 * ResultPtr will point one after the end of the copied string. On failure, 00193 * ResultPtr will not be changed, and ErrorPtr will be set to the location of 00194 * the first character which could not be converted. 00195 * \return true on success. 00196 */ 00197 bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source, 00198 char *&ResultPtr, const UTF8 *&ErrorPtr); 00199 00200 /** 00201 * Convert an Unicode code point to UTF8 sequence. 00202 * 00203 * \param Source a Unicode code point. 00204 * \param [in,out] ResultPtr pointer to the output buffer, needs to be at least 00205 * \c UNI_MAX_UTF8_BYTES_PER_CODE_POINT bytes. On success \c ResultPtr is 00206 * updated one past end of the converted sequence. 00207 * 00208 * \returns true on success. 00209 */ 00210 bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr); 00211 00212 /** 00213 * Convert the first UTF8 sequence in the given source buffer to a UTF32 00214 * code point. 00215 * 00216 * \param [in,out] source A pointer to the source buffer. If the conversion 00217 * succeeds, this pointer will be updated to point to the byte just past the 00218 * end of the converted sequence. 00219 * \param sourceEnd A pointer just past the end of the source buffer. 00220 * \param [out] target The converted code 00221 * \param flags Whether the conversion is strict or lenient. 00222 * 00223 * \returns conversionOK on success 00224 * 00225 * \sa ConvertUTF8toUTF32 00226 */ 00227 static inline ConversionResult convertUTF8Sequence(const UTF8 **source, 00228 const UTF8 *sourceEnd, 00229 UTF32 *target, 00230 ConversionFlags flags) { 00231 if (*source == sourceEnd) 00232 return sourceExhausted; 00233 unsigned size = getNumBytesForUTF8(**source); 00234 if ((ptrdiff_t)size > sourceEnd - *source) 00235 return sourceExhausted; 00236 return ConvertUTF8toUTF32(source, *source + size, &target, target + 1, flags); 00237 } 00238 00239 /** 00240 * Returns true if a blob of text starts with a UTF-16 big or little endian byte 00241 * order mark. 00242 */ 00243 bool hasUTF16ByteOrderMark(ArrayRef<char> SrcBytes); 00244 00245 /** 00246 * Converts a stream of raw bytes assumed to be UTF16 into a UTF8 std::string. 00247 * 00248 * \param [in] SrcBytes A buffer of what is assumed to be UTF-16 encoded text. 00249 * \param [out] Out Converted UTF-8 is stored here on success. 00250 * \returns true on success 00251 */ 00252 bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out); 00253 00254 } /* end namespace llvm */ 00255 00256 #endif 00257 00258 /* --------------------------------------------------------------------- */ 00259 00260 #endif