clang API Documentation
00001 //===--- Encoding.h - Format C++ code -------------------------------------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 /// 00010 /// \file 00011 /// \brief Contains functions for text encoding manipulation. Supports UTF-8, 00012 /// 8-bit encodings and escape sequences in C++ string literals. 00013 /// 00014 //===----------------------------------------------------------------------===// 00015 00016 #ifndef LLVM_CLANG_LIB_FORMAT_ENCODING_H 00017 #define LLVM_CLANG_LIB_FORMAT_ENCODING_H 00018 00019 #include "clang/Basic/LLVM.h" 00020 #include "llvm/Support/ConvertUTF.h" 00021 #include "llvm/Support/Unicode.h" 00022 00023 namespace clang { 00024 namespace format { 00025 namespace encoding { 00026 00027 enum Encoding { 00028 Encoding_UTF8, 00029 Encoding_Unknown // We treat all other encodings as 8-bit encodings. 00030 }; 00031 00032 /// \brief Detects encoding of the Text. If the Text can be decoded using UTF-8, 00033 /// it is considered UTF8, otherwise we treat it as some 8-bit encoding. 00034 inline Encoding detectEncoding(StringRef Text) { 00035 const UTF8 *Ptr = reinterpret_cast<const UTF8 *>(Text.begin()); 00036 const UTF8 *BufEnd = reinterpret_cast<const UTF8 *>(Text.end()); 00037 if (::isLegalUTF8String(&Ptr, BufEnd)) 00038 return Encoding_UTF8; 00039 return Encoding_Unknown; 00040 } 00041 00042 inline unsigned getCodePointCountUTF8(StringRef Text) { 00043 unsigned CodePoints = 0; 00044 for (size_t i = 0, e = Text.size(); i < e; i += getNumBytesForUTF8(Text[i])) { 00045 ++CodePoints; 00046 } 00047 return CodePoints; 00048 } 00049 00050 /// \brief Gets the number of code points in the Text using the specified 00051 /// Encoding. 00052 inline unsigned getCodePointCount(StringRef Text, Encoding Encoding) { 00053 switch (Encoding) { 00054 case Encoding_UTF8: 00055 return getCodePointCountUTF8(Text); 00056 default: 00057 return Text.size(); 00058 } 00059 } 00060 00061 /// \brief Returns the number of columns required to display the \p Text on a 00062 /// generic Unicode-capable terminal. Text is assumed to use the specified 00063 /// \p Encoding. 00064 inline unsigned columnWidth(StringRef Text, Encoding Encoding) { 00065 if (Encoding == Encoding_UTF8) { 00066 int ContentWidth = llvm::sys::unicode::columnWidthUTF8(Text); 00067 // FIXME: Figure out the correct way to handle this in the presence of both 00068 // printable and unprintable multi-byte UTF-8 characters. Falling back to 00069 // returning the number of bytes may cause problems, as columnWidth suddenly 00070 // becomes non-additive. 00071 if (ContentWidth >= 0) 00072 return ContentWidth; 00073 } 00074 return Text.size(); 00075 } 00076 00077 /// \brief Returns the number of columns required to display the \p Text, 00078 /// starting from the \p StartColumn on a terminal with the \p TabWidth. The 00079 /// text is assumed to use the specified \p Encoding. 00080 inline unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, 00081 unsigned TabWidth, Encoding Encoding) { 00082 unsigned TotalWidth = 0; 00083 StringRef Tail = Text; 00084 for (;;) { 00085 StringRef::size_type TabPos = Tail.find('\t'); 00086 if (TabPos == StringRef::npos) 00087 return TotalWidth + columnWidth(Tail, Encoding); 00088 TotalWidth += columnWidth(Tail.substr(0, TabPos), Encoding); 00089 TotalWidth += TabWidth - (TotalWidth + StartColumn) % TabWidth; 00090 Tail = Tail.substr(TabPos + 1); 00091 } 00092 } 00093 00094 /// \brief Gets the number of bytes in a sequence representing a single 00095 /// codepoint and starting with FirstChar in the specified Encoding. 00096 inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) { 00097 switch (Encoding) { 00098 case Encoding_UTF8: 00099 return getNumBytesForUTF8(FirstChar); 00100 default: 00101 return 1; 00102 } 00103 } 00104 00105 inline bool isOctDigit(char c) { return '0' <= c && c <= '7'; } 00106 00107 inline bool isHexDigit(char c) { 00108 return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || 00109 ('A' <= c && c <= 'F'); 00110 } 00111 00112 /// \brief Gets the length of an escape sequence inside a C++ string literal. 00113 /// Text should span from the beginning of the escape sequence (starting with a 00114 /// backslash) to the end of the string literal. 00115 inline unsigned getEscapeSequenceLength(StringRef Text) { 00116 assert(Text[0] == '\\'); 00117 if (Text.size() < 2) 00118 return 1; 00119 00120 switch (Text[1]) { 00121 case 'u': 00122 return 6; 00123 case 'U': 00124 return 10; 00125 case 'x': { 00126 unsigned I = 2; // Point after '\x'. 00127 while (I < Text.size() && isHexDigit(Text[I])) 00128 ++I; 00129 return I; 00130 } 00131 default: 00132 if (isOctDigit(Text[1])) { 00133 unsigned I = 1; 00134 while (I < Text.size() && I < 4 && isOctDigit(Text[I])) 00135 ++I; 00136 return I; 00137 } 00138 return 2; 00139 } 00140 } 00141 00142 } // namespace encoding 00143 } // namespace format 00144 } // namespace clang 00145 00146 #endif