clang: Encoding.h Source File

Go to the documentation of this file.
00001 //===--- Encoding.h - Format C++ code -------------------------------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 ///
00010 /// \file
00011 /// \brief Contains functions for text encoding manipulation. Supports UTF-8,
00012 /// 8-bit encodings and escape sequences in C++ string literals.
00013 ///
00014 //===----------------------------------------------------------------------===//
00015 
00016 #ifndef LLVM_CLANG_LIB_FORMAT_ENCODING_H
00017 #define LLVM_CLANG_LIB_FORMAT_ENCODING_H
00018 
00019 #include "clang/Basic/LLVM.h"
00020 #include "llvm/Support/ConvertUTF.h"
00021 #include "llvm/Support/Unicode.h"
00022 
00023 namespace clang {
00024 namespace format {
00025 namespace encoding {
00026 
00027 enum Encoding {
00028   Encoding_UTF8,
00029   Encoding_Unknown // We treat all other encodings as 8-bit encodings.
00030 };
00031 
00032 /// \brief Detects encoding of the Text. If the Text can be decoded using UTF-8,
00033 /// it is considered UTF8, otherwise we treat it as some 8-bit encoding.
00034 inline Encoding detectEncoding(StringRef Text) {
00035   const UTF8 *Ptr = reinterpret_cast<const UTF8 *>(Text.begin());
00036   const UTF8 *BufEnd = reinterpret_cast<const UTF8 *>(Text.end());
00037   if (::isLegalUTF8String(&Ptr, BufEnd))
00038     return Encoding_UTF8;
00039   return Encoding_Unknown;
00040 }
00041 
00042 inline unsigned getCodePointCountUTF8(StringRef Text) {
00043   unsigned CodePoints = 0;
00044   for (size_t i = 0, e = Text.size(); i < e; i += getNumBytesForUTF8(Text[i])) {
00045     ++CodePoints;
00046   }
00047   return CodePoints;
00048 }
00049 
00050 /// \brief Gets the number of code points in the Text using the specified
00051 /// Encoding.
00052 inline unsigned getCodePointCount(StringRef Text, Encoding Encoding) {
00053   switch (Encoding) {
00054   case Encoding_UTF8:
00055     return getCodePointCountUTF8(Text);
00056   default:
00057     return Text.size();
00058   }
00059 }
00060 
00061 /// \brief Returns the number of columns required to display the \p Text on a
00062 /// generic Unicode-capable terminal. Text is assumed to use the specified
00063 /// \p Encoding.
00064 inline unsigned columnWidth(StringRef Text, Encoding Encoding) {
00065   if (Encoding == Encoding_UTF8) {
00066     int ContentWidth = llvm::sys::unicode::columnWidthUTF8(Text);
00067     // FIXME: Figure out the correct way to handle this in the presence of both
00068     // printable and unprintable multi-byte UTF-8 characters. Falling back to
00069     // returning the number of bytes may cause problems, as columnWidth suddenly
00070     // becomes non-additive.
00071     if (ContentWidth >= 0)
00072       return ContentWidth;
00073   }
00074   return Text.size();
00075 }
00076 
00077 /// \brief Returns the number of columns required to display the \p Text,
00078 /// starting from the \p StartColumn on a terminal with the \p TabWidth. The
00079 /// text is assumed to use the specified \p Encoding.
00080 inline unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn,
00081                                     unsigned TabWidth, Encoding Encoding) {
00082   unsigned TotalWidth = 0;
00083   StringRef Tail = Text;
00084   for (;;) {
00085     StringRef::size_type TabPos = Tail.find('\t');
00086     if (TabPos == StringRef::npos)
00087       return TotalWidth + columnWidth(Tail, Encoding);
00088     TotalWidth += columnWidth(Tail.substr(0, TabPos), Encoding);
00089     TotalWidth += TabWidth - (TotalWidth + StartColumn) % TabWidth;
00090     Tail = Tail.substr(TabPos + 1);
00091   }
00092 }
00093 
00094 /// \brief Gets the number of bytes in a sequence representing a single
00095 /// codepoint and starting with FirstChar in the specified Encoding.
00096 inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) {
00097   switch (Encoding) {
00098   case Encoding_UTF8:
00099     return getNumBytesForUTF8(FirstChar);
00100   default:
00101     return 1;
00102   }
00103 }
00104 
00105 inline bool isOctDigit(char c) { return '0' <= c && c <= '7'; }
00106 
00107 inline bool isHexDigit(char c) {
00108   return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
00109          ('A' <= c && c <= 'F');
00110 }
00111 
00112 /// \brief Gets the length of an escape sequence inside a C++ string literal.
00113 /// Text should span from the beginning of the escape sequence (starting with a
00114 /// backslash) to the end of the string literal.
00115 inline unsigned getEscapeSequenceLength(StringRef Text) {
00116   assert(Text[0] == '\\');
00117   if (Text.size() < 2)
00118     return 1;
00119 
00120   switch (Text[1]) {
00121   case 'u':
00122     return 6;
00123   case 'U':
00124     return 10;
00125   case 'x': {
00126     unsigned I = 2; // Point after '\x'.
00127     while (I < Text.size() && isHexDigit(Text[I]))
00128       ++I;
00129     return I;
00130   }
00131   default:
00132     if (isOctDigit(Text[1])) {
00133       unsigned I = 1;
00134       while (I < Text.size() && I < 4 && isOctDigit(Text[I]))
00135         ++I;
00136       return I;
00137     }
00138     return 2;
00139   }
00140 }
00141 
00142 } // namespace encoding
00143 } // namespace format
00144 } // namespace clang
00145 
00146 #endif