clang API Documentation
00001 //===--- TokenConcatenation.cpp - Token Concatenation Avoidance -----------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 // This file implements the TokenConcatenation class. 00011 // 00012 //===----------------------------------------------------------------------===// 00013 00014 #include "clang/Lex/TokenConcatenation.h" 00015 #include "clang/Basic/CharInfo.h" 00016 #include "clang/Lex/Preprocessor.h" 00017 #include "llvm/Support/ErrorHandling.h" 00018 using namespace clang; 00019 00020 00021 /// IsStringPrefix - Return true if Str is a string prefix. 00022 /// 'L', 'u', 'U', or 'u8'. Including raw versions. 00023 static bool IsStringPrefix(StringRef Str, bool CPlusPlus11) { 00024 00025 if (Str[0] == 'L' || 00026 (CPlusPlus11 && (Str[0] == 'u' || Str[0] == 'U' || Str[0] == 'R'))) { 00027 00028 if (Str.size() == 1) 00029 return true; // "L", "u", "U", and "R" 00030 00031 // Check for raw flavors. Need to make sure the first character wasn't 00032 // already R. Need CPlusPlus11 check for "LR". 00033 if (Str[1] == 'R' && Str[0] != 'R' && Str.size() == 2 && CPlusPlus11) 00034 return true; // "LR", "uR", "UR" 00035 00036 // Check for "u8" and "u8R" 00037 if (Str[0] == 'u' && Str[1] == '8') { 00038 if (Str.size() == 2) return true; // "u8" 00039 if (Str.size() == 3 && Str[2] == 'R') return true; // "u8R" 00040 } 00041 } 00042 00043 return false; 00044 } 00045 00046 /// IsIdentifierStringPrefix - Return true if the spelling of the token 00047 /// is literally 'L', 'u', 'U', or 'u8'. Including raw versions. 00048 bool TokenConcatenation::IsIdentifierStringPrefix(const Token &Tok) const { 00049 const LangOptions &LangOpts = PP.getLangOpts(); 00050 00051 if (!Tok.needsCleaning()) { 00052 if (Tok.getLength() < 1 || Tok.getLength() > 3) 00053 return false; 00054 SourceManager &SM = PP.getSourceManager(); 00055 const char *Ptr = SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())); 00056 return IsStringPrefix(StringRef(Ptr, Tok.getLength()), 00057 LangOpts.CPlusPlus11); 00058 } 00059 00060 if (Tok.getLength() < 256) { 00061 char Buffer[256]; 00062 const char *TokPtr = Buffer; 00063 unsigned length = PP.getSpelling(Tok, TokPtr); 00064 return IsStringPrefix(StringRef(TokPtr, length), LangOpts.CPlusPlus11); 00065 } 00066 00067 return IsStringPrefix(StringRef(PP.getSpelling(Tok)), LangOpts.CPlusPlus11); 00068 } 00069 00070 TokenConcatenation::TokenConcatenation(Preprocessor &pp) : PP(pp) { 00071 memset(TokenInfo, 0, sizeof(TokenInfo)); 00072 00073 // These tokens have custom code in AvoidConcat. 00074 TokenInfo[tok::identifier ] |= aci_custom; 00075 TokenInfo[tok::numeric_constant] |= aci_custom_firstchar; 00076 TokenInfo[tok::period ] |= aci_custom_firstchar; 00077 TokenInfo[tok::amp ] |= aci_custom_firstchar; 00078 TokenInfo[tok::plus ] |= aci_custom_firstchar; 00079 TokenInfo[tok::minus ] |= aci_custom_firstchar; 00080 TokenInfo[tok::slash ] |= aci_custom_firstchar; 00081 TokenInfo[tok::less ] |= aci_custom_firstchar; 00082 TokenInfo[tok::greater ] |= aci_custom_firstchar; 00083 TokenInfo[tok::pipe ] |= aci_custom_firstchar; 00084 TokenInfo[tok::percent ] |= aci_custom_firstchar; 00085 TokenInfo[tok::colon ] |= aci_custom_firstchar; 00086 TokenInfo[tok::hash ] |= aci_custom_firstchar; 00087 TokenInfo[tok::arrow ] |= aci_custom_firstchar; 00088 00089 // These tokens have custom code in C++11 mode. 00090 if (PP.getLangOpts().CPlusPlus11) { 00091 TokenInfo[tok::string_literal ] |= aci_custom; 00092 TokenInfo[tok::wide_string_literal ] |= aci_custom; 00093 TokenInfo[tok::utf8_string_literal ] |= aci_custom; 00094 TokenInfo[tok::utf16_string_literal] |= aci_custom; 00095 TokenInfo[tok::utf32_string_literal] |= aci_custom; 00096 TokenInfo[tok::char_constant ] |= aci_custom; 00097 TokenInfo[tok::wide_char_constant ] |= aci_custom; 00098 TokenInfo[tok::utf16_char_constant ] |= aci_custom; 00099 TokenInfo[tok::utf32_char_constant ] |= aci_custom; 00100 } 00101 00102 // These tokens have custom code in C++1z mode. 00103 if (PP.getLangOpts().CPlusPlus1z) 00104 TokenInfo[tok::utf8_char_constant] |= aci_custom; 00105 00106 // These tokens change behavior if followed by an '='. 00107 TokenInfo[tok::amp ] |= aci_avoid_equal; // &= 00108 TokenInfo[tok::plus ] |= aci_avoid_equal; // += 00109 TokenInfo[tok::minus ] |= aci_avoid_equal; // -= 00110 TokenInfo[tok::slash ] |= aci_avoid_equal; // /= 00111 TokenInfo[tok::less ] |= aci_avoid_equal; // <= 00112 TokenInfo[tok::greater ] |= aci_avoid_equal; // >= 00113 TokenInfo[tok::pipe ] |= aci_avoid_equal; // |= 00114 TokenInfo[tok::percent ] |= aci_avoid_equal; // %= 00115 TokenInfo[tok::star ] |= aci_avoid_equal; // *= 00116 TokenInfo[tok::exclaim ] |= aci_avoid_equal; // != 00117 TokenInfo[tok::lessless ] |= aci_avoid_equal; // <<= 00118 TokenInfo[tok::greatergreater] |= aci_avoid_equal; // >>= 00119 TokenInfo[tok::caret ] |= aci_avoid_equal; // ^= 00120 TokenInfo[tok::equal ] |= aci_avoid_equal; // == 00121 } 00122 00123 /// GetFirstChar - Get the first character of the token \arg Tok, 00124 /// avoiding calls to getSpelling where possible. 00125 static char GetFirstChar(Preprocessor &PP, const Token &Tok) { 00126 if (IdentifierInfo *II = Tok.getIdentifierInfo()) { 00127 // Avoid spelling identifiers, the most common form of token. 00128 return II->getNameStart()[0]; 00129 } else if (!Tok.needsCleaning()) { 00130 if (Tok.isLiteral() && Tok.getLiteralData()) { 00131 return *Tok.getLiteralData(); 00132 } else { 00133 SourceManager &SM = PP.getSourceManager(); 00134 return *SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())); 00135 } 00136 } else if (Tok.getLength() < 256) { 00137 char Buffer[256]; 00138 const char *TokPtr = Buffer; 00139 PP.getSpelling(Tok, TokPtr); 00140 return TokPtr[0]; 00141 } else { 00142 return PP.getSpelling(Tok)[0]; 00143 } 00144 } 00145 00146 /// AvoidConcat - If printing PrevTok immediately followed by Tok would cause 00147 /// the two individual tokens to be lexed as a single token, return true 00148 /// (which causes a space to be printed between them). This allows the output 00149 /// of -E mode to be lexed to the same token stream as lexing the input 00150 /// directly would. 00151 /// 00152 /// This code must conservatively return true if it doesn't want to be 100% 00153 /// accurate. This will cause the output to include extra space characters, 00154 /// but the resulting output won't have incorrect concatenations going on. 00155 /// Examples include "..", which we print with a space between, because we 00156 /// don't want to track enough to tell "x.." from "...". 00157 bool TokenConcatenation::AvoidConcat(const Token &PrevPrevTok, 00158 const Token &PrevTok, 00159 const Token &Tok) const { 00160 // First, check to see if the tokens were directly adjacent in the original 00161 // source. If they were, it must be okay to stick them together: if there 00162 // were an issue, the tokens would have been lexed differently. 00163 SourceManager &SM = PP.getSourceManager(); 00164 SourceLocation PrevSpellLoc = SM.getSpellingLoc(PrevTok.getLocation()); 00165 SourceLocation SpellLoc = SM.getSpellingLoc(Tok.getLocation()); 00166 if (PrevSpellLoc.getLocWithOffset(PrevTok.getLength()) == SpellLoc) 00167 return false; 00168 00169 tok::TokenKind PrevKind = PrevTok.getKind(); 00170 if (!PrevTok.isAnnotation() && PrevTok.getIdentifierInfo()) 00171 PrevKind = tok::identifier; // Language keyword or named operator. 00172 00173 // Look up information on when we should avoid concatenation with prevtok. 00174 unsigned ConcatInfo = TokenInfo[PrevKind]; 00175 00176 // If prevtok never causes a problem for anything after it, return quickly. 00177 if (ConcatInfo == 0) return false; 00178 00179 if (ConcatInfo & aci_avoid_equal) { 00180 // If the next token is '=' or '==', avoid concatenation. 00181 if (Tok.is(tok::equal) || Tok.is(tok::equalequal)) 00182 return true; 00183 ConcatInfo &= ~aci_avoid_equal; 00184 } 00185 if (Tok.isAnnotation()) { 00186 // Modules annotation can show up when generated automatically for includes. 00187 assert((Tok.is(tok::annot_module_include) || 00188 Tok.is(tok::annot_module_begin) || 00189 Tok.is(tok::annot_module_end)) && 00190 "unexpected annotation in AvoidConcat"); 00191 ConcatInfo = 0; 00192 } 00193 00194 if (ConcatInfo == 0) return false; 00195 00196 // Basic algorithm: we look at the first character of the second token, and 00197 // determine whether it, if appended to the first token, would form (or 00198 // would contribute) to a larger token if concatenated. 00199 char FirstChar = 0; 00200 if (ConcatInfo & aci_custom) { 00201 // If the token does not need to know the first character, don't get it. 00202 } else { 00203 FirstChar = GetFirstChar(PP, Tok); 00204 } 00205 00206 switch (PrevKind) { 00207 default: 00208 llvm_unreachable("InitAvoidConcatTokenInfo built wrong"); 00209 00210 case tok::raw_identifier: 00211 llvm_unreachable("tok::raw_identifier in non-raw lexing mode!"); 00212 00213 case tok::string_literal: 00214 case tok::wide_string_literal: 00215 case tok::utf8_string_literal: 00216 case tok::utf16_string_literal: 00217 case tok::utf32_string_literal: 00218 case tok::char_constant: 00219 case tok::wide_char_constant: 00220 case tok::utf8_char_constant: 00221 case tok::utf16_char_constant: 00222 case tok::utf32_char_constant: 00223 if (!PP.getLangOpts().CPlusPlus11) 00224 return false; 00225 00226 // In C++11, a string or character literal followed by an identifier is a 00227 // single token. 00228 if (Tok.getIdentifierInfo()) 00229 return true; 00230 00231 // A ud-suffix is an identifier. If the previous token ends with one, treat 00232 // it as an identifier. 00233 if (!PrevTok.hasUDSuffix()) 00234 return false; 00235 // FALL THROUGH. 00236 case tok::identifier: // id+id or id+number or id+L"foo". 00237 // id+'.'... will not append. 00238 if (Tok.is(tok::numeric_constant)) 00239 return GetFirstChar(PP, Tok) != '.'; 00240 00241 if (Tok.getIdentifierInfo() || Tok.is(tok::wide_string_literal) || 00242 Tok.is(tok::utf8_string_literal) || Tok.is(tok::utf16_string_literal) || 00243 Tok.is(tok::utf32_string_literal) || Tok.is(tok::wide_char_constant) || 00244 Tok.is(tok::utf8_char_constant) || Tok.is(tok::utf16_char_constant) || 00245 Tok.is(tok::utf32_char_constant)) 00246 return true; 00247 00248 // If this isn't identifier + string, we're done. 00249 if (Tok.isNot(tok::char_constant) && Tok.isNot(tok::string_literal)) 00250 return false; 00251 00252 // Otherwise, this is a narrow character or string. If the *identifier* 00253 // is a literal 'L', 'u8', 'u' or 'U', avoid pasting L "foo" -> L"foo". 00254 return IsIdentifierStringPrefix(PrevTok); 00255 00256 case tok::numeric_constant: 00257 return isPreprocessingNumberBody(FirstChar) || 00258 FirstChar == '+' || FirstChar == '-'; 00259 case tok::period: // ..., .*, .1234 00260 return (FirstChar == '.' && PrevPrevTok.is(tok::period)) || 00261 isDigit(FirstChar) || 00262 (PP.getLangOpts().CPlusPlus && FirstChar == '*'); 00263 case tok::amp: // && 00264 return FirstChar == '&'; 00265 case tok::plus: // ++ 00266 return FirstChar == '+'; 00267 case tok::minus: // --, ->, ->* 00268 return FirstChar == '-' || FirstChar == '>'; 00269 case tok::slash: //, /*, // 00270 return FirstChar == '*' || FirstChar == '/'; 00271 case tok::less: // <<, <<=, <:, <% 00272 return FirstChar == '<' || FirstChar == ':' || FirstChar == '%'; 00273 case tok::greater: // >>, >>= 00274 return FirstChar == '>'; 00275 case tok::pipe: // || 00276 return FirstChar == '|'; 00277 case tok::percent: // %>, %: 00278 return FirstChar == '>' || FirstChar == ':'; 00279 case tok::colon: // ::, :> 00280 return FirstChar == '>' || 00281 (PP.getLangOpts().CPlusPlus && FirstChar == ':'); 00282 case tok::hash: // ##, #@, %:%: 00283 return FirstChar == '#' || FirstChar == '@' || FirstChar == '%'; 00284 case tok::arrow: // ->* 00285 return PP.getLangOpts().CPlusPlus && FirstChar == '*'; 00286 } 00287 }