clang: TokenConcatenation.cpp Source File

Go to the documentation of this file.
00001 //===--- TokenConcatenation.cpp - Token Concatenation Avoidance -----------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This file implements the TokenConcatenation class.
00011 //
00012 //===----------------------------------------------------------------------===//
00013 
00014 #include "clang/Lex/TokenConcatenation.h"
00015 #include "clang/Basic/CharInfo.h"
00016 #include "clang/Lex/Preprocessor.h"
00017 #include "llvm/Support/ErrorHandling.h"
00018 using namespace clang;
00019 
00020 
00021 /// IsStringPrefix - Return true if Str is a string prefix.
00022 /// 'L', 'u', 'U', or 'u8'. Including raw versions.
00023 static bool IsStringPrefix(StringRef Str, bool CPlusPlus11) {
00024 
00025   if (Str[0] == 'L' ||
00026       (CPlusPlus11 && (Str[0] == 'u' || Str[0] == 'U' || Str[0] == 'R'))) {
00027 
00028     if (Str.size() == 1)
00029       return true; // "L", "u", "U", and "R"
00030 
00031     // Check for raw flavors. Need to make sure the first character wasn't
00032     // already R. Need CPlusPlus11 check for "LR".
00033     if (Str[1] == 'R' && Str[0] != 'R' && Str.size() == 2 && CPlusPlus11)
00034       return true; // "LR", "uR", "UR"
00035 
00036     // Check for "u8" and "u8R"
00037     if (Str[0] == 'u' && Str[1] == '8') {
00038       if (Str.size() == 2) return true; // "u8"
00039       if (Str.size() == 3 && Str[2] == 'R') return true; // "u8R"
00040     }
00041   }
00042 
00043   return false;
00044 }
00045 
00046 /// IsIdentifierStringPrefix - Return true if the spelling of the token
00047 /// is literally 'L', 'u', 'U', or 'u8'. Including raw versions.
00048 bool TokenConcatenation::IsIdentifierStringPrefix(const Token &Tok) const {
00049   const LangOptions &LangOpts = PP.getLangOpts();
00050 
00051   if (!Tok.needsCleaning()) {
00052     if (Tok.getLength() < 1 || Tok.getLength() > 3)
00053       return false;
00054     SourceManager &SM = PP.getSourceManager();
00055     const char *Ptr = SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation()));
00056     return IsStringPrefix(StringRef(Ptr, Tok.getLength()),
00057                           LangOpts.CPlusPlus11);
00058   }
00059 
00060   if (Tok.getLength() < 256) {
00061     char Buffer[256];
00062     const char *TokPtr = Buffer;
00063     unsigned length = PP.getSpelling(Tok, TokPtr);
00064     return IsStringPrefix(StringRef(TokPtr, length), LangOpts.CPlusPlus11);
00065   }
00066 
00067   return IsStringPrefix(StringRef(PP.getSpelling(Tok)), LangOpts.CPlusPlus11);
00068 }
00069 
00070 TokenConcatenation::TokenConcatenation(Preprocessor &pp) : PP(pp) {
00071   memset(TokenInfo, 0, sizeof(TokenInfo));
00072 
00073   // These tokens have custom code in AvoidConcat.
00074   TokenInfo[tok::identifier      ] |= aci_custom;
00075   TokenInfo[tok::numeric_constant] |= aci_custom_firstchar;
00076   TokenInfo[tok::period          ] |= aci_custom_firstchar;
00077   TokenInfo[tok::amp             ] |= aci_custom_firstchar;
00078   TokenInfo[tok::plus            ] |= aci_custom_firstchar;
00079   TokenInfo[tok::minus           ] |= aci_custom_firstchar;
00080   TokenInfo[tok::slash           ] |= aci_custom_firstchar;
00081   TokenInfo[tok::less            ] |= aci_custom_firstchar;
00082   TokenInfo[tok::greater         ] |= aci_custom_firstchar;
00083   TokenInfo[tok::pipe            ] |= aci_custom_firstchar;
00084   TokenInfo[tok::percent         ] |= aci_custom_firstchar;
00085   TokenInfo[tok::colon           ] |= aci_custom_firstchar;
00086   TokenInfo[tok::hash            ] |= aci_custom_firstchar;
00087   TokenInfo[tok::arrow           ] |= aci_custom_firstchar;
00088 
00089   // These tokens have custom code in C++11 mode.
00090   if (PP.getLangOpts().CPlusPlus11) {
00091     TokenInfo[tok::string_literal      ] |= aci_custom;
00092     TokenInfo[tok::wide_string_literal ] |= aci_custom;
00093     TokenInfo[tok::utf8_string_literal ] |= aci_custom;
00094     TokenInfo[tok::utf16_string_literal] |= aci_custom;
00095     TokenInfo[tok::utf32_string_literal] |= aci_custom;
00096     TokenInfo[tok::char_constant       ] |= aci_custom;
00097     TokenInfo[tok::wide_char_constant  ] |= aci_custom;
00098     TokenInfo[tok::utf16_char_constant ] |= aci_custom;
00099     TokenInfo[tok::utf32_char_constant ] |= aci_custom;
00100   }
00101 
00102   // These tokens have custom code in C++1z mode.
00103   if (PP.getLangOpts().CPlusPlus1z)
00104     TokenInfo[tok::utf8_char_constant] |= aci_custom;
00105 
00106   // These tokens change behavior if followed by an '='.
00107   TokenInfo[tok::amp         ] |= aci_avoid_equal;           // &=
00108   TokenInfo[tok::plus        ] |= aci_avoid_equal;           // +=
00109   TokenInfo[tok::minus       ] |= aci_avoid_equal;           // -=
00110   TokenInfo[tok::slash       ] |= aci_avoid_equal;           // /=
00111   TokenInfo[tok::less        ] |= aci_avoid_equal;           // <=
00112   TokenInfo[tok::greater     ] |= aci_avoid_equal;           // >=
00113   TokenInfo[tok::pipe        ] |= aci_avoid_equal;           // |=
00114   TokenInfo[tok::percent     ] |= aci_avoid_equal;           // %=
00115   TokenInfo[tok::star        ] |= aci_avoid_equal;           // *=
00116   TokenInfo[tok::exclaim     ] |= aci_avoid_equal;           // !=
00117   TokenInfo[tok::lessless    ] |= aci_avoid_equal;           // <<=
00118   TokenInfo[tok::greatergreater] |= aci_avoid_equal;         // >>=
00119   TokenInfo[tok::caret       ] |= aci_avoid_equal;           // ^=
00120   TokenInfo[tok::equal       ] |= aci_avoid_equal;           // ==
00121 }
00122 
00123 /// GetFirstChar - Get the first character of the token \arg Tok,
00124 /// avoiding calls to getSpelling where possible.
00125 static char GetFirstChar(Preprocessor &PP, const Token &Tok) {
00126   if (IdentifierInfo *II = Tok.getIdentifierInfo()) {
00127     // Avoid spelling identifiers, the most common form of token.
00128     return II->getNameStart()[0];
00129   } else if (!Tok.needsCleaning()) {
00130     if (Tok.isLiteral() && Tok.getLiteralData()) {
00131       return *Tok.getLiteralData();
00132     } else {
00133       SourceManager &SM = PP.getSourceManager();
00134       return *SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation()));
00135     }
00136   } else if (Tok.getLength() < 256) {
00137     char Buffer[256];
00138     const char *TokPtr = Buffer;
00139     PP.getSpelling(Tok, TokPtr);
00140     return TokPtr[0];
00141   } else {
00142     return PP.getSpelling(Tok)[0];
00143   }
00144 }
00145 
00146 /// AvoidConcat - If printing PrevTok immediately followed by Tok would cause
00147 /// the two individual tokens to be lexed as a single token, return true
00148 /// (which causes a space to be printed between them).  This allows the output
00149 /// of -E mode to be lexed to the same token stream as lexing the input
00150 /// directly would.
00151 ///
00152 /// This code must conservatively return true if it doesn't want to be 100%
00153 /// accurate.  This will cause the output to include extra space characters,
00154 /// but the resulting output won't have incorrect concatenations going on.
00155 /// Examples include "..", which we print with a space between, because we
00156 /// don't want to track enough to tell "x.." from "...".
00157 bool TokenConcatenation::AvoidConcat(const Token &PrevPrevTok,
00158                                      const Token &PrevTok,
00159                                      const Token &Tok) const {
00160   // First, check to see if the tokens were directly adjacent in the original
00161   // source.  If they were, it must be okay to stick them together: if there
00162   // were an issue, the tokens would have been lexed differently.
00163   SourceManager &SM = PP.getSourceManager();
00164   SourceLocation PrevSpellLoc = SM.getSpellingLoc(PrevTok.getLocation());
00165   SourceLocation SpellLoc = SM.getSpellingLoc(Tok.getLocation());
00166   if (PrevSpellLoc.getLocWithOffset(PrevTok.getLength()) == SpellLoc)
00167     return false;
00168 
00169   tok::TokenKind PrevKind = PrevTok.getKind();
00170   if (!PrevTok.isAnnotation() && PrevTok.getIdentifierInfo())
00171     PrevKind = tok::identifier; // Language keyword or named operator.
00172 
00173   // Look up information on when we should avoid concatenation with prevtok.
00174   unsigned ConcatInfo = TokenInfo[PrevKind];
00175 
00176   // If prevtok never causes a problem for anything after it, return quickly.
00177   if (ConcatInfo == 0) return false;
00178 
00179   if (ConcatInfo & aci_avoid_equal) {
00180     // If the next token is '=' or '==', avoid concatenation.
00181     if (Tok.is(tok::equal) || Tok.is(tok::equalequal))
00182       return true;
00183     ConcatInfo &= ~aci_avoid_equal;
00184   }
00185   if (Tok.isAnnotation()) {
00186     // Modules annotation can show up when generated automatically for includes.
00187     assert((Tok.is(tok::annot_module_include) ||
00188             Tok.is(tok::annot_module_begin) ||
00189             Tok.is(tok::annot_module_end)) &&
00190            "unexpected annotation in AvoidConcat");
00191     ConcatInfo = 0;
00192   }
00193 
00194   if (ConcatInfo == 0) return false;
00195 
00196   // Basic algorithm: we look at the first character of the second token, and
00197   // determine whether it, if appended to the first token, would form (or
00198   // would contribute) to a larger token if concatenated.
00199   char FirstChar = 0;
00200   if (ConcatInfo & aci_custom) {
00201     // If the token does not need to know the first character, don't get it.
00202   } else {
00203     FirstChar = GetFirstChar(PP, Tok);
00204   }
00205 
00206   switch (PrevKind) {
00207   default:
00208     llvm_unreachable("InitAvoidConcatTokenInfo built wrong");
00209 
00210   case tok::raw_identifier:
00211     llvm_unreachable("tok::raw_identifier in non-raw lexing mode!");
00212 
00213   case tok::string_literal:
00214   case tok::wide_string_literal:
00215   case tok::utf8_string_literal:
00216   case tok::utf16_string_literal:
00217   case tok::utf32_string_literal:
00218   case tok::char_constant:
00219   case tok::wide_char_constant:
00220   case tok::utf8_char_constant:
00221   case tok::utf16_char_constant:
00222   case tok::utf32_char_constant:
00223     if (!PP.getLangOpts().CPlusPlus11)
00224       return false;
00225 
00226     // In C++11, a string or character literal followed by an identifier is a
00227     // single token.
00228     if (Tok.getIdentifierInfo())
00229       return true;
00230 
00231     // A ud-suffix is an identifier. If the previous token ends with one, treat
00232     // it as an identifier.
00233     if (!PrevTok.hasUDSuffix())
00234       return false;
00235     // FALL THROUGH.
00236   case tok::identifier:   // id+id or id+number or id+L"foo".
00237     // id+'.'... will not append.
00238     if (Tok.is(tok::numeric_constant))
00239       return GetFirstChar(PP, Tok) != '.';
00240 
00241     if (Tok.getIdentifierInfo() || Tok.is(tok::wide_string_literal) ||
00242         Tok.is(tok::utf8_string_literal) || Tok.is(tok::utf16_string_literal) ||
00243         Tok.is(tok::utf32_string_literal) || Tok.is(tok::wide_char_constant) ||
00244         Tok.is(tok::utf8_char_constant) || Tok.is(tok::utf16_char_constant) ||
00245         Tok.is(tok::utf32_char_constant))
00246       return true;
00247 
00248     // If this isn't identifier + string, we're done.
00249     if (Tok.isNot(tok::char_constant) && Tok.isNot(tok::string_literal))
00250       return false;
00251 
00252     // Otherwise, this is a narrow character or string.  If the *identifier*
00253     // is a literal 'L', 'u8', 'u' or 'U', avoid pasting L "foo" -> L"foo".
00254     return IsIdentifierStringPrefix(PrevTok);
00255 
00256   case tok::numeric_constant:
00257     return isPreprocessingNumberBody(FirstChar) ||
00258            FirstChar == '+' || FirstChar == '-';
00259   case tok::period:          // ..., .*, .1234
00260     return (FirstChar == '.' && PrevPrevTok.is(tok::period)) ||
00261            isDigit(FirstChar) ||
00262            (PP.getLangOpts().CPlusPlus && FirstChar == '*');
00263   case tok::amp:             // &&
00264     return FirstChar == '&';
00265   case tok::plus:            // ++
00266     return FirstChar == '+';
00267   case tok::minus:           // --, ->, ->*
00268     return FirstChar == '-' || FirstChar == '>';
00269   case tok::slash:           //, /*, //
00270     return FirstChar == '*' || FirstChar == '/';
00271   case tok::less:            // <<, <<=, <:, <%
00272     return FirstChar == '<' || FirstChar == ':' || FirstChar == '%';
00273   case tok::greater:         // >>, >>=
00274     return FirstChar == '>';
00275   case tok::pipe:            // ||
00276     return FirstChar == '|';
00277   case tok::percent:         // %>, %:
00278     return FirstChar == '>' || FirstChar == ':';
00279   case tok::colon:           // ::, :>
00280     return FirstChar == '>' ||
00281     (PP.getLangOpts().CPlusPlus && FirstChar == ':');
00282   case tok::hash:            // ##, #@, %:%:
00283     return FirstChar == '#' || FirstChar == '@' || FirstChar == '%';
00284   case tok::arrow:           // ->*
00285     return PP.getLangOpts().CPlusPlus && FirstChar == '*';
00286   }
00287 }