clang API Documentation
00001 //===--- Lexer.cpp - C Language Family Lexer ------------------------------===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 // This file implements the Lexer and Token interfaces. 00011 // 00012 //===----------------------------------------------------------------------===// 00013 00014 #include "clang/Lex/Lexer.h" 00015 #include "UnicodeCharSets.h" 00016 #include "clang/Basic/CharInfo.h" 00017 #include "clang/Basic/SourceManager.h" 00018 #include "clang/Lex/CodeCompletionHandler.h" 00019 #include "clang/Lex/LexDiagnostic.h" 00020 #include "clang/Lex/LiteralSupport.h" 00021 #include "clang/Lex/Preprocessor.h" 00022 #include "llvm/ADT/STLExtras.h" 00023 #include "llvm/ADT/StringExtras.h" 00024 #include "llvm/ADT/StringSwitch.h" 00025 #include "llvm/Support/Compiler.h" 00026 #include "llvm/Support/ConvertUTF.h" 00027 #include "llvm/Support/MemoryBuffer.h" 00028 #include <cstring> 00029 using namespace clang; 00030 00031 //===----------------------------------------------------------------------===// 00032 // Token Class Implementation 00033 //===----------------------------------------------------------------------===// 00034 00035 /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 00036 bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 00037 if (IdentifierInfo *II = getIdentifierInfo()) 00038 return II->getObjCKeywordID() == objcKey; 00039 return false; 00040 } 00041 00042 /// getObjCKeywordID - Return the ObjC keyword kind. 00043 tok::ObjCKeywordKind Token::getObjCKeywordID() const { 00044 IdentifierInfo *specId = getIdentifierInfo(); 00045 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 00046 } 00047 00048 00049 //===----------------------------------------------------------------------===// 00050 // Lexer Class Implementation 00051 //===----------------------------------------------------------------------===// 00052 00053 void Lexer::anchor() { } 00054 00055 void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 00056 const char *BufEnd) { 00057 BufferStart = BufStart; 00058 BufferPtr = BufPtr; 00059 BufferEnd = BufEnd; 00060 00061 assert(BufEnd[0] == 0 && 00062 "We assume that the input buffer has a null character at the end" 00063 " to simplify lexing!"); 00064 00065 // Check whether we have a BOM in the beginning of the buffer. If yes - act 00066 // accordingly. Right now we support only UTF-8 with and without BOM, so, just 00067 // skip the UTF-8 BOM if it's present. 00068 if (BufferStart == BufferPtr) { 00069 // Determine the size of the BOM. 00070 StringRef Buf(BufferStart, BufferEnd - BufferStart); 00071 size_t BOMLength = llvm::StringSwitch<size_t>(Buf) 00072 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM 00073 .Default(0); 00074 00075 // Skip the BOM. 00076 BufferPtr += BOMLength; 00077 } 00078 00079 Is_PragmaLexer = false; 00080 CurrentConflictMarkerState = CMK_None; 00081 00082 // Start of the file is a start of line. 00083 IsAtStartOfLine = true; 00084 IsAtPhysicalStartOfLine = true; 00085 00086 HasLeadingSpace = false; 00087 HasLeadingEmptyMacro = false; 00088 00089 // We are not after parsing a #. 00090 ParsingPreprocessorDirective = false; 00091 00092 // We are not after parsing #include. 00093 ParsingFilename = false; 00094 00095 // We are not in raw mode. Raw mode disables diagnostics and interpretation 00096 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 00097 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 00098 // or otherwise skipping over tokens. 00099 LexingRawMode = false; 00100 00101 // Default to not keeping comments. 00102 ExtendedTokenMode = 0; 00103 } 00104 00105 /// Lexer constructor - Create a new lexer object for the specified buffer 00106 /// with the specified preprocessor managing the lexing process. This lexer 00107 /// assumes that the associated file buffer and Preprocessor objects will 00108 /// outlive it, so it doesn't take ownership of either of them. 00109 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP) 00110 : PreprocessorLexer(&PP, FID), 00111 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 00112 LangOpts(PP.getLangOpts()) { 00113 00114 InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(), 00115 InputFile->getBufferEnd()); 00116 00117 resetExtendedTokenMode(); 00118 } 00119 00120 void Lexer::resetExtendedTokenMode() { 00121 assert(PP && "Cannot reset token mode without a preprocessor"); 00122 if (LangOpts.TraditionalCPP) 00123 SetKeepWhitespaceMode(true); 00124 else 00125 SetCommentRetentionState(PP->getCommentRetentionState()); 00126 } 00127 00128 /// Lexer constructor - Create a new raw lexer object. This object is only 00129 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 00130 /// range will outlive it, so it doesn't take ownership of it. 00131 Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts, 00132 const char *BufStart, const char *BufPtr, const char *BufEnd) 00133 : FileLoc(fileloc), LangOpts(langOpts) { 00134 00135 InitLexer(BufStart, BufPtr, BufEnd); 00136 00137 // We *are* in raw mode. 00138 LexingRawMode = true; 00139 } 00140 00141 /// Lexer constructor - Create a new raw lexer object. This object is only 00142 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 00143 /// range will outlive it, so it doesn't take ownership of it. 00144 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile, 00145 const SourceManager &SM, const LangOptions &langOpts) 00146 : FileLoc(SM.getLocForStartOfFile(FID)), LangOpts(langOpts) { 00147 00148 InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(), 00149 FromFile->getBufferEnd()); 00150 00151 // We *are* in raw mode. 00152 LexingRawMode = true; 00153 } 00154 00155 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 00156 /// _Pragma expansion. This has a variety of magic semantics that this method 00157 /// sets up. It returns a new'd Lexer that must be delete'd when done. 00158 /// 00159 /// On entrance to this routine, TokStartLoc is a macro location which has a 00160 /// spelling loc that indicates the bytes to be lexed for the token and an 00161 /// expansion location that indicates where all lexed tokens should be 00162 /// "expanded from". 00163 /// 00164 /// TODO: It would really be nice to make _Pragma just be a wrapper around a 00165 /// normal lexer that remaps tokens as they fly by. This would require making 00166 /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 00167 /// interface that could handle this stuff. This would pull GetMappedTokenLoc 00168 /// out of the critical path of the lexer! 00169 /// 00170 Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 00171 SourceLocation ExpansionLocStart, 00172 SourceLocation ExpansionLocEnd, 00173 unsigned TokLen, Preprocessor &PP) { 00174 SourceManager &SM = PP.getSourceManager(); 00175 00176 // Create the lexer as if we were going to lex the file normally. 00177 FileID SpellingFID = SM.getFileID(SpellingLoc); 00178 const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID); 00179 Lexer *L = new Lexer(SpellingFID, InputFile, PP); 00180 00181 // Now that the lexer is created, change the start/end locations so that we 00182 // just lex the subsection of the file that we want. This is lexing from a 00183 // scratch buffer. 00184 const char *StrData = SM.getCharacterData(SpellingLoc); 00185 00186 L->BufferPtr = StrData; 00187 L->BufferEnd = StrData+TokLen; 00188 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 00189 00190 // Set the SourceLocation with the remapping information. This ensures that 00191 // GetMappedTokenLoc will remap the tokens as they are lexed. 00192 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID), 00193 ExpansionLocStart, 00194 ExpansionLocEnd, TokLen); 00195 00196 // Ensure that the lexer thinks it is inside a directive, so that end \n will 00197 // return an EOD token. 00198 L->ParsingPreprocessorDirective = true; 00199 00200 // This lexer really is for _Pragma. 00201 L->Is_PragmaLexer = true; 00202 return L; 00203 } 00204 00205 00206 /// Stringify - Convert the specified string into a C string, with surrounding 00207 /// ""'s, and with escaped \ and " characters. 00208 std::string Lexer::Stringify(const std::string &Str, bool Charify) { 00209 std::string Result = Str; 00210 char Quote = Charify ? '\'' : '"'; 00211 for (unsigned i = 0, e = Result.size(); i != e; ++i) { 00212 if (Result[i] == '\\' || Result[i] == Quote) { 00213 Result.insert(Result.begin()+i, '\\'); 00214 ++i; ++e; 00215 } 00216 } 00217 return Result; 00218 } 00219 00220 /// Stringify - Convert the specified string into a C string by escaping '\' 00221 /// and " characters. This does not add surrounding ""'s to the string. 00222 void Lexer::Stringify(SmallVectorImpl<char> &Str) { 00223 for (unsigned i = 0, e = Str.size(); i != e; ++i) { 00224 if (Str[i] == '\\' || Str[i] == '"') { 00225 Str.insert(Str.begin()+i, '\\'); 00226 ++i; ++e; 00227 } 00228 } 00229 } 00230 00231 //===----------------------------------------------------------------------===// 00232 // Token Spelling 00233 //===----------------------------------------------------------------------===// 00234 00235 /// \brief Slow case of getSpelling. Extract the characters comprising the 00236 /// spelling of this token from the provided input buffer. 00237 static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, 00238 const LangOptions &LangOpts, char *Spelling) { 00239 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token"); 00240 00241 size_t Length = 0; 00242 const char *BufEnd = BufPtr + Tok.getLength(); 00243 00244 if (Tok.is(tok::string_literal)) { 00245 // Munch the encoding-prefix and opening double-quote. 00246 while (BufPtr < BufEnd) { 00247 unsigned Size; 00248 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); 00249 BufPtr += Size; 00250 00251 if (Spelling[Length - 1] == '"') 00252 break; 00253 } 00254 00255 // Raw string literals need special handling; trigraph expansion and line 00256 // splicing do not occur within their d-char-sequence nor within their 00257 // r-char-sequence. 00258 if (Length >= 2 && 00259 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') { 00260 // Search backwards from the end of the token to find the matching closing 00261 // quote. 00262 const char *RawEnd = BufEnd; 00263 do --RawEnd; while (*RawEnd != '"'); 00264 size_t RawLength = RawEnd - BufPtr + 1; 00265 00266 // Everything between the quotes is included verbatim in the spelling. 00267 memcpy(Spelling + Length, BufPtr, RawLength); 00268 Length += RawLength; 00269 BufPtr += RawLength; 00270 00271 // The rest of the token is lexed normally. 00272 } 00273 } 00274 00275 while (BufPtr < BufEnd) { 00276 unsigned Size; 00277 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); 00278 BufPtr += Size; 00279 } 00280 00281 assert(Length < Tok.getLength() && 00282 "NeedsCleaning flag set on token that didn't need cleaning!"); 00283 return Length; 00284 } 00285 00286 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 00287 /// token are the characters used to represent the token in the source file 00288 /// after trigraph expansion and escaped-newline folding. In particular, this 00289 /// wants to get the true, uncanonicalized, spelling of things like digraphs 00290 /// UCNs, etc. 00291 StringRef Lexer::getSpelling(SourceLocation loc, 00292 SmallVectorImpl<char> &buffer, 00293 const SourceManager &SM, 00294 const LangOptions &options, 00295 bool *invalid) { 00296 // Break down the source location. 00297 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc); 00298 00299 // Try to the load the file buffer. 00300 bool invalidTemp = false; 00301 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); 00302 if (invalidTemp) { 00303 if (invalid) *invalid = true; 00304 return StringRef(); 00305 } 00306 00307 const char *tokenBegin = file.data() + locInfo.second; 00308 00309 // Lex from the start of the given location. 00310 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, 00311 file.begin(), tokenBegin, file.end()); 00312 Token token; 00313 lexer.LexFromRawLexer(token); 00314 00315 unsigned length = token.getLength(); 00316 00317 // Common case: no need for cleaning. 00318 if (!token.needsCleaning()) 00319 return StringRef(tokenBegin, length); 00320 00321 // Hard case, we need to relex the characters into the string. 00322 buffer.resize(length); 00323 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data())); 00324 return StringRef(buffer.data(), buffer.size()); 00325 } 00326 00327 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 00328 /// token are the characters used to represent the token in the source file 00329 /// after trigraph expansion and escaped-newline folding. In particular, this 00330 /// wants to get the true, uncanonicalized, spelling of things like digraphs 00331 /// UCNs, etc. 00332 std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, 00333 const LangOptions &LangOpts, bool *Invalid) { 00334 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 00335 00336 bool CharDataInvalid = false; 00337 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 00338 &CharDataInvalid); 00339 if (Invalid) 00340 *Invalid = CharDataInvalid; 00341 if (CharDataInvalid) 00342 return std::string(); 00343 00344 // If this token contains nothing interesting, return it directly. 00345 if (!Tok.needsCleaning()) 00346 return std::string(TokStart, TokStart + Tok.getLength()); 00347 00348 std::string Result; 00349 Result.resize(Tok.getLength()); 00350 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin())); 00351 return Result; 00352 } 00353 00354 /// getSpelling - This method is used to get the spelling of a token into a 00355 /// preallocated buffer, instead of as an std::string. The caller is required 00356 /// to allocate enough space for the token, which is guaranteed to be at least 00357 /// Tok.getLength() bytes long. The actual length of the token is returned. 00358 /// 00359 /// Note that this method may do two possible things: it may either fill in 00360 /// the buffer specified with characters, or it may *change the input pointer* 00361 /// to point to a constant buffer with the data already in it (avoiding a 00362 /// copy). The caller is not allowed to modify the returned buffer pointer 00363 /// if an internal buffer is returned. 00364 unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 00365 const SourceManager &SourceMgr, 00366 const LangOptions &LangOpts, bool *Invalid) { 00367 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 00368 00369 const char *TokStart = nullptr; 00370 // NOTE: this has to be checked *before* testing for an IdentifierInfo. 00371 if (Tok.is(tok::raw_identifier)) 00372 TokStart = Tok.getRawIdentifier().data(); 00373 else if (!Tok.hasUCN()) { 00374 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { 00375 // Just return the string from the identifier table, which is very quick. 00376 Buffer = II->getNameStart(); 00377 return II->getLength(); 00378 } 00379 } 00380 00381 // NOTE: this can be checked even after testing for an IdentifierInfo. 00382 if (Tok.isLiteral()) 00383 TokStart = Tok.getLiteralData(); 00384 00385 if (!TokStart) { 00386 // Compute the start of the token in the input lexer buffer. 00387 bool CharDataInvalid = false; 00388 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); 00389 if (Invalid) 00390 *Invalid = CharDataInvalid; 00391 if (CharDataInvalid) { 00392 Buffer = ""; 00393 return 0; 00394 } 00395 } 00396 00397 // If this token contains nothing interesting, return it directly. 00398 if (!Tok.needsCleaning()) { 00399 Buffer = TokStart; 00400 return Tok.getLength(); 00401 } 00402 00403 // Otherwise, hard case, relex the characters into the string. 00404 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer)); 00405 } 00406 00407 00408 /// MeasureTokenLength - Relex the token at the specified location and return 00409 /// its length in bytes in the input file. If the token needs cleaning (e.g. 00410 /// includes a trigraph or an escaped newline) then this count includes bytes 00411 /// that are part of that. 00412 unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 00413 const SourceManager &SM, 00414 const LangOptions &LangOpts) { 00415 Token TheTok; 00416 if (getRawToken(Loc, TheTok, SM, LangOpts)) 00417 return 0; 00418 return TheTok.getLength(); 00419 } 00420 00421 /// \brief Relex the token at the specified location. 00422 /// \returns true if there was a failure, false on success. 00423 bool Lexer::getRawToken(SourceLocation Loc, Token &Result, 00424 const SourceManager &SM, 00425 const LangOptions &LangOpts, 00426 bool IgnoreWhiteSpace) { 00427 // TODO: this could be special cased for common tokens like identifiers, ')', 00428 // etc to make this faster, if it mattered. Just look at StrData[0] to handle 00429 // all obviously single-char tokens. This could use 00430 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 00431 // something. 00432 00433 // If this comes from a macro expansion, we really do want the macro name, not 00434 // the token this macro expanded to. 00435 Loc = SM.getExpansionLoc(Loc); 00436 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 00437 bool Invalid = false; 00438 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 00439 if (Invalid) 00440 return true; 00441 00442 const char *StrData = Buffer.data()+LocInfo.second; 00443 00444 if (!IgnoreWhiteSpace && isWhitespace(StrData[0])) 00445 return true; 00446 00447 // Create a lexer starting at the beginning of this token. 00448 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, 00449 Buffer.begin(), StrData, Buffer.end()); 00450 TheLexer.SetCommentRetentionState(true); 00451 TheLexer.LexFromRawLexer(Result); 00452 return false; 00453 } 00454 00455 static SourceLocation getBeginningOfFileToken(SourceLocation Loc, 00456 const SourceManager &SM, 00457 const LangOptions &LangOpts) { 00458 assert(Loc.isFileID()); 00459 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 00460 if (LocInfo.first.isInvalid()) 00461 return Loc; 00462 00463 bool Invalid = false; 00464 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 00465 if (Invalid) 00466 return Loc; 00467 00468 // Back up from the current location until we hit the beginning of a line 00469 // (or the buffer). We'll relex from that point. 00470 const char *BufStart = Buffer.data(); 00471 if (LocInfo.second >= Buffer.size()) 00472 return Loc; 00473 00474 const char *StrData = BufStart+LocInfo.second; 00475 if (StrData[0] == '\n' || StrData[0] == '\r') 00476 return Loc; 00477 00478 const char *LexStart = StrData; 00479 while (LexStart != BufStart) { 00480 if (LexStart[0] == '\n' || LexStart[0] == '\r') { 00481 ++LexStart; 00482 break; 00483 } 00484 00485 --LexStart; 00486 } 00487 00488 // Create a lexer starting at the beginning of this token. 00489 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); 00490 Lexer TheLexer(LexerStartLoc, LangOpts, BufStart, LexStart, Buffer.end()); 00491 TheLexer.SetCommentRetentionState(true); 00492 00493 // Lex tokens until we find the token that contains the source location. 00494 Token TheTok; 00495 do { 00496 TheLexer.LexFromRawLexer(TheTok); 00497 00498 if (TheLexer.getBufferLocation() > StrData) { 00499 // Lexing this token has taken the lexer past the source location we're 00500 // looking for. If the current token encompasses our source location, 00501 // return the beginning of that token. 00502 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) 00503 return TheTok.getLocation(); 00504 00505 // We ended up skipping over the source location entirely, which means 00506 // that it points into whitespace. We're done here. 00507 break; 00508 } 00509 } while (TheTok.getKind() != tok::eof); 00510 00511 // We've passed our source location; just return the original source location. 00512 return Loc; 00513 } 00514 00515 SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, 00516 const SourceManager &SM, 00517 const LangOptions &LangOpts) { 00518 if (Loc.isFileID()) 00519 return getBeginningOfFileToken(Loc, SM, LangOpts); 00520 00521 if (!SM.isMacroArgExpansion(Loc)) 00522 return Loc; 00523 00524 SourceLocation FileLoc = SM.getSpellingLoc(Loc); 00525 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); 00526 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc); 00527 std::pair<FileID, unsigned> BeginFileLocInfo 00528 = SM.getDecomposedLoc(BeginFileLoc); 00529 assert(FileLocInfo.first == BeginFileLocInfo.first && 00530 FileLocInfo.second >= BeginFileLocInfo.second); 00531 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); 00532 } 00533 00534 namespace { 00535 enum PreambleDirectiveKind { 00536 PDK_Skipped, 00537 PDK_StartIf, 00538 PDK_EndIf, 00539 PDK_Unknown 00540 }; 00541 } 00542 00543 std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer, 00544 const LangOptions &LangOpts, 00545 unsigned MaxLines) { 00546 // Create a lexer starting at the beginning of the file. Note that we use a 00547 // "fake" file source location at offset 1 so that the lexer will track our 00548 // position within the file. 00549 const unsigned StartOffset = 1; 00550 SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset); 00551 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(), 00552 Buffer.end()); 00553 TheLexer.SetCommentRetentionState(true); 00554 00555 // StartLoc will differ from FileLoc if there is a BOM that was skipped. 00556 SourceLocation StartLoc = TheLexer.getSourceLocation(); 00557 00558 bool InPreprocessorDirective = false; 00559 Token TheTok; 00560 Token IfStartTok; 00561 unsigned IfCount = 0; 00562 SourceLocation ActiveCommentLoc; 00563 00564 unsigned MaxLineOffset = 0; 00565 if (MaxLines) { 00566 const char *CurPtr = Buffer.begin(); 00567 unsigned CurLine = 0; 00568 while (CurPtr != Buffer.end()) { 00569 char ch = *CurPtr++; 00570 if (ch == '\n') { 00571 ++CurLine; 00572 if (CurLine == MaxLines) 00573 break; 00574 } 00575 } 00576 if (CurPtr != Buffer.end()) 00577 MaxLineOffset = CurPtr - Buffer.begin(); 00578 } 00579 00580 do { 00581 TheLexer.LexFromRawLexer(TheTok); 00582 00583 if (InPreprocessorDirective) { 00584 // If we've hit the end of the file, we're done. 00585 if (TheTok.getKind() == tok::eof) { 00586 break; 00587 } 00588 00589 // If we haven't hit the end of the preprocessor directive, skip this 00590 // token. 00591 if (!TheTok.isAtStartOfLine()) 00592 continue; 00593 00594 // We've passed the end of the preprocessor directive, and will look 00595 // at this token again below. 00596 InPreprocessorDirective = false; 00597 } 00598 00599 // Keep track of the # of lines in the preamble. 00600 if (TheTok.isAtStartOfLine()) { 00601 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; 00602 00603 // If we were asked to limit the number of lines in the preamble, 00604 // and we're about to exceed that limit, we're done. 00605 if (MaxLineOffset && TokOffset >= MaxLineOffset) 00606 break; 00607 } 00608 00609 // Comments are okay; skip over them. 00610 if (TheTok.getKind() == tok::comment) { 00611 if (ActiveCommentLoc.isInvalid()) 00612 ActiveCommentLoc = TheTok.getLocation(); 00613 continue; 00614 } 00615 00616 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { 00617 // This is the start of a preprocessor directive. 00618 Token HashTok = TheTok; 00619 InPreprocessorDirective = true; 00620 ActiveCommentLoc = SourceLocation(); 00621 00622 // Figure out which directive this is. Since we're lexing raw tokens, 00623 // we don't have an identifier table available. Instead, just look at 00624 // the raw identifier to recognize and categorize preprocessor directives. 00625 TheLexer.LexFromRawLexer(TheTok); 00626 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { 00627 StringRef Keyword = TheTok.getRawIdentifier(); 00628 PreambleDirectiveKind PDK 00629 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) 00630 .Case("include", PDK_Skipped) 00631 .Case("__include_macros", PDK_Skipped) 00632 .Case("define", PDK_Skipped) 00633 .Case("undef", PDK_Skipped) 00634 .Case("line", PDK_Skipped) 00635 .Case("error", PDK_Skipped) 00636 .Case("pragma", PDK_Skipped) 00637 .Case("import", PDK_Skipped) 00638 .Case("include_next", PDK_Skipped) 00639 .Case("warning", PDK_Skipped) 00640 .Case("ident", PDK_Skipped) 00641 .Case("sccs", PDK_Skipped) 00642 .Case("assert", PDK_Skipped) 00643 .Case("unassert", PDK_Skipped) 00644 .Case("if", PDK_StartIf) 00645 .Case("ifdef", PDK_StartIf) 00646 .Case("ifndef", PDK_StartIf) 00647 .Case("elif", PDK_Skipped) 00648 .Case("else", PDK_Skipped) 00649 .Case("endif", PDK_EndIf) 00650 .Default(PDK_Unknown); 00651 00652 switch (PDK) { 00653 case PDK_Skipped: 00654 continue; 00655 00656 case PDK_StartIf: 00657 if (IfCount == 0) 00658 IfStartTok = HashTok; 00659 00660 ++IfCount; 00661 continue; 00662 00663 case PDK_EndIf: 00664 // Mismatched #endif. The preamble ends here. 00665 if (IfCount == 0) 00666 break; 00667 00668 --IfCount; 00669 continue; 00670 00671 case PDK_Unknown: 00672 // We don't know what this directive is; stop at the '#'. 00673 break; 00674 } 00675 } 00676 00677 // We only end up here if we didn't recognize the preprocessor 00678 // directive or it was one that can't occur in the preamble at this 00679 // point. Roll back the current token to the location of the '#'. 00680 InPreprocessorDirective = false; 00681 TheTok = HashTok; 00682 } 00683 00684 // We hit a token that we don't recognize as being in the 00685 // "preprocessing only" part of the file, so we're no longer in 00686 // the preamble. 00687 break; 00688 } while (true); 00689 00690 SourceLocation End; 00691 if (IfCount) 00692 End = IfStartTok.getLocation(); 00693 else if (ActiveCommentLoc.isValid()) 00694 End = ActiveCommentLoc; // don't truncate a decl comment. 00695 else 00696 End = TheTok.getLocation(); 00697 00698 return std::make_pair(End.getRawEncoding() - StartLoc.getRawEncoding(), 00699 IfCount? IfStartTok.isAtStartOfLine() 00700 : TheTok.isAtStartOfLine()); 00701 } 00702 00703 00704 /// AdvanceToTokenCharacter - Given a location that specifies the start of a 00705 /// token, return a new location that specifies a character within the token. 00706 SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart, 00707 unsigned CharNo, 00708 const SourceManager &SM, 00709 const LangOptions &LangOpts) { 00710 // Figure out how many physical characters away the specified expansion 00711 // character is. This needs to take into consideration newlines and 00712 // trigraphs. 00713 bool Invalid = false; 00714 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); 00715 00716 // If they request the first char of the token, we're trivially done. 00717 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) 00718 return TokStart; 00719 00720 unsigned PhysOffset = 0; 00721 00722 // The usual case is that tokens don't contain anything interesting. Skip 00723 // over the uninteresting characters. If a token only consists of simple 00724 // chars, this method is extremely fast. 00725 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { 00726 if (CharNo == 0) 00727 return TokStart.getLocWithOffset(PhysOffset); 00728 ++TokPtr, --CharNo, ++PhysOffset; 00729 } 00730 00731 // If we have a character that may be a trigraph or escaped newline, use a 00732 // lexer to parse it correctly. 00733 for (; CharNo; --CharNo) { 00734 unsigned Size; 00735 Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts); 00736 TokPtr += Size; 00737 PhysOffset += Size; 00738 } 00739 00740 // Final detail: if we end up on an escaped newline, we want to return the 00741 // location of the actual byte of the token. For example foo<newline>bar 00742 // advanced by 3 should return the location of b, not of \\. One compounding 00743 // detail of this is that the escape may be made by a trigraph. 00744 if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) 00745 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; 00746 00747 return TokStart.getLocWithOffset(PhysOffset); 00748 } 00749 00750 /// \brief Computes the source location just past the end of the 00751 /// token at this source location. 00752 /// 00753 /// This routine can be used to produce a source location that 00754 /// points just past the end of the token referenced by \p Loc, and 00755 /// is generally used when a diagnostic needs to point just after a 00756 /// token where it expected something different that it received. If 00757 /// the returned source location would not be meaningful (e.g., if 00758 /// it points into a macro), this routine returns an invalid 00759 /// source location. 00760 /// 00761 /// \param Offset an offset from the end of the token, where the source 00762 /// location should refer to. The default offset (0) produces a source 00763 /// location pointing just past the end of the token; an offset of 1 produces 00764 /// a source location pointing to the last character in the token, etc. 00765 SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 00766 const SourceManager &SM, 00767 const LangOptions &LangOpts) { 00768 if (Loc.isInvalid()) 00769 return SourceLocation(); 00770 00771 if (Loc.isMacroID()) { 00772 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 00773 return SourceLocation(); // Points inside the macro expansion. 00774 } 00775 00776 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 00777 if (Len > Offset) 00778 Len = Len - Offset; 00779 else 00780 return Loc; 00781 00782 return Loc.getLocWithOffset(Len); 00783 } 00784 00785 /// \brief Returns true if the given MacroID location points at the first 00786 /// token of the macro expansion. 00787 bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, 00788 const SourceManager &SM, 00789 const LangOptions &LangOpts, 00790 SourceLocation *MacroBegin) { 00791 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 00792 00793 SourceLocation expansionLoc; 00794 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc)) 00795 return false; 00796 00797 if (expansionLoc.isFileID()) { 00798 // No other macro expansions, this is the first. 00799 if (MacroBegin) 00800 *MacroBegin = expansionLoc; 00801 return true; 00802 } 00803 00804 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin); 00805 } 00806 00807 /// \brief Returns true if the given MacroID location points at the last 00808 /// token of the macro expansion. 00809 bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, 00810 const SourceManager &SM, 00811 const LangOptions &LangOpts, 00812 SourceLocation *MacroEnd) { 00813 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 00814 00815 SourceLocation spellLoc = SM.getSpellingLoc(loc); 00816 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts); 00817 if (tokLen == 0) 00818 return false; 00819 00820 SourceLocation afterLoc = loc.getLocWithOffset(tokLen); 00821 SourceLocation expansionLoc; 00822 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc)) 00823 return false; 00824 00825 if (expansionLoc.isFileID()) { 00826 // No other macro expansions. 00827 if (MacroEnd) 00828 *MacroEnd = expansionLoc; 00829 return true; 00830 } 00831 00832 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd); 00833 } 00834 00835 static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, 00836 const SourceManager &SM, 00837 const LangOptions &LangOpts) { 00838 SourceLocation Begin = Range.getBegin(); 00839 SourceLocation End = Range.getEnd(); 00840 assert(Begin.isFileID() && End.isFileID()); 00841 if (Range.isTokenRange()) { 00842 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts); 00843 if (End.isInvalid()) 00844 return CharSourceRange(); 00845 } 00846 00847 // Break down the source locations. 00848 FileID FID; 00849 unsigned BeginOffs; 00850 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); 00851 if (FID.isInvalid()) 00852 return CharSourceRange(); 00853 00854 unsigned EndOffs; 00855 if (!SM.isInFileID(End, FID, &EndOffs) || 00856 BeginOffs > EndOffs) 00857 return CharSourceRange(); 00858 00859 return CharSourceRange::getCharRange(Begin, End); 00860 } 00861 00862 CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, 00863 const SourceManager &SM, 00864 const LangOptions &LangOpts) { 00865 SourceLocation Begin = Range.getBegin(); 00866 SourceLocation End = Range.getEnd(); 00867 if (Begin.isInvalid() || End.isInvalid()) 00868 return CharSourceRange(); 00869 00870 if (Begin.isFileID() && End.isFileID()) 00871 return makeRangeFromFileLocs(Range, SM, LangOpts); 00872 00873 if (Begin.isMacroID() && End.isFileID()) { 00874 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin)) 00875 return CharSourceRange(); 00876 Range.setBegin(Begin); 00877 return makeRangeFromFileLocs(Range, SM, LangOpts); 00878 } 00879 00880 if (Begin.isFileID() && End.isMacroID()) { 00881 if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts, 00882 &End)) || 00883 (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts, 00884 &End))) 00885 return CharSourceRange(); 00886 Range.setEnd(End); 00887 return makeRangeFromFileLocs(Range, SM, LangOpts); 00888 } 00889 00890 assert(Begin.isMacroID() && End.isMacroID()); 00891 SourceLocation MacroBegin, MacroEnd; 00892 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) && 00893 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts, 00894 &MacroEnd)) || 00895 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts, 00896 &MacroEnd)))) { 00897 Range.setBegin(MacroBegin); 00898 Range.setEnd(MacroEnd); 00899 return makeRangeFromFileLocs(Range, SM, LangOpts); 00900 } 00901 00902 bool Invalid = false; 00903 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin), 00904 &Invalid); 00905 if (Invalid) 00906 return CharSourceRange(); 00907 00908 if (BeginEntry.getExpansion().isMacroArgExpansion()) { 00909 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End), 00910 &Invalid); 00911 if (Invalid) 00912 return CharSourceRange(); 00913 00914 if (EndEntry.getExpansion().isMacroArgExpansion() && 00915 BeginEntry.getExpansion().getExpansionLocStart() == 00916 EndEntry.getExpansion().getExpansionLocStart()) { 00917 Range.setBegin(SM.getImmediateSpellingLoc(Begin)); 00918 Range.setEnd(SM.getImmediateSpellingLoc(End)); 00919 return makeFileCharRange(Range, SM, LangOpts); 00920 } 00921 } 00922 00923 return CharSourceRange(); 00924 } 00925 00926 StringRef Lexer::getSourceText(CharSourceRange Range, 00927 const SourceManager &SM, 00928 const LangOptions &LangOpts, 00929 bool *Invalid) { 00930 Range = makeFileCharRange(Range, SM, LangOpts); 00931 if (Range.isInvalid()) { 00932 if (Invalid) *Invalid = true; 00933 return StringRef(); 00934 } 00935 00936 // Break down the source location. 00937 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin()); 00938 if (beginInfo.first.isInvalid()) { 00939 if (Invalid) *Invalid = true; 00940 return StringRef(); 00941 } 00942 00943 unsigned EndOffs; 00944 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) || 00945 beginInfo.second > EndOffs) { 00946 if (Invalid) *Invalid = true; 00947 return StringRef(); 00948 } 00949 00950 // Try to the load the file buffer. 00951 bool invalidTemp = false; 00952 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp); 00953 if (invalidTemp) { 00954 if (Invalid) *Invalid = true; 00955 return StringRef(); 00956 } 00957 00958 if (Invalid) *Invalid = false; 00959 return file.substr(beginInfo.second, EndOffs - beginInfo.second); 00960 } 00961 00962 StringRef Lexer::getImmediateMacroName(SourceLocation Loc, 00963 const SourceManager &SM, 00964 const LangOptions &LangOpts) { 00965 assert(Loc.isMacroID() && "Only reasonble to call this on macros"); 00966 00967 // Find the location of the immediate macro expansion. 00968 while (1) { 00969 FileID FID = SM.getFileID(Loc); 00970 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); 00971 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); 00972 Loc = Expansion.getExpansionLocStart(); 00973 if (!Expansion.isMacroArgExpansion()) 00974 break; 00975 00976 // For macro arguments we need to check that the argument did not come 00977 // from an inner macro, e.g: "MAC1( MAC2(foo) )" 00978 00979 // Loc points to the argument id of the macro definition, move to the 00980 // macro expansion. 00981 Loc = SM.getImmediateExpansionRange(Loc).first; 00982 SourceLocation SpellLoc = Expansion.getSpellingLoc(); 00983 if (SpellLoc.isFileID()) 00984 break; // No inner macro. 00985 00986 // If spelling location resides in the same FileID as macro expansion 00987 // location, it means there is no inner macro. 00988 FileID MacroFID = SM.getFileID(Loc); 00989 if (SM.isInFileID(SpellLoc, MacroFID)) 00990 break; 00991 00992 // Argument came from inner macro. 00993 Loc = SpellLoc; 00994 } 00995 00996 // Find the spelling location of the start of the non-argument expansion 00997 // range. This is where the macro name was spelled in order to begin 00998 // expanding this macro. 00999 Loc = SM.getSpellingLoc(Loc); 01000 01001 // Dig out the buffer where the macro name was spelled and the extents of the 01002 // name so that we can render it into the expansion note. 01003 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 01004 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 01005 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 01006 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 01007 } 01008 01009 bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) { 01010 return isIdentifierBody(c, LangOpts.DollarIdents); 01011 } 01012 01013 01014 //===----------------------------------------------------------------------===// 01015 // Diagnostics forwarding code. 01016 //===----------------------------------------------------------------------===// 01017 01018 /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 01019 /// lexer buffer was all expanded at a single point, perform the mapping. 01020 /// This is currently only used for _Pragma implementation, so it is the slow 01021 /// path of the hot getSourceLocation method. Do not allow it to be inlined. 01022 static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( 01023 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); 01024 static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 01025 SourceLocation FileLoc, 01026 unsigned CharNo, unsigned TokLen) { 01027 assert(FileLoc.isMacroID() && "Must be a macro expansion"); 01028 01029 // Otherwise, we're lexing "mapped tokens". This is used for things like 01030 // _Pragma handling. Combine the expansion location of FileLoc with the 01031 // spelling location. 01032 SourceManager &SM = PP.getSourceManager(); 01033 01034 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose 01035 // characters come from spelling(FileLoc)+Offset. 01036 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 01037 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo); 01038 01039 // Figure out the expansion loc range, which is the range covered by the 01040 // original _Pragma(...) sequence. 01041 std::pair<SourceLocation,SourceLocation> II = 01042 SM.getImmediateExpansionRange(FileLoc); 01043 01044 return SM.createExpansionLoc(SpellingLoc, II.first, II.second, TokLen); 01045 } 01046 01047 /// getSourceLocation - Return a source location identifier for the specified 01048 /// offset in the current file. 01049 SourceLocation Lexer::getSourceLocation(const char *Loc, 01050 unsigned TokLen) const { 01051 assert(Loc >= BufferStart && Loc <= BufferEnd && 01052 "Location out of range for this buffer!"); 01053 01054 // In the normal case, we're just lexing from a simple file buffer, return 01055 // the file id from FileLoc with the offset specified. 01056 unsigned CharNo = Loc-BufferStart; 01057 if (FileLoc.isFileID()) 01058 return FileLoc.getLocWithOffset(CharNo); 01059 01060 // Otherwise, this is the _Pragma lexer case, which pretends that all of the 01061 // tokens are lexed from where the _Pragma was defined. 01062 assert(PP && "This doesn't work on raw lexers"); 01063 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 01064 } 01065 01066 /// Diag - Forwarding function for diagnostics. This translate a source 01067 /// position in the current buffer into a SourceLocation object for rendering. 01068 DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 01069 return PP->Diag(getSourceLocation(Loc), DiagID); 01070 } 01071 01072 //===----------------------------------------------------------------------===// 01073 // Trigraph and Escaped Newline Handling Code. 01074 //===----------------------------------------------------------------------===// 01075 01076 /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 01077 /// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 01078 static char GetTrigraphCharForLetter(char Letter) { 01079 switch (Letter) { 01080 default: return 0; 01081 case '=': return '#'; 01082 case ')': return ']'; 01083 case '(': return '['; 01084 case '!': return '|'; 01085 case '\'': return '^'; 01086 case '>': return '}'; 01087 case '/': return '\\'; 01088 case '<': return '{'; 01089 case '-': return '~'; 01090 } 01091 } 01092 01093 /// DecodeTrigraphChar - If the specified character is a legal trigraph when 01094 /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 01095 /// return the result character. Finally, emit a warning about trigraph use 01096 /// whether trigraphs are enabled or not. 01097 static char DecodeTrigraphChar(const char *CP, Lexer *L) { 01098 char Res = GetTrigraphCharForLetter(*CP); 01099 if (!Res || !L) return Res; 01100 01101 if (!L->getLangOpts().Trigraphs) { 01102 if (!L->isLexingRawMode()) 01103 L->Diag(CP-2, diag::trigraph_ignored); 01104 return 0; 01105 } 01106 01107 if (!L->isLexingRawMode()) 01108 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); 01109 return Res; 01110 } 01111 01112 /// getEscapedNewLineSize - Return the size of the specified escaped newline, 01113 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 01114 /// trigraph equivalent on entry to this function. 01115 unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 01116 unsigned Size = 0; 01117 while (isWhitespace(Ptr[Size])) { 01118 ++Size; 01119 01120 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 01121 continue; 01122 01123 // If this is a \r\n or \n\r, skip the other half. 01124 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 01125 Ptr[Size-1] != Ptr[Size]) 01126 ++Size; 01127 01128 return Size; 01129 } 01130 01131 // Not an escaped newline, must be a \t or something else. 01132 return 0; 01133 } 01134 01135 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of 01136 /// them), skip over them and return the first non-escaped-newline found, 01137 /// otherwise return P. 01138 const char *Lexer::SkipEscapedNewLines(const char *P) { 01139 while (1) { 01140 const char *AfterEscape; 01141 if (*P == '\\') { 01142 AfterEscape = P+1; 01143 } else if (*P == '?') { 01144 // If not a trigraph for escape, bail out. 01145 if (P[1] != '?' || P[2] != '/') 01146 return P; 01147 AfterEscape = P+3; 01148 } else { 01149 return P; 01150 } 01151 01152 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 01153 if (NewLineSize == 0) return P; 01154 P = AfterEscape+NewLineSize; 01155 } 01156 } 01157 01158 /// \brief Checks that the given token is the first token that occurs after the 01159 /// given location (this excludes comments and whitespace). Returns the location 01160 /// immediately after the specified token. If the token is not found or the 01161 /// location is inside a macro, the returned source location will be invalid. 01162 SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc, 01163 tok::TokenKind TKind, 01164 const SourceManager &SM, 01165 const LangOptions &LangOpts, 01166 bool SkipTrailingWhitespaceAndNewLine) { 01167 if (Loc.isMacroID()) { 01168 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 01169 return SourceLocation(); 01170 } 01171 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); 01172 01173 // Break down the source location. 01174 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 01175 01176 // Try to load the file buffer. 01177 bool InvalidTemp = false; 01178 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); 01179 if (InvalidTemp) 01180 return SourceLocation(); 01181 01182 const char *TokenBegin = File.data() + LocInfo.second; 01183 01184 // Lex from the start of the given location. 01185 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), 01186 TokenBegin, File.end()); 01187 // Find the token. 01188 Token Tok; 01189 lexer.LexFromRawLexer(Tok); 01190 if (Tok.isNot(TKind)) 01191 return SourceLocation(); 01192 SourceLocation TokenLoc = Tok.getLocation(); 01193 01194 // Calculate how much whitespace needs to be skipped if any. 01195 unsigned NumWhitespaceChars = 0; 01196 if (SkipTrailingWhitespaceAndNewLine) { 01197 const char *TokenEnd = SM.getCharacterData(TokenLoc) + 01198 Tok.getLength(); 01199 unsigned char C = *TokenEnd; 01200 while (isHorizontalWhitespace(C)) { 01201 C = *(++TokenEnd); 01202 NumWhitespaceChars++; 01203 } 01204 01205 // Skip \r, \n, \r\n, or \n\r 01206 if (C == '\n' || C == '\r') { 01207 char PrevC = C; 01208 C = *(++TokenEnd); 01209 NumWhitespaceChars++; 01210 if ((C == '\n' || C == '\r') && C != PrevC) 01211 NumWhitespaceChars++; 01212 } 01213 } 01214 01215 return TokenLoc.getLocWithOffset(Tok.getLength() + NumWhitespaceChars); 01216 } 01217 01218 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 01219 /// get its size, and return it. This is tricky in several cases: 01220 /// 1. If currently at the start of a trigraph, we warn about the trigraph, 01221 /// then either return the trigraph (skipping 3 chars) or the '?', 01222 /// depending on whether trigraphs are enabled or not. 01223 /// 2. If this is an escaped newline (potentially with whitespace between 01224 /// the backslash and newline), implicitly skip the newline and return 01225 /// the char after it. 01226 /// 01227 /// This handles the slow/uncommon case of the getCharAndSize method. Here we 01228 /// know that we can accumulate into Size, and that we have already incremented 01229 /// Ptr by Size bytes. 01230 /// 01231 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 01232 /// be updated to match. 01233 /// 01234 char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, 01235 Token *Tok) { 01236 // If we have a slash, look for an escaped newline. 01237 if (Ptr[0] == '\\') { 01238 ++Size; 01239 ++Ptr; 01240 Slash: 01241 // Common case, backslash-char where the char is not whitespace. 01242 if (!isWhitespace(Ptr[0])) return '\\'; 01243 01244 // See if we have optional whitespace characters between the slash and 01245 // newline. 01246 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 01247 // Remember that this token needs to be cleaned. 01248 if (Tok) Tok->setFlag(Token::NeedsCleaning); 01249 01250 // Warn if there was whitespace between the backslash and newline. 01251 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 01252 Diag(Ptr, diag::backslash_newline_space); 01253 01254 // Found backslash<whitespace><newline>. Parse the char after it. 01255 Size += EscapedNewLineSize; 01256 Ptr += EscapedNewLineSize; 01257 01258 // If the char that we finally got was a \n, then we must have had 01259 // something like <newline><newline>. We don't want to consume the 01260 // second newline. 01261 if (*Ptr == '\n' || *Ptr == '\r' || *Ptr == '\0') 01262 return ' '; 01263 01264 // Use slow version to accumulate a correct size field. 01265 return getCharAndSizeSlow(Ptr, Size, Tok); 01266 } 01267 01268 // Otherwise, this is not an escaped newline, just return the slash. 01269 return '\\'; 01270 } 01271 01272 // If this is a trigraph, process it. 01273 if (Ptr[0] == '?' && Ptr[1] == '?') { 01274 // If this is actually a legal trigraph (not something like "??x"), emit 01275 // a trigraph warning. If so, and if trigraphs are enabled, return it. 01276 if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : nullptr)) { 01277 // Remember that this token needs to be cleaned. 01278 if (Tok) Tok->setFlag(Token::NeedsCleaning); 01279 01280 Ptr += 3; 01281 Size += 3; 01282 if (C == '\\') goto Slash; 01283 return C; 01284 } 01285 } 01286 01287 // If this is neither, return a single character. 01288 ++Size; 01289 return *Ptr; 01290 } 01291 01292 01293 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 01294 /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 01295 /// and that we have already incremented Ptr by Size bytes. 01296 /// 01297 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should 01298 /// be updated to match. 01299 char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 01300 const LangOptions &LangOpts) { 01301 // If we have a slash, look for an escaped newline. 01302 if (Ptr[0] == '\\') { 01303 ++Size; 01304 ++Ptr; 01305 Slash: 01306 // Common case, backslash-char where the char is not whitespace. 01307 if (!isWhitespace(Ptr[0])) return '\\'; 01308 01309 // See if we have optional whitespace characters followed by a newline. 01310 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 01311 // Found backslash<whitespace><newline>. Parse the char after it. 01312 Size += EscapedNewLineSize; 01313 Ptr += EscapedNewLineSize; 01314 01315 // If the char that we finally got was a \n, then we must have had 01316 // something like <newline><newline>. We don't want to consume the 01317 // second newline. 01318 if (*Ptr == '\n' || *Ptr == '\r' || *Ptr == '\0') 01319 return ' '; 01320 01321 // Use slow version to accumulate a correct size field. 01322 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts); 01323 } 01324 01325 // Otherwise, this is not an escaped newline, just return the slash. 01326 return '\\'; 01327 } 01328 01329 // If this is a trigraph, process it. 01330 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 01331 // If this is actually a legal trigraph (not something like "??x"), return 01332 // it. 01333 if (char C = GetTrigraphCharForLetter(Ptr[2])) { 01334 Ptr += 3; 01335 Size += 3; 01336 if (C == '\\') goto Slash; 01337 return C; 01338 } 01339 } 01340 01341 // If this is neither, return a single character. 01342 ++Size; 01343 return *Ptr; 01344 } 01345 01346 //===----------------------------------------------------------------------===// 01347 // Helper methods for lexing. 01348 //===----------------------------------------------------------------------===// 01349 01350 /// \brief Routine that indiscriminately skips bytes in the source file. 01351 void Lexer::SkipBytes(unsigned Bytes, bool StartOfLine) { 01352 BufferPtr += Bytes; 01353 if (BufferPtr > BufferEnd) 01354 BufferPtr = BufferEnd; 01355 // FIXME: What exactly does the StartOfLine bit mean? There are two 01356 // possible meanings for the "start" of the line: the first token on the 01357 // unexpanded line, or the first token on the expanded line. 01358 IsAtStartOfLine = StartOfLine; 01359 IsAtPhysicalStartOfLine = StartOfLine; 01360 } 01361 01362 static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) { 01363 if (LangOpts.CPlusPlus11 || LangOpts.C11) { 01364 static const llvm::sys::UnicodeCharSet C11AllowedIDChars( 01365 C11AllowedIDCharRanges); 01366 return C11AllowedIDChars.contains(C); 01367 } else if (LangOpts.CPlusPlus) { 01368 static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars( 01369 CXX03AllowedIDCharRanges); 01370 return CXX03AllowedIDChars.contains(C); 01371 } else { 01372 static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 01373 C99AllowedIDCharRanges); 01374 return C99AllowedIDChars.contains(C); 01375 } 01376 } 01377 01378 static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) { 01379 assert(isAllowedIDChar(C, LangOpts)); 01380 if (LangOpts.CPlusPlus11 || LangOpts.C11) { 01381 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars( 01382 C11DisallowedInitialIDCharRanges); 01383 return !C11DisallowedInitialIDChars.contains(C); 01384 } else if (LangOpts.CPlusPlus) { 01385 return true; 01386 } else { 01387 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 01388 C99DisallowedInitialIDCharRanges); 01389 return !C99DisallowedInitialIDChars.contains(C); 01390 } 01391 } 01392 01393 static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin, 01394 const char *End) { 01395 return CharSourceRange::getCharRange(L.getSourceLocation(Begin), 01396 L.getSourceLocation(End)); 01397 } 01398 01399 static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, 01400 CharSourceRange Range, bool IsFirst) { 01401 // Check C99 compatibility. 01402 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) { 01403 enum { 01404 CannotAppearInIdentifier = 0, 01405 CannotStartIdentifier 01406 }; 01407 01408 static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 01409 C99AllowedIDCharRanges); 01410 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 01411 C99DisallowedInitialIDCharRanges); 01412 if (!C99AllowedIDChars.contains(C)) { 01413 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 01414 << Range 01415 << CannotAppearInIdentifier; 01416 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) { 01417 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 01418 << Range 01419 << CannotStartIdentifier; 01420 } 01421 } 01422 01423 // Check C++98 compatibility. 01424 if (!Diags.isIgnored(diag::warn_cxx98_compat_unicode_id, Range.getBegin())) { 01425 static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars( 01426 CXX03AllowedIDCharRanges); 01427 if (!CXX03AllowedIDChars.contains(C)) { 01428 Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id) 01429 << Range; 01430 } 01431 } 01432 } 01433 01434 bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, 01435 Token &Result) { 01436 const char *UCNPtr = CurPtr + Size; 01437 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr); 01438 if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts)) 01439 return false; 01440 01441 if (!isLexingRawMode()) 01442 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 01443 makeCharRange(*this, CurPtr, UCNPtr), 01444 /*IsFirst=*/false); 01445 01446 Result.setFlag(Token::HasUCN); 01447 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || 01448 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U')) 01449 CurPtr = UCNPtr; 01450 else 01451 while (CurPtr != UCNPtr) 01452 (void)getAndAdvanceChar(CurPtr, Result); 01453 return true; 01454 } 01455 01456 bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) { 01457 const char *UnicodePtr = CurPtr; 01458 UTF32 CodePoint; 01459 ConversionResult Result = 01460 llvm::convertUTF8Sequence((const UTF8 **)&UnicodePtr, 01461 (const UTF8 *)BufferEnd, 01462 &CodePoint, 01463 strictConversion); 01464 if (Result != conversionOK || 01465 !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) 01466 return false; 01467 01468 if (!isLexingRawMode()) 01469 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 01470 makeCharRange(*this, CurPtr, UnicodePtr), 01471 /*IsFirst=*/false); 01472 01473 CurPtr = UnicodePtr; 01474 return true; 01475 } 01476 01477 bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) { 01478 // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] 01479 unsigned Size; 01480 unsigned char C = *CurPtr++; 01481 while (isIdentifierBody(C)) 01482 C = *CurPtr++; 01483 01484 --CurPtr; // Back up over the skipped character. 01485 01486 // Fast path, no $,\,? in identifier found. '\' might be an escaped newline 01487 // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. 01488 // 01489 // TODO: Could merge these checks into an InfoTable flag to make the 01490 // comparison cheaper 01491 if (isASCII(C) && C != '\\' && C != '?' && 01492 (C != '$' || !LangOpts.DollarIdents)) { 01493 FinishIdentifier: 01494 const char *IdStart = BufferPtr; 01495 FormTokenWithChars(Result, CurPtr, tok::raw_identifier); 01496 Result.setRawIdentifierData(IdStart); 01497 01498 // If we are in raw mode, return this identifier raw. There is no need to 01499 // look up identifier information or attempt to macro expand it. 01500 if (LexingRawMode) 01501 return true; 01502 01503 // Fill in Result.IdentifierInfo and update the token kind, 01504 // looking up the identifier in the identifier table. 01505 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 01506 01507 // Finally, now that we know we have an identifier, pass this off to the 01508 // preprocessor, which may macro expand it or something. 01509 if (II->isHandleIdentifierCase()) 01510 return PP->HandleIdentifier(Result); 01511 01512 return true; 01513 } 01514 01515 // Otherwise, $,\,? in identifier found. Enter slower path. 01516 01517 C = getCharAndSize(CurPtr, Size); 01518 while (1) { 01519 if (C == '$') { 01520 // If we hit a $ and they are not supported in identifiers, we are done. 01521 if (!LangOpts.DollarIdents) goto FinishIdentifier; 01522 01523 // Otherwise, emit a diagnostic and continue. 01524 if (!isLexingRawMode()) 01525 Diag(CurPtr, diag::ext_dollar_in_identifier); 01526 CurPtr = ConsumeChar(CurPtr, Size, Result); 01527 C = getCharAndSize(CurPtr, Size); 01528 continue; 01529 01530 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { 01531 C = getCharAndSize(CurPtr, Size); 01532 continue; 01533 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) { 01534 C = getCharAndSize(CurPtr, Size); 01535 continue; 01536 } else if (!isIdentifierBody(C)) { 01537 goto FinishIdentifier; 01538 } 01539 01540 // Otherwise, this character is good, consume it. 01541 CurPtr = ConsumeChar(CurPtr, Size, Result); 01542 01543 C = getCharAndSize(CurPtr, Size); 01544 while (isIdentifierBody(C)) { 01545 CurPtr = ConsumeChar(CurPtr, Size, Result); 01546 C = getCharAndSize(CurPtr, Size); 01547 } 01548 } 01549 } 01550 01551 /// isHexaLiteral - Return true if Start points to a hex constant. 01552 /// in microsoft mode (where this is supposed to be several different tokens). 01553 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) { 01554 unsigned Size; 01555 char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts); 01556 if (C1 != '0') 01557 return false; 01558 char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts); 01559 return (C2 == 'x' || C2 == 'X'); 01560 } 01561 01562 /// LexNumericConstant - Lex the remainder of a integer or floating point 01563 /// constant. From[-1] is the first character lexed. Return the end of the 01564 /// constant. 01565 bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 01566 unsigned Size; 01567 char C = getCharAndSize(CurPtr, Size); 01568 char PrevCh = 0; 01569 while (isPreprocessingNumberBody(C)) { 01570 CurPtr = ConsumeChar(CurPtr, Size, Result); 01571 PrevCh = C; 01572 C = getCharAndSize(CurPtr, Size); 01573 } 01574 01575 // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 01576 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) { 01577 // If we are in Microsoft mode, don't continue if the constant is hex. 01578 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1 01579 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts)) 01580 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 01581 } 01582 01583 // If we have a hex FP constant, continue. 01584 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) { 01585 // Outside C99, we accept hexadecimal floating point numbers as a 01586 // not-quite-conforming extension. Only do so if this looks like it's 01587 // actually meant to be a hexfloat, and not if it has a ud-suffix. 01588 bool IsHexFloat = true; 01589 if (!LangOpts.C99) { 01590 if (!isHexaLiteral(BufferPtr, LangOpts)) 01591 IsHexFloat = false; 01592 else if (std::find(BufferPtr, CurPtr, '_') != CurPtr) 01593 IsHexFloat = false; 01594 } 01595 if (IsHexFloat) 01596 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 01597 } 01598 01599 // If we have a digit separator, continue. 01600 if (C == '\'' && getLangOpts().CPlusPlus14) { 01601 unsigned NextSize; 01602 char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts()); 01603 if (isIdentifierBody(Next)) { 01604 if (!isLexingRawMode()) 01605 Diag(CurPtr, diag::warn_cxx11_compat_digit_separator); 01606 CurPtr = ConsumeChar(CurPtr, Size, Result); 01607 CurPtr = ConsumeChar(CurPtr, NextSize, Result); 01608 return LexNumericConstant(Result, CurPtr); 01609 } 01610 } 01611 01612 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue. 01613 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 01614 return LexNumericConstant(Result, CurPtr); 01615 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) 01616 return LexNumericConstant(Result, CurPtr); 01617 01618 // Update the location of token as well as BufferPtr. 01619 const char *TokStart = BufferPtr; 01620 FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 01621 Result.setLiteralData(TokStart); 01622 return true; 01623 } 01624 01625 /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes 01626 /// in C++11, or warn on a ud-suffix in C++98. 01627 const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, 01628 bool IsStringLiteral) { 01629 assert(getLangOpts().CPlusPlus); 01630 01631 // Maximally munch an identifier. 01632 unsigned Size; 01633 char C = getCharAndSize(CurPtr, Size); 01634 bool Consumed = false; 01635 01636 if (!isIdentifierHead(C)) { 01637 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 01638 Consumed = true; 01639 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) 01640 Consumed = true; 01641 else 01642 return CurPtr; 01643 } 01644 01645 if (!getLangOpts().CPlusPlus11) { 01646 if (!isLexingRawMode()) 01647 Diag(CurPtr, 01648 C == '_' ? diag::warn_cxx11_compat_user_defined_literal 01649 : diag::warn_cxx11_compat_reserved_user_defined_literal) 01650 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 01651 return CurPtr; 01652 } 01653 01654 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix 01655 // that does not start with an underscore is ill-formed. As a conforming 01656 // extension, we treat all such suffixes as if they had whitespace before 01657 // them. We assume a suffix beginning with a UCN or UTF-8 character is more 01658 // likely to be a ud-suffix than a macro, however, and accept that. 01659 if (!Consumed) { 01660 bool IsUDSuffix = false; 01661 if (C == '_') 01662 IsUDSuffix = true; 01663 else if (IsStringLiteral && getLangOpts().CPlusPlus14) { 01664 // In C++1y, we need to look ahead a few characters to see if this is a 01665 // valid suffix for a string literal or a numeric literal (this could be 01666 // the 'operator""if' defining a numeric literal operator). 01667 const unsigned MaxStandardSuffixLength = 3; 01668 char Buffer[MaxStandardSuffixLength] = { C }; 01669 unsigned Consumed = Size; 01670 unsigned Chars = 1; 01671 while (true) { 01672 unsigned NextSize; 01673 char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, 01674 getLangOpts()); 01675 if (!isIdentifierBody(Next)) { 01676 // End of suffix. Check whether this is on the whitelist. 01677 IsUDSuffix = (Chars == 1 && Buffer[0] == 's') || 01678 NumericLiteralParser::isValidUDSuffix( 01679 getLangOpts(), StringRef(Buffer, Chars)); 01680 break; 01681 } 01682 01683 if (Chars == MaxStandardSuffixLength) 01684 // Too long: can't be a standard suffix. 01685 break; 01686 01687 Buffer[Chars++] = Next; 01688 Consumed += NextSize; 01689 } 01690 } 01691 01692 if (!IsUDSuffix) { 01693 if (!isLexingRawMode()) 01694 Diag(CurPtr, getLangOpts().MSVCCompat 01695 ? diag::ext_ms_reserved_user_defined_literal 01696 : diag::ext_reserved_user_defined_literal) 01697 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 01698 return CurPtr; 01699 } 01700 01701 CurPtr = ConsumeChar(CurPtr, Size, Result); 01702 } 01703 01704 Result.setFlag(Token::HasUDSuffix); 01705 while (true) { 01706 C = getCharAndSize(CurPtr, Size); 01707 if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); } 01708 else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {} 01709 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {} 01710 else break; 01711 } 01712 01713 return CurPtr; 01714 } 01715 01716 /// LexStringLiteral - Lex the remainder of a string literal, after having lexed 01717 /// either " or L" or u8" or u" or U". 01718 bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr, 01719 tok::TokenKind Kind) { 01720 // Does this string contain the \0 character? 01721 const char *NulCharacter = nullptr; 01722 01723 if (!isLexingRawMode() && 01724 (Kind == tok::utf8_string_literal || 01725 Kind == tok::utf16_string_literal || 01726 Kind == tok::utf32_string_literal)) 01727 Diag(BufferPtr, getLangOpts().CPlusPlus 01728 ? diag::warn_cxx98_compat_unicode_literal 01729 : diag::warn_c99_compat_unicode_literal); 01730 01731 char C = getAndAdvanceChar(CurPtr, Result); 01732 while (C != '"') { 01733 // Skip escaped characters. Escaped newlines will already be processed by 01734 // getAndAdvanceChar. 01735 if (C == '\\') 01736 C = getAndAdvanceChar(CurPtr, Result); 01737 01738 if (C == '\n' || C == '\r' || // Newline. 01739 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 01740 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 01741 Diag(BufferPtr, diag::ext_unterminated_string); 01742 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 01743 return true; 01744 } 01745 01746 if (C == 0) { 01747 if (isCodeCompletionPoint(CurPtr-1)) { 01748 PP->CodeCompleteNaturalLanguage(); 01749 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 01750 cutOffLexing(); 01751 return true; 01752 } 01753 01754 NulCharacter = CurPtr-1; 01755 } 01756 C = getAndAdvanceChar(CurPtr, Result); 01757 } 01758 01759 // If we are in C++11, lex the optional ud-suffix. 01760 if (getLangOpts().CPlusPlus) 01761 CurPtr = LexUDSuffix(Result, CurPtr, true); 01762 01763 // If a nul character existed in the string, warn about it. 01764 if (NulCharacter && !isLexingRawMode()) 01765 Diag(NulCharacter, diag::null_in_string); 01766 01767 // Update the location of the token as well as the BufferPtr instance var. 01768 const char *TokStart = BufferPtr; 01769 FormTokenWithChars(Result, CurPtr, Kind); 01770 Result.setLiteralData(TokStart); 01771 return true; 01772 } 01773 01774 /// LexRawStringLiteral - Lex the remainder of a raw string literal, after 01775 /// having lexed R", LR", u8R", uR", or UR". 01776 bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, 01777 tok::TokenKind Kind) { 01778 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: 01779 // Between the initial and final double quote characters of the raw string, 01780 // any transformations performed in phases 1 and 2 (trigraphs, 01781 // universal-character-names, and line splicing) are reverted. 01782 01783 if (!isLexingRawMode()) 01784 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal); 01785 01786 unsigned PrefixLen = 0; 01787 01788 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) 01789 ++PrefixLen; 01790 01791 // If the last character was not a '(', then we didn't lex a valid delimiter. 01792 if (CurPtr[PrefixLen] != '(') { 01793 if (!isLexingRawMode()) { 01794 const char *PrefixEnd = &CurPtr[PrefixLen]; 01795 if (PrefixLen == 16) { 01796 Diag(PrefixEnd, diag::err_raw_delim_too_long); 01797 } else { 01798 Diag(PrefixEnd, diag::err_invalid_char_raw_delim) 01799 << StringRef(PrefixEnd, 1); 01800 } 01801 } 01802 01803 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately, 01804 // it's possible the '"' was intended to be part of the raw string, but 01805 // there's not much we can do about that. 01806 while (1) { 01807 char C = *CurPtr++; 01808 01809 if (C == '"') 01810 break; 01811 if (C == 0 && CurPtr-1 == BufferEnd) { 01812 --CurPtr; 01813 break; 01814 } 01815 } 01816 01817 FormTokenWithChars(Result, CurPtr, tok::unknown); 01818 return true; 01819 } 01820 01821 // Save prefix and move CurPtr past it 01822 const char *Prefix = CurPtr; 01823 CurPtr += PrefixLen + 1; // skip over prefix and '(' 01824 01825 while (1) { 01826 char C = *CurPtr++; 01827 01828 if (C == ')') { 01829 // Check for prefix match and closing quote. 01830 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') { 01831 CurPtr += PrefixLen + 1; // skip over prefix and '"' 01832 break; 01833 } 01834 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file. 01835 if (!isLexingRawMode()) 01836 Diag(BufferPtr, diag::err_unterminated_raw_string) 01837 << StringRef(Prefix, PrefixLen); 01838 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 01839 return true; 01840 } 01841 } 01842 01843 // If we are in C++11, lex the optional ud-suffix. 01844 if (getLangOpts().CPlusPlus) 01845 CurPtr = LexUDSuffix(Result, CurPtr, true); 01846 01847 // Update the location of token as well as BufferPtr. 01848 const char *TokStart = BufferPtr; 01849 FormTokenWithChars(Result, CurPtr, Kind); 01850 Result.setLiteralData(TokStart); 01851 return true; 01852 } 01853 01854 /// LexAngledStringLiteral - Lex the remainder of an angled string literal, 01855 /// after having lexed the '<' character. This is used for #include filenames. 01856 bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 01857 // Does this string contain the \0 character? 01858 const char *NulCharacter = nullptr; 01859 const char *AfterLessPos = CurPtr; 01860 char C = getAndAdvanceChar(CurPtr, Result); 01861 while (C != '>') { 01862 // Skip escaped characters. 01863 if (C == '\\') { 01864 // Skip the escaped character. 01865 getAndAdvanceChar(CurPtr, Result); 01866 } else if (C == '\n' || C == '\r' || // Newline. 01867 (C == 0 && (CurPtr-1 == BufferEnd || // End of file. 01868 isCodeCompletionPoint(CurPtr-1)))) { 01869 // If the filename is unterminated, then it must just be a lone < 01870 // character. Return this as such. 01871 FormTokenWithChars(Result, AfterLessPos, tok::less); 01872 return true; 01873 } else if (C == 0) { 01874 NulCharacter = CurPtr-1; 01875 } 01876 C = getAndAdvanceChar(CurPtr, Result); 01877 } 01878 01879 // If a nul character existed in the string, warn about it. 01880 if (NulCharacter && !isLexingRawMode()) 01881 Diag(NulCharacter, diag::null_in_string); 01882 01883 // Update the location of token as well as BufferPtr. 01884 const char *TokStart = BufferPtr; 01885 FormTokenWithChars(Result, CurPtr, tok::angle_string_literal); 01886 Result.setLiteralData(TokStart); 01887 return true; 01888 } 01889 01890 01891 /// LexCharConstant - Lex the remainder of a character constant, after having 01892 /// lexed either ' or L' or u8' or u' or U'. 01893 bool Lexer::LexCharConstant(Token &Result, const char *CurPtr, 01894 tok::TokenKind Kind) { 01895 // Does this character contain the \0 character? 01896 const char *NulCharacter = nullptr; 01897 01898 if (!isLexingRawMode()) { 01899 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant) 01900 Diag(BufferPtr, getLangOpts().CPlusPlus 01901 ? diag::warn_cxx98_compat_unicode_literal 01902 : diag::warn_c99_compat_unicode_literal); 01903 else if (Kind == tok::utf8_char_constant) 01904 Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal); 01905 } 01906 01907 char C = getAndAdvanceChar(CurPtr, Result); 01908 if (C == '\'') { 01909 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 01910 Diag(BufferPtr, diag::ext_empty_character); 01911 FormTokenWithChars(Result, CurPtr, tok::unknown); 01912 return true; 01913 } 01914 01915 while (C != '\'') { 01916 // Skip escaped characters. 01917 if (C == '\\') 01918 C = getAndAdvanceChar(CurPtr, Result); 01919 01920 if (C == '\n' || C == '\r' || // Newline. 01921 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 01922 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 01923 Diag(BufferPtr, diag::ext_unterminated_char); 01924 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 01925 return true; 01926 } 01927 01928 if (C == 0) { 01929 if (isCodeCompletionPoint(CurPtr-1)) { 01930 PP->CodeCompleteNaturalLanguage(); 01931 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 01932 cutOffLexing(); 01933 return true; 01934 } 01935 01936 NulCharacter = CurPtr-1; 01937 } 01938 C = getAndAdvanceChar(CurPtr, Result); 01939 } 01940 01941 // If we are in C++11, lex the optional ud-suffix. 01942 if (getLangOpts().CPlusPlus) 01943 CurPtr = LexUDSuffix(Result, CurPtr, false); 01944 01945 // If a nul character existed in the character, warn about it. 01946 if (NulCharacter && !isLexingRawMode()) 01947 Diag(NulCharacter, diag::null_in_char); 01948 01949 // Update the location of token as well as BufferPtr. 01950 const char *TokStart = BufferPtr; 01951 FormTokenWithChars(Result, CurPtr, Kind); 01952 Result.setLiteralData(TokStart); 01953 return true; 01954 } 01955 01956 /// SkipWhitespace - Efficiently skip over a series of whitespace characters. 01957 /// Update BufferPtr to point to the next non-whitespace character and return. 01958 /// 01959 /// This method forms a token and returns true if KeepWhitespaceMode is enabled. 01960 /// 01961 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr, 01962 bool &TokAtPhysicalStartOfLine) { 01963 // Whitespace - Skip it, then return the token after the whitespace. 01964 bool SawNewline = isVerticalWhitespace(CurPtr[-1]); 01965 01966 unsigned char Char = *CurPtr; 01967 01968 // Skip consecutive spaces efficiently. 01969 while (1) { 01970 // Skip horizontal whitespace very aggressively. 01971 while (isHorizontalWhitespace(Char)) 01972 Char = *++CurPtr; 01973 01974 // Otherwise if we have something other than whitespace, we're done. 01975 if (!isVerticalWhitespace(Char)) 01976 break; 01977 01978 if (ParsingPreprocessorDirective) { 01979 // End of preprocessor directive line, let LexTokenInternal handle this. 01980 BufferPtr = CurPtr; 01981 return false; 01982 } 01983 01984 // OK, but handle newline. 01985 SawNewline = true; 01986 Char = *++CurPtr; 01987 } 01988 01989 // If the client wants us to return whitespace, return it now. 01990 if (isKeepWhitespaceMode()) { 01991 FormTokenWithChars(Result, CurPtr, tok::unknown); 01992 if (SawNewline) { 01993 IsAtStartOfLine = true; 01994 IsAtPhysicalStartOfLine = true; 01995 } 01996 // FIXME: The next token will not have LeadingSpace set. 01997 return true; 01998 } 01999 02000 // If this isn't immediately after a newline, there is leading space. 02001 char PrevChar = CurPtr[-1]; 02002 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar); 02003 02004 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace); 02005 if (SawNewline) { 02006 Result.setFlag(Token::StartOfLine); 02007 TokAtPhysicalStartOfLine = true; 02008 } 02009 02010 BufferPtr = CurPtr; 02011 return false; 02012 } 02013 02014 /// We have just read the // characters from input. Skip until we find the 02015 /// newline character thats terminate the comment. Then update BufferPtr and 02016 /// return. 02017 /// 02018 /// If we're in KeepCommentMode or any CommentHandler has inserted 02019 /// some tokens, this will store the first token and return true. 02020 bool Lexer::SkipLineComment(Token &Result, const char *CurPtr, 02021 bool &TokAtPhysicalStartOfLine) { 02022 // If Line comments aren't explicitly enabled for this language, emit an 02023 // extension warning. 02024 if (!LangOpts.LineComment && !isLexingRawMode()) { 02025 Diag(BufferPtr, diag::ext_line_comment); 02026 02027 // Mark them enabled so we only emit one warning for this translation 02028 // unit. 02029 LangOpts.LineComment = true; 02030 } 02031 02032 // Scan over the body of the comment. The common case, when scanning, is that 02033 // the comment contains normal ascii characters with nothing interesting in 02034 // them. As such, optimize for this case with the inner loop. 02035 char C; 02036 do { 02037 C = *CurPtr; 02038 // Skip over characters in the fast loop. 02039 while (C != 0 && // Potentially EOF. 02040 C != '\n' && C != '\r') // Newline or DOS-style newline. 02041 C = *++CurPtr; 02042 02043 const char *NextLine = CurPtr; 02044 if (C != 0) { 02045 // We found a newline, see if it's escaped. 02046 const char *EscapePtr = CurPtr-1; 02047 bool HasSpace = false; 02048 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace. 02049 --EscapePtr; 02050 HasSpace = true; 02051 } 02052 02053 if (*EscapePtr == '\\') // Escaped newline. 02054 CurPtr = EscapePtr; 02055 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' && 02056 EscapePtr[-2] == '?') // Trigraph-escaped newline. 02057 CurPtr = EscapePtr-2; 02058 else 02059 break; // This is a newline, we're done. 02060 02061 // If there was space between the backslash and newline, warn about it. 02062 if (HasSpace && !isLexingRawMode()) 02063 Diag(EscapePtr, diag::backslash_newline_space); 02064 } 02065 02066 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 02067 // properly decode the character. Read it in raw mode to avoid emitting 02068 // diagnostics about things like trigraphs. If we see an escaped newline, 02069 // we'll handle it below. 02070 const char *OldPtr = CurPtr; 02071 bool OldRawMode = isLexingRawMode(); 02072 LexingRawMode = true; 02073 C = getAndAdvanceChar(CurPtr, Result); 02074 LexingRawMode = OldRawMode; 02075 02076 // If we only read only one character, then no special handling is needed. 02077 // We're done and can skip forward to the newline. 02078 if (C != 0 && CurPtr == OldPtr+1) { 02079 CurPtr = NextLine; 02080 break; 02081 } 02082 02083 // If we read multiple characters, and one of those characters was a \r or 02084 // \n, then we had an escaped newline within the comment. Emit diagnostic 02085 // unless the next line is also a // comment. 02086 if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') { 02087 for (; OldPtr != CurPtr; ++OldPtr) 02088 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 02089 // Okay, we found a // comment that ends in a newline, if the next 02090 // line is also a // comment, but has spaces, don't emit a diagnostic. 02091 if (isWhitespace(C)) { 02092 const char *ForwardPtr = CurPtr; 02093 while (isWhitespace(*ForwardPtr)) // Skip whitespace. 02094 ++ForwardPtr; 02095 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 02096 break; 02097 } 02098 02099 if (!isLexingRawMode()) 02100 Diag(OldPtr-1, diag::ext_multi_line_line_comment); 02101 break; 02102 } 02103 } 02104 02105 if (CurPtr == BufferEnd+1) { 02106 --CurPtr; 02107 break; 02108 } 02109 02110 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 02111 PP->CodeCompleteNaturalLanguage(); 02112 cutOffLexing(); 02113 return false; 02114 } 02115 02116 } while (C != '\n' && C != '\r'); 02117 02118 // Found but did not consume the newline. Notify comment handlers about the 02119 // comment unless we're in a #if 0 block. 02120 if (PP && !isLexingRawMode() && 02121 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 02122 getSourceLocation(CurPtr)))) { 02123 BufferPtr = CurPtr; 02124 return true; // A token has to be returned. 02125 } 02126 02127 // If we are returning comments as tokens, return this comment as a token. 02128 if (inKeepCommentMode()) 02129 return SaveLineComment(Result, CurPtr); 02130 02131 // If we are inside a preprocessor directive and we see the end of line, 02132 // return immediately, so that the lexer can return this as an EOD token. 02133 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 02134 BufferPtr = CurPtr; 02135 return false; 02136 } 02137 02138 // Otherwise, eat the \n character. We don't care if this is a \n\r or 02139 // \r\n sequence. This is an efficiency hack (because we know the \n can't 02140 // contribute to another token), it isn't needed for correctness. Note that 02141 // this is ok even in KeepWhitespaceMode, because we would have returned the 02142 /// comment above in that mode. 02143 ++CurPtr; 02144 02145 // The next returned token is at the start of the line. 02146 Result.setFlag(Token::StartOfLine); 02147 TokAtPhysicalStartOfLine = true; 02148 // No leading whitespace seen so far. 02149 Result.clearFlag(Token::LeadingSpace); 02150 BufferPtr = CurPtr; 02151 return false; 02152 } 02153 02154 /// If in save-comment mode, package up this Line comment in an appropriate 02155 /// way and return it. 02156 bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) { 02157 // If we're not in a preprocessor directive, just return the // comment 02158 // directly. 02159 FormTokenWithChars(Result, CurPtr, tok::comment); 02160 02161 if (!ParsingPreprocessorDirective || LexingRawMode) 02162 return true; 02163 02164 // If this Line-style comment is in a macro definition, transmogrify it into 02165 // a C-style block comment. 02166 bool Invalid = false; 02167 std::string Spelling = PP->getSpelling(Result, &Invalid); 02168 if (Invalid) 02169 return true; 02170 02171 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?"); 02172 Spelling[1] = '*'; // Change prefix to "/*". 02173 Spelling += "*/"; // add suffix. 02174 02175 Result.setKind(tok::comment); 02176 PP->CreateString(Spelling, Result, 02177 Result.getLocation(), Result.getLocation()); 02178 return true; 02179 } 02180 02181 /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 02182 /// character (either \\n or \\r) is part of an escaped newline sequence. Issue 02183 /// a diagnostic if so. We know that the newline is inside of a block comment. 02184 static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, 02185 Lexer *L) { 02186 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 02187 02188 // Back up off the newline. 02189 --CurPtr; 02190 02191 // If this is a two-character newline sequence, skip the other character. 02192 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 02193 // \n\n or \r\r -> not escaped newline. 02194 if (CurPtr[0] == CurPtr[1]) 02195 return false; 02196 // \n\r or \r\n -> skip the newline. 02197 --CurPtr; 02198 } 02199 02200 // If we have horizontal whitespace, skip over it. We allow whitespace 02201 // between the slash and newline. 02202 bool HasSpace = false; 02203 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 02204 --CurPtr; 02205 HasSpace = true; 02206 } 02207 02208 // If we have a slash, we know this is an escaped newline. 02209 if (*CurPtr == '\\') { 02210 if (CurPtr[-1] != '*') return false; 02211 } else { 02212 // It isn't a slash, is it the ?? / trigraph? 02213 if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' || 02214 CurPtr[-3] != '*') 02215 return false; 02216 02217 // This is the trigraph ending the comment. Emit a stern warning! 02218 CurPtr -= 2; 02219 02220 // If no trigraphs are enabled, warn that we ignored this trigraph and 02221 // ignore this * character. 02222 if (!L->getLangOpts().Trigraphs) { 02223 if (!L->isLexingRawMode()) 02224 L->Diag(CurPtr, diag::trigraph_ignored_block_comment); 02225 return false; 02226 } 02227 if (!L->isLexingRawMode()) 02228 L->Diag(CurPtr, diag::trigraph_ends_block_comment); 02229 } 02230 02231 // Warn about having an escaped newline between the */ characters. 02232 if (!L->isLexingRawMode()) 02233 L->Diag(CurPtr, diag::escaped_newline_block_comment_end); 02234 02235 // If there was space between the backslash and newline, warn about it. 02236 if (HasSpace && !L->isLexingRawMode()) 02237 L->Diag(CurPtr, diag::backslash_newline_space); 02238 02239 return true; 02240 } 02241 02242 #ifdef __SSE2__ 02243 #include <emmintrin.h> 02244 #elif __ALTIVEC__ 02245 #include <altivec.h> 02246 #undef bool 02247 #endif 02248 02249 /// We have just read from input the / and * characters that started a comment. 02250 /// Read until we find the * and / characters that terminate the comment. 02251 /// Note that we don't bother decoding trigraphs or escaped newlines in block 02252 /// comments, because they cannot cause the comment to end. The only thing 02253 /// that can happen is the comment could end with an escaped newline between 02254 /// the terminating * and /. 02255 /// 02256 /// If we're in KeepCommentMode or any CommentHandler has inserted 02257 /// some tokens, this will store the first token and return true. 02258 bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr, 02259 bool &TokAtPhysicalStartOfLine) { 02260 // Scan one character past where we should, looking for a '/' character. Once 02261 // we find it, check to see if it was preceded by a *. This common 02262 // optimization helps people who like to put a lot of * characters in their 02263 // comments. 02264 02265 // The first character we get with newlines and trigraphs skipped to handle 02266 // the degenerate /*/ case below correctly if the * has an escaped newline 02267 // after it. 02268 unsigned CharSize; 02269 unsigned char C = getCharAndSize(CurPtr, CharSize); 02270 CurPtr += CharSize; 02271 if (C == 0 && CurPtr == BufferEnd+1) { 02272 if (!isLexingRawMode()) 02273 Diag(BufferPtr, diag::err_unterminated_block_comment); 02274 --CurPtr; 02275 02276 // KeepWhitespaceMode should return this broken comment as a token. Since 02277 // it isn't a well formed comment, just return it as an 'unknown' token. 02278 if (isKeepWhitespaceMode()) { 02279 FormTokenWithChars(Result, CurPtr, tok::unknown); 02280 return true; 02281 } 02282 02283 BufferPtr = CurPtr; 02284 return false; 02285 } 02286 02287 // Check to see if the first character after the '/*' is another /. If so, 02288 // then this slash does not end the block comment, it is part of it. 02289 if (C == '/') 02290 C = *CurPtr++; 02291 02292 while (1) { 02293 // Skip over all non-interesting characters until we find end of buffer or a 02294 // (probably ending) '/' character. 02295 if (CurPtr + 24 < BufferEnd && 02296 // If there is a code-completion point avoid the fast scan because it 02297 // doesn't check for '\0'. 02298 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { 02299 // While not aligned to a 16-byte boundary. 02300 while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) 02301 C = *CurPtr++; 02302 02303 if (C == '/') goto FoundSlash; 02304 02305 #ifdef __SSE2__ 02306 __m128i Slashes = _mm_set1_epi8('/'); 02307 while (CurPtr+16 <= BufferEnd) { 02308 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr, 02309 Slashes)); 02310 if (cmp != 0) { 02311 // Adjust the pointer to point directly after the first slash. It's 02312 // not necessary to set C here, it will be overwritten at the end of 02313 // the outer loop. 02314 CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1; 02315 goto FoundSlash; 02316 } 02317 CurPtr += 16; 02318 } 02319 #elif __ALTIVEC__ 02320 __vector unsigned char Slashes = { 02321 '/', '/', '/', '/', '/', '/', '/', '/', 02322 '/', '/', '/', '/', '/', '/', '/', '/' 02323 }; 02324 while (CurPtr+16 <= BufferEnd && 02325 !vec_any_eq(*(const vector unsigned char*)CurPtr, Slashes)) 02326 CurPtr += 16; 02327 #else 02328 // Scan for '/' quickly. Many block comments are very large. 02329 while (CurPtr[0] != '/' && 02330 CurPtr[1] != '/' && 02331 CurPtr[2] != '/' && 02332 CurPtr[3] != '/' && 02333 CurPtr+4 < BufferEnd) { 02334 CurPtr += 4; 02335 } 02336 #endif 02337 02338 // It has to be one of the bytes scanned, increment to it and read one. 02339 C = *CurPtr++; 02340 } 02341 02342 // Loop to scan the remainder. 02343 while (C != '/' && C != '\0') 02344 C = *CurPtr++; 02345 02346 if (C == '/') { 02347 FoundSlash: 02348 if (CurPtr[-2] == '*') // We found the final */. We're done! 02349 break; 02350 02351 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 02352 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) { 02353 // We found the final */, though it had an escaped newline between the 02354 // * and /. We're done! 02355 break; 02356 } 02357 } 02358 if (CurPtr[0] == '*' && CurPtr[1] != '/') { 02359 // If this is a /* inside of the comment, emit a warning. Don't do this 02360 // if this is a /*/, which will end the comment. This misses cases with 02361 // embedded escaped newlines, but oh well. 02362 if (!isLexingRawMode()) 02363 Diag(CurPtr-1, diag::warn_nested_block_comment); 02364 } 02365 } else if (C == 0 && CurPtr == BufferEnd+1) { 02366 if (!isLexingRawMode()) 02367 Diag(BufferPtr, diag::err_unterminated_block_comment); 02368 // Note: the user probably forgot a */. We could continue immediately 02369 // after the /*, but this would involve lexing a lot of what really is the 02370 // comment, which surely would confuse the parser. 02371 --CurPtr; 02372 02373 // KeepWhitespaceMode should return this broken comment as a token. Since 02374 // it isn't a well formed comment, just return it as an 'unknown' token. 02375 if (isKeepWhitespaceMode()) { 02376 FormTokenWithChars(Result, CurPtr, tok::unknown); 02377 return true; 02378 } 02379 02380 BufferPtr = CurPtr; 02381 return false; 02382 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 02383 PP->CodeCompleteNaturalLanguage(); 02384 cutOffLexing(); 02385 return false; 02386 } 02387 02388 C = *CurPtr++; 02389 } 02390 02391 // Notify comment handlers about the comment unless we're in a #if 0 block. 02392 if (PP && !isLexingRawMode() && 02393 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 02394 getSourceLocation(CurPtr)))) { 02395 BufferPtr = CurPtr; 02396 return true; // A token has to be returned. 02397 } 02398 02399 // If we are returning comments as tokens, return this comment as a token. 02400 if (inKeepCommentMode()) { 02401 FormTokenWithChars(Result, CurPtr, tok::comment); 02402 return true; 02403 } 02404 02405 // It is common for the tokens immediately after a /**/ comment to be 02406 // whitespace. Instead of going through the big switch, handle it 02407 // efficiently now. This is safe even in KeepWhitespaceMode because we would 02408 // have already returned above with the comment as a token. 02409 if (isHorizontalWhitespace(*CurPtr)) { 02410 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine); 02411 return false; 02412 } 02413 02414 // Otherwise, just return so that the next character will be lexed as a token. 02415 BufferPtr = CurPtr; 02416 Result.setFlag(Token::LeadingSpace); 02417 return false; 02418 } 02419 02420 //===----------------------------------------------------------------------===// 02421 // Primary Lexing Entry Points 02422 //===----------------------------------------------------------------------===// 02423 02424 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an 02425 /// uninterpreted string. This switches the lexer out of directive mode. 02426 void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) { 02427 assert(ParsingPreprocessorDirective && ParsingFilename == false && 02428 "Must be in a preprocessing directive!"); 02429 Token Tmp; 02430 02431 // CurPtr - Cache BufferPtr in an automatic variable. 02432 const char *CurPtr = BufferPtr; 02433 while (1) { 02434 char Char = getAndAdvanceChar(CurPtr, Tmp); 02435 switch (Char) { 02436 default: 02437 if (Result) 02438 Result->push_back(Char); 02439 break; 02440 case 0: // Null. 02441 // Found end of file? 02442 if (CurPtr-1 != BufferEnd) { 02443 if (isCodeCompletionPoint(CurPtr-1)) { 02444 PP->CodeCompleteNaturalLanguage(); 02445 cutOffLexing(); 02446 return; 02447 } 02448 02449 // Nope, normal character, continue. 02450 if (Result) 02451 Result->push_back(Char); 02452 break; 02453 } 02454 // FALL THROUGH. 02455 case '\r': 02456 case '\n': 02457 // Okay, we found the end of the line. First, back up past the \0, \r, \n. 02458 assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 02459 BufferPtr = CurPtr-1; 02460 02461 // Next, lex the character, which should handle the EOD transition. 02462 Lex(Tmp); 02463 if (Tmp.is(tok::code_completion)) { 02464 if (PP) 02465 PP->CodeCompleteNaturalLanguage(); 02466 Lex(Tmp); 02467 } 02468 assert(Tmp.is(tok::eod) && "Unexpected token!"); 02469 02470 // Finally, we're done; 02471 return; 02472 } 02473 } 02474 } 02475 02476 /// LexEndOfFile - CurPtr points to the end of this file. Handle this 02477 /// condition, reporting diagnostics and handling other edge cases as required. 02478 /// This returns true if Result contains a token, false if PP.Lex should be 02479 /// called again. 02480 bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 02481 // If we hit the end of the file while parsing a preprocessor directive, 02482 // end the preprocessor directive first. The next token returned will 02483 // then be the end of file. 02484 if (ParsingPreprocessorDirective) { 02485 // Done parsing the "line". 02486 ParsingPreprocessorDirective = false; 02487 // Update the location of token as well as BufferPtr. 02488 FormTokenWithChars(Result, CurPtr, tok::eod); 02489 02490 // Restore comment saving mode, in case it was disabled for directive. 02491 if (PP) 02492 resetExtendedTokenMode(); 02493 return true; // Have a token. 02494 } 02495 02496 // If we are in raw mode, return this event as an EOF token. Let the caller 02497 // that put us in raw mode handle the event. 02498 if (isLexingRawMode()) { 02499 Result.startToken(); 02500 BufferPtr = BufferEnd; 02501 FormTokenWithChars(Result, BufferEnd, tok::eof); 02502 return true; 02503 } 02504 02505 // Issue diagnostics for unterminated #if and missing newline. 02506 02507 // If we are in a #if directive, emit an error. 02508 while (!ConditionalStack.empty()) { 02509 if (PP->getCodeCompletionFileLoc() != FileLoc) 02510 PP->Diag(ConditionalStack.back().IfLoc, 02511 diag::err_pp_unterminated_conditional); 02512 ConditionalStack.pop_back(); 02513 } 02514 02515 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 02516 // a pedwarn. 02517 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) { 02518 DiagnosticsEngine &Diags = PP->getDiagnostics(); 02519 SourceLocation EndLoc = getSourceLocation(BufferEnd); 02520 unsigned DiagID; 02521 02522 if (LangOpts.CPlusPlus11) { 02523 // C++11 [lex.phases] 2.2 p2 02524 // Prefer the C++98 pedantic compatibility warning over the generic, 02525 // non-extension, user-requested "missing newline at EOF" warning. 02526 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) { 02527 DiagID = diag::warn_cxx98_compat_no_newline_eof; 02528 } else { 02529 DiagID = diag::warn_no_newline_eof; 02530 } 02531 } else { 02532 DiagID = diag::ext_no_newline_eof; 02533 } 02534 02535 Diag(BufferEnd, DiagID) 02536 << FixItHint::CreateInsertion(EndLoc, "\n"); 02537 } 02538 02539 BufferPtr = CurPtr; 02540 02541 // Finally, let the preprocessor handle this. 02542 return PP->HandleEndOfFile(Result, isPragmaLexer()); 02543 } 02544 02545 /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 02546 /// the specified lexer will return a tok::l_paren token, 0 if it is something 02547 /// else and 2 if there are no more tokens in the buffer controlled by the 02548 /// lexer. 02549 unsigned Lexer::isNextPPTokenLParen() { 02550 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 02551 02552 // Switch to 'skipping' mode. This will ensure that we can lex a token 02553 // without emitting diagnostics, disables macro expansion, and will cause EOF 02554 // to return an EOF token instead of popping the include stack. 02555 LexingRawMode = true; 02556 02557 // Save state that can be changed while lexing so that we can restore it. 02558 const char *TmpBufferPtr = BufferPtr; 02559 bool inPPDirectiveMode = ParsingPreprocessorDirective; 02560 bool atStartOfLine = IsAtStartOfLine; 02561 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 02562 bool leadingSpace = HasLeadingSpace; 02563 02564 Token Tok; 02565 Lex(Tok); 02566 02567 // Restore state that may have changed. 02568 BufferPtr = TmpBufferPtr; 02569 ParsingPreprocessorDirective = inPPDirectiveMode; 02570 HasLeadingSpace = leadingSpace; 02571 IsAtStartOfLine = atStartOfLine; 02572 IsAtPhysicalStartOfLine = atPhysicalStartOfLine; 02573 02574 // Restore the lexer back to non-skipping mode. 02575 LexingRawMode = false; 02576 02577 if (Tok.is(tok::eof)) 02578 return 2; 02579 return Tok.is(tok::l_paren); 02580 } 02581 02582 /// \brief Find the end of a version control conflict marker. 02583 static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd, 02584 ConflictMarkerKind CMK) { 02585 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>"; 02586 size_t TermLen = CMK == CMK_Perforce ? 5 : 7; 02587 StringRef RestOfBuffer(CurPtr+TermLen, BufferEnd-CurPtr-TermLen); 02588 size_t Pos = RestOfBuffer.find(Terminator); 02589 while (Pos != StringRef::npos) { 02590 // Must occur at start of line. 02591 if (RestOfBuffer[Pos-1] != '\r' && 02592 RestOfBuffer[Pos-1] != '\n') { 02593 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen); 02594 Pos = RestOfBuffer.find(Terminator); 02595 continue; 02596 } 02597 return RestOfBuffer.data()+Pos; 02598 } 02599 return nullptr; 02600 } 02601 02602 /// IsStartOfConflictMarker - If the specified pointer is the start of a version 02603 /// control conflict marker like '<<<<<<<', recognize it as such, emit an error 02604 /// and recover nicely. This returns true if it is a conflict marker and false 02605 /// if not. 02606 bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 02607 // Only a conflict marker if it starts at the beginning of a line. 02608 if (CurPtr != BufferStart && 02609 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 02610 return false; 02611 02612 // Check to see if we have <<<<<<< or >>>>. 02613 if ((BufferEnd-CurPtr < 8 || StringRef(CurPtr, 7) != "<<<<<<<") && 02614 (BufferEnd-CurPtr < 6 || StringRef(CurPtr, 5) != ">>>> ")) 02615 return false; 02616 02617 // If we have a situation where we don't care about conflict markers, ignore 02618 // it. 02619 if (CurrentConflictMarkerState || isLexingRawMode()) 02620 return false; 02621 02622 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; 02623 02624 // Check to see if there is an ending marker somewhere in the buffer at the 02625 // start of a line to terminate this conflict marker. 02626 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) { 02627 // We found a match. We are really in a conflict marker. 02628 // Diagnose this, and ignore to the end of line. 02629 Diag(CurPtr, diag::err_conflict_marker); 02630 CurrentConflictMarkerState = Kind; 02631 02632 // Skip ahead to the end of line. We know this exists because the 02633 // end-of-conflict marker starts with \r or \n. 02634 while (*CurPtr != '\r' && *CurPtr != '\n') { 02635 assert(CurPtr != BufferEnd && "Didn't find end of line"); 02636 ++CurPtr; 02637 } 02638 BufferPtr = CurPtr; 02639 return true; 02640 } 02641 02642 // No end of conflict marker found. 02643 return false; 02644 } 02645 02646 02647 /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if 02648 /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it 02649 /// is the end of a conflict marker. Handle it by ignoring up until the end of 02650 /// the line. This returns true if it is a conflict marker and false if not. 02651 bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 02652 // Only a conflict marker if it starts at the beginning of a line. 02653 if (CurPtr != BufferStart && 02654 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 02655 return false; 02656 02657 // If we have a situation where we don't care about conflict markers, ignore 02658 // it. 02659 if (!CurrentConflictMarkerState || isLexingRawMode()) 02660 return false; 02661 02662 // Check to see if we have the marker (4 characters in a row). 02663 for (unsigned i = 1; i != 4; ++i) 02664 if (CurPtr[i] != CurPtr[0]) 02665 return false; 02666 02667 // If we do have it, search for the end of the conflict marker. This could 02668 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 02669 // be the end of conflict marker. 02670 if (const char *End = FindConflictEnd(CurPtr, BufferEnd, 02671 CurrentConflictMarkerState)) { 02672 CurPtr = End; 02673 02674 // Skip ahead to the end of line. 02675 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 02676 ++CurPtr; 02677 02678 BufferPtr = CurPtr; 02679 02680 // No longer in the conflict marker. 02681 CurrentConflictMarkerState = CMK_None; 02682 return true; 02683 } 02684 02685 return false; 02686 } 02687 02688 bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { 02689 if (PP && PP->isCodeCompletionEnabled()) { 02690 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart); 02691 return Loc == PP->getCodeCompletionLoc(); 02692 } 02693 02694 return false; 02695 } 02696 02697 uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, 02698 Token *Result) { 02699 unsigned CharSize; 02700 char Kind = getCharAndSize(StartPtr, CharSize); 02701 02702 unsigned NumHexDigits; 02703 if (Kind == 'u') 02704 NumHexDigits = 4; 02705 else if (Kind == 'U') 02706 NumHexDigits = 8; 02707 else 02708 return 0; 02709 02710 if (!LangOpts.CPlusPlus && !LangOpts.C99) { 02711 if (Result && !isLexingRawMode()) 02712 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89); 02713 return 0; 02714 } 02715 02716 const char *CurPtr = StartPtr + CharSize; 02717 const char *KindLoc = &CurPtr[-1]; 02718 02719 uint32_t CodePoint = 0; 02720 for (unsigned i = 0; i < NumHexDigits; ++i) { 02721 char C = getCharAndSize(CurPtr, CharSize); 02722 02723 unsigned Value = llvm::hexDigitValue(C); 02724 if (Value == -1U) { 02725 if (Result && !isLexingRawMode()) { 02726 if (i == 0) { 02727 Diag(BufferPtr, diag::warn_ucn_escape_no_digits) 02728 << StringRef(KindLoc, 1); 02729 } else { 02730 Diag(BufferPtr, diag::warn_ucn_escape_incomplete); 02731 02732 // If the user wrote \U1234, suggest a fixit to \u. 02733 if (i == 4 && NumHexDigits == 8) { 02734 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1); 02735 Diag(KindLoc, diag::note_ucn_four_not_eight) 02736 << FixItHint::CreateReplacement(URange, "u"); 02737 } 02738 } 02739 } 02740 02741 return 0; 02742 } 02743 02744 CodePoint <<= 4; 02745 CodePoint += Value; 02746 02747 CurPtr += CharSize; 02748 } 02749 02750 if (Result) { 02751 Result->setFlag(Token::HasUCN); 02752 if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2) 02753 StartPtr = CurPtr; 02754 else 02755 while (StartPtr != CurPtr) 02756 (void)getAndAdvanceChar(StartPtr, *Result); 02757 } else { 02758 StartPtr = CurPtr; 02759 } 02760 02761 // Don't apply C family restrictions to UCNs in assembly mode 02762 if (LangOpts.AsmPreprocessor) 02763 return CodePoint; 02764 02765 // C99 6.4.3p2: A universal character name shall not specify a character whose 02766 // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or 02767 // 0060 (`), nor one in the range D800 through DFFF inclusive.) 02768 // C++11 [lex.charset]p2: If the hexadecimal value for a 02769 // universal-character-name corresponds to a surrogate code point (in the 02770 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally, 02771 // if the hexadecimal value for a universal-character-name outside the 02772 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or 02773 // string literal corresponds to a control character (in either of the 02774 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the 02775 // basic source character set, the program is ill-formed. 02776 if (CodePoint < 0xA0) { 02777 if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60) 02778 return CodePoint; 02779 02780 // We don't use isLexingRawMode() here because we need to warn about bad 02781 // UCNs even when skipping preprocessing tokens in a #if block. 02782 if (Result && PP) { 02783 if (CodePoint < 0x20 || CodePoint >= 0x7F) 02784 Diag(BufferPtr, diag::err_ucn_control_character); 02785 else { 02786 char C = static_cast<char>(CodePoint); 02787 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1); 02788 } 02789 } 02790 02791 return 0; 02792 02793 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) { 02794 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't. 02795 // We don't use isLexingRawMode() here because we need to diagnose bad 02796 // UCNs even when skipping preprocessing tokens in a #if block. 02797 if (Result && PP) { 02798 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11) 02799 Diag(BufferPtr, diag::warn_ucn_escape_surrogate); 02800 else 02801 Diag(BufferPtr, diag::err_ucn_escape_invalid); 02802 } 02803 return 0; 02804 } 02805 02806 return CodePoint; 02807 } 02808 02809 bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C, 02810 const char *CurPtr) { 02811 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars( 02812 UnicodeWhitespaceCharRanges); 02813 if (!isLexingRawMode() && !PP->isPreprocessedOutput() && 02814 UnicodeWhitespaceChars.contains(C)) { 02815 Diag(BufferPtr, diag::ext_unicode_whitespace) 02816 << makeCharRange(*this, BufferPtr, CurPtr); 02817 02818 Result.setFlag(Token::LeadingSpace); 02819 return true; 02820 } 02821 return false; 02822 } 02823 02824 bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) { 02825 if (isAllowedIDChar(C, LangOpts) && isAllowedInitiallyIDChar(C, LangOpts)) { 02826 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 02827 !PP->isPreprocessedOutput()) { 02828 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C, 02829 makeCharRange(*this, BufferPtr, CurPtr), 02830 /*IsFirst=*/true); 02831 } 02832 02833 MIOpt.ReadToken(); 02834 return LexIdentifier(Result, CurPtr); 02835 } 02836 02837 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 02838 !PP->isPreprocessedOutput() && 02839 !isASCII(*BufferPtr) && !isAllowedIDChar(C, LangOpts)) { 02840 // Non-ASCII characters tend to creep into source code unintentionally. 02841 // Instead of letting the parser complain about the unknown token, 02842 // just drop the character. 02843 // Note that we can /only/ do this when the non-ASCII character is actually 02844 // spelled as Unicode, not written as a UCN. The standard requires that 02845 // we not throw away any possible preprocessor tokens, but there's a 02846 // loophole in the mapping of Unicode characters to basic character set 02847 // characters that allows us to map these particular characters to, say, 02848 // whitespace. 02849 Diag(BufferPtr, diag::err_non_ascii) 02850 << FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr)); 02851 02852 BufferPtr = CurPtr; 02853 return false; 02854 } 02855 02856 // Otherwise, we have an explicit UCN or a character that's unlikely to show 02857 // up by accident. 02858 MIOpt.ReadToken(); 02859 FormTokenWithChars(Result, CurPtr, tok::unknown); 02860 return true; 02861 } 02862 02863 void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) { 02864 IsAtStartOfLine = Result.isAtStartOfLine(); 02865 HasLeadingSpace = Result.hasLeadingSpace(); 02866 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro(); 02867 // Note that this doesn't affect IsAtPhysicalStartOfLine. 02868 } 02869 02870 bool Lexer::Lex(Token &Result) { 02871 // Start a new token. 02872 Result.startToken(); 02873 02874 // Set up misc whitespace flags for LexTokenInternal. 02875 if (IsAtStartOfLine) { 02876 Result.setFlag(Token::StartOfLine); 02877 IsAtStartOfLine = false; 02878 } 02879 02880 if (HasLeadingSpace) { 02881 Result.setFlag(Token::LeadingSpace); 02882 HasLeadingSpace = false; 02883 } 02884 02885 if (HasLeadingEmptyMacro) { 02886 Result.setFlag(Token::LeadingEmptyMacro); 02887 HasLeadingEmptyMacro = false; 02888 } 02889 02890 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 02891 IsAtPhysicalStartOfLine = false; 02892 bool isRawLex = isLexingRawMode(); 02893 (void) isRawLex; 02894 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine); 02895 // (After the LexTokenInternal call, the lexer might be destroyed.) 02896 assert((returnedToken || !isRawLex) && "Raw lex must succeed"); 02897 return returnedToken; 02898 } 02899 02900 /// LexTokenInternal - This implements a simple C family lexer. It is an 02901 /// extremely performance critical piece of code. This assumes that the buffer 02902 /// has a null character at the end of the file. This returns a preprocessing 02903 /// token, not a normal token, as such, it is an internal interface. It assumes 02904 /// that the Flags of result have been cleared before calling this. 02905 bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) { 02906 LexNextToken: 02907 // New token, can't need cleaning yet. 02908 Result.clearFlag(Token::NeedsCleaning); 02909 Result.setIdentifierInfo(nullptr); 02910 02911 // CurPtr - Cache BufferPtr in an automatic variable. 02912 const char *CurPtr = BufferPtr; 02913 02914 // Small amounts of horizontal whitespace is very common between tokens. 02915 if ((*CurPtr == ' ') || (*CurPtr == '\t')) { 02916 ++CurPtr; 02917 while ((*CurPtr == ' ') || (*CurPtr == '\t')) 02918 ++CurPtr; 02919 02920 // If we are keeping whitespace and other tokens, just return what we just 02921 // skipped. The next lexer invocation will return the token after the 02922 // whitespace. 02923 if (isKeepWhitespaceMode()) { 02924 FormTokenWithChars(Result, CurPtr, tok::unknown); 02925 // FIXME: The next token will not have LeadingSpace set. 02926 return true; 02927 } 02928 02929 BufferPtr = CurPtr; 02930 Result.setFlag(Token::LeadingSpace); 02931 } 02932 02933 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 02934 02935 // Read a character, advancing over it. 02936 char Char = getAndAdvanceChar(CurPtr, Result); 02937 tok::TokenKind Kind; 02938 02939 switch (Char) { 02940 case 0: // Null. 02941 // Found end of file? 02942 if (CurPtr-1 == BufferEnd) 02943 return LexEndOfFile(Result, CurPtr-1); 02944 02945 // Check if we are performing code completion. 02946 if (isCodeCompletionPoint(CurPtr-1)) { 02947 // Return the code-completion token. 02948 Result.startToken(); 02949 FormTokenWithChars(Result, CurPtr, tok::code_completion); 02950 return true; 02951 } 02952 02953 if (!isLexingRawMode()) 02954 Diag(CurPtr-1, diag::null_in_file); 02955 Result.setFlag(Token::LeadingSpace); 02956 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 02957 return true; // KeepWhitespaceMode 02958 02959 // We know the lexer hasn't changed, so just try again with this lexer. 02960 // (We manually eliminate the tail call to avoid recursion.) 02961 goto LexNextToken; 02962 02963 case 26: // DOS & CP/M EOF: "^Z". 02964 // If we're in Microsoft extensions mode, treat this as end of file. 02965 if (LangOpts.MicrosoftExt) 02966 return LexEndOfFile(Result, CurPtr-1); 02967 02968 // If Microsoft extensions are disabled, this is just random garbage. 02969 Kind = tok::unknown; 02970 break; 02971 02972 case '\n': 02973 case '\r': 02974 // If we are inside a preprocessor directive and we see the end of line, 02975 // we know we are done with the directive, so return an EOD token. 02976 if (ParsingPreprocessorDirective) { 02977 // Done parsing the "line". 02978 ParsingPreprocessorDirective = false; 02979 02980 // Restore comment saving mode, in case it was disabled for directive. 02981 if (PP) 02982 resetExtendedTokenMode(); 02983 02984 // Since we consumed a newline, we are back at the start of a line. 02985 IsAtStartOfLine = true; 02986 IsAtPhysicalStartOfLine = true; 02987 02988 Kind = tok::eod; 02989 break; 02990 } 02991 02992 // No leading whitespace seen so far. 02993 Result.clearFlag(Token::LeadingSpace); 02994 02995 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 02996 return true; // KeepWhitespaceMode 02997 02998 // We only saw whitespace, so just try again with this lexer. 02999 // (We manually eliminate the tail call to avoid recursion.) 03000 goto LexNextToken; 03001 case ' ': 03002 case '\t': 03003 case '\f': 03004 case '\v': 03005 SkipHorizontalWhitespace: 03006 Result.setFlag(Token::LeadingSpace); 03007 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 03008 return true; // KeepWhitespaceMode 03009 03010 SkipIgnoredUnits: 03011 CurPtr = BufferPtr; 03012 03013 // If the next token is obviously a // or /* */ comment, skip it efficiently 03014 // too (without going through the big switch stmt). 03015 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 03016 LangOpts.LineComment && 03017 (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) { 03018 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 03019 return true; // There is a token to return. 03020 goto SkipIgnoredUnits; 03021 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 03022 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 03023 return true; // There is a token to return. 03024 goto SkipIgnoredUnits; 03025 } else if (isHorizontalWhitespace(*CurPtr)) { 03026 goto SkipHorizontalWhitespace; 03027 } 03028 // We only saw whitespace, so just try again with this lexer. 03029 // (We manually eliminate the tail call to avoid recursion.) 03030 goto LexNextToken; 03031 03032 // C99 6.4.4.1: Integer Constants. 03033 // C99 6.4.4.2: Floating Constants. 03034 case '0': case '1': case '2': case '3': case '4': 03035 case '5': case '6': case '7': case '8': case '9': 03036 // Notify MIOpt that we read a non-whitespace/non-comment token. 03037 MIOpt.ReadToken(); 03038 return LexNumericConstant(Result, CurPtr); 03039 03040 case 'u': // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal 03041 // Notify MIOpt that we read a non-whitespace/non-comment token. 03042 MIOpt.ReadToken(); 03043 03044 if (LangOpts.CPlusPlus11 || LangOpts.C11) { 03045 Char = getCharAndSize(CurPtr, SizeTmp); 03046 03047 // UTF-16 string literal 03048 if (Char == '"') 03049 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 03050 tok::utf16_string_literal); 03051 03052 // UTF-16 character constant 03053 if (Char == '\'') 03054 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 03055 tok::utf16_char_constant); 03056 03057 // UTF-16 raw string literal 03058 if (Char == 'R' && LangOpts.CPlusPlus11 && 03059 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 03060 return LexRawStringLiteral(Result, 03061 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 03062 SizeTmp2, Result), 03063 tok::utf16_string_literal); 03064 03065 if (Char == '8') { 03066 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2); 03067 03068 // UTF-8 string literal 03069 if (Char2 == '"') 03070 return LexStringLiteral(Result, 03071 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 03072 SizeTmp2, Result), 03073 tok::utf8_string_literal); 03074 if (Char2 == '\'' && LangOpts.CPlusPlus1z) 03075 return LexCharConstant( 03076 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 03077 SizeTmp2, Result), 03078 tok::utf8_char_constant); 03079 03080 if (Char2 == 'R' && LangOpts.CPlusPlus11) { 03081 unsigned SizeTmp3; 03082 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 03083 // UTF-8 raw string literal 03084 if (Char3 == '"') { 03085 return LexRawStringLiteral(Result, 03086 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 03087 SizeTmp2, Result), 03088 SizeTmp3, Result), 03089 tok::utf8_string_literal); 03090 } 03091 } 03092 } 03093 } 03094 03095 // treat u like the start of an identifier. 03096 return LexIdentifier(Result, CurPtr); 03097 03098 case 'U': // Identifier (Uber) or C11/C++11 UTF-32 string literal 03099 // Notify MIOpt that we read a non-whitespace/non-comment token. 03100 MIOpt.ReadToken(); 03101 03102 if (LangOpts.CPlusPlus11 || LangOpts.C11) { 03103 Char = getCharAndSize(CurPtr, SizeTmp); 03104 03105 // UTF-32 string literal 03106 if (Char == '"') 03107 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 03108 tok::utf32_string_literal); 03109 03110 // UTF-32 character constant 03111 if (Char == '\'') 03112 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 03113 tok::utf32_char_constant); 03114 03115 // UTF-32 raw string literal 03116 if (Char == 'R' && LangOpts.CPlusPlus11 && 03117 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 03118 return LexRawStringLiteral(Result, 03119 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 03120 SizeTmp2, Result), 03121 tok::utf32_string_literal); 03122 } 03123 03124 // treat U like the start of an identifier. 03125 return LexIdentifier(Result, CurPtr); 03126 03127 case 'R': // Identifier or C++0x raw string literal 03128 // Notify MIOpt that we read a non-whitespace/non-comment token. 03129 MIOpt.ReadToken(); 03130 03131 if (LangOpts.CPlusPlus11) { 03132 Char = getCharAndSize(CurPtr, SizeTmp); 03133 03134 if (Char == '"') 03135 return LexRawStringLiteral(Result, 03136 ConsumeChar(CurPtr, SizeTmp, Result), 03137 tok::string_literal); 03138 } 03139 03140 // treat R like the start of an identifier. 03141 return LexIdentifier(Result, CurPtr); 03142 03143 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 03144 // Notify MIOpt that we read a non-whitespace/non-comment token. 03145 MIOpt.ReadToken(); 03146 Char = getCharAndSize(CurPtr, SizeTmp); 03147 03148 // Wide string literal. 03149 if (Char == '"') 03150 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 03151 tok::wide_string_literal); 03152 03153 // Wide raw string literal. 03154 if (LangOpts.CPlusPlus11 && Char == 'R' && 03155 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 03156 return LexRawStringLiteral(Result, 03157 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 03158 SizeTmp2, Result), 03159 tok::wide_string_literal); 03160 03161 // Wide character constant. 03162 if (Char == '\'') 03163 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 03164 tok::wide_char_constant); 03165 // FALL THROUGH, treating L like the start of an identifier. 03166 03167 // C99 6.4.2: Identifiers. 03168 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 03169 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 03170 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/ 03171 case 'V': case 'W': case 'X': case 'Y': case 'Z': 03172 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 03173 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 03174 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ 03175 case 'v': case 'w': case 'x': case 'y': case 'z': 03176 case '_': 03177 // Notify MIOpt that we read a non-whitespace/non-comment token. 03178 MIOpt.ReadToken(); 03179 return LexIdentifier(Result, CurPtr); 03180 03181 case '$': // $ in identifiers. 03182 if (LangOpts.DollarIdents) { 03183 if (!isLexingRawMode()) 03184 Diag(CurPtr-1, diag::ext_dollar_in_identifier); 03185 // Notify MIOpt that we read a non-whitespace/non-comment token. 03186 MIOpt.ReadToken(); 03187 return LexIdentifier(Result, CurPtr); 03188 } 03189 03190 Kind = tok::unknown; 03191 break; 03192 03193 // C99 6.4.4: Character Constants. 03194 case '\'': 03195 // Notify MIOpt that we read a non-whitespace/non-comment token. 03196 MIOpt.ReadToken(); 03197 return LexCharConstant(Result, CurPtr, tok::char_constant); 03198 03199 // C99 6.4.5: String Literals. 03200 case '"': 03201 // Notify MIOpt that we read a non-whitespace/non-comment token. 03202 MIOpt.ReadToken(); 03203 return LexStringLiteral(Result, CurPtr, tok::string_literal); 03204 03205 // C99 6.4.6: Punctuators. 03206 case '?': 03207 Kind = tok::question; 03208 break; 03209 case '[': 03210 Kind = tok::l_square; 03211 break; 03212 case ']': 03213 Kind = tok::r_square; 03214 break; 03215 case '(': 03216 Kind = tok::l_paren; 03217 break; 03218 case ')': 03219 Kind = tok::r_paren; 03220 break; 03221 case '{': 03222 Kind = tok::l_brace; 03223 break; 03224 case '}': 03225 Kind = tok::r_brace; 03226 break; 03227 case '.': 03228 Char = getCharAndSize(CurPtr, SizeTmp); 03229 if (Char >= '0' && Char <= '9') { 03230 // Notify MIOpt that we read a non-whitespace/non-comment token. 03231 MIOpt.ReadToken(); 03232 03233 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 03234 } else if (LangOpts.CPlusPlus && Char == '*') { 03235 Kind = tok::periodstar; 03236 CurPtr += SizeTmp; 03237 } else if (Char == '.' && 03238 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 03239 Kind = tok::ellipsis; 03240 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 03241 SizeTmp2, Result); 03242 } else { 03243 Kind = tok::period; 03244 } 03245 break; 03246 case '&': 03247 Char = getCharAndSize(CurPtr, SizeTmp); 03248 if (Char == '&') { 03249 Kind = tok::ampamp; 03250 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03251 } else if (Char == '=') { 03252 Kind = tok::ampequal; 03253 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03254 } else { 03255 Kind = tok::amp; 03256 } 03257 break; 03258 case '*': 03259 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 03260 Kind = tok::starequal; 03261 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03262 } else { 03263 Kind = tok::star; 03264 } 03265 break; 03266 case '+': 03267 Char = getCharAndSize(CurPtr, SizeTmp); 03268 if (Char == '+') { 03269 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03270 Kind = tok::plusplus; 03271 } else if (Char == '=') { 03272 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03273 Kind = tok::plusequal; 03274 } else { 03275 Kind = tok::plus; 03276 } 03277 break; 03278 case '-': 03279 Char = getCharAndSize(CurPtr, SizeTmp); 03280 if (Char == '-') { // -- 03281 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03282 Kind = tok::minusminus; 03283 } else if (Char == '>' && LangOpts.CPlusPlus && 03284 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 03285 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 03286 SizeTmp2, Result); 03287 Kind = tok::arrowstar; 03288 } else if (Char == '>') { // -> 03289 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03290 Kind = tok::arrow; 03291 } else if (Char == '=') { // -= 03292 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03293 Kind = tok::minusequal; 03294 } else { 03295 Kind = tok::minus; 03296 } 03297 break; 03298 case '~': 03299 Kind = tok::tilde; 03300 break; 03301 case '!': 03302 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 03303 Kind = tok::exclaimequal; 03304 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03305 } else { 03306 Kind = tok::exclaim; 03307 } 03308 break; 03309 case '/': 03310 // 6.4.9: Comments 03311 Char = getCharAndSize(CurPtr, SizeTmp); 03312 if (Char == '/') { // Line comment. 03313 // Even if Line comments are disabled (e.g. in C89 mode), we generally 03314 // want to lex this as a comment. There is one problem with this though, 03315 // that in one particular corner case, this can change the behavior of the 03316 // resultant program. For example, In "foo //**/ bar", C89 would lex 03317 // this as "foo / bar" and langauges with Line comments would lex it as 03318 // "foo". Check to see if the character after the second slash is a '*'. 03319 // If so, we will lex that as a "/" instead of the start of a comment. 03320 // However, we never do this if we are just preprocessing. 03321 bool TreatAsComment = LangOpts.LineComment && 03322 (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP); 03323 if (!TreatAsComment) 03324 if (!(PP && PP->isPreprocessedOutput())) 03325 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*'; 03326 03327 if (TreatAsComment) { 03328 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 03329 TokAtPhysicalStartOfLine)) 03330 return true; // There is a token to return. 03331 03332 // It is common for the tokens immediately after a // comment to be 03333 // whitespace (indentation for the next line). Instead of going through 03334 // the big switch, handle it efficiently now. 03335 goto SkipIgnoredUnits; 03336 } 03337 } 03338 03339 if (Char == '*') { // /**/ comment. 03340 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 03341 TokAtPhysicalStartOfLine)) 03342 return true; // There is a token to return. 03343 03344 // We only saw whitespace, so just try again with this lexer. 03345 // (We manually eliminate the tail call to avoid recursion.) 03346 goto LexNextToken; 03347 } 03348 03349 if (Char == '=') { 03350 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03351 Kind = tok::slashequal; 03352 } else { 03353 Kind = tok::slash; 03354 } 03355 break; 03356 case '%': 03357 Char = getCharAndSize(CurPtr, SizeTmp); 03358 if (Char == '=') { 03359 Kind = tok::percentequal; 03360 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03361 } else if (LangOpts.Digraphs && Char == '>') { 03362 Kind = tok::r_brace; // '%>' -> '}' 03363 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03364 } else if (LangOpts.Digraphs && Char == ':') { 03365 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03366 Char = getCharAndSize(CurPtr, SizeTmp); 03367 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 03368 Kind = tok::hashhash; // '%:%:' -> '##' 03369 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 03370 SizeTmp2, Result); 03371 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize 03372 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03373 if (!isLexingRawMode()) 03374 Diag(BufferPtr, diag::ext_charize_microsoft); 03375 Kind = tok::hashat; 03376 } else { // '%:' -> '#' 03377 // We parsed a # character. If this occurs at the start of the line, 03378 // it's actually the start of a preprocessing directive. Callback to 03379 // the preprocessor to handle it. 03380 // TODO: -fpreprocessed mode?? 03381 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 03382 goto HandleDirective; 03383 03384 Kind = tok::hash; 03385 } 03386 } else { 03387 Kind = tok::percent; 03388 } 03389 break; 03390 case '<': 03391 Char = getCharAndSize(CurPtr, SizeTmp); 03392 if (ParsingFilename) { 03393 return LexAngledStringLiteral(Result, CurPtr); 03394 } else if (Char == '<') { 03395 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 03396 if (After == '=') { 03397 Kind = tok::lesslessequal; 03398 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 03399 SizeTmp2, Result); 03400 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 03401 // If this is actually a '<<<<<<<' version control conflict marker, 03402 // recognize it as such and recover nicely. 03403 goto LexNextToken; 03404 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) { 03405 // If this is '<<<<' and we're in a Perforce-style conflict marker, 03406 // ignore it. 03407 goto LexNextToken; 03408 } else if (LangOpts.CUDA && After == '<') { 03409 Kind = tok::lesslessless; 03410 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 03411 SizeTmp2, Result); 03412 } else { 03413 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03414 Kind = tok::lessless; 03415 } 03416 } else if (Char == '=') { 03417 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03418 Kind = tok::lessequal; 03419 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '[' 03420 if (LangOpts.CPlusPlus11 && 03421 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') { 03422 // C++0x [lex.pptoken]p3: 03423 // Otherwise, if the next three characters are <:: and the subsequent 03424 // character is neither : nor >, the < is treated as a preprocessor 03425 // token by itself and not as the first character of the alternative 03426 // token <:. 03427 unsigned SizeTmp3; 03428 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 03429 if (After != ':' && After != '>') { 03430 Kind = tok::less; 03431 if (!isLexingRawMode()) 03432 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon); 03433 break; 03434 } 03435 } 03436 03437 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03438 Kind = tok::l_square; 03439 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{' 03440 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03441 Kind = tok::l_brace; 03442 } else { 03443 Kind = tok::less; 03444 } 03445 break; 03446 case '>': 03447 Char = getCharAndSize(CurPtr, SizeTmp); 03448 if (Char == '=') { 03449 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03450 Kind = tok::greaterequal; 03451 } else if (Char == '>') { 03452 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 03453 if (After == '=') { 03454 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 03455 SizeTmp2, Result); 03456 Kind = tok::greatergreaterequal; 03457 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) { 03458 // If this is actually a '>>>>' conflict marker, recognize it as such 03459 // and recover nicely. 03460 goto LexNextToken; 03461 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 03462 // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 03463 goto LexNextToken; 03464 } else if (LangOpts.CUDA && After == '>') { 03465 Kind = tok::greatergreatergreater; 03466 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 03467 SizeTmp2, Result); 03468 } else { 03469 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03470 Kind = tok::greatergreater; 03471 } 03472 03473 } else { 03474 Kind = tok::greater; 03475 } 03476 break; 03477 case '^': 03478 Char = getCharAndSize(CurPtr, SizeTmp); 03479 if (Char == '=') { 03480 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03481 Kind = tok::caretequal; 03482 } else { 03483 Kind = tok::caret; 03484 } 03485 break; 03486 case '|': 03487 Char = getCharAndSize(CurPtr, SizeTmp); 03488 if (Char == '=') { 03489 Kind = tok::pipeequal; 03490 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03491 } else if (Char == '|') { 03492 // If this is '|||||||' and we're in a conflict marker, ignore it. 03493 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 03494 goto LexNextToken; 03495 Kind = tok::pipepipe; 03496 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03497 } else { 03498 Kind = tok::pipe; 03499 } 03500 break; 03501 case ':': 03502 Char = getCharAndSize(CurPtr, SizeTmp); 03503 if (LangOpts.Digraphs && Char == '>') { 03504 Kind = tok::r_square; // ':>' -> ']' 03505 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03506 } else if (LangOpts.CPlusPlus && Char == ':') { 03507 Kind = tok::coloncolon; 03508 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03509 } else { 03510 Kind = tok::colon; 03511 } 03512 break; 03513 case ';': 03514 Kind = tok::semi; 03515 break; 03516 case '=': 03517 Char = getCharAndSize(CurPtr, SizeTmp); 03518 if (Char == '=') { 03519 // If this is '====' and we're in a conflict marker, ignore it. 03520 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 03521 goto LexNextToken; 03522 03523 Kind = tok::equalequal; 03524 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03525 } else { 03526 Kind = tok::equal; 03527 } 03528 break; 03529 case ',': 03530 Kind = tok::comma; 03531 break; 03532 case '#': 03533 Char = getCharAndSize(CurPtr, SizeTmp); 03534 if (Char == '#') { 03535 Kind = tok::hashhash; 03536 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03537 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize 03538 Kind = tok::hashat; 03539 if (!isLexingRawMode()) 03540 Diag(BufferPtr, diag::ext_charize_microsoft); 03541 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 03542 } else { 03543 // We parsed a # character. If this occurs at the start of the line, 03544 // it's actually the start of a preprocessing directive. Callback to 03545 // the preprocessor to handle it. 03546 // TODO: -fpreprocessed mode?? 03547 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 03548 goto HandleDirective; 03549 03550 Kind = tok::hash; 03551 } 03552 break; 03553 03554 case '@': 03555 // Objective C support. 03556 if (CurPtr[-1] == '@' && LangOpts.ObjC1) 03557 Kind = tok::at; 03558 else 03559 Kind = tok::unknown; 03560 break; 03561 03562 // UCNs (C99 6.4.3, C++11 [lex.charset]p2) 03563 case '\\': 03564 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) { 03565 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 03566 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 03567 return true; // KeepWhitespaceMode 03568 03569 // We only saw whitespace, so just try again with this lexer. 03570 // (We manually eliminate the tail call to avoid recursion.) 03571 goto LexNextToken; 03572 } 03573 03574 return LexUnicode(Result, CodePoint, CurPtr); 03575 } 03576 03577 Kind = tok::unknown; 03578 break; 03579 03580 default: { 03581 if (isASCII(Char)) { 03582 Kind = tok::unknown; 03583 break; 03584 } 03585 03586 UTF32 CodePoint; 03587 03588 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to 03589 // an escaped newline. 03590 --CurPtr; 03591 ConversionResult Status = 03592 llvm::convertUTF8Sequence((const UTF8 **)&CurPtr, 03593 (const UTF8 *)BufferEnd, 03594 &CodePoint, 03595 strictConversion); 03596 if (Status == conversionOK) { 03597 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 03598 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 03599 return true; // KeepWhitespaceMode 03600 03601 // We only saw whitespace, so just try again with this lexer. 03602 // (We manually eliminate the tail call to avoid recursion.) 03603 goto LexNextToken; 03604 } 03605 return LexUnicode(Result, CodePoint, CurPtr); 03606 } 03607 03608 if (isLexingRawMode() || ParsingPreprocessorDirective || 03609 PP->isPreprocessedOutput()) { 03610 ++CurPtr; 03611 Kind = tok::unknown; 03612 break; 03613 } 03614 03615 // Non-ASCII characters tend to creep into source code unintentionally. 03616 // Instead of letting the parser complain about the unknown token, 03617 // just diagnose the invalid UTF-8, then drop the character. 03618 Diag(CurPtr, diag::err_invalid_utf8); 03619 03620 BufferPtr = CurPtr+1; 03621 // We're pretending the character didn't exist, so just try again with 03622 // this lexer. 03623 // (We manually eliminate the tail call to avoid recursion.) 03624 goto LexNextToken; 03625 } 03626 } 03627 03628 // Notify MIOpt that we read a non-whitespace/non-comment token. 03629 MIOpt.ReadToken(); 03630 03631 // Update the location of token as well as BufferPtr. 03632 FormTokenWithChars(Result, CurPtr, Kind); 03633 return true; 03634 03635 HandleDirective: 03636 // We parsed a # character and it's the start of a preprocessing directive. 03637 03638 FormTokenWithChars(Result, CurPtr, tok::hash); 03639 PP->HandleDirective(Result); 03640 03641 if (PP->hadModuleLoaderFatalFailure()) { 03642 // With a fatal failure in the module loader, we abort parsing. 03643 assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof"); 03644 return true; 03645 } 03646 03647 // We parsed the directive; lex a token with the new state. 03648 return false; 03649 }