clang API Documentation
00001 //===--- Lexer.h - C Language Family Lexer ----------------------*- C++ -*-===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 // This file defines the Lexer interface. 00011 // 00012 //===----------------------------------------------------------------------===// 00013 00014 #ifndef LLVM_CLANG_LEX_LEXER_H 00015 #define LLVM_CLANG_LEX_LEXER_H 00016 00017 #include "clang/Basic/LangOptions.h" 00018 #include "clang/Lex/PreprocessorLexer.h" 00019 #include "llvm/ADT/SmallVector.h" 00020 #include <cassert> 00021 #include <string> 00022 00023 namespace clang { 00024 class DiagnosticsEngine; 00025 class SourceManager; 00026 class Preprocessor; 00027 class DiagnosticBuilder; 00028 00029 /// ConflictMarkerKind - Kinds of conflict marker which the lexer might be 00030 /// recovering from. 00031 enum ConflictMarkerKind { 00032 /// Not within a conflict marker. 00033 CMK_None, 00034 /// A normal or diff3 conflict marker, initiated by at least 7 "<"s, 00035 /// separated by at least 7 "="s or "|"s, and terminated by at least 7 ">"s. 00036 CMK_Normal, 00037 /// A Perforce-style conflict marker, initiated by 4 ">"s, 00038 /// separated by 4 "="s, and terminated by 4 "<"s. 00039 CMK_Perforce 00040 }; 00041 00042 /// Lexer - This provides a simple interface that turns a text buffer into a 00043 /// stream of tokens. This provides no support for file reading or buffering, 00044 /// or buffering/seeking of tokens, only forward lexing is supported. It relies 00045 /// on the specified Preprocessor object to handle preprocessor directives, etc. 00046 class Lexer : public PreprocessorLexer { 00047 void anchor() override; 00048 00049 //===--------------------------------------------------------------------===// 00050 // Constant configuration values for this lexer. 00051 const char *BufferStart; // Start of the buffer. 00052 const char *BufferEnd; // End of the buffer. 00053 SourceLocation FileLoc; // Location for start of file. 00054 LangOptions LangOpts; // LangOpts enabled by this language (cache). 00055 bool Is_PragmaLexer; // True if lexer for _Pragma handling. 00056 00057 //===--------------------------------------------------------------------===// 00058 // Context-specific lexing flags set by the preprocessor. 00059 // 00060 00061 /// ExtendedTokenMode - The lexer can optionally keep comments and whitespace 00062 /// and return them as tokens. This is used for -C and -CC modes, and 00063 /// whitespace preservation can be useful for some clients that want to lex 00064 /// the file in raw mode and get every character from the file. 00065 /// 00066 /// When this is set to 2 it returns comments and whitespace. When set to 1 00067 /// it returns comments, when it is set to 0 it returns normal tokens only. 00068 unsigned char ExtendedTokenMode; 00069 00070 //===--------------------------------------------------------------------===// 00071 // Context that changes as the file is lexed. 00072 // NOTE: any state that mutates when in raw mode must have save/restore code 00073 // in Lexer::isNextPPTokenLParen. 00074 00075 // BufferPtr - Current pointer into the buffer. This is the next character 00076 // to be lexed. 00077 const char *BufferPtr; 00078 00079 // IsAtStartOfLine - True if the next lexed token should get the "start of 00080 // line" flag set on it. 00081 bool IsAtStartOfLine; 00082 00083 bool IsAtPhysicalStartOfLine; 00084 00085 bool HasLeadingSpace; 00086 00087 bool HasLeadingEmptyMacro; 00088 00089 // CurrentConflictMarkerState - The kind of conflict marker we are handling. 00090 ConflictMarkerKind CurrentConflictMarkerState; 00091 00092 Lexer(const Lexer &) LLVM_DELETED_FUNCTION; 00093 void operator=(const Lexer &) LLVM_DELETED_FUNCTION; 00094 friend class Preprocessor; 00095 00096 void InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd); 00097 public: 00098 00099 /// Lexer constructor - Create a new lexer object for the specified buffer 00100 /// with the specified preprocessor managing the lexing process. This lexer 00101 /// assumes that the associated file buffer and Preprocessor objects will 00102 /// outlive it, so it doesn't take ownership of either of them. 00103 Lexer(FileID FID, const llvm::MemoryBuffer *InputBuffer, Preprocessor &PP); 00104 00105 /// Lexer constructor - Create a new raw lexer object. This object is only 00106 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the 00107 /// text range will outlive it, so it doesn't take ownership of it. 00108 Lexer(SourceLocation FileLoc, const LangOptions &LangOpts, 00109 const char *BufStart, const char *BufPtr, const char *BufEnd); 00110 00111 /// Lexer constructor - Create a new raw lexer object. This object is only 00112 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the 00113 /// text range will outlive it, so it doesn't take ownership of it. 00114 Lexer(FileID FID, const llvm::MemoryBuffer *InputBuffer, 00115 const SourceManager &SM, const LangOptions &LangOpts); 00116 00117 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 00118 /// _Pragma expansion. This has a variety of magic semantics that this method 00119 /// sets up. It returns a new'd Lexer that must be delete'd when done. 00120 static Lexer *Create_PragmaLexer(SourceLocation SpellingLoc, 00121 SourceLocation ExpansionLocStart, 00122 SourceLocation ExpansionLocEnd, 00123 unsigned TokLen, Preprocessor &PP); 00124 00125 00126 /// getLangOpts - Return the language features currently enabled. 00127 /// NOTE: this lexer modifies features as a file is parsed! 00128 const LangOptions &getLangOpts() const { return LangOpts; } 00129 00130 /// getFileLoc - Return the File Location for the file we are lexing out of. 00131 /// The physical location encodes the location where the characters come from, 00132 /// the virtual location encodes where we should *claim* the characters came 00133 /// from. Currently this is only used by _Pragma handling. 00134 SourceLocation getFileLoc() const { return FileLoc; } 00135 00136 private: 00137 /// Lex - Return the next token in the file. If this is the end of file, it 00138 /// return the tok::eof token. This implicitly involves the preprocessor. 00139 bool Lex(Token &Result); 00140 00141 public: 00142 /// isPragmaLexer - Returns true if this Lexer is being used to lex a pragma. 00143 bool isPragmaLexer() const { return Is_PragmaLexer; } 00144 00145 private: 00146 /// IndirectLex - An indirect call to 'Lex' that can be invoked via 00147 /// the PreprocessorLexer interface. 00148 void IndirectLex(Token &Result) override { Lex(Result); } 00149 00150 public: 00151 /// LexFromRawLexer - Lex a token from a designated raw lexer (one with no 00152 /// associated preprocessor object. Return true if the 'next character to 00153 /// read' pointer points at the end of the lexer buffer, false otherwise. 00154 bool LexFromRawLexer(Token &Result) { 00155 assert(LexingRawMode && "Not already in raw mode!"); 00156 Lex(Result); 00157 // Note that lexing to the end of the buffer doesn't implicitly delete the 00158 // lexer when in raw mode. 00159 return BufferPtr == BufferEnd; 00160 } 00161 00162 /// isKeepWhitespaceMode - Return true if the lexer should return tokens for 00163 /// every character in the file, including whitespace and comments. This 00164 /// should only be used in raw mode, as the preprocessor is not prepared to 00165 /// deal with the excess tokens. 00166 bool isKeepWhitespaceMode() const { 00167 return ExtendedTokenMode > 1; 00168 } 00169 00170 /// SetKeepWhitespaceMode - This method lets clients enable or disable 00171 /// whitespace retention mode. 00172 void SetKeepWhitespaceMode(bool Val) { 00173 assert((!Val || LexingRawMode || LangOpts.TraditionalCPP) && 00174 "Can only retain whitespace in raw mode or -traditional-cpp"); 00175 ExtendedTokenMode = Val ? 2 : 0; 00176 } 00177 00178 /// inKeepCommentMode - Return true if the lexer should return comments as 00179 /// tokens. 00180 bool inKeepCommentMode() const { 00181 return ExtendedTokenMode > 0; 00182 } 00183 00184 /// SetCommentRetentionMode - Change the comment retention mode of the lexer 00185 /// to the specified mode. This is really only useful when lexing in raw 00186 /// mode, because otherwise the lexer needs to manage this. 00187 void SetCommentRetentionState(bool Mode) { 00188 assert(!isKeepWhitespaceMode() && 00189 "Can't play with comment retention state when retaining whitespace"); 00190 ExtendedTokenMode = Mode ? 1 : 0; 00191 } 00192 00193 /// Sets the extended token mode back to its initial value, according to the 00194 /// language options and preprocessor. This controls whether the lexer 00195 /// produces comment and whitespace tokens. 00196 /// 00197 /// This requires the lexer to have an associated preprocessor. A standalone 00198 /// lexer has nothing to reset to. 00199 void resetExtendedTokenMode(); 00200 00201 /// Gets source code buffer. 00202 StringRef getBuffer() const { 00203 return StringRef(BufferStart, BufferEnd - BufferStart); 00204 } 00205 00206 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an 00207 /// uninterpreted string. This switches the lexer out of directive mode. 00208 void ReadToEndOfLine(SmallVectorImpl<char> *Result = nullptr); 00209 00210 00211 /// Diag - Forwarding function for diagnostics. This translate a source 00212 /// position in the current buffer into a SourceLocation object for rendering. 00213 DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const; 00214 00215 /// getSourceLocation - Return a source location identifier for the specified 00216 /// offset in the current file. 00217 SourceLocation getSourceLocation(const char *Loc, unsigned TokLen = 1) const; 00218 00219 /// getSourceLocation - Return a source location for the next character in 00220 /// the current file. 00221 SourceLocation getSourceLocation() override { 00222 return getSourceLocation(BufferPtr); 00223 } 00224 00225 /// \brief Return the current location in the buffer. 00226 const char *getBufferLocation() const { return BufferPtr; } 00227 00228 /// Stringify - Convert the specified string into a C string by escaping '\' 00229 /// and " characters. This does not add surrounding ""'s to the string. 00230 /// If Charify is true, this escapes the ' character instead of ". 00231 static std::string Stringify(const std::string &Str, bool Charify = false); 00232 00233 /// Stringify - Convert the specified string into a C string by escaping '\' 00234 /// and " characters. This does not add surrounding ""'s to the string. 00235 static void Stringify(SmallVectorImpl<char> &Str); 00236 00237 00238 /// getSpelling - This method is used to get the spelling of a token into a 00239 /// preallocated buffer, instead of as an std::string. The caller is required 00240 /// to allocate enough space for the token, which is guaranteed to be at least 00241 /// Tok.getLength() bytes long. The length of the actual result is returned. 00242 /// 00243 /// Note that this method may do two possible things: it may either fill in 00244 /// the buffer specified with characters, or it may *change the input pointer* 00245 /// to point to a constant buffer with the data already in it (avoiding a 00246 /// copy). The caller is not allowed to modify the returned buffer pointer 00247 /// if an internal buffer is returned. 00248 static unsigned getSpelling(const Token &Tok, const char *&Buffer, 00249 const SourceManager &SourceMgr, 00250 const LangOptions &LangOpts, 00251 bool *Invalid = nullptr); 00252 00253 /// getSpelling() - Return the 'spelling' of the Tok token. The spelling of a 00254 /// token is the characters used to represent the token in the source file 00255 /// after trigraph expansion and escaped-newline folding. In particular, this 00256 /// wants to get the true, uncanonicalized, spelling of things like digraphs 00257 /// UCNs, etc. 00258 static std::string getSpelling(const Token &Tok, 00259 const SourceManager &SourceMgr, 00260 const LangOptions &LangOpts, 00261 bool *Invalid = nullptr); 00262 00263 /// getSpelling - This method is used to get the spelling of the 00264 /// token at the given source location. If, as is usually true, it 00265 /// is not necessary to copy any data, then the returned string may 00266 /// not point into the provided buffer. 00267 /// 00268 /// This method lexes at the expansion depth of the given 00269 /// location and does not jump to the expansion or spelling 00270 /// location. 00271 static StringRef getSpelling(SourceLocation loc, 00272 SmallVectorImpl<char> &buffer, 00273 const SourceManager &SourceMgr, 00274 const LangOptions &LangOpts, 00275 bool *invalid = nullptr); 00276 00277 /// MeasureTokenLength - Relex the token at the specified location and return 00278 /// its length in bytes in the input file. If the token needs cleaning (e.g. 00279 /// includes a trigraph or an escaped newline) then this count includes bytes 00280 /// that are part of that. 00281 static unsigned MeasureTokenLength(SourceLocation Loc, 00282 const SourceManager &SM, 00283 const LangOptions &LangOpts); 00284 00285 /// \brief Relex the token at the specified location. 00286 /// \returns true if there was a failure, false on success. 00287 static bool getRawToken(SourceLocation Loc, Token &Result, 00288 const SourceManager &SM, 00289 const LangOptions &LangOpts, 00290 bool IgnoreWhiteSpace = false); 00291 00292 /// \brief Given a location any where in a source buffer, find the location 00293 /// that corresponds to the beginning of the token in which the original 00294 /// source location lands. 00295 static SourceLocation GetBeginningOfToken(SourceLocation Loc, 00296 const SourceManager &SM, 00297 const LangOptions &LangOpts); 00298 00299 /// AdvanceToTokenCharacter - If the current SourceLocation specifies a 00300 /// location at the start of a token, return a new location that specifies a 00301 /// character within the token. This handles trigraphs and escaped newlines. 00302 static SourceLocation AdvanceToTokenCharacter(SourceLocation TokStart, 00303 unsigned Character, 00304 const SourceManager &SM, 00305 const LangOptions &LangOpts); 00306 00307 /// \brief Computes the source location just past the end of the 00308 /// token at this source location. 00309 /// 00310 /// This routine can be used to produce a source location that 00311 /// points just past the end of the token referenced by \p Loc, and 00312 /// is generally used when a diagnostic needs to point just after a 00313 /// token where it expected something different that it received. If 00314 /// the returned source location would not be meaningful (e.g., if 00315 /// it points into a macro), this routine returns an invalid 00316 /// source location. 00317 /// 00318 /// \param Offset an offset from the end of the token, where the source 00319 /// location should refer to. The default offset (0) produces a source 00320 /// location pointing just past the end of the token; an offset of 1 produces 00321 /// a source location pointing to the last character in the token, etc. 00322 static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 00323 const SourceManager &SM, 00324 const LangOptions &LangOpts); 00325 00326 /// \brief Returns true if the given MacroID location points at the first 00327 /// token of the macro expansion. 00328 /// 00329 /// \param MacroBegin If non-null and function returns true, it is set to 00330 /// begin location of the macro. 00331 static bool isAtStartOfMacroExpansion(SourceLocation loc, 00332 const SourceManager &SM, 00333 const LangOptions &LangOpts, 00334 SourceLocation *MacroBegin = nullptr); 00335 00336 /// \brief Returns true if the given MacroID location points at the last 00337 /// token of the macro expansion. 00338 /// 00339 /// \param MacroEnd If non-null and function returns true, it is set to 00340 /// end location of the macro. 00341 static bool isAtEndOfMacroExpansion(SourceLocation loc, 00342 const SourceManager &SM, 00343 const LangOptions &LangOpts, 00344 SourceLocation *MacroEnd = nullptr); 00345 00346 /// \brief Accepts a range and returns a character range with file locations. 00347 /// 00348 /// Returns a null range if a part of the range resides inside a macro 00349 /// expansion or the range does not reside on the same FileID. 00350 /// 00351 /// This function is trying to deal with macros and return a range based on 00352 /// file locations. The cases where it can successfully handle macros are: 00353 /// 00354 /// -begin or end range lies at the start or end of a macro expansion, in 00355 /// which case the location will be set to the expansion point, e.g: 00356 /// \#define M 1 2 00357 /// a M 00358 /// If you have a range [a, 2] (where 2 came from the macro), the function 00359 /// will return a range for "a M" 00360 /// if you have range [a, 1], the function will fail because the range 00361 /// overlaps with only a part of the macro 00362 /// 00363 /// -The macro is a function macro and the range can be mapped to the macro 00364 /// arguments, e.g: 00365 /// \#define M 1 2 00366 /// \#define FM(x) x 00367 /// FM(a b M) 00368 /// if you have range [b, 2], the function will return the file range "b M" 00369 /// inside the macro arguments. 00370 /// if you have range [a, 2], the function will return the file range 00371 /// "FM(a b M)" since the range includes all of the macro expansion. 00372 static CharSourceRange makeFileCharRange(CharSourceRange Range, 00373 const SourceManager &SM, 00374 const LangOptions &LangOpts); 00375 00376 /// \brief Returns a string for the source that the range encompasses. 00377 static StringRef getSourceText(CharSourceRange Range, 00378 const SourceManager &SM, 00379 const LangOptions &LangOpts, 00380 bool *Invalid = nullptr); 00381 00382 /// \brief Retrieve the name of the immediate macro expansion. 00383 /// 00384 /// This routine starts from a source location, and finds the name of the macro 00385 /// responsible for its immediate expansion. It looks through any intervening 00386 /// macro argument expansions to compute this. It returns a StringRef which 00387 /// refers to the SourceManager-owned buffer of the source where that macro 00388 /// name is spelled. Thus, the result shouldn't out-live that SourceManager. 00389 static StringRef getImmediateMacroName(SourceLocation Loc, 00390 const SourceManager &SM, 00391 const LangOptions &LangOpts); 00392 00393 /// \brief Compute the preamble of the given file. 00394 /// 00395 /// The preamble of a file contains the initial comments, include directives, 00396 /// and other preprocessor directives that occur before the code in this 00397 /// particular file actually begins. The preamble of the main source file is 00398 /// a potential prefix header. 00399 /// 00400 /// \param Buffer The memory buffer containing the file's contents. 00401 /// 00402 /// \param MaxLines If non-zero, restrict the length of the preamble 00403 /// to fewer than this number of lines. 00404 /// 00405 /// \returns The offset into the file where the preamble ends and the rest 00406 /// of the file begins along with a boolean value indicating whether 00407 /// the preamble ends at the beginning of a new line. 00408 static std::pair<unsigned, bool> ComputePreamble(StringRef Buffer, 00409 const LangOptions &LangOpts, 00410 unsigned MaxLines = 0); 00411 00412 /// \brief Checks that the given token is the first token that occurs after 00413 /// the given location (this excludes comments and whitespace). Returns the 00414 /// location immediately after the specified token. If the token is not found 00415 /// or the location is inside a macro, the returned source location will be 00416 /// invalid. 00417 static SourceLocation findLocationAfterToken(SourceLocation loc, 00418 tok::TokenKind TKind, 00419 const SourceManager &SM, 00420 const LangOptions &LangOpts, 00421 bool SkipTrailingWhitespaceAndNewLine); 00422 00423 /// \brief Returns true if the given character could appear in an identifier. 00424 static bool isIdentifierBodyChar(char c, const LangOptions &LangOpts); 00425 00426 /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever 00427 /// emit a warning. 00428 static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size, 00429 const LangOptions &LangOpts) { 00430 // If this is not a trigraph and not a UCN or escaped newline, return 00431 // quickly. 00432 if (isObviouslySimpleCharacter(Ptr[0])) { 00433 Size = 1; 00434 return *Ptr; 00435 } 00436 00437 Size = 0; 00438 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts); 00439 } 00440 00441 //===--------------------------------------------------------------------===// 00442 // Internal implementation interfaces. 00443 private: 00444 00445 /// LexTokenInternal - Internal interface to lex a preprocessing token. Called 00446 /// by Lex. 00447 /// 00448 bool LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine); 00449 00450 bool CheckUnicodeWhitespace(Token &Result, uint32_t C, const char *CurPtr); 00451 00452 /// Given that a token begins with the Unicode character \p C, figure out 00453 /// what kind of token it is and dispatch to the appropriate lexing helper 00454 /// function. 00455 bool LexUnicode(Token &Result, uint32_t C, const char *CurPtr); 00456 00457 /// FormTokenWithChars - When we lex a token, we have identified a span 00458 /// starting at BufferPtr, going to TokEnd that forms the token. This method 00459 /// takes that range and assigns it to the token as its location and size. In 00460 /// addition, since tokens cannot overlap, this also updates BufferPtr to be 00461 /// TokEnd. 00462 void FormTokenWithChars(Token &Result, const char *TokEnd, 00463 tok::TokenKind Kind) { 00464 unsigned TokLen = TokEnd-BufferPtr; 00465 Result.setLength(TokLen); 00466 Result.setLocation(getSourceLocation(BufferPtr, TokLen)); 00467 Result.setKind(Kind); 00468 BufferPtr = TokEnd; 00469 } 00470 00471 /// isNextPPTokenLParen - Return 1 if the next unexpanded token will return a 00472 /// tok::l_paren token, 0 if it is something else and 2 if there are no more 00473 /// tokens in the buffer controlled by this lexer. 00474 unsigned isNextPPTokenLParen(); 00475 00476 //===--------------------------------------------------------------------===// 00477 // Lexer character reading interfaces. 00478 00479 // This lexer is built on two interfaces for reading characters, both of which 00480 // automatically provide phase 1/2 translation. getAndAdvanceChar is used 00481 // when we know that we will be reading a character from the input buffer and 00482 // that this character will be part of the result token. This occurs in (f.e.) 00483 // string processing, because we know we need to read until we find the 00484 // closing '"' character. 00485 // 00486 // The second interface is the combination of getCharAndSize with 00487 // ConsumeChar. getCharAndSize reads a phase 1/2 translated character, 00488 // returning it and its size. If the lexer decides that this character is 00489 // part of the current token, it calls ConsumeChar on it. This two stage 00490 // approach allows us to emit diagnostics for characters (e.g. warnings about 00491 // trigraphs), knowing that they only are emitted if the character is 00492 // consumed. 00493 00494 /// isObviouslySimpleCharacter - Return true if the specified character is 00495 /// obviously the same in translation phase 1 and translation phase 3. This 00496 /// can return false for characters that end up being the same, but it will 00497 /// never return true for something that needs to be mapped. 00498 static bool isObviouslySimpleCharacter(char C) { 00499 return C != '?' && C != '\\'; 00500 } 00501 00502 /// getAndAdvanceChar - Read a single 'character' from the specified buffer, 00503 /// advance over it, and return it. This is tricky in several cases. Here we 00504 /// just handle the trivial case and fall-back to the non-inlined 00505 /// getCharAndSizeSlow method to handle the hard case. 00506 inline char getAndAdvanceChar(const char *&Ptr, Token &Tok) { 00507 // If this is not a trigraph and not a UCN or escaped newline, return 00508 // quickly. 00509 if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++; 00510 00511 unsigned Size = 0; 00512 char C = getCharAndSizeSlow(Ptr, Size, &Tok); 00513 Ptr += Size; 00514 return C; 00515 } 00516 00517 /// ConsumeChar - When a character (identified by getCharAndSize) is consumed 00518 /// and added to a given token, check to see if there are diagnostics that 00519 /// need to be emitted or flags that need to be set on the token. If so, do 00520 /// it. 00521 const char *ConsumeChar(const char *Ptr, unsigned Size, Token &Tok) { 00522 // Normal case, we consumed exactly one token. Just return it. 00523 if (Size == 1) 00524 return Ptr+Size; 00525 00526 // Otherwise, re-lex the character with a current token, allowing 00527 // diagnostics to be emitted and flags to be set. 00528 Size = 0; 00529 getCharAndSizeSlow(Ptr, Size, &Tok); 00530 return Ptr+Size; 00531 } 00532 00533 /// getCharAndSize - Peek a single 'character' from the specified buffer, 00534 /// get its size, and return it. This is tricky in several cases. Here we 00535 /// just handle the trivial case and fall-back to the non-inlined 00536 /// getCharAndSizeSlow method to handle the hard case. 00537 inline char getCharAndSize(const char *Ptr, unsigned &Size) { 00538 // If this is not a trigraph and not a UCN or escaped newline, return 00539 // quickly. 00540 if (isObviouslySimpleCharacter(Ptr[0])) { 00541 Size = 1; 00542 return *Ptr; 00543 } 00544 00545 Size = 0; 00546 return getCharAndSizeSlow(Ptr, Size); 00547 } 00548 00549 /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize 00550 /// method. 00551 char getCharAndSizeSlow(const char *Ptr, unsigned &Size, 00552 Token *Tok = nullptr); 00553 00554 /// getEscapedNewLineSize - Return the size of the specified escaped newline, 00555 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry 00556 /// to this function. 00557 static unsigned getEscapedNewLineSize(const char *P); 00558 00559 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of 00560 /// them), skip over them and return the first non-escaped-newline found, 00561 /// otherwise return P. 00562 static const char *SkipEscapedNewLines(const char *P); 00563 00564 /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a 00565 /// diagnostic. 00566 static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 00567 const LangOptions &LangOpts); 00568 00569 //===--------------------------------------------------------------------===// 00570 // Other lexer functions. 00571 00572 void SkipBytes(unsigned Bytes, bool StartOfLine); 00573 00574 void PropagateLineStartLeadingSpaceInfo(Token &Result); 00575 00576 const char *LexUDSuffix(Token &Result, const char *CurPtr, 00577 bool IsStringLiteral); 00578 00579 // Helper functions to lex the remainder of a token of the specific type. 00580 bool LexIdentifier (Token &Result, const char *CurPtr); 00581 bool LexNumericConstant (Token &Result, const char *CurPtr); 00582 bool LexStringLiteral (Token &Result, const char *CurPtr, 00583 tok::TokenKind Kind); 00584 bool LexRawStringLiteral (Token &Result, const char *CurPtr, 00585 tok::TokenKind Kind); 00586 bool LexAngledStringLiteral(Token &Result, const char *CurPtr); 00587 bool LexCharConstant (Token &Result, const char *CurPtr, 00588 tok::TokenKind Kind); 00589 bool LexEndOfFile (Token &Result, const char *CurPtr); 00590 bool SkipWhitespace (Token &Result, const char *CurPtr, 00591 bool &TokAtPhysicalStartOfLine); 00592 bool SkipLineComment (Token &Result, const char *CurPtr, 00593 bool &TokAtPhysicalStartOfLine); 00594 bool SkipBlockComment (Token &Result, const char *CurPtr, 00595 bool &TokAtPhysicalStartOfLine); 00596 bool SaveLineComment (Token &Result, const char *CurPtr); 00597 00598 bool IsStartOfConflictMarker(const char *CurPtr); 00599 bool HandleEndOfConflictMarker(const char *CurPtr); 00600 00601 bool isCodeCompletionPoint(const char *CurPtr) const; 00602 void cutOffLexing() { BufferPtr = BufferEnd; } 00603 00604 bool isHexaLiteral(const char *Start, const LangOptions &LangOpts); 00605 00606 00607 /// Read a universal character name. 00608 /// 00609 /// \param CurPtr The position in the source buffer after the initial '\'. 00610 /// If the UCN is syntactically well-formed (but not necessarily 00611 /// valid), this parameter will be updated to point to the 00612 /// character after the UCN. 00613 /// \param SlashLoc The position in the source buffer of the '\'. 00614 /// \param Tok The token being formed. Pass \c NULL to suppress diagnostics 00615 /// and handle token formation in the caller. 00616 /// 00617 /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is 00618 /// invalid. 00619 uint32_t tryReadUCN(const char *&CurPtr, const char *SlashLoc, Token *Tok); 00620 00621 /// \brief Try to consume a UCN as part of an identifier at the current 00622 /// location. 00623 /// \param CurPtr Initially points to the range of characters in the source 00624 /// buffer containing the '\'. Updated to point past the end of 00625 /// the UCN on success. 00626 /// \param Size The number of characters occupied by the '\' (including 00627 /// trigraphs and escaped newlines). 00628 /// \param Result The token being produced. Marked as containing a UCN on 00629 /// success. 00630 /// \return \c true if a UCN was lexed and it produced an acceptable 00631 /// identifier character, \c false otherwise. 00632 bool tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, 00633 Token &Result); 00634 00635 /// \brief Try to consume an identifier character encoded in UTF-8. 00636 /// \param CurPtr Points to the start of the (potential) UTF-8 code unit 00637 /// sequence. On success, updated to point past the end of it. 00638 /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier 00639 /// character was lexed, \c false otherwise. 00640 bool tryConsumeIdentifierUTF8Char(const char *&CurPtr); 00641 }; 00642 00643 } // end namespace clang 00644 00645 #endif