clang API Documentation
00001 //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 // 00010 // This file defines lexer for structured comments and supporting token class. 00011 // 00012 //===----------------------------------------------------------------------===// 00013 00014 #ifndef LLVM_CLANG_AST_COMMENTLEXER_H 00015 #define LLVM_CLANG_AST_COMMENTLEXER_H 00016 00017 #include "clang/Basic/Diagnostic.h" 00018 #include "clang/Basic/SourceManager.h" 00019 #include "llvm/ADT/SmallString.h" 00020 #include "llvm/ADT/SmallVector.h" 00021 #include "llvm/ADT/StringRef.h" 00022 #include "llvm/Support/Allocator.h" 00023 #include "llvm/Support/raw_ostream.h" 00024 00025 namespace clang { 00026 namespace comments { 00027 00028 class Lexer; 00029 class TextTokenRetokenizer; 00030 struct CommandInfo; 00031 class CommandTraits; 00032 00033 namespace tok { 00034 enum TokenKind { 00035 eof, 00036 newline, 00037 text, 00038 unknown_command, // Command that does not have an ID. 00039 backslash_command, // Command with an ID, that used backslash marker. 00040 at_command, // Command with an ID, that used 'at' marker. 00041 verbatim_block_begin, 00042 verbatim_block_line, 00043 verbatim_block_end, 00044 verbatim_line_name, 00045 verbatim_line_text, 00046 html_start_tag, // <tag 00047 html_ident, // attr 00048 html_equals, // = 00049 html_quoted_string, // "blah\"blah" or 'blah\'blah' 00050 html_greater, // > 00051 html_slash_greater, // /> 00052 html_end_tag // </tag 00053 }; 00054 } // end namespace tok 00055 00056 /// \brief Comment token. 00057 class Token { 00058 friend class Lexer; 00059 friend class TextTokenRetokenizer; 00060 00061 /// The location of the token. 00062 SourceLocation Loc; 00063 00064 /// The actual kind of the token. 00065 tok::TokenKind Kind; 00066 00067 /// Length of the token spelling in comment. Can be 0 for synthenized 00068 /// tokens. 00069 unsigned Length; 00070 00071 /// Contains text value associated with a token. 00072 const char *TextPtr; 00073 00074 /// Integer value associated with a token. 00075 /// 00076 /// If the token is a konwn command, contains command ID and TextPtr is 00077 /// unused (command spelling can be found with CommandTraits). Otherwise, 00078 /// contains the length of the string that starts at TextPtr. 00079 unsigned IntVal; 00080 00081 public: 00082 SourceLocation getLocation() const LLVM_READONLY { return Loc; } 00083 void setLocation(SourceLocation SL) { Loc = SL; } 00084 00085 SourceLocation getEndLocation() const LLVM_READONLY { 00086 if (Length == 0 || Length == 1) 00087 return Loc; 00088 return Loc.getLocWithOffset(Length - 1); 00089 } 00090 00091 tok::TokenKind getKind() const LLVM_READONLY { return Kind; } 00092 void setKind(tok::TokenKind K) { Kind = K; } 00093 00094 bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; } 00095 bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; } 00096 00097 unsigned getLength() const LLVM_READONLY { return Length; } 00098 void setLength(unsigned L) { Length = L; } 00099 00100 StringRef getText() const LLVM_READONLY { 00101 assert(is(tok::text)); 00102 return StringRef(TextPtr, IntVal); 00103 } 00104 00105 void setText(StringRef Text) { 00106 assert(is(tok::text)); 00107 TextPtr = Text.data(); 00108 IntVal = Text.size(); 00109 } 00110 00111 StringRef getUnknownCommandName() const LLVM_READONLY { 00112 assert(is(tok::unknown_command)); 00113 return StringRef(TextPtr, IntVal); 00114 } 00115 00116 void setUnknownCommandName(StringRef Name) { 00117 assert(is(tok::unknown_command)); 00118 TextPtr = Name.data(); 00119 IntVal = Name.size(); 00120 } 00121 00122 unsigned getCommandID() const LLVM_READONLY { 00123 assert(is(tok::backslash_command) || is(tok::at_command)); 00124 return IntVal; 00125 } 00126 00127 void setCommandID(unsigned ID) { 00128 assert(is(tok::backslash_command) || is(tok::at_command)); 00129 IntVal = ID; 00130 } 00131 00132 unsigned getVerbatimBlockID() const LLVM_READONLY { 00133 assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); 00134 return IntVal; 00135 } 00136 00137 void setVerbatimBlockID(unsigned ID) { 00138 assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); 00139 IntVal = ID; 00140 } 00141 00142 StringRef getVerbatimBlockText() const LLVM_READONLY { 00143 assert(is(tok::verbatim_block_line)); 00144 return StringRef(TextPtr, IntVal); 00145 } 00146 00147 void setVerbatimBlockText(StringRef Text) { 00148 assert(is(tok::verbatim_block_line)); 00149 TextPtr = Text.data(); 00150 IntVal = Text.size(); 00151 } 00152 00153 unsigned getVerbatimLineID() const LLVM_READONLY { 00154 assert(is(tok::verbatim_line_name)); 00155 return IntVal; 00156 } 00157 00158 void setVerbatimLineID(unsigned ID) { 00159 assert(is(tok::verbatim_line_name)); 00160 IntVal = ID; 00161 } 00162 00163 StringRef getVerbatimLineText() const LLVM_READONLY { 00164 assert(is(tok::verbatim_line_text)); 00165 return StringRef(TextPtr, IntVal); 00166 } 00167 00168 void setVerbatimLineText(StringRef Text) { 00169 assert(is(tok::verbatim_line_text)); 00170 TextPtr = Text.data(); 00171 IntVal = Text.size(); 00172 } 00173 00174 StringRef getHTMLTagStartName() const LLVM_READONLY { 00175 assert(is(tok::html_start_tag)); 00176 return StringRef(TextPtr, IntVal); 00177 } 00178 00179 void setHTMLTagStartName(StringRef Name) { 00180 assert(is(tok::html_start_tag)); 00181 TextPtr = Name.data(); 00182 IntVal = Name.size(); 00183 } 00184 00185 StringRef getHTMLIdent() const LLVM_READONLY { 00186 assert(is(tok::html_ident)); 00187 return StringRef(TextPtr, IntVal); 00188 } 00189 00190 void setHTMLIdent(StringRef Name) { 00191 assert(is(tok::html_ident)); 00192 TextPtr = Name.data(); 00193 IntVal = Name.size(); 00194 } 00195 00196 StringRef getHTMLQuotedString() const LLVM_READONLY { 00197 assert(is(tok::html_quoted_string)); 00198 return StringRef(TextPtr, IntVal); 00199 } 00200 00201 void setHTMLQuotedString(StringRef Str) { 00202 assert(is(tok::html_quoted_string)); 00203 TextPtr = Str.data(); 00204 IntVal = Str.size(); 00205 } 00206 00207 StringRef getHTMLTagEndName() const LLVM_READONLY { 00208 assert(is(tok::html_end_tag)); 00209 return StringRef(TextPtr, IntVal); 00210 } 00211 00212 void setHTMLTagEndName(StringRef Name) { 00213 assert(is(tok::html_end_tag)); 00214 TextPtr = Name.data(); 00215 IntVal = Name.size(); 00216 } 00217 00218 void dump(const Lexer &L, const SourceManager &SM) const; 00219 }; 00220 00221 /// \brief Comment lexer. 00222 class Lexer { 00223 private: 00224 Lexer(const Lexer &) LLVM_DELETED_FUNCTION; 00225 void operator=(const Lexer &) LLVM_DELETED_FUNCTION; 00226 00227 /// Allocator for strings that are semantic values of tokens and have to be 00228 /// computed (for example, resolved decimal character references). 00229 llvm::BumpPtrAllocator &Allocator; 00230 00231 DiagnosticsEngine &Diags; 00232 00233 const CommandTraits &Traits; 00234 00235 const char *const BufferStart; 00236 const char *const BufferEnd; 00237 SourceLocation FileLoc; 00238 00239 const char *BufferPtr; 00240 00241 /// One past end pointer for the current comment. For BCPL comments points 00242 /// to newline or BufferEnd, for C comments points to star in '*/'. 00243 const char *CommentEnd; 00244 00245 enum LexerCommentState { 00246 LCS_BeforeComment, 00247 LCS_InsideBCPLComment, 00248 LCS_InsideCComment, 00249 LCS_BetweenComments 00250 }; 00251 00252 /// Low-level lexer state, track if we are inside or outside of comment. 00253 LexerCommentState CommentState; 00254 00255 enum LexerState { 00256 /// Lexing normal comment text 00257 LS_Normal, 00258 00259 /// Finished lexing verbatim block beginning command, will lex first body 00260 /// line. 00261 LS_VerbatimBlockFirstLine, 00262 00263 /// Lexing verbatim block body line-by-line, skipping line-starting 00264 /// decorations. 00265 LS_VerbatimBlockBody, 00266 00267 /// Finished lexing verbatim line beginning command, will lex text (one 00268 /// line). 00269 LS_VerbatimLineText, 00270 00271 /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes. 00272 LS_HTMLStartTag, 00273 00274 /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'. 00275 LS_HTMLEndTag 00276 }; 00277 00278 /// Current lexing mode. 00279 LexerState State; 00280 00281 /// If State is LS_VerbatimBlock, contains the name of verbatim end 00282 /// command, including command marker. 00283 SmallString<16> VerbatimBlockEndCommandName; 00284 00285 /// Given a character reference name (e.g., "lt"), return the character that 00286 /// it stands for (e.g., "<"). 00287 StringRef resolveHTMLNamedCharacterReference(StringRef Name) const; 00288 00289 /// Given a Unicode codepoint as base-10 integer, return the character. 00290 StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const; 00291 00292 /// Given a Unicode codepoint as base-16 integer, return the character. 00293 StringRef resolveHTMLHexCharacterReference(StringRef Name) const; 00294 00295 void formTokenWithChars(Token &Result, const char *TokEnd, 00296 tok::TokenKind Kind); 00297 00298 void formTextToken(Token &Result, const char *TokEnd) { 00299 StringRef Text(BufferPtr, TokEnd - BufferPtr); 00300 formTokenWithChars(Result, TokEnd, tok::text); 00301 Result.setText(Text); 00302 } 00303 00304 SourceLocation getSourceLocation(const char *Loc) const { 00305 assert(Loc >= BufferStart && Loc <= BufferEnd && 00306 "Location out of range for this buffer!"); 00307 00308 const unsigned CharNo = Loc - BufferStart; 00309 return FileLoc.getLocWithOffset(CharNo); 00310 } 00311 00312 DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) { 00313 return Diags.Report(Loc, DiagID); 00314 } 00315 00316 /// Eat string matching regexp \code \s*\* \endcode. 00317 void skipLineStartingDecorations(); 00318 00319 /// Lex stuff inside comments. CommentEnd should be set correctly. 00320 void lexCommentText(Token &T); 00321 00322 void setupAndLexVerbatimBlock(Token &T, 00323 const char *TextBegin, 00324 char Marker, const CommandInfo *Info); 00325 00326 void lexVerbatimBlockFirstLine(Token &T); 00327 00328 void lexVerbatimBlockBody(Token &T); 00329 00330 void setupAndLexVerbatimLine(Token &T, const char *TextBegin, 00331 const CommandInfo *Info); 00332 00333 void lexVerbatimLineText(Token &T); 00334 00335 void lexHTMLCharacterReference(Token &T); 00336 00337 void setupAndLexHTMLStartTag(Token &T); 00338 00339 void lexHTMLStartTag(Token &T); 00340 00341 void setupAndLexHTMLEndTag(Token &T); 00342 00343 void lexHTMLEndTag(Token &T); 00344 00345 public: 00346 Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, 00347 const CommandTraits &Traits, 00348 SourceLocation FileLoc, 00349 const char *BufferStart, const char *BufferEnd); 00350 00351 void lex(Token &T); 00352 00353 StringRef getSpelling(const Token &Tok, 00354 const SourceManager &SourceMgr, 00355 bool *Invalid = nullptr) const; 00356 }; 00357 00358 } // end namespace comments 00359 } // end namespace clang 00360 00361 #endif 00362