clang API Documentation

CommentLexer.h
Go to the documentation of this file.
00001 //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 //  This file defines lexer for structured comments and supporting token class.
00011 //
00012 //===----------------------------------------------------------------------===//
00013 
00014 #ifndef LLVM_CLANG_AST_COMMENTLEXER_H
00015 #define LLVM_CLANG_AST_COMMENTLEXER_H
00016 
00017 #include "clang/Basic/Diagnostic.h"
00018 #include "clang/Basic/SourceManager.h"
00019 #include "llvm/ADT/SmallString.h"
00020 #include "llvm/ADT/SmallVector.h"
00021 #include "llvm/ADT/StringRef.h"
00022 #include "llvm/Support/Allocator.h"
00023 #include "llvm/Support/raw_ostream.h"
00024 
00025 namespace clang {
00026 namespace comments {
00027 
00028 class Lexer;
00029 class TextTokenRetokenizer;
00030 struct CommandInfo;
00031 class CommandTraits;
00032 
00033 namespace tok {
00034 enum TokenKind {
00035   eof,
00036   newline,
00037   text,
00038   unknown_command,   // Command that does not have an ID.
00039   backslash_command, // Command with an ID, that used backslash marker.
00040   at_command,        // Command with an ID, that used 'at' marker.
00041   verbatim_block_begin,
00042   verbatim_block_line,
00043   verbatim_block_end,
00044   verbatim_line_name,
00045   verbatim_line_text,
00046   html_start_tag,     // <tag
00047   html_ident,         // attr
00048   html_equals,        // =
00049   html_quoted_string, // "blah\"blah" or 'blah\'blah'
00050   html_greater,       // >
00051   html_slash_greater, // />
00052   html_end_tag        // </tag
00053 };
00054 } // end namespace tok
00055 
00056 /// \brief Comment token.
00057 class Token {
00058   friend class Lexer;
00059   friend class TextTokenRetokenizer;
00060 
00061   /// The location of the token.
00062   SourceLocation Loc;
00063 
00064   /// The actual kind of the token.
00065   tok::TokenKind Kind;
00066 
00067   /// Length of the token spelling in comment.  Can be 0 for synthenized
00068   /// tokens.
00069   unsigned Length;
00070 
00071   /// Contains text value associated with a token.
00072   const char *TextPtr;
00073 
00074   /// Integer value associated with a token.
00075   ///
00076   /// If the token is a konwn command, contains command ID and TextPtr is
00077   /// unused (command spelling can be found with CommandTraits).  Otherwise,
00078   /// contains the length of the string that starts at TextPtr.
00079   unsigned IntVal;
00080   
00081 public:
00082   SourceLocation getLocation() const LLVM_READONLY { return Loc; }
00083   void setLocation(SourceLocation SL) { Loc = SL; }
00084 
00085   SourceLocation getEndLocation() const LLVM_READONLY {
00086     if (Length == 0 || Length == 1)
00087       return Loc;
00088     return Loc.getLocWithOffset(Length - 1);
00089   }
00090 
00091   tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
00092   void setKind(tok::TokenKind K) { Kind = K; }
00093 
00094   bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
00095   bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
00096 
00097   unsigned getLength() const LLVM_READONLY { return Length; }
00098   void setLength(unsigned L) { Length = L; }
00099 
00100   StringRef getText() const LLVM_READONLY {
00101     assert(is(tok::text));
00102     return StringRef(TextPtr, IntVal);
00103   }
00104 
00105   void setText(StringRef Text) {
00106     assert(is(tok::text));
00107     TextPtr = Text.data();
00108     IntVal = Text.size();
00109   }
00110 
00111   StringRef getUnknownCommandName() const LLVM_READONLY {
00112     assert(is(tok::unknown_command));
00113     return StringRef(TextPtr, IntVal);
00114   }
00115 
00116   void setUnknownCommandName(StringRef Name) {
00117     assert(is(tok::unknown_command));
00118     TextPtr = Name.data();
00119     IntVal = Name.size();
00120   }
00121 
00122   unsigned getCommandID() const LLVM_READONLY {
00123     assert(is(tok::backslash_command) || is(tok::at_command));
00124     return IntVal;
00125   }
00126 
00127   void setCommandID(unsigned ID) {
00128     assert(is(tok::backslash_command) || is(tok::at_command));
00129     IntVal = ID;
00130   }
00131 
00132   unsigned getVerbatimBlockID() const LLVM_READONLY {
00133     assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
00134     return IntVal;
00135   }
00136 
00137   void setVerbatimBlockID(unsigned ID) {
00138     assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
00139     IntVal = ID;
00140   }
00141 
00142   StringRef getVerbatimBlockText() const LLVM_READONLY {
00143     assert(is(tok::verbatim_block_line));
00144     return StringRef(TextPtr, IntVal);
00145   }
00146 
00147   void setVerbatimBlockText(StringRef Text) {
00148     assert(is(tok::verbatim_block_line));
00149     TextPtr = Text.data();
00150     IntVal = Text.size();
00151   }
00152 
00153   unsigned getVerbatimLineID() const LLVM_READONLY {
00154     assert(is(tok::verbatim_line_name));
00155     return IntVal;
00156   }
00157 
00158   void setVerbatimLineID(unsigned ID) {
00159     assert(is(tok::verbatim_line_name));
00160     IntVal = ID;
00161   }
00162 
00163   StringRef getVerbatimLineText() const LLVM_READONLY {
00164     assert(is(tok::verbatim_line_text));
00165     return StringRef(TextPtr, IntVal);
00166   }
00167 
00168   void setVerbatimLineText(StringRef Text) {
00169     assert(is(tok::verbatim_line_text));
00170     TextPtr = Text.data();
00171     IntVal = Text.size();
00172   }
00173 
00174   StringRef getHTMLTagStartName() const LLVM_READONLY {
00175     assert(is(tok::html_start_tag));
00176     return StringRef(TextPtr, IntVal);
00177   }
00178 
00179   void setHTMLTagStartName(StringRef Name) {
00180     assert(is(tok::html_start_tag));
00181     TextPtr = Name.data();
00182     IntVal = Name.size();
00183   }
00184 
00185   StringRef getHTMLIdent() const LLVM_READONLY {
00186     assert(is(tok::html_ident));
00187     return StringRef(TextPtr, IntVal);
00188   }
00189 
00190   void setHTMLIdent(StringRef Name) {
00191     assert(is(tok::html_ident));
00192     TextPtr = Name.data();
00193     IntVal = Name.size();
00194   }
00195 
00196   StringRef getHTMLQuotedString() const LLVM_READONLY {
00197     assert(is(tok::html_quoted_string));
00198     return StringRef(TextPtr, IntVal);
00199   }
00200 
00201   void setHTMLQuotedString(StringRef Str) {
00202     assert(is(tok::html_quoted_string));
00203     TextPtr = Str.data();
00204     IntVal = Str.size();
00205   }
00206 
00207   StringRef getHTMLTagEndName() const LLVM_READONLY {
00208     assert(is(tok::html_end_tag));
00209     return StringRef(TextPtr, IntVal);
00210   }
00211 
00212   void setHTMLTagEndName(StringRef Name) {
00213     assert(is(tok::html_end_tag));
00214     TextPtr = Name.data();
00215     IntVal = Name.size();
00216   }
00217 
00218   void dump(const Lexer &L, const SourceManager &SM) const;
00219 };
00220 
00221 /// \brief Comment lexer.
00222 class Lexer {
00223 private:
00224   Lexer(const Lexer &) LLVM_DELETED_FUNCTION;
00225   void operator=(const Lexer &) LLVM_DELETED_FUNCTION;
00226 
00227   /// Allocator for strings that are semantic values of tokens and have to be
00228   /// computed (for example, resolved decimal character references).
00229   llvm::BumpPtrAllocator &Allocator;
00230 
00231   DiagnosticsEngine &Diags;
00232   
00233   const CommandTraits &Traits;
00234 
00235   const char *const BufferStart;
00236   const char *const BufferEnd;
00237   SourceLocation FileLoc;
00238 
00239   const char *BufferPtr;
00240 
00241   /// One past end pointer for the current comment.  For BCPL comments points
00242   /// to newline or BufferEnd, for C comments points to star in '*/'.
00243   const char *CommentEnd;
00244 
00245   enum LexerCommentState {
00246     LCS_BeforeComment,
00247     LCS_InsideBCPLComment,
00248     LCS_InsideCComment,
00249     LCS_BetweenComments
00250   };
00251 
00252   /// Low-level lexer state, track if we are inside or outside of comment.
00253   LexerCommentState CommentState;
00254 
00255   enum LexerState {
00256     /// Lexing normal comment text
00257     LS_Normal,
00258 
00259     /// Finished lexing verbatim block beginning command, will lex first body
00260     /// line.
00261     LS_VerbatimBlockFirstLine,
00262 
00263     /// Lexing verbatim block body line-by-line, skipping line-starting
00264     /// decorations.
00265     LS_VerbatimBlockBody,
00266 
00267     /// Finished lexing verbatim line beginning command, will lex text (one
00268     /// line).
00269     LS_VerbatimLineText,
00270 
00271     /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
00272     LS_HTMLStartTag,
00273 
00274     /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
00275     LS_HTMLEndTag
00276   };
00277 
00278   /// Current lexing mode.
00279   LexerState State;
00280 
00281   /// If State is LS_VerbatimBlock, contains the name of verbatim end
00282   /// command, including command marker.
00283   SmallString<16> VerbatimBlockEndCommandName;
00284 
00285   /// Given a character reference name (e.g., "lt"), return the character that
00286   /// it stands for (e.g., "<").
00287   StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
00288 
00289   /// Given a Unicode codepoint as base-10 integer, return the character.
00290   StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
00291 
00292   /// Given a Unicode codepoint as base-16 integer, return the character.
00293   StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
00294 
00295   void formTokenWithChars(Token &Result, const char *TokEnd,
00296                           tok::TokenKind Kind);
00297 
00298   void formTextToken(Token &Result, const char *TokEnd) {
00299     StringRef Text(BufferPtr, TokEnd - BufferPtr);
00300     formTokenWithChars(Result, TokEnd, tok::text);
00301     Result.setText(Text);
00302   }
00303 
00304   SourceLocation getSourceLocation(const char *Loc) const {
00305     assert(Loc >= BufferStart && Loc <= BufferEnd &&
00306            "Location out of range for this buffer!");
00307 
00308     const unsigned CharNo = Loc - BufferStart;
00309     return FileLoc.getLocWithOffset(CharNo);
00310   }
00311 
00312   DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
00313     return Diags.Report(Loc, DiagID);
00314   }
00315 
00316   /// Eat string matching regexp \code \s*\* \endcode.
00317   void skipLineStartingDecorations();
00318 
00319   /// Lex stuff inside comments.  CommentEnd should be set correctly.
00320   void lexCommentText(Token &T);
00321 
00322   void setupAndLexVerbatimBlock(Token &T,
00323                                 const char *TextBegin,
00324                                 char Marker, const CommandInfo *Info);
00325 
00326   void lexVerbatimBlockFirstLine(Token &T);
00327 
00328   void lexVerbatimBlockBody(Token &T);
00329 
00330   void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
00331                                const CommandInfo *Info);
00332 
00333   void lexVerbatimLineText(Token &T);
00334 
00335   void lexHTMLCharacterReference(Token &T);
00336 
00337   void setupAndLexHTMLStartTag(Token &T);
00338 
00339   void lexHTMLStartTag(Token &T);
00340 
00341   void setupAndLexHTMLEndTag(Token &T);
00342 
00343   void lexHTMLEndTag(Token &T);
00344 
00345 public:
00346   Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
00347         const CommandTraits &Traits,
00348         SourceLocation FileLoc,
00349         const char *BufferStart, const char *BufferEnd);
00350 
00351   void lex(Token &T);
00352 
00353   StringRef getSpelling(const Token &Tok,
00354                         const SourceManager &SourceMgr,
00355                         bool *Invalid = nullptr) const;
00356 };
00357 
00358 } // end namespace comments
00359 } // end namespace clang
00360 
00361 #endif
00362