clang: CommentLexer.cpp Source File

Go to the documentation of this file.
00001 #include "clang/AST/CommentLexer.h"
00002 #include "clang/AST/CommentCommandTraits.h"
00003 #include "clang/AST/CommentDiagnostic.h"
00004 #include "clang/Basic/CharInfo.h"
00005 #include "llvm/ADT/StringExtras.h"
00006 #include "llvm/ADT/StringSwitch.h"
00007 #include "llvm/Support/ConvertUTF.h"
00008 #include "llvm/Support/ErrorHandling.h"
00009 
00010 namespace clang {
00011 namespace comments {
00012 
00013 void Token::dump(const Lexer &L, const SourceManager &SM) const {
00014   llvm::errs() << "comments::Token Kind=" << Kind << " ";
00015   Loc.dump(SM);
00016   llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
00017 }
00018 
00019 static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
00020   return isLetter(C);
00021 }
00022 
00023 static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
00024   return isDigit(C);
00025 }
00026 
00027 static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
00028   return isHexDigit(C);
00029 }
00030 
00031 static inline StringRef convertCodePointToUTF8(
00032                                       llvm::BumpPtrAllocator &Allocator,
00033                                       unsigned CodePoint) {
00034   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
00035   char *ResolvedPtr = Resolved;
00036   if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
00037     return StringRef(Resolved, ResolvedPtr - Resolved);
00038   else
00039     return StringRef();
00040 }
00041 
00042 namespace {
00043 
00044 #include "clang/AST/CommentHTMLTags.inc"
00045 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
00046 
00047 } // unnamed namespace
00048 
00049 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
00050   // Fast path, first check a few most widely used named character references.
00051   return llvm::StringSwitch<StringRef>(Name)
00052       .Case("amp", "&")
00053       .Case("lt", "<")
00054       .Case("gt", ">")
00055       .Case("quot", "\"")
00056       .Case("apos", "\'")
00057       // Slow path.
00058       .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
00059 }
00060 
00061 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
00062   unsigned CodePoint = 0;
00063   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
00064     assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
00065     CodePoint *= 10;
00066     CodePoint += Name[i] - '0';
00067   }
00068   return convertCodePointToUTF8(Allocator, CodePoint);
00069 }
00070 
00071 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
00072   unsigned CodePoint = 0;
00073   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
00074     CodePoint *= 16;
00075     const char C = Name[i];
00076     assert(isHTMLHexCharacterReferenceCharacter(C));
00077     CodePoint += llvm::hexDigitValue(C);
00078   }
00079   return convertCodePointToUTF8(Allocator, CodePoint);
00080 }
00081 
00082 void Lexer::skipLineStartingDecorations() {
00083   // This function should be called only for C comments
00084   assert(CommentState == LCS_InsideCComment);
00085 
00086   if (BufferPtr == CommentEnd)
00087     return;
00088 
00089   switch (*BufferPtr) {
00090   case ' ':
00091   case '\t':
00092   case '\f':
00093   case '\v': {
00094     const char *NewBufferPtr = BufferPtr;
00095     NewBufferPtr++;
00096     if (NewBufferPtr == CommentEnd)
00097       return;
00098 
00099     char C = *NewBufferPtr;
00100     while (isHorizontalWhitespace(C)) {
00101       NewBufferPtr++;
00102       if (NewBufferPtr == CommentEnd)
00103         return;
00104       C = *NewBufferPtr;
00105     }
00106     if (C == '*')
00107       BufferPtr = NewBufferPtr + 1;
00108     break;
00109   }
00110   case '*':
00111     BufferPtr++;
00112     break;
00113   }
00114 }
00115 
00116 namespace {
00117 /// Returns pointer to the first newline character in the string.
00118 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
00119   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
00120     if (isVerticalWhitespace(*BufferPtr))
00121       return BufferPtr;
00122   }
00123   return BufferEnd;
00124 }
00125 
00126 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
00127   if (BufferPtr == BufferEnd)
00128     return BufferPtr;
00129 
00130   if (*BufferPtr == '\n')
00131     BufferPtr++;
00132   else {
00133     assert(*BufferPtr == '\r');
00134     BufferPtr++;
00135     if (BufferPtr != BufferEnd && *BufferPtr == '\n')
00136       BufferPtr++;
00137   }
00138   return BufferPtr;
00139 }
00140 
00141 const char *skipNamedCharacterReference(const char *BufferPtr,
00142                                         const char *BufferEnd) {
00143   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
00144     if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
00145       return BufferPtr;
00146   }
00147   return BufferEnd;
00148 }
00149 
00150 const char *skipDecimalCharacterReference(const char *BufferPtr,
00151                                           const char *BufferEnd) {
00152   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
00153     if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
00154       return BufferPtr;
00155   }
00156   return BufferEnd;
00157 }
00158 
00159 const char *skipHexCharacterReference(const char *BufferPtr,
00160                                       const char *BufferEnd) {
00161   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
00162     if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
00163       return BufferPtr;
00164   }
00165   return BufferEnd;
00166 }
00167 
00168 bool isHTMLIdentifierStartingCharacter(char C) {
00169   return isLetter(C);
00170 }
00171 
00172 bool isHTMLIdentifierCharacter(char C) {
00173   return isAlphanumeric(C);
00174 }
00175 
00176 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
00177   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
00178     if (!isHTMLIdentifierCharacter(*BufferPtr))
00179       return BufferPtr;
00180   }
00181   return BufferEnd;
00182 }
00183 
00184 /// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
00185 /// string allowed.
00186 ///
00187 /// Returns pointer to closing quote.
00188 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
00189 {
00190   const char Quote = *BufferPtr;
00191   assert(Quote == '\"' || Quote == '\'');
00192 
00193   BufferPtr++;
00194   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
00195     const char C = *BufferPtr;
00196     if (C == Quote && BufferPtr[-1] != '\\')
00197       return BufferPtr;
00198   }
00199   return BufferEnd;
00200 }
00201 
00202 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
00203   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
00204     if (!isWhitespace(*BufferPtr))
00205       return BufferPtr;
00206   }
00207   return BufferEnd;
00208 }
00209 
00210 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
00211   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
00212 }
00213 
00214 bool isCommandNameStartCharacter(char C) {
00215   return isLetter(C);
00216 }
00217 
00218 bool isCommandNameCharacter(char C) {
00219   return isAlphanumeric(C);
00220 }
00221 
00222 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
00223   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
00224     if (!isCommandNameCharacter(*BufferPtr))
00225       return BufferPtr;
00226   }
00227   return BufferEnd;
00228 }
00229 
00230 /// Return the one past end pointer for BCPL comments.
00231 /// Handles newlines escaped with backslash or trigraph for backslahs.
00232 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
00233   const char *CurPtr = BufferPtr;
00234   while (CurPtr != BufferEnd) {
00235     while (!isVerticalWhitespace(*CurPtr)) {
00236       CurPtr++;
00237       if (CurPtr == BufferEnd)
00238         return BufferEnd;
00239     }
00240     // We found a newline, check if it is escaped.
00241     const char *EscapePtr = CurPtr - 1;
00242     while(isHorizontalWhitespace(*EscapePtr))
00243       EscapePtr--;
00244 
00245     if (*EscapePtr == '\\' ||
00246         (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
00247          EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
00248       // We found an escaped newline.
00249       CurPtr = skipNewline(CurPtr, BufferEnd);
00250     } else
00251       return CurPtr; // Not an escaped newline.
00252   }
00253   return BufferEnd;
00254 }
00255 
00256 /// Return the one past end pointer for C comments.
00257 /// Very dumb, does not handle escaped newlines or trigraphs.
00258 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
00259   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
00260     if (*BufferPtr == '*') {
00261       assert(BufferPtr + 1 != BufferEnd);
00262       if (*(BufferPtr + 1) == '/')
00263         return BufferPtr;
00264     }
00265   }
00266   llvm_unreachable("buffer end hit before '*/' was seen");
00267 }
00268     
00269 } // unnamed namespace
00270 
00271 void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
00272                                tok::TokenKind Kind) {
00273   const unsigned TokLen = TokEnd - BufferPtr;
00274   Result.setLocation(getSourceLocation(BufferPtr));
00275   Result.setKind(Kind);
00276   Result.setLength(TokLen);
00277 #ifndef NDEBUG
00278   Result.TextPtr = "<UNSET>";
00279   Result.IntVal = 7;
00280 #endif
00281   BufferPtr = TokEnd;
00282 }
00283 
00284 void Lexer::lexCommentText(Token &T) {
00285   assert(CommentState == LCS_InsideBCPLComment ||
00286          CommentState == LCS_InsideCComment);
00287 
00288   switch (State) {
00289   case LS_Normal:
00290     break;
00291   case LS_VerbatimBlockFirstLine:
00292     lexVerbatimBlockFirstLine(T);
00293     return;
00294   case LS_VerbatimBlockBody:
00295     lexVerbatimBlockBody(T);
00296     return;
00297   case LS_VerbatimLineText:
00298     lexVerbatimLineText(T);
00299     return;
00300   case LS_HTMLStartTag:
00301     lexHTMLStartTag(T);
00302     return;
00303   case LS_HTMLEndTag:
00304     lexHTMLEndTag(T);
00305     return;
00306   }
00307 
00308   assert(State == LS_Normal);
00309 
00310   const char *TokenPtr = BufferPtr;
00311   assert(TokenPtr < CommentEnd);
00312   while (TokenPtr != CommentEnd) {
00313     switch(*TokenPtr) {
00314       case '\\':
00315       case '@': {
00316         // Commands that start with a backslash and commands that start with
00317         // 'at' have equivalent semantics.  But we keep information about the
00318         // exact syntax in AST for comments.
00319         tok::TokenKind CommandKind =
00320             (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
00321         TokenPtr++;
00322         if (TokenPtr == CommentEnd) {
00323           formTextToken(T, TokenPtr);
00324           return;
00325         }
00326         char C = *TokenPtr;
00327         switch (C) {
00328         default:
00329           break;
00330 
00331         case '\\': case '@': case '&': case '$':
00332         case '#':  case '<': case '>': case '%':
00333         case '\"': case '.': case ':':
00334           // This is one of \\ \@ \& \$ etc escape sequences.
00335           TokenPtr++;
00336           if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
00337             // This is the \:: escape sequence.
00338             TokenPtr++;
00339           }
00340           StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
00341           formTokenWithChars(T, TokenPtr, tok::text);
00342           T.setText(UnescapedText);
00343           return;
00344         }
00345 
00346         // Don't make zero-length commands.
00347         if (!isCommandNameStartCharacter(*TokenPtr)) {
00348           formTextToken(T, TokenPtr);
00349           return;
00350         }
00351 
00352         TokenPtr = skipCommandName(TokenPtr, CommentEnd);
00353         unsigned Length = TokenPtr - (BufferPtr + 1);
00354 
00355         // Hardcoded support for lexing LaTeX formula commands
00356         // \f$ \f[ \f] \f{ \f} as a single command.
00357         if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
00358           C = *TokenPtr;
00359           if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
00360             TokenPtr++;
00361             Length++;
00362           }
00363         }
00364 
00365         StringRef CommandName(BufferPtr + 1, Length);
00366 
00367         const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
00368         if (!Info) {
00369           if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
00370             StringRef CorrectedName = Info->Name;
00371             SourceLocation Loc = getSourceLocation(BufferPtr);
00372             SourceRange CommandRange(Loc.getLocWithOffset(1),
00373                                      getSourceLocation(TokenPtr));
00374             Diag(Loc, diag::warn_correct_comment_command_name)
00375               << CommandName << CorrectedName
00376               << FixItHint::CreateReplacement(CommandRange, CorrectedName);
00377           } else {
00378             formTokenWithChars(T, TokenPtr, tok::unknown_command);
00379             T.setUnknownCommandName(CommandName);
00380             Diag(T.getLocation(), diag::warn_unknown_comment_command_name);
00381             return;
00382           }
00383         }
00384         if (Info->IsVerbatimBlockCommand) {
00385           setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
00386           return;
00387         }
00388         if (Info->IsVerbatimLineCommand) {
00389           setupAndLexVerbatimLine(T, TokenPtr, Info);
00390           return;
00391         }
00392         formTokenWithChars(T, TokenPtr, CommandKind);
00393         T.setCommandID(Info->getID());
00394         return;
00395       }
00396 
00397       case '&':
00398         lexHTMLCharacterReference(T);
00399         return;
00400 
00401       case '<': {
00402         TokenPtr++;
00403         if (TokenPtr == CommentEnd) {
00404           formTextToken(T, TokenPtr);
00405           return;
00406         }
00407         const char C = *TokenPtr;
00408         if (isHTMLIdentifierStartingCharacter(C))
00409           setupAndLexHTMLStartTag(T);
00410         else if (C == '/')
00411           setupAndLexHTMLEndTag(T);
00412         else
00413           formTextToken(T, TokenPtr);
00414 
00415         return;
00416       }
00417 
00418       case '\n':
00419       case '\r':
00420         TokenPtr = skipNewline(TokenPtr, CommentEnd);
00421         formTokenWithChars(T, TokenPtr, tok::newline);
00422 
00423         if (CommentState == LCS_InsideCComment)
00424           skipLineStartingDecorations();
00425         return;
00426 
00427       default: {
00428         size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
00429                          find_first_of("\n\r\\@&<");
00430         if (End != StringRef::npos)
00431           TokenPtr += End;
00432         else
00433           TokenPtr = CommentEnd;
00434         formTextToken(T, TokenPtr);
00435         return;
00436       }
00437     }
00438   }
00439 }
00440 
00441 void Lexer::setupAndLexVerbatimBlock(Token &T,
00442                                      const char *TextBegin,
00443                                      char Marker, const CommandInfo *Info) {
00444   assert(Info->IsVerbatimBlockCommand);
00445 
00446   VerbatimBlockEndCommandName.clear();
00447   VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
00448   VerbatimBlockEndCommandName.append(Info->EndCommandName);
00449 
00450   formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
00451   T.setVerbatimBlockID(Info->getID());
00452 
00453   // If there is a newline following the verbatim opening command, skip the
00454   // newline so that we don't create an tok::verbatim_block_line with empty
00455   // text content.
00456   if (BufferPtr != CommentEnd &&
00457       isVerticalWhitespace(*BufferPtr)) {
00458     BufferPtr = skipNewline(BufferPtr, CommentEnd);
00459     State = LS_VerbatimBlockBody;
00460     return;
00461   }
00462 
00463   State = LS_VerbatimBlockFirstLine;
00464 }
00465 
00466 void Lexer::lexVerbatimBlockFirstLine(Token &T) {
00467 again:
00468   assert(BufferPtr < CommentEnd);
00469 
00470   // FIXME: It would be better to scan the text once, finding either the block
00471   // end command or newline.
00472   //
00473   // Extract current line.
00474   const char *Newline = findNewline(BufferPtr, CommentEnd);
00475   StringRef Line(BufferPtr, Newline - BufferPtr);
00476 
00477   // Look for end command in current line.
00478   size_t Pos = Line.find(VerbatimBlockEndCommandName);
00479   const char *TextEnd;
00480   const char *NextLine;
00481   if (Pos == StringRef::npos) {
00482     // Current line is completely verbatim.
00483     TextEnd = Newline;
00484     NextLine = skipNewline(Newline, CommentEnd);
00485   } else if (Pos == 0) {
00486     // Current line contains just an end command.
00487     const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
00488     StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
00489     formTokenWithChars(T, End, tok::verbatim_block_end);
00490     T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
00491     State = LS_Normal;
00492     return;
00493   } else {
00494     // There is some text, followed by end command.  Extract text first.
00495     TextEnd = BufferPtr + Pos;
00496     NextLine = TextEnd;
00497     // If there is only whitespace before end command, skip whitespace.
00498     if (isWhitespace(BufferPtr, TextEnd)) {
00499       BufferPtr = TextEnd;
00500       goto again;
00501     }
00502   }
00503 
00504   StringRef Text(BufferPtr, TextEnd - BufferPtr);
00505   formTokenWithChars(T, NextLine, tok::verbatim_block_line);
00506   T.setVerbatimBlockText(Text);
00507 
00508   State = LS_VerbatimBlockBody;
00509 }
00510 
00511 void Lexer::lexVerbatimBlockBody(Token &T) {
00512   assert(State == LS_VerbatimBlockBody);
00513 
00514   if (CommentState == LCS_InsideCComment)
00515     skipLineStartingDecorations();
00516 
00517   lexVerbatimBlockFirstLine(T);
00518 }
00519 
00520 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
00521                                     const CommandInfo *Info) {
00522   assert(Info->IsVerbatimLineCommand);
00523   formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
00524   T.setVerbatimLineID(Info->getID());
00525 
00526   State = LS_VerbatimLineText;
00527 }
00528 
00529 void Lexer::lexVerbatimLineText(Token &T) {
00530   assert(State == LS_VerbatimLineText);
00531 
00532   // Extract current line.
00533   const char *Newline = findNewline(BufferPtr, CommentEnd);
00534   StringRef Text(BufferPtr, Newline - BufferPtr);
00535   formTokenWithChars(T, Newline, tok::verbatim_line_text);
00536   T.setVerbatimLineText(Text);
00537 
00538   State = LS_Normal;
00539 }
00540 
00541 void Lexer::lexHTMLCharacterReference(Token &T) {
00542   const char *TokenPtr = BufferPtr;
00543   assert(*TokenPtr == '&');
00544   TokenPtr++;
00545   if (TokenPtr == CommentEnd) {
00546     formTextToken(T, TokenPtr);
00547     return;
00548   }
00549   const char *NamePtr;
00550   bool isNamed = false;
00551   bool isDecimal = false;
00552   char C = *TokenPtr;
00553   if (isHTMLNamedCharacterReferenceCharacter(C)) {
00554     NamePtr = TokenPtr;
00555     TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
00556     isNamed = true;
00557   } else if (C == '#') {
00558     TokenPtr++;
00559     if (TokenPtr == CommentEnd) {
00560       formTextToken(T, TokenPtr);
00561       return;
00562     }
00563     C = *TokenPtr;
00564     if (isHTMLDecimalCharacterReferenceCharacter(C)) {
00565       NamePtr = TokenPtr;
00566       TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
00567       isDecimal = true;
00568     } else if (C == 'x' || C == 'X') {
00569       TokenPtr++;
00570       NamePtr = TokenPtr;
00571       TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
00572     } else {
00573       formTextToken(T, TokenPtr);
00574       return;
00575     }
00576   } else {
00577     formTextToken(T, TokenPtr);
00578     return;
00579   }
00580   if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
00581       *TokenPtr != ';') {
00582     formTextToken(T, TokenPtr);
00583     return;
00584   }
00585   StringRef Name(NamePtr, TokenPtr - NamePtr);
00586   TokenPtr++; // Skip semicolon.
00587   StringRef Resolved;
00588   if (isNamed)
00589     Resolved = resolveHTMLNamedCharacterReference(Name);
00590   else if (isDecimal)
00591     Resolved = resolveHTMLDecimalCharacterReference(Name);
00592   else
00593     Resolved = resolveHTMLHexCharacterReference(Name);
00594 
00595   if (Resolved.empty()) {
00596     formTextToken(T, TokenPtr);
00597     return;
00598   }
00599   formTokenWithChars(T, TokenPtr, tok::text);
00600   T.setText(Resolved);
00601   return;
00602 }
00603 
00604 void Lexer::setupAndLexHTMLStartTag(Token &T) {
00605   assert(BufferPtr[0] == '<' &&
00606          isHTMLIdentifierStartingCharacter(BufferPtr[1]));
00607   const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
00608   StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
00609   if (!isHTMLTagName(Name)) {
00610     formTextToken(T, TagNameEnd);
00611     return;
00612   }
00613 
00614   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
00615   T.setHTMLTagStartName(Name);
00616 
00617   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
00618 
00619   const char C = *BufferPtr;
00620   if (BufferPtr != CommentEnd &&
00621       (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
00622     State = LS_HTMLStartTag;
00623 }
00624 
00625 void Lexer::lexHTMLStartTag(Token &T) {
00626   assert(State == LS_HTMLStartTag);
00627 
00628   const char *TokenPtr = BufferPtr;
00629   char C = *TokenPtr;
00630   if (isHTMLIdentifierCharacter(C)) {
00631     TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
00632     StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
00633     formTokenWithChars(T, TokenPtr, tok::html_ident);
00634     T.setHTMLIdent(Ident);
00635   } else {
00636     switch (C) {
00637     case '=':
00638       TokenPtr++;
00639       formTokenWithChars(T, TokenPtr, tok::html_equals);
00640       break;
00641     case '\"':
00642     case '\'': {
00643       const char *OpenQuote = TokenPtr;
00644       TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
00645       const char *ClosingQuote = TokenPtr;
00646       if (TokenPtr != CommentEnd) // Skip closing quote.
00647         TokenPtr++;
00648       formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
00649       T.setHTMLQuotedString(StringRef(OpenQuote + 1,
00650                                       ClosingQuote - (OpenQuote + 1)));
00651       break;
00652     }
00653     case '>':
00654       TokenPtr++;
00655       formTokenWithChars(T, TokenPtr, tok::html_greater);
00656       State = LS_Normal;
00657       return;
00658     case '/':
00659       TokenPtr++;
00660       if (TokenPtr != CommentEnd && *TokenPtr == '>') {
00661         TokenPtr++;
00662         formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
00663       } else
00664         formTextToken(T, TokenPtr);
00665 
00666       State = LS_Normal;
00667       return;
00668     }
00669   }
00670 
00671   // Now look ahead and return to normal state if we don't see any HTML tokens
00672   // ahead.
00673   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
00674   if (BufferPtr == CommentEnd) {
00675     State = LS_Normal;
00676     return;
00677   }
00678 
00679   C = *BufferPtr;
00680   if (!isHTMLIdentifierStartingCharacter(C) &&
00681       C != '=' && C != '\"' && C != '\'' && C != '>') {
00682     State = LS_Normal;
00683     return;
00684   }
00685 }
00686 
00687 void Lexer::setupAndLexHTMLEndTag(Token &T) {
00688   assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
00689 
00690   const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
00691   const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
00692   StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
00693   if (!isHTMLTagName(Name)) {
00694     formTextToken(T, TagNameEnd);
00695     return;
00696   }
00697 
00698   const char *End = skipWhitespace(TagNameEnd, CommentEnd);
00699 
00700   formTokenWithChars(T, End, tok::html_end_tag);
00701   T.setHTMLTagEndName(Name);
00702 
00703   if (BufferPtr != CommentEnd && *BufferPtr == '>')
00704     State = LS_HTMLEndTag;
00705 }
00706 
00707 void Lexer::lexHTMLEndTag(Token &T) {
00708   assert(BufferPtr != CommentEnd && *BufferPtr == '>');
00709 
00710   formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
00711   State = LS_Normal;
00712 }
00713 
00714 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
00715              const CommandTraits &Traits,
00716              SourceLocation FileLoc,
00717              const char *BufferStart, const char *BufferEnd):
00718     Allocator(Allocator), Diags(Diags), Traits(Traits),
00719     BufferStart(BufferStart), BufferEnd(BufferEnd),
00720     FileLoc(FileLoc), BufferPtr(BufferStart),
00721     CommentState(LCS_BeforeComment), State(LS_Normal) {
00722 }
00723 
00724 void Lexer::lex(Token &T) {
00725 again:
00726   switch (CommentState) {
00727   case LCS_BeforeComment:
00728     if (BufferPtr == BufferEnd) {
00729       formTokenWithChars(T, BufferPtr, tok::eof);
00730       return;
00731     }
00732 
00733     assert(*BufferPtr == '/');
00734     BufferPtr++; // Skip first slash.
00735     switch(*BufferPtr) {
00736     case '/': { // BCPL comment.
00737       BufferPtr++; // Skip second slash.
00738 
00739       if (BufferPtr != BufferEnd) {
00740         // Skip Doxygen magic marker, if it is present.
00741         // It might be missing because of a typo //< or /*<, or because we
00742         // merged this non-Doxygen comment into a bunch of Doxygen comments
00743         // around it: /** ... */ /* ... */ /** ... */
00744         const char C = *BufferPtr;
00745         if (C == '/' || C == '!')
00746           BufferPtr++;
00747       }
00748 
00749       // Skip less-than symbol that marks trailing comments.
00750       // Skip it even if the comment is not a Doxygen one, because //< and /*<
00751       // are frequent typos.
00752       if (BufferPtr != BufferEnd && *BufferPtr == '<')
00753         BufferPtr++;
00754 
00755       CommentState = LCS_InsideBCPLComment;
00756       if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
00757         State = LS_Normal;
00758       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
00759       goto again;
00760     }
00761     case '*': { // C comment.
00762       BufferPtr++; // Skip star.
00763 
00764       // Skip Doxygen magic marker.
00765       const char C = *BufferPtr;
00766       if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
00767         BufferPtr++;
00768 
00769       // Skip less-than symbol that marks trailing comments.
00770       if (BufferPtr != BufferEnd && *BufferPtr == '<')
00771         BufferPtr++;
00772 
00773       CommentState = LCS_InsideCComment;
00774       State = LS_Normal;
00775       CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
00776       goto again;
00777     }
00778     default:
00779       llvm_unreachable("second character of comment should be '/' or '*'");
00780     }
00781 
00782   case LCS_BetweenComments: {
00783     // Consecutive comments are extracted only if there is only whitespace
00784     // between them.  So we can search for the start of the next comment.
00785     const char *EndWhitespace = BufferPtr;
00786     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
00787       EndWhitespace++;
00788 
00789     // Turn any whitespace between comments (and there is only whitespace
00790     // between them -- guaranteed by comment extraction) into a newline.  We
00791     // have two newlines between C comments in total (first one was synthesized
00792     // after a comment).
00793     formTokenWithChars(T, EndWhitespace, tok::newline);
00794 
00795     CommentState = LCS_BeforeComment;
00796     break;
00797   }
00798 
00799   case LCS_InsideBCPLComment:
00800   case LCS_InsideCComment:
00801     if (BufferPtr != CommentEnd) {
00802       lexCommentText(T);
00803       break;
00804     } else {
00805       // Skip C comment closing sequence.
00806       if (CommentState == LCS_InsideCComment) {
00807         assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
00808         BufferPtr += 2;
00809         assert(BufferPtr <= BufferEnd);
00810 
00811         // Synthenize newline just after the C comment, regardless if there is
00812         // actually a newline.
00813         formTokenWithChars(T, BufferPtr, tok::newline);
00814 
00815         CommentState = LCS_BetweenComments;
00816         break;
00817       } else {
00818         // Don't synthesized a newline after BCPL comment.
00819         CommentState = LCS_BetweenComments;
00820         goto again;
00821       }
00822     }
00823   }
00824 }
00825 
00826 StringRef Lexer::getSpelling(const Token &Tok,
00827                              const SourceManager &SourceMgr,
00828                              bool *Invalid) const {
00829   SourceLocation Loc = Tok.getLocation();
00830   std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
00831 
00832   bool InvalidTemp = false;
00833   StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
00834   if (InvalidTemp) {
00835     *Invalid = true;
00836     return StringRef();
00837   }
00838 
00839   const char *Begin = File.data() + LocInfo.second;
00840   return StringRef(Begin, Tok.getLength());
00841 }
00842 
00843 } // end namespace comments
00844 } // end namespace clang
00845