clang: Parser.cpp Source File

Go to the documentation of this file.
00001 //===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 ///
00010 /// \file
00011 /// \brief Recursive parser implementation for the matcher expression grammar.
00012 ///
00013 //===----------------------------------------------------------------------===//
00014 
00015 #include "clang/ASTMatchers/Dynamic/Parser.h"
00016 #include "clang/ASTMatchers/Dynamic/Registry.h"
00017 #include "clang/Basic/CharInfo.h"
00018 #include "llvm/ADT/Optional.h"
00019 #include "llvm/ADT/Twine.h"
00020 #include "llvm/Support/ManagedStatic.h"
00021 #include <string>
00022 #include <vector>
00023 
00024 namespace clang {
00025 namespace ast_matchers {
00026 namespace dynamic {
00027 
00028 /// \brief Simple structure to hold information for one token from the parser.
00029 struct Parser::TokenInfo {
00030   /// \brief Different possible tokens.
00031   enum TokenKind {
00032     TK_Eof,
00033     TK_OpenParen,
00034     TK_CloseParen,
00035     TK_Comma,
00036     TK_Period,
00037     TK_Literal,
00038     TK_Ident,
00039     TK_InvalidChar,
00040     TK_Error,
00041     TK_CodeCompletion
00042   };
00043 
00044   /// \brief Some known identifiers.
00045   static const char* const ID_Bind;
00046 
00047   TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {}
00048 
00049   StringRef Text;
00050   TokenKind Kind;
00051   SourceRange Range;
00052   VariantValue Value;
00053 };
00054 
00055 const char* const Parser::TokenInfo::ID_Bind = "bind";
00056 
00057 /// \brief Simple tokenizer for the parser.
00058 class Parser::CodeTokenizer {
00059 public:
00060   explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error)
00061       : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error),
00062         CodeCompletionLocation(nullptr) {
00063     NextToken = getNextToken();
00064   }
00065 
00066   CodeTokenizer(StringRef MatcherCode, Diagnostics *Error,
00067                 unsigned CodeCompletionOffset)
00068       : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error),
00069         CodeCompletionLocation(MatcherCode.data() + CodeCompletionOffset) {
00070     NextToken = getNextToken();
00071   }
00072 
00073   /// \brief Returns but doesn't consume the next token.
00074   const TokenInfo &peekNextToken() const { return NextToken; }
00075 
00076   /// \brief Consumes and returns the next token.
00077   TokenInfo consumeNextToken() {
00078     TokenInfo ThisToken = NextToken;
00079     NextToken = getNextToken();
00080     return ThisToken;
00081   }
00082 
00083   TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; }
00084 
00085 private:
00086   TokenInfo getNextToken() {
00087     consumeWhitespace();
00088     TokenInfo Result;
00089     Result.Range.Start = currentLocation();
00090 
00091     if (CodeCompletionLocation && CodeCompletionLocation <= Code.data()) {
00092       Result.Kind = TokenInfo::TK_CodeCompletion;
00093       Result.Text = StringRef(CodeCompletionLocation, 0);
00094       CodeCompletionLocation = nullptr;
00095       return Result;
00096     }
00097 
00098     if (Code.empty()) {
00099       Result.Kind = TokenInfo::TK_Eof;
00100       Result.Text = "";
00101       return Result;
00102     }
00103 
00104     switch (Code[0]) {
00105     case ',':
00106       Result.Kind = TokenInfo::TK_Comma;
00107       Result.Text = Code.substr(0, 1);
00108       Code = Code.drop_front();
00109       break;
00110     case '.':
00111       Result.Kind = TokenInfo::TK_Period;
00112       Result.Text = Code.substr(0, 1);
00113       Code = Code.drop_front();
00114       break;
00115     case '(':
00116       Result.Kind = TokenInfo::TK_OpenParen;
00117       Result.Text = Code.substr(0, 1);
00118       Code = Code.drop_front();
00119       break;
00120     case ')':
00121       Result.Kind = TokenInfo::TK_CloseParen;
00122       Result.Text = Code.substr(0, 1);
00123       Code = Code.drop_front();
00124       break;
00125 
00126     case '"':
00127     case '\'':
00128       // Parse a string literal.
00129       consumeStringLiteral(&Result);
00130       break;
00131 
00132     case '0': case '1': case '2': case '3': case '4':
00133     case '5': case '6': case '7': case '8': case '9':
00134       // Parse an unsigned literal.
00135       consumeUnsignedLiteral(&Result);
00136       break;
00137 
00138     default:
00139       if (isAlphanumeric(Code[0])) {
00140         // Parse an identifier
00141         size_t TokenLength = 1;
00142         while (1) {
00143           // A code completion location in/immediately after an identifier will
00144           // cause the portion of the identifier before the code completion
00145           // location to become a code completion token.
00146           if (CodeCompletionLocation == Code.data() + TokenLength) {
00147             CodeCompletionLocation = nullptr;
00148             Result.Kind = TokenInfo::TK_CodeCompletion;
00149             Result.Text = Code.substr(0, TokenLength);
00150             Code = Code.drop_front(TokenLength);
00151             return Result;
00152           }
00153           if (TokenLength == Code.size() || !isAlphanumeric(Code[TokenLength]))
00154             break;
00155           ++TokenLength;
00156         }
00157         Result.Kind = TokenInfo::TK_Ident;
00158         Result.Text = Code.substr(0, TokenLength);
00159         Code = Code.drop_front(TokenLength);
00160       } else {
00161         Result.Kind = TokenInfo::TK_InvalidChar;
00162         Result.Text = Code.substr(0, 1);
00163         Code = Code.drop_front(1);
00164       }
00165       break;
00166     }
00167 
00168     Result.Range.End = currentLocation();
00169     return Result;
00170   }
00171 
00172   /// \brief Consume an unsigned literal.
00173   void consumeUnsignedLiteral(TokenInfo *Result) {
00174     unsigned Length = 1;
00175     if (Code.size() > 1) {
00176       // Consume the 'x' or 'b' radix modifier, if present.
00177       switch (toLowercase(Code[1])) {
00178       case 'x': case 'b': Length = 2;
00179       }
00180     }
00181     while (Length < Code.size() && isHexDigit(Code[Length]))
00182       ++Length;
00183 
00184     Result->Text = Code.substr(0, Length);
00185     Code = Code.drop_front(Length);
00186 
00187     unsigned Value;
00188     if (!Result->Text.getAsInteger(0, Value)) {
00189       Result->Kind = TokenInfo::TK_Literal;
00190       Result->Value = Value;
00191     } else {
00192       SourceRange Range;
00193       Range.Start = Result->Range.Start;
00194       Range.End = currentLocation();
00195       Error->addError(Range, Error->ET_ParserUnsignedError) << Result->Text;
00196       Result->Kind = TokenInfo::TK_Error;
00197     }
00198   }
00199 
00200   /// \brief Consume a string literal.
00201   ///
00202   /// \c Code must be positioned at the start of the literal (the opening
00203   /// quote). Consumed until it finds the same closing quote character.
00204   void consumeStringLiteral(TokenInfo *Result) {
00205     bool InEscape = false;
00206     const char Marker = Code[0];
00207     for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) {
00208       if (InEscape) {
00209         InEscape = false;
00210         continue;
00211       }
00212       if (Code[Length] == '\\') {
00213         InEscape = true;
00214         continue;
00215       }
00216       if (Code[Length] == Marker) {
00217         Result->Kind = TokenInfo::TK_Literal;
00218         Result->Text = Code.substr(0, Length + 1);
00219         Result->Value = Code.substr(1, Length - 1).str();
00220         Code = Code.drop_front(Length + 1);
00221         return;
00222       }
00223     }
00224 
00225     StringRef ErrorText = Code;
00226     Code = Code.drop_front(Code.size());
00227     SourceRange Range;
00228     Range.Start = Result->Range.Start;
00229     Range.End = currentLocation();
00230     Error->addError(Range, Error->ET_ParserStringError) << ErrorText;
00231     Result->Kind = TokenInfo::TK_Error;
00232   }
00233 
00234   /// \brief Consume all leading whitespace from \c Code.
00235   void consumeWhitespace() {
00236     while (!Code.empty() && isWhitespace(Code[0])) {
00237       if (Code[0] == '\n') {
00238         ++Line;
00239         StartOfLine = Code.drop_front();
00240       }
00241       Code = Code.drop_front();
00242     }
00243   }
00244 
00245   SourceLocation currentLocation() {
00246     SourceLocation Location;
00247     Location.Line = Line;
00248     Location.Column = Code.data() - StartOfLine.data() + 1;
00249     return Location;
00250   }
00251 
00252   StringRef Code;
00253   StringRef StartOfLine;
00254   unsigned Line;
00255   Diagnostics *Error;
00256   TokenInfo NextToken;
00257   const char *CodeCompletionLocation;
00258 };
00259 
00260 Parser::Sema::~Sema() {}
00261 
00262 std::vector<ArgKind> Parser::Sema::getAcceptedCompletionTypes(
00263     llvm::ArrayRef<std::pair<MatcherCtor, unsigned>> Context) {
00264   return std::vector<ArgKind>();
00265 }
00266 
00267 std::vector<MatcherCompletion>
00268 Parser::Sema::getMatcherCompletions(llvm::ArrayRef<ArgKind> AcceptedTypes) {
00269   return std::vector<MatcherCompletion>();
00270 }
00271 
00272 struct Parser::ScopedContextEntry {
00273   Parser *P;
00274 
00275   ScopedContextEntry(Parser *P, MatcherCtor C) : P(P) {
00276     P->ContextStack.push_back(std::make_pair(C, 0u));
00277   }
00278 
00279   ~ScopedContextEntry() {
00280     P->ContextStack.pop_back();
00281   }
00282 
00283   void nextArg() {
00284     ++P->ContextStack.back().second;
00285   }
00286 };
00287 
00288 /// \brief Parse expressions that start with an identifier.
00289 ///
00290 /// This function can parse named values and matchers.
00291 /// In case of failure it will try to determine the user's intent to give
00292 /// an appropriate error message.
00293 bool Parser::parseIdentifierPrefixImpl(VariantValue *Value) {
00294   const TokenInfo NameToken = Tokenizer->consumeNextToken();
00295 
00296   if (Tokenizer->nextTokenKind() != TokenInfo::TK_OpenParen) {
00297     // Parse as a named value.
00298     if (const VariantValue NamedValue =
00299             NamedValues ? NamedValues->lookup(NameToken.Text)
00300                         : VariantValue()) {
00301       *Value = NamedValue;
00302       return true;
00303     }
00304     // If the syntax is correct and the name is not a matcher either, report
00305     // unknown named value.
00306     if ((Tokenizer->nextTokenKind() == TokenInfo::TK_Comma ||
00307          Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen ||
00308          Tokenizer->nextTokenKind() == TokenInfo::TK_Eof) &&
00309         !S->lookupMatcherCtor(NameToken.Text)) {
00310       Error->addError(NameToken.Range, Error->ET_RegistryValueNotFound)
00311           << NameToken.Text;
00312       return false;
00313     }
00314     // Otherwise, fallback to the matcher parser.
00315   }
00316 
00317   // Parse as a matcher expression.
00318   return parseMatcherExpressionImpl(NameToken, Value);
00319 }
00320 
00321 /// \brief Parse and validate a matcher expression.
00322 /// \return \c true on success, in which case \c Value has the matcher parsed.
00323 ///   If the input is malformed, or some argument has an error, it
00324 ///   returns \c false.
00325 bool Parser::parseMatcherExpressionImpl(const TokenInfo &NameToken,
00326                                         VariantValue *Value) {
00327   assert(NameToken.Kind == TokenInfo::TK_Ident);
00328   const TokenInfo OpenToken = Tokenizer->consumeNextToken();
00329   if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
00330     Error->addError(OpenToken.Range, Error->ET_ParserNoOpenParen)
00331         << OpenToken.Text;
00332     return false;
00333   }
00334 
00335   llvm::Optional<MatcherCtor> Ctor = S->lookupMatcherCtor(NameToken.Text);
00336 
00337   if (!Ctor) {
00338     Error->addError(NameToken.Range, Error->ET_RegistryMatcherNotFound)
00339         << NameToken.Text;
00340     // Do not return here. We need to continue to give completion suggestions.
00341   }
00342 
00343   std::vector<ParserValue> Args;
00344   TokenInfo EndToken;
00345 
00346   {
00347     ScopedContextEntry SCE(this, Ctor ? *Ctor : nullptr);
00348 
00349     while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) {
00350       if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) {
00351         // End of args.
00352         EndToken = Tokenizer->consumeNextToken();
00353         break;
00354       }
00355       if (Args.size() > 0) {
00356         // We must find a , token to continue.
00357         const TokenInfo CommaToken = Tokenizer->consumeNextToken();
00358         if (CommaToken.Kind != TokenInfo::TK_Comma) {
00359           Error->addError(CommaToken.Range, Error->ET_ParserNoComma)
00360               << CommaToken.Text;
00361           return false;
00362         }
00363       }
00364 
00365       Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error,
00366                                NameToken.Text, NameToken.Range,
00367                                Args.size() + 1);
00368       ParserValue ArgValue;
00369       ArgValue.Text = Tokenizer->peekNextToken().Text;
00370       ArgValue.Range = Tokenizer->peekNextToken().Range;
00371       if (!parseExpressionImpl(&ArgValue.Value)) {
00372         return false;
00373       }
00374 
00375       Args.push_back(ArgValue);
00376       SCE.nextArg();
00377     }
00378   }
00379 
00380   if (EndToken.Kind == TokenInfo::TK_Eof) {
00381     Error->addError(OpenToken.Range, Error->ET_ParserNoCloseParen);
00382     return false;
00383   }
00384 
00385   std::string BindID;
00386   if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) {
00387     // Parse .bind("foo")
00388     Tokenizer->consumeNextToken();  // consume the period.
00389     const TokenInfo BindToken = Tokenizer->consumeNextToken();
00390     if (BindToken.Kind == TokenInfo::TK_CodeCompletion) {
00391       addCompletion(BindToken, MatcherCompletion("bind(\"", "bind", 1));
00392       return false;
00393     }
00394 
00395     const TokenInfo OpenToken = Tokenizer->consumeNextToken();
00396     const TokenInfo IDToken = Tokenizer->consumeNextToken();
00397     const TokenInfo CloseToken = Tokenizer->consumeNextToken();
00398 
00399     // TODO: We could use different error codes for each/some to be more
00400     //       explicit about the syntax error.
00401     if (BindToken.Kind != TokenInfo::TK_Ident ||
00402         BindToken.Text != TokenInfo::ID_Bind) {
00403       Error->addError(BindToken.Range, Error->ET_ParserMalformedBindExpr);
00404       return false;
00405     }
00406     if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
00407       Error->addError(OpenToken.Range, Error->ET_ParserMalformedBindExpr);
00408       return false;
00409     }
00410     if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) {
00411       Error->addError(IDToken.Range, Error->ET_ParserMalformedBindExpr);
00412       return false;
00413     }
00414     if (CloseToken.Kind != TokenInfo::TK_CloseParen) {
00415       Error->addError(CloseToken.Range, Error->ET_ParserMalformedBindExpr);
00416       return false;
00417     }
00418     BindID = IDToken.Value.getString();
00419   }
00420 
00421   if (!Ctor)
00422     return false;
00423 
00424   // Merge the start and end infos.
00425   Diagnostics::Context Ctx(Diagnostics::Context::ConstructMatcher, Error,
00426                            NameToken.Text, NameToken.Range);
00427   SourceRange MatcherRange = NameToken.Range;
00428   MatcherRange.End = EndToken.Range.End;
00429   VariantMatcher Result = S->actOnMatcherExpression(
00430       *Ctor, MatcherRange, BindID, Args, Error);
00431   if (Result.isNull()) return false;
00432 
00433   *Value = Result;
00434   return true;
00435 }
00436 
00437 // If the prefix of this completion matches the completion token, add it to
00438 // Completions minus the prefix.
00439 void Parser::addCompletion(const TokenInfo &CompToken,
00440                            const MatcherCompletion& Completion) {
00441   if (StringRef(Completion.TypedText).startswith(CompToken.Text) &&
00442       Completion.Specificity > 0) {
00443     Completions.emplace_back(Completion.TypedText.substr(CompToken.Text.size()),
00444                              Completion.MatcherDecl, Completion.Specificity);
00445   }
00446 }
00447 
00448 std::vector<MatcherCompletion> Parser::getNamedValueCompletions(
00449     ArrayRef<ArgKind> AcceptedTypes) {
00450   if (!NamedValues) return std::vector<MatcherCompletion>();
00451   std::vector<MatcherCompletion> Result;
00452   for (const auto &Entry : *NamedValues) {
00453     unsigned Specificity;
00454     if (Entry.getValue().isConvertibleTo(AcceptedTypes, &Specificity)) {
00455       std::string Decl =
00456           (Entry.getValue().getTypeAsString() + " " + Entry.getKey()).str();
00457       Result.emplace_back(Entry.getKey(), Decl, Specificity);
00458     }
00459   }
00460   return Result;
00461 }
00462 
00463 void Parser::addExpressionCompletions() {
00464   const TokenInfo CompToken = Tokenizer->consumeNextToken();
00465   assert(CompToken.Kind == TokenInfo::TK_CodeCompletion);
00466 
00467   // We cannot complete code if there is an invalid element on the context
00468   // stack.
00469   for (ContextStackTy::iterator I = ContextStack.begin(),
00470                                 E = ContextStack.end();
00471        I != E; ++I) {
00472     if (!I->first)
00473       return;
00474   }
00475 
00476   auto AcceptedTypes = S->getAcceptedCompletionTypes(ContextStack);
00477   for (const auto &Completion : S->getMatcherCompletions(AcceptedTypes)) {
00478     addCompletion(CompToken, Completion);
00479   }
00480 
00481   for (const auto &Completion : getNamedValueCompletions(AcceptedTypes)) {
00482     addCompletion(CompToken, Completion);
00483   }
00484 }
00485 
00486 /// \brief Parse an <Expresssion>
00487 bool Parser::parseExpressionImpl(VariantValue *Value) {
00488   switch (Tokenizer->nextTokenKind()) {
00489   case TokenInfo::TK_Literal:
00490     *Value = Tokenizer->consumeNextToken().Value;
00491     return true;
00492 
00493   case TokenInfo::TK_Ident:
00494     return parseIdentifierPrefixImpl(Value);
00495 
00496   case TokenInfo::TK_CodeCompletion:
00497     addExpressionCompletions();
00498     return false;
00499 
00500   case TokenInfo::TK_Eof:
00501     Error->addError(Tokenizer->consumeNextToken().Range,
00502                     Error->ET_ParserNoCode);
00503     return false;
00504 
00505   case TokenInfo::TK_Error:
00506     // This error was already reported by the tokenizer.
00507     return false;
00508 
00509   case TokenInfo::TK_OpenParen:
00510   case TokenInfo::TK_CloseParen:
00511   case TokenInfo::TK_Comma:
00512   case TokenInfo::TK_Period:
00513   case TokenInfo::TK_InvalidChar:
00514     const TokenInfo Token = Tokenizer->consumeNextToken();
00515     Error->addError(Token.Range, Error->ET_ParserInvalidToken) << Token.Text;
00516     return false;
00517   }
00518 
00519   llvm_unreachable("Unknown token kind.");
00520 }
00521 
00522 static llvm::ManagedStatic<Parser::RegistrySema> DefaultRegistrySema;
00523 
00524 Parser::Parser(CodeTokenizer *Tokenizer, Sema *S,
00525                const NamedValueMap *NamedValues, Diagnostics *Error)
00526     : Tokenizer(Tokenizer), S(S ? S : &*DefaultRegistrySema),
00527       NamedValues(NamedValues), Error(Error) {}
00528 
00529 Parser::RegistrySema::~RegistrySema() {}
00530 
00531 llvm::Optional<MatcherCtor>
00532 Parser::RegistrySema::lookupMatcherCtor(StringRef MatcherName) {
00533   return Registry::lookupMatcherCtor(MatcherName);
00534 }
00535 
00536 VariantMatcher Parser::RegistrySema::actOnMatcherExpression(
00537     MatcherCtor Ctor, const SourceRange &NameRange, StringRef BindID,
00538     ArrayRef<ParserValue> Args, Diagnostics *Error) {
00539   if (BindID.empty()) {
00540     return Registry::constructMatcher(Ctor, NameRange, Args, Error);
00541   } else {
00542     return Registry::constructBoundMatcher(Ctor, NameRange, BindID, Args,
00543                                            Error);
00544   }
00545 }
00546 
00547 std::vector<ArgKind> Parser::RegistrySema::getAcceptedCompletionTypes(
00548     ArrayRef<std::pair<MatcherCtor, unsigned>> Context) {
00549   return Registry::getAcceptedCompletionTypes(Context);
00550 }
00551 
00552 std::vector<MatcherCompletion> Parser::RegistrySema::getMatcherCompletions(
00553     ArrayRef<ArgKind> AcceptedTypes) {
00554   return Registry::getMatcherCompletions(AcceptedTypes);
00555 }
00556 
00557 bool Parser::parseExpression(StringRef Code, Sema *S,
00558                              const NamedValueMap *NamedValues,
00559                              VariantValue *Value, Diagnostics *Error) {
00560   CodeTokenizer Tokenizer(Code, Error);
00561   if (!Parser(&Tokenizer, S, NamedValues, Error).parseExpressionImpl(Value))
00562     return false;
00563   if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) {
00564     Error->addError(Tokenizer.peekNextToken().Range,
00565                     Error->ET_ParserTrailingCode);
00566     return false;
00567   }
00568   return true;
00569 }
00570 
00571 std::vector<MatcherCompletion>
00572 Parser::completeExpression(StringRef Code, unsigned CompletionOffset, Sema *S,
00573                            const NamedValueMap *NamedValues) {
00574   Diagnostics Error;
00575   CodeTokenizer Tokenizer(Code, &Error, CompletionOffset);
00576   Parser P(&Tokenizer, S, NamedValues, &Error);
00577   VariantValue Dummy;
00578   P.parseExpressionImpl(&Dummy);
00579 
00580   // Sort by specificity, then by name.
00581   std::sort(P.Completions.begin(), P.Completions.end(),
00582             [](const MatcherCompletion &A, const MatcherCompletion &B) {
00583     if (A.Specificity != B.Specificity)
00584       return A.Specificity > B.Specificity;
00585     return A.TypedText < B.TypedText;
00586   });
00587 
00588   return P.Completions;
00589 }
00590 
00591 llvm::Optional<DynTypedMatcher>
00592 Parser::parseMatcherExpression(StringRef Code, Sema *S,
00593                                const NamedValueMap *NamedValues,
00594                                Diagnostics *Error) {
00595   VariantValue Value;
00596   if (!parseExpression(Code, S, NamedValues, &Value, Error))
00597     return llvm::Optional<DynTypedMatcher>();
00598   if (!Value.isMatcher()) {
00599     Error->addError(SourceRange(), Error->ET_ParserNotAMatcher);
00600     return llvm::Optional<DynTypedMatcher>();
00601   }
00602   llvm::Optional<DynTypedMatcher> Result =
00603       Value.getMatcher().getSingleMatcher();
00604   if (!Result.hasValue()) {
00605     Error->addError(SourceRange(), Error->ET_ParserOverloadedType)
00606         << Value.getTypeAsString();
00607   }
00608   return Result;
00609 }
00610 
00611 }  // namespace dynamic
00612 }  // namespace ast_matchers
00613 }  // namespace clang