clang API Documentation
00001 //===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===// 00002 // 00003 // The LLVM Compiler Infrastructure 00004 // 00005 // This file is distributed under the University of Illinois Open Source 00006 // License. See LICENSE.TXT for details. 00007 // 00008 //===----------------------------------------------------------------------===// 00009 /// 00010 /// \file 00011 /// \brief Recursive parser implementation for the matcher expression grammar. 00012 /// 00013 //===----------------------------------------------------------------------===// 00014 00015 #include "clang/ASTMatchers/Dynamic/Parser.h" 00016 #include "clang/ASTMatchers/Dynamic/Registry.h" 00017 #include "clang/Basic/CharInfo.h" 00018 #include "llvm/ADT/Optional.h" 00019 #include "llvm/ADT/Twine.h" 00020 #include "llvm/Support/ManagedStatic.h" 00021 #include <string> 00022 #include <vector> 00023 00024 namespace clang { 00025 namespace ast_matchers { 00026 namespace dynamic { 00027 00028 /// \brief Simple structure to hold information for one token from the parser. 00029 struct Parser::TokenInfo { 00030 /// \brief Different possible tokens. 00031 enum TokenKind { 00032 TK_Eof, 00033 TK_OpenParen, 00034 TK_CloseParen, 00035 TK_Comma, 00036 TK_Period, 00037 TK_Literal, 00038 TK_Ident, 00039 TK_InvalidChar, 00040 TK_Error, 00041 TK_CodeCompletion 00042 }; 00043 00044 /// \brief Some known identifiers. 00045 static const char* const ID_Bind; 00046 00047 TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {} 00048 00049 StringRef Text; 00050 TokenKind Kind; 00051 SourceRange Range; 00052 VariantValue Value; 00053 }; 00054 00055 const char* const Parser::TokenInfo::ID_Bind = "bind"; 00056 00057 /// \brief Simple tokenizer for the parser. 00058 class Parser::CodeTokenizer { 00059 public: 00060 explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error) 00061 : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error), 00062 CodeCompletionLocation(nullptr) { 00063 NextToken = getNextToken(); 00064 } 00065 00066 CodeTokenizer(StringRef MatcherCode, Diagnostics *Error, 00067 unsigned CodeCompletionOffset) 00068 : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error), 00069 CodeCompletionLocation(MatcherCode.data() + CodeCompletionOffset) { 00070 NextToken = getNextToken(); 00071 } 00072 00073 /// \brief Returns but doesn't consume the next token. 00074 const TokenInfo &peekNextToken() const { return NextToken; } 00075 00076 /// \brief Consumes and returns the next token. 00077 TokenInfo consumeNextToken() { 00078 TokenInfo ThisToken = NextToken; 00079 NextToken = getNextToken(); 00080 return ThisToken; 00081 } 00082 00083 TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; } 00084 00085 private: 00086 TokenInfo getNextToken() { 00087 consumeWhitespace(); 00088 TokenInfo Result; 00089 Result.Range.Start = currentLocation(); 00090 00091 if (CodeCompletionLocation && CodeCompletionLocation <= Code.data()) { 00092 Result.Kind = TokenInfo::TK_CodeCompletion; 00093 Result.Text = StringRef(CodeCompletionLocation, 0); 00094 CodeCompletionLocation = nullptr; 00095 return Result; 00096 } 00097 00098 if (Code.empty()) { 00099 Result.Kind = TokenInfo::TK_Eof; 00100 Result.Text = ""; 00101 return Result; 00102 } 00103 00104 switch (Code[0]) { 00105 case ',': 00106 Result.Kind = TokenInfo::TK_Comma; 00107 Result.Text = Code.substr(0, 1); 00108 Code = Code.drop_front(); 00109 break; 00110 case '.': 00111 Result.Kind = TokenInfo::TK_Period; 00112 Result.Text = Code.substr(0, 1); 00113 Code = Code.drop_front(); 00114 break; 00115 case '(': 00116 Result.Kind = TokenInfo::TK_OpenParen; 00117 Result.Text = Code.substr(0, 1); 00118 Code = Code.drop_front(); 00119 break; 00120 case ')': 00121 Result.Kind = TokenInfo::TK_CloseParen; 00122 Result.Text = Code.substr(0, 1); 00123 Code = Code.drop_front(); 00124 break; 00125 00126 case '"': 00127 case '\'': 00128 // Parse a string literal. 00129 consumeStringLiteral(&Result); 00130 break; 00131 00132 case '0': case '1': case '2': case '3': case '4': 00133 case '5': case '6': case '7': case '8': case '9': 00134 // Parse an unsigned literal. 00135 consumeUnsignedLiteral(&Result); 00136 break; 00137 00138 default: 00139 if (isAlphanumeric(Code[0])) { 00140 // Parse an identifier 00141 size_t TokenLength = 1; 00142 while (1) { 00143 // A code completion location in/immediately after an identifier will 00144 // cause the portion of the identifier before the code completion 00145 // location to become a code completion token. 00146 if (CodeCompletionLocation == Code.data() + TokenLength) { 00147 CodeCompletionLocation = nullptr; 00148 Result.Kind = TokenInfo::TK_CodeCompletion; 00149 Result.Text = Code.substr(0, TokenLength); 00150 Code = Code.drop_front(TokenLength); 00151 return Result; 00152 } 00153 if (TokenLength == Code.size() || !isAlphanumeric(Code[TokenLength])) 00154 break; 00155 ++TokenLength; 00156 } 00157 Result.Kind = TokenInfo::TK_Ident; 00158 Result.Text = Code.substr(0, TokenLength); 00159 Code = Code.drop_front(TokenLength); 00160 } else { 00161 Result.Kind = TokenInfo::TK_InvalidChar; 00162 Result.Text = Code.substr(0, 1); 00163 Code = Code.drop_front(1); 00164 } 00165 break; 00166 } 00167 00168 Result.Range.End = currentLocation(); 00169 return Result; 00170 } 00171 00172 /// \brief Consume an unsigned literal. 00173 void consumeUnsignedLiteral(TokenInfo *Result) { 00174 unsigned Length = 1; 00175 if (Code.size() > 1) { 00176 // Consume the 'x' or 'b' radix modifier, if present. 00177 switch (toLowercase(Code[1])) { 00178 case 'x': case 'b': Length = 2; 00179 } 00180 } 00181 while (Length < Code.size() && isHexDigit(Code[Length])) 00182 ++Length; 00183 00184 Result->Text = Code.substr(0, Length); 00185 Code = Code.drop_front(Length); 00186 00187 unsigned Value; 00188 if (!Result->Text.getAsInteger(0, Value)) { 00189 Result->Kind = TokenInfo::TK_Literal; 00190 Result->Value = Value; 00191 } else { 00192 SourceRange Range; 00193 Range.Start = Result->Range.Start; 00194 Range.End = currentLocation(); 00195 Error->addError(Range, Error->ET_ParserUnsignedError) << Result->Text; 00196 Result->Kind = TokenInfo::TK_Error; 00197 } 00198 } 00199 00200 /// \brief Consume a string literal. 00201 /// 00202 /// \c Code must be positioned at the start of the literal (the opening 00203 /// quote). Consumed until it finds the same closing quote character. 00204 void consumeStringLiteral(TokenInfo *Result) { 00205 bool InEscape = false; 00206 const char Marker = Code[0]; 00207 for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) { 00208 if (InEscape) { 00209 InEscape = false; 00210 continue; 00211 } 00212 if (Code[Length] == '\\') { 00213 InEscape = true; 00214 continue; 00215 } 00216 if (Code[Length] == Marker) { 00217 Result->Kind = TokenInfo::TK_Literal; 00218 Result->Text = Code.substr(0, Length + 1); 00219 Result->Value = Code.substr(1, Length - 1).str(); 00220 Code = Code.drop_front(Length + 1); 00221 return; 00222 } 00223 } 00224 00225 StringRef ErrorText = Code; 00226 Code = Code.drop_front(Code.size()); 00227 SourceRange Range; 00228 Range.Start = Result->Range.Start; 00229 Range.End = currentLocation(); 00230 Error->addError(Range, Error->ET_ParserStringError) << ErrorText; 00231 Result->Kind = TokenInfo::TK_Error; 00232 } 00233 00234 /// \brief Consume all leading whitespace from \c Code. 00235 void consumeWhitespace() { 00236 while (!Code.empty() && isWhitespace(Code[0])) { 00237 if (Code[0] == '\n') { 00238 ++Line; 00239 StartOfLine = Code.drop_front(); 00240 } 00241 Code = Code.drop_front(); 00242 } 00243 } 00244 00245 SourceLocation currentLocation() { 00246 SourceLocation Location; 00247 Location.Line = Line; 00248 Location.Column = Code.data() - StartOfLine.data() + 1; 00249 return Location; 00250 } 00251 00252 StringRef Code; 00253 StringRef StartOfLine; 00254 unsigned Line; 00255 Diagnostics *Error; 00256 TokenInfo NextToken; 00257 const char *CodeCompletionLocation; 00258 }; 00259 00260 Parser::Sema::~Sema() {} 00261 00262 std::vector<ArgKind> Parser::Sema::getAcceptedCompletionTypes( 00263 llvm::ArrayRef<std::pair<MatcherCtor, unsigned>> Context) { 00264 return std::vector<ArgKind>(); 00265 } 00266 00267 std::vector<MatcherCompletion> 00268 Parser::Sema::getMatcherCompletions(llvm::ArrayRef<ArgKind> AcceptedTypes) { 00269 return std::vector<MatcherCompletion>(); 00270 } 00271 00272 struct Parser::ScopedContextEntry { 00273 Parser *P; 00274 00275 ScopedContextEntry(Parser *P, MatcherCtor C) : P(P) { 00276 P->ContextStack.push_back(std::make_pair(C, 0u)); 00277 } 00278 00279 ~ScopedContextEntry() { 00280 P->ContextStack.pop_back(); 00281 } 00282 00283 void nextArg() { 00284 ++P->ContextStack.back().second; 00285 } 00286 }; 00287 00288 /// \brief Parse expressions that start with an identifier. 00289 /// 00290 /// This function can parse named values and matchers. 00291 /// In case of failure it will try to determine the user's intent to give 00292 /// an appropriate error message. 00293 bool Parser::parseIdentifierPrefixImpl(VariantValue *Value) { 00294 const TokenInfo NameToken = Tokenizer->consumeNextToken(); 00295 00296 if (Tokenizer->nextTokenKind() != TokenInfo::TK_OpenParen) { 00297 // Parse as a named value. 00298 if (const VariantValue NamedValue = 00299 NamedValues ? NamedValues->lookup(NameToken.Text) 00300 : VariantValue()) { 00301 *Value = NamedValue; 00302 return true; 00303 } 00304 // If the syntax is correct and the name is not a matcher either, report 00305 // unknown named value. 00306 if ((Tokenizer->nextTokenKind() == TokenInfo::TK_Comma || 00307 Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen || 00308 Tokenizer->nextTokenKind() == TokenInfo::TK_Eof) && 00309 !S->lookupMatcherCtor(NameToken.Text)) { 00310 Error->addError(NameToken.Range, Error->ET_RegistryValueNotFound) 00311 << NameToken.Text; 00312 return false; 00313 } 00314 // Otherwise, fallback to the matcher parser. 00315 } 00316 00317 // Parse as a matcher expression. 00318 return parseMatcherExpressionImpl(NameToken, Value); 00319 } 00320 00321 /// \brief Parse and validate a matcher expression. 00322 /// \return \c true on success, in which case \c Value has the matcher parsed. 00323 /// If the input is malformed, or some argument has an error, it 00324 /// returns \c false. 00325 bool Parser::parseMatcherExpressionImpl(const TokenInfo &NameToken, 00326 VariantValue *Value) { 00327 assert(NameToken.Kind == TokenInfo::TK_Ident); 00328 const TokenInfo OpenToken = Tokenizer->consumeNextToken(); 00329 if (OpenToken.Kind != TokenInfo::TK_OpenParen) { 00330 Error->addError(OpenToken.Range, Error->ET_ParserNoOpenParen) 00331 << OpenToken.Text; 00332 return false; 00333 } 00334 00335 llvm::Optional<MatcherCtor> Ctor = S->lookupMatcherCtor(NameToken.Text); 00336 00337 if (!Ctor) { 00338 Error->addError(NameToken.Range, Error->ET_RegistryMatcherNotFound) 00339 << NameToken.Text; 00340 // Do not return here. We need to continue to give completion suggestions. 00341 } 00342 00343 std::vector<ParserValue> Args; 00344 TokenInfo EndToken; 00345 00346 { 00347 ScopedContextEntry SCE(this, Ctor ? *Ctor : nullptr); 00348 00349 while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) { 00350 if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) { 00351 // End of args. 00352 EndToken = Tokenizer->consumeNextToken(); 00353 break; 00354 } 00355 if (Args.size() > 0) { 00356 // We must find a , token to continue. 00357 const TokenInfo CommaToken = Tokenizer->consumeNextToken(); 00358 if (CommaToken.Kind != TokenInfo::TK_Comma) { 00359 Error->addError(CommaToken.Range, Error->ET_ParserNoComma) 00360 << CommaToken.Text; 00361 return false; 00362 } 00363 } 00364 00365 Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error, 00366 NameToken.Text, NameToken.Range, 00367 Args.size() + 1); 00368 ParserValue ArgValue; 00369 ArgValue.Text = Tokenizer->peekNextToken().Text; 00370 ArgValue.Range = Tokenizer->peekNextToken().Range; 00371 if (!parseExpressionImpl(&ArgValue.Value)) { 00372 return false; 00373 } 00374 00375 Args.push_back(ArgValue); 00376 SCE.nextArg(); 00377 } 00378 } 00379 00380 if (EndToken.Kind == TokenInfo::TK_Eof) { 00381 Error->addError(OpenToken.Range, Error->ET_ParserNoCloseParen); 00382 return false; 00383 } 00384 00385 std::string BindID; 00386 if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) { 00387 // Parse .bind("foo") 00388 Tokenizer->consumeNextToken(); // consume the period. 00389 const TokenInfo BindToken = Tokenizer->consumeNextToken(); 00390 if (BindToken.Kind == TokenInfo::TK_CodeCompletion) { 00391 addCompletion(BindToken, MatcherCompletion("bind(\"", "bind", 1)); 00392 return false; 00393 } 00394 00395 const TokenInfo OpenToken = Tokenizer->consumeNextToken(); 00396 const TokenInfo IDToken = Tokenizer->consumeNextToken(); 00397 const TokenInfo CloseToken = Tokenizer->consumeNextToken(); 00398 00399 // TODO: We could use different error codes for each/some to be more 00400 // explicit about the syntax error. 00401 if (BindToken.Kind != TokenInfo::TK_Ident || 00402 BindToken.Text != TokenInfo::ID_Bind) { 00403 Error->addError(BindToken.Range, Error->ET_ParserMalformedBindExpr); 00404 return false; 00405 } 00406 if (OpenToken.Kind != TokenInfo::TK_OpenParen) { 00407 Error->addError(OpenToken.Range, Error->ET_ParserMalformedBindExpr); 00408 return false; 00409 } 00410 if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) { 00411 Error->addError(IDToken.Range, Error->ET_ParserMalformedBindExpr); 00412 return false; 00413 } 00414 if (CloseToken.Kind != TokenInfo::TK_CloseParen) { 00415 Error->addError(CloseToken.Range, Error->ET_ParserMalformedBindExpr); 00416 return false; 00417 } 00418 BindID = IDToken.Value.getString(); 00419 } 00420 00421 if (!Ctor) 00422 return false; 00423 00424 // Merge the start and end infos. 00425 Diagnostics::Context Ctx(Diagnostics::Context::ConstructMatcher, Error, 00426 NameToken.Text, NameToken.Range); 00427 SourceRange MatcherRange = NameToken.Range; 00428 MatcherRange.End = EndToken.Range.End; 00429 VariantMatcher Result = S->actOnMatcherExpression( 00430 *Ctor, MatcherRange, BindID, Args, Error); 00431 if (Result.isNull()) return false; 00432 00433 *Value = Result; 00434 return true; 00435 } 00436 00437 // If the prefix of this completion matches the completion token, add it to 00438 // Completions minus the prefix. 00439 void Parser::addCompletion(const TokenInfo &CompToken, 00440 const MatcherCompletion& Completion) { 00441 if (StringRef(Completion.TypedText).startswith(CompToken.Text) && 00442 Completion.Specificity > 0) { 00443 Completions.emplace_back(Completion.TypedText.substr(CompToken.Text.size()), 00444 Completion.MatcherDecl, Completion.Specificity); 00445 } 00446 } 00447 00448 std::vector<MatcherCompletion> Parser::getNamedValueCompletions( 00449 ArrayRef<ArgKind> AcceptedTypes) { 00450 if (!NamedValues) return std::vector<MatcherCompletion>(); 00451 std::vector<MatcherCompletion> Result; 00452 for (const auto &Entry : *NamedValues) { 00453 unsigned Specificity; 00454 if (Entry.getValue().isConvertibleTo(AcceptedTypes, &Specificity)) { 00455 std::string Decl = 00456 (Entry.getValue().getTypeAsString() + " " + Entry.getKey()).str(); 00457 Result.emplace_back(Entry.getKey(), Decl, Specificity); 00458 } 00459 } 00460 return Result; 00461 } 00462 00463 void Parser::addExpressionCompletions() { 00464 const TokenInfo CompToken = Tokenizer->consumeNextToken(); 00465 assert(CompToken.Kind == TokenInfo::TK_CodeCompletion); 00466 00467 // We cannot complete code if there is an invalid element on the context 00468 // stack. 00469 for (ContextStackTy::iterator I = ContextStack.begin(), 00470 E = ContextStack.end(); 00471 I != E; ++I) { 00472 if (!I->first) 00473 return; 00474 } 00475 00476 auto AcceptedTypes = S->getAcceptedCompletionTypes(ContextStack); 00477 for (const auto &Completion : S->getMatcherCompletions(AcceptedTypes)) { 00478 addCompletion(CompToken, Completion); 00479 } 00480 00481 for (const auto &Completion : getNamedValueCompletions(AcceptedTypes)) { 00482 addCompletion(CompToken, Completion); 00483 } 00484 } 00485 00486 /// \brief Parse an <Expresssion> 00487 bool Parser::parseExpressionImpl(VariantValue *Value) { 00488 switch (Tokenizer->nextTokenKind()) { 00489 case TokenInfo::TK_Literal: 00490 *Value = Tokenizer->consumeNextToken().Value; 00491 return true; 00492 00493 case TokenInfo::TK_Ident: 00494 return parseIdentifierPrefixImpl(Value); 00495 00496 case TokenInfo::TK_CodeCompletion: 00497 addExpressionCompletions(); 00498 return false; 00499 00500 case TokenInfo::TK_Eof: 00501 Error->addError(Tokenizer->consumeNextToken().Range, 00502 Error->ET_ParserNoCode); 00503 return false; 00504 00505 case TokenInfo::TK_Error: 00506 // This error was already reported by the tokenizer. 00507 return false; 00508 00509 case TokenInfo::TK_OpenParen: 00510 case TokenInfo::TK_CloseParen: 00511 case TokenInfo::TK_Comma: 00512 case TokenInfo::TK_Period: 00513 case TokenInfo::TK_InvalidChar: 00514 const TokenInfo Token = Tokenizer->consumeNextToken(); 00515 Error->addError(Token.Range, Error->ET_ParserInvalidToken) << Token.Text; 00516 return false; 00517 } 00518 00519 llvm_unreachable("Unknown token kind."); 00520 } 00521 00522 static llvm::ManagedStatic<Parser::RegistrySema> DefaultRegistrySema; 00523 00524 Parser::Parser(CodeTokenizer *Tokenizer, Sema *S, 00525 const NamedValueMap *NamedValues, Diagnostics *Error) 00526 : Tokenizer(Tokenizer), S(S ? S : &*DefaultRegistrySema), 00527 NamedValues(NamedValues), Error(Error) {} 00528 00529 Parser::RegistrySema::~RegistrySema() {} 00530 00531 llvm::Optional<MatcherCtor> 00532 Parser::RegistrySema::lookupMatcherCtor(StringRef MatcherName) { 00533 return Registry::lookupMatcherCtor(MatcherName); 00534 } 00535 00536 VariantMatcher Parser::RegistrySema::actOnMatcherExpression( 00537 MatcherCtor Ctor, const SourceRange &NameRange, StringRef BindID, 00538 ArrayRef<ParserValue> Args, Diagnostics *Error) { 00539 if (BindID.empty()) { 00540 return Registry::constructMatcher(Ctor, NameRange, Args, Error); 00541 } else { 00542 return Registry::constructBoundMatcher(Ctor, NameRange, BindID, Args, 00543 Error); 00544 } 00545 } 00546 00547 std::vector<ArgKind> Parser::RegistrySema::getAcceptedCompletionTypes( 00548 ArrayRef<std::pair<MatcherCtor, unsigned>> Context) { 00549 return Registry::getAcceptedCompletionTypes(Context); 00550 } 00551 00552 std::vector<MatcherCompletion> Parser::RegistrySema::getMatcherCompletions( 00553 ArrayRef<ArgKind> AcceptedTypes) { 00554 return Registry::getMatcherCompletions(AcceptedTypes); 00555 } 00556 00557 bool Parser::parseExpression(StringRef Code, Sema *S, 00558 const NamedValueMap *NamedValues, 00559 VariantValue *Value, Diagnostics *Error) { 00560 CodeTokenizer Tokenizer(Code, Error); 00561 if (!Parser(&Tokenizer, S, NamedValues, Error).parseExpressionImpl(Value)) 00562 return false; 00563 if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) { 00564 Error->addError(Tokenizer.peekNextToken().Range, 00565 Error->ET_ParserTrailingCode); 00566 return false; 00567 } 00568 return true; 00569 } 00570 00571 std::vector<MatcherCompletion> 00572 Parser::completeExpression(StringRef Code, unsigned CompletionOffset, Sema *S, 00573 const NamedValueMap *NamedValues) { 00574 Diagnostics Error; 00575 CodeTokenizer Tokenizer(Code, &Error, CompletionOffset); 00576 Parser P(&Tokenizer, S, NamedValues, &Error); 00577 VariantValue Dummy; 00578 P.parseExpressionImpl(&Dummy); 00579 00580 // Sort by specificity, then by name. 00581 std::sort(P.Completions.begin(), P.Completions.end(), 00582 [](const MatcherCompletion &A, const MatcherCompletion &B) { 00583 if (A.Specificity != B.Specificity) 00584 return A.Specificity > B.Specificity; 00585 return A.TypedText < B.TypedText; 00586 }); 00587 00588 return P.Completions; 00589 } 00590 00591 llvm::Optional<DynTypedMatcher> 00592 Parser::parseMatcherExpression(StringRef Code, Sema *S, 00593 const NamedValueMap *NamedValues, 00594 Diagnostics *Error) { 00595 VariantValue Value; 00596 if (!parseExpression(Code, S, NamedValues, &Value, Error)) 00597 return llvm::Optional<DynTypedMatcher>(); 00598 if (!Value.isMatcher()) { 00599 Error->addError(SourceRange(), Error->ET_ParserNotAMatcher); 00600 return llvm::Optional<DynTypedMatcher>(); 00601 } 00602 llvm::Optional<DynTypedMatcher> Result = 00603 Value.getMatcher().getSingleMatcher(); 00604 if (!Result.hasValue()) { 00605 Error->addError(SourceRange(), Error->ET_ParserOverloadedType) 00606 << Value.getTypeAsString(); 00607 } 00608 return Result; 00609 } 00610 00611 } // namespace dynamic 00612 } // namespace ast_matchers 00613 } // namespace clang