clang API Documentation

ScanfFormatString.cpp
Go to the documentation of this file.
00001 //= ScanfFormatString.cpp - Analysis of printf format strings --*- C++ -*-===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // Handling of format string in scanf and friends.  The structure of format
00011 // strings for fscanf() are described in C99 7.19.6.2.
00012 //
00013 //===----------------------------------------------------------------------===//
00014 
00015 #include "clang/Analysis/Analyses/FormatString.h"
00016 #include "FormatStringParsing.h"
00017 #include "clang/Basic/TargetInfo.h"
00018 
00019 using clang::analyze_format_string::ArgType;
00020 using clang::analyze_format_string::FormatStringHandler;
00021 using clang::analyze_format_string::LengthModifier;
00022 using clang::analyze_format_string::OptionalAmount;
00023 using clang::analyze_format_string::ConversionSpecifier;
00024 using clang::analyze_scanf::ScanfConversionSpecifier;
00025 using clang::analyze_scanf::ScanfSpecifier;
00026 using clang::UpdateOnReturn;
00027 using namespace clang;
00028 
00029 typedef clang::analyze_format_string::SpecifierResult<ScanfSpecifier>
00030         ScanfSpecifierResult;
00031 
00032 static bool ParseScanList(FormatStringHandler &H,
00033                           ScanfConversionSpecifier &CS,
00034                           const char *&Beg, const char *E) {
00035   const char *I = Beg;
00036   const char *start = I - 1;
00037   UpdateOnReturn <const char*> UpdateBeg(Beg, I);
00038 
00039   // No more characters?
00040   if (I == E) {
00041     H.HandleIncompleteScanList(start, I);
00042     return true;
00043   }
00044   
00045   // Special case: ']' is the first character.
00046   if (*I == ']') {
00047     if (++I == E) {
00048       H.HandleIncompleteScanList(start, I - 1);
00049       return true;
00050     }
00051   }
00052 
00053   // Special case: "^]" are the first characters.
00054   if (I + 1 != E && I[0] == '^' && I[1] == ']') {
00055     I += 2;
00056     if (I == E) {
00057       H.HandleIncompleteScanList(start, I - 1);
00058       return true;
00059     }
00060   }
00061 
00062   // Look for a ']' character which denotes the end of the scan list.
00063   while (*I != ']') {
00064     if (++I == E) {
00065       H.HandleIncompleteScanList(start, I - 1);
00066       return true;
00067     }
00068   }    
00069 
00070   CS.setEndScanList(I);
00071   return false;
00072 }
00073 
00074 // FIXME: Much of this is copy-paste from ParsePrintfSpecifier.
00075 // We can possibly refactor.
00076 static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H,
00077                                                 const char *&Beg,
00078                                                 const char *E,
00079                                                 unsigned &argIndex,
00080                                                 const LangOptions &LO,
00081                                                 const TargetInfo &Target) {
00082   
00083   using namespace clang::analyze_scanf;
00084   const char *I = Beg;
00085   const char *Start = nullptr;
00086   UpdateOnReturn <const char*> UpdateBeg(Beg, I);
00087 
00088     // Look for a '%' character that indicates the start of a format specifier.
00089   for ( ; I != E ; ++I) {
00090     char c = *I;
00091     if (c == '\0') {
00092         // Detect spurious null characters, which are likely errors.
00093       H.HandleNullChar(I);
00094       return true;
00095     }
00096     if (c == '%') {
00097       Start = I++;  // Record the start of the format specifier.
00098       break;
00099     }
00100   }
00101   
00102     // No format specifier found?
00103   if (!Start)
00104     return false;
00105   
00106   if (I == E) {
00107       // No more characters left?
00108     H.HandleIncompleteSpecifier(Start, E - Start);
00109     return true;
00110   }
00111   
00112   ScanfSpecifier FS;
00113   if (ParseArgPosition(H, FS, Start, I, E))
00114     return true;
00115 
00116   if (I == E) {
00117       // No more characters left?
00118     H.HandleIncompleteSpecifier(Start, E - Start);
00119     return true;
00120   }
00121   
00122   // Look for '*' flag if it is present.
00123   if (*I == '*') {
00124     FS.setSuppressAssignment(I);
00125     if (++I == E) {
00126       H.HandleIncompleteSpecifier(Start, E - Start);
00127       return true;
00128     }
00129   }
00130   
00131   // Look for the field width (if any).  Unlike printf, this is either
00132   // a fixed integer or isn't present.
00133   const OptionalAmount &Amt = clang::analyze_format_string::ParseAmount(I, E);
00134   if (Amt.getHowSpecified() != OptionalAmount::NotSpecified) {
00135     assert(Amt.getHowSpecified() == OptionalAmount::Constant);
00136     FS.setFieldWidth(Amt);
00137 
00138     if (I == E) {
00139       // No more characters left?
00140       H.HandleIncompleteSpecifier(Start, E - Start);
00141       return true;
00142     }
00143   }
00144   
00145   // Look for the length modifier.
00146   if (ParseLengthModifier(FS, I, E, LO, /*scanf=*/true) && I == E) {
00147       // No more characters left?
00148     H.HandleIncompleteSpecifier(Start, E - Start);
00149     return true;
00150   }
00151   
00152   // Detect spurious null characters, which are likely errors.
00153   if (*I == '\0') {
00154     H.HandleNullChar(I);
00155     return true;
00156   }
00157   
00158   // Finally, look for the conversion specifier.
00159   const char *conversionPosition = I++;
00160   ScanfConversionSpecifier::Kind k = ScanfConversionSpecifier::InvalidSpecifier;
00161   switch (*conversionPosition) {
00162     default:
00163       break;
00164     case '%': k = ConversionSpecifier::PercentArg;   break;
00165     case 'A': k = ConversionSpecifier::AArg; break;
00166     case 'E': k = ConversionSpecifier::EArg; break;
00167     case 'F': k = ConversionSpecifier::FArg; break;
00168     case 'G': k = ConversionSpecifier::GArg; break;
00169     case 'X': k = ConversionSpecifier::XArg; break;
00170     case 'a': k = ConversionSpecifier::aArg; break;
00171     case 'd': k = ConversionSpecifier::dArg; break;
00172     case 'e': k = ConversionSpecifier::eArg; break;
00173     case 'f': k = ConversionSpecifier::fArg; break;
00174     case 'g': k = ConversionSpecifier::gArg; break;
00175     case 'i': k = ConversionSpecifier::iArg; break;
00176     case 'n': k = ConversionSpecifier::nArg; break;
00177     case 'c': k = ConversionSpecifier::cArg; break;
00178     case 'C': k = ConversionSpecifier::CArg; break;
00179     case 'S': k = ConversionSpecifier::SArg; break;
00180     case '[': k = ConversionSpecifier::ScanListArg; break;
00181     case 'u': k = ConversionSpecifier::uArg; break;
00182     case 'x': k = ConversionSpecifier::xArg; break;
00183     case 'o': k = ConversionSpecifier::oArg; break;
00184     case 's': k = ConversionSpecifier::sArg; break;
00185     case 'p': k = ConversionSpecifier::pArg; break;
00186     // Apple extensions
00187       // Apple-specific
00188     case 'D':
00189       if (Target.getTriple().isOSDarwin())
00190         k = ConversionSpecifier::DArg;
00191       break;
00192     case 'O':
00193       if (Target.getTriple().isOSDarwin())
00194         k = ConversionSpecifier::OArg;
00195       break;
00196     case 'U':
00197       if (Target.getTriple().isOSDarwin())
00198         k = ConversionSpecifier::UArg;
00199       break;
00200   }
00201   ScanfConversionSpecifier CS(conversionPosition, k);
00202   if (k == ScanfConversionSpecifier::ScanListArg) {
00203     if (ParseScanList(H, CS, I, E))
00204       return true;
00205   }
00206   FS.setConversionSpecifier(CS);
00207   if (CS.consumesDataArgument() && !FS.getSuppressAssignment()
00208       && !FS.usesPositionalArg())
00209     FS.setArgIndex(argIndex++);
00210   
00211   // FIXME: '%' and '*' doesn't make sense.  Issue a warning.
00212   // FIXME: 'ConsumedSoFar' and '*' doesn't make sense.
00213   
00214   if (k == ScanfConversionSpecifier::InvalidSpecifier) {
00215     // Assume the conversion takes one argument.
00216     return !H.HandleInvalidScanfConversionSpecifier(FS, Beg, I - Beg);
00217   }
00218   return ScanfSpecifierResult(Start, FS);
00219 }
00220 
00221 ArgType ScanfSpecifier::getArgType(ASTContext &Ctx) const {
00222   const ScanfConversionSpecifier &CS = getConversionSpecifier();
00223 
00224   if (!CS.consumesDataArgument())
00225     return ArgType::Invalid();
00226 
00227   switch(CS.getKind()) {
00228     // Signed int.
00229     case ConversionSpecifier::dArg:
00230     case ConversionSpecifier::DArg:
00231     case ConversionSpecifier::iArg:
00232       switch (LM.getKind()) {
00233         case LengthModifier::None:
00234           return ArgType::PtrTo(Ctx.IntTy);
00235         case LengthModifier::AsChar:
00236           return ArgType::PtrTo(ArgType::AnyCharTy);
00237         case LengthModifier::AsShort:
00238           return ArgType::PtrTo(Ctx.ShortTy);
00239         case LengthModifier::AsLong:
00240           return ArgType::PtrTo(Ctx.LongTy);
00241         case LengthModifier::AsLongLong:
00242         case LengthModifier::AsQuad:
00243           return ArgType::PtrTo(Ctx.LongLongTy);
00244         case LengthModifier::AsInt64:
00245           return ArgType::PtrTo(ArgType(Ctx.LongLongTy, "__int64"));
00246         case LengthModifier::AsIntMax:
00247           return ArgType::PtrTo(ArgType(Ctx.getIntMaxType(), "intmax_t"));
00248         case LengthModifier::AsSizeT:
00249           // FIXME: ssize_t.
00250           return ArgType();
00251         case LengthModifier::AsPtrDiff:
00252           return ArgType::PtrTo(ArgType(Ctx.getPointerDiffType(), "ptrdiff_t"));
00253         case LengthModifier::AsLongDouble:
00254           // GNU extension.
00255           return ArgType::PtrTo(Ctx.LongLongTy);
00256         case LengthModifier::AsAllocate:
00257         case LengthModifier::AsMAllocate:
00258         case LengthModifier::AsInt32:
00259         case LengthModifier::AsInt3264:
00260         case LengthModifier::AsWide:
00261           return ArgType::Invalid();
00262       }
00263 
00264     // Unsigned int.
00265     case ConversionSpecifier::oArg:
00266     case ConversionSpecifier::OArg:
00267     case ConversionSpecifier::uArg:
00268     case ConversionSpecifier::UArg:
00269     case ConversionSpecifier::xArg:
00270     case ConversionSpecifier::XArg:
00271       switch (LM.getKind()) {
00272         case LengthModifier::None:
00273           return ArgType::PtrTo(Ctx.UnsignedIntTy);
00274         case LengthModifier::AsChar:
00275           return ArgType::PtrTo(Ctx.UnsignedCharTy);
00276         case LengthModifier::AsShort:
00277           return ArgType::PtrTo(Ctx.UnsignedShortTy);
00278         case LengthModifier::AsLong:
00279           return ArgType::PtrTo(Ctx.UnsignedLongTy);
00280         case LengthModifier::AsLongLong:
00281         case LengthModifier::AsQuad:
00282           return ArgType::PtrTo(Ctx.UnsignedLongLongTy);
00283         case LengthModifier::AsInt64:
00284           return ArgType::PtrTo(ArgType(Ctx.UnsignedLongLongTy, "unsigned __int64"));
00285         case LengthModifier::AsIntMax:
00286           return ArgType::PtrTo(ArgType(Ctx.getUIntMaxType(), "uintmax_t"));
00287         case LengthModifier::AsSizeT:
00288           return ArgType::PtrTo(ArgType(Ctx.getSizeType(), "size_t"));
00289         case LengthModifier::AsPtrDiff:
00290           // FIXME: Unsigned version of ptrdiff_t?
00291           return ArgType();
00292         case LengthModifier::AsLongDouble:
00293           // GNU extension.
00294           return ArgType::PtrTo(Ctx.UnsignedLongLongTy);
00295         case LengthModifier::AsAllocate:
00296         case LengthModifier::AsMAllocate:
00297         case LengthModifier::AsInt32:
00298         case LengthModifier::AsInt3264:
00299         case LengthModifier::AsWide:
00300           return ArgType::Invalid();
00301       }
00302 
00303     // Float.
00304     case ConversionSpecifier::aArg:
00305     case ConversionSpecifier::AArg:
00306     case ConversionSpecifier::eArg:
00307     case ConversionSpecifier::EArg:
00308     case ConversionSpecifier::fArg:
00309     case ConversionSpecifier::FArg:
00310     case ConversionSpecifier::gArg:
00311     case ConversionSpecifier::GArg:
00312       switch (LM.getKind()) {
00313         case LengthModifier::None:
00314           return ArgType::PtrTo(Ctx.FloatTy);
00315         case LengthModifier::AsLong:
00316           return ArgType::PtrTo(Ctx.DoubleTy);
00317         case LengthModifier::AsLongDouble:
00318           return ArgType::PtrTo(Ctx.LongDoubleTy);
00319         default:
00320           return ArgType::Invalid();
00321       }
00322 
00323     // Char, string and scanlist.
00324     case ConversionSpecifier::cArg:
00325     case ConversionSpecifier::sArg:
00326     case ConversionSpecifier::ScanListArg:
00327       switch (LM.getKind()) {
00328         case LengthModifier::None:
00329           return ArgType::PtrTo(ArgType::AnyCharTy);
00330         case LengthModifier::AsLong:
00331         case LengthModifier::AsWide:
00332           return ArgType::PtrTo(ArgType(Ctx.getWideCharType(), "wchar_t"));
00333         case LengthModifier::AsAllocate:
00334         case LengthModifier::AsMAllocate:
00335           return ArgType::PtrTo(ArgType::CStrTy);
00336         case LengthModifier::AsShort:
00337           if (Ctx.getTargetInfo().getTriple().isOSMSVCRT())
00338             return ArgType::PtrTo(ArgType::AnyCharTy);
00339         default:
00340           return ArgType::Invalid();
00341       }
00342     case ConversionSpecifier::CArg:
00343     case ConversionSpecifier::SArg:
00344       // FIXME: Mac OS X specific?
00345       switch (LM.getKind()) {
00346         case LengthModifier::None:
00347         case LengthModifier::AsWide:
00348           return ArgType::PtrTo(ArgType(Ctx.getWideCharType(), "wchar_t"));
00349         case LengthModifier::AsAllocate:
00350         case LengthModifier::AsMAllocate:
00351           return ArgType::PtrTo(ArgType(ArgType::WCStrTy, "wchar_t *"));
00352         case LengthModifier::AsShort:
00353           if (Ctx.getTargetInfo().getTriple().isOSMSVCRT())
00354             return ArgType::PtrTo(ArgType::AnyCharTy);
00355         default:
00356           return ArgType::Invalid();
00357       }
00358 
00359     // Pointer.
00360     case ConversionSpecifier::pArg:
00361       return ArgType::PtrTo(ArgType::CPointerTy);
00362 
00363     // Write-back.
00364     case ConversionSpecifier::nArg:
00365       switch (LM.getKind()) {
00366         case LengthModifier::None:
00367           return ArgType::PtrTo(Ctx.IntTy);
00368         case LengthModifier::AsChar:
00369           return ArgType::PtrTo(Ctx.SignedCharTy);
00370         case LengthModifier::AsShort:
00371           return ArgType::PtrTo(Ctx.ShortTy);
00372         case LengthModifier::AsLong:
00373           return ArgType::PtrTo(Ctx.LongTy);
00374         case LengthModifier::AsLongLong:
00375         case LengthModifier::AsQuad:
00376           return ArgType::PtrTo(Ctx.LongLongTy);
00377         case LengthModifier::AsInt64:
00378           return ArgType::PtrTo(ArgType(Ctx.LongLongTy, "__int64"));
00379         case LengthModifier::AsIntMax:
00380           return ArgType::PtrTo(ArgType(Ctx.getIntMaxType(), "intmax_t"));
00381         case LengthModifier::AsSizeT:
00382           return ArgType(); // FIXME: ssize_t
00383         case LengthModifier::AsPtrDiff:
00384           return ArgType::PtrTo(ArgType(Ctx.getPointerDiffType(), "ptrdiff_t"));
00385         case LengthModifier::AsLongDouble:
00386           return ArgType(); // FIXME: Is this a known extension?
00387         case LengthModifier::AsAllocate:
00388         case LengthModifier::AsMAllocate:
00389         case LengthModifier::AsInt32:
00390         case LengthModifier::AsInt3264:
00391         case LengthModifier::AsWide:
00392           return ArgType::Invalid();
00393         }
00394 
00395     default:
00396       break;
00397   }
00398 
00399   return ArgType();
00400 }
00401 
00402 bool ScanfSpecifier::fixType(QualType QT, QualType RawQT,
00403                              const LangOptions &LangOpt,
00404                              ASTContext &Ctx) {
00405 
00406   // %n is different from other conversion specifiers; don't try to fix it.
00407   if (CS.getKind() == ConversionSpecifier::nArg)
00408     return false;
00409 
00410   if (!QT->isPointerType())
00411     return false;
00412 
00413   QualType PT = QT->getPointeeType();
00414 
00415   // If it's an enum, get its underlying type.
00416   if (const EnumType *ETy = PT->getAs<EnumType>())
00417     PT = ETy->getDecl()->getIntegerType();
00418 
00419   const BuiltinType *BT = PT->getAs<BuiltinType>();
00420   if (!BT)
00421     return false;
00422 
00423   // Pointer to a character.
00424   if (PT->isAnyCharacterType()) {
00425     CS.setKind(ConversionSpecifier::sArg);
00426     if (PT->isWideCharType())
00427       LM.setKind(LengthModifier::AsWideChar);
00428     else
00429       LM.setKind(LengthModifier::None);
00430 
00431     // If we know the target array length, we can use it as a field width.
00432     if (const ConstantArrayType *CAT = Ctx.getAsConstantArrayType(RawQT)) {
00433       if (CAT->getSizeModifier() == ArrayType::Normal)
00434         FieldWidth = OptionalAmount(OptionalAmount::Constant,
00435                                     CAT->getSize().getZExtValue() - 1,
00436                                     "", 0, false);
00437 
00438     }
00439     return true;
00440   }
00441 
00442   // Figure out the length modifier.
00443   switch (BT->getKind()) {
00444     // no modifier
00445     case BuiltinType::UInt:
00446     case BuiltinType::Int:
00447     case BuiltinType::Float:
00448       LM.setKind(LengthModifier::None);
00449       break;
00450 
00451     // hh
00452     case BuiltinType::Char_U:
00453     case BuiltinType::UChar:
00454     case BuiltinType::Char_S:
00455     case BuiltinType::SChar:
00456       LM.setKind(LengthModifier::AsChar);
00457       break;
00458 
00459     // h
00460     case BuiltinType::Short:
00461     case BuiltinType::UShort:
00462       LM.setKind(LengthModifier::AsShort);
00463       break;
00464 
00465     // l
00466     case BuiltinType::Long:
00467     case BuiltinType::ULong:
00468     case BuiltinType::Double:
00469       LM.setKind(LengthModifier::AsLong);
00470       break;
00471 
00472     // ll
00473     case BuiltinType::LongLong:
00474     case BuiltinType::ULongLong:
00475       LM.setKind(LengthModifier::AsLongLong);
00476       break;
00477 
00478     // L
00479     case BuiltinType::LongDouble:
00480       LM.setKind(LengthModifier::AsLongDouble);
00481       break;
00482 
00483     // Don't know.
00484     default:
00485       return false;
00486   }
00487 
00488   // Handle size_t, ptrdiff_t, etc. that have dedicated length modifiers in C99.
00489   if (isa<TypedefType>(PT) && (LangOpt.C99 || LangOpt.CPlusPlus11))
00490     namedTypeToLengthModifier(PT, LM);
00491 
00492   // If fixing the length modifier was enough, we are done.
00493   if (hasValidLengthModifier(Ctx.getTargetInfo())) {
00494     const analyze_scanf::ArgType &AT = getArgType(Ctx);
00495     if (AT.isValid() && AT.matchesType(Ctx, QT))
00496       return true;
00497   }
00498 
00499   // Figure out the conversion specifier.
00500   if (PT->isRealFloatingType())
00501     CS.setKind(ConversionSpecifier::fArg);
00502   else if (PT->isSignedIntegerType())
00503     CS.setKind(ConversionSpecifier::dArg);
00504   else if (PT->isUnsignedIntegerType())
00505     CS.setKind(ConversionSpecifier::uArg);
00506   else
00507     llvm_unreachable("Unexpected type");
00508 
00509   return true;
00510 }
00511 
00512 void ScanfSpecifier::toString(raw_ostream &os) const {
00513   os << "%";
00514 
00515   if (usesPositionalArg())
00516     os << getPositionalArgIndex() << "$";
00517   if (SuppressAssignment)
00518     os << "*";
00519 
00520   FieldWidth.toString(os);
00521   os << LM.toString();
00522   os << CS.toString();
00523 }
00524 
00525 bool clang::analyze_format_string::ParseScanfString(FormatStringHandler &H,
00526                                                     const char *I,
00527                                                     const char *E,
00528                                                     const LangOptions &LO,
00529                                                     const TargetInfo &Target) {
00530   
00531   unsigned argIndex = 0;
00532   
00533   // Keep looking for a format specifier until we have exhausted the string.
00534   while (I != E) {
00535     const ScanfSpecifierResult &FSR = ParseScanfSpecifier(H, I, E, argIndex,
00536                                                           LO, Target);
00537     // Did a fail-stop error of any kind occur when parsing the specifier?
00538     // If so, don't do any more processing.
00539     if (FSR.shouldStop())
00540       return true;
00541       // Did we exhaust the string or encounter an error that
00542       // we can recover from?
00543     if (!FSR.hasValue())
00544       continue;
00545       // We have a format specifier.  Pass it to the callback.
00546     if (!H.HandleScanfSpecifier(FSR.getValue(), FSR.getStart(),
00547                                 I - FSR.getStart())) {
00548       return true;
00549     }
00550   }
00551   assert(I == E && "Format string not exhausted");
00552   return false;
00553 }