Header And Logo

PostgreSQL
| The world's most advanced open source database.

nodeBitmapHeapscan.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * nodeBitmapHeapscan.c
00004  *    Routines to support bitmapped scans of relations
00005  *
00006  * NOTE: it is critical that this plan type only be used with MVCC-compliant
00007  * snapshots (ie, regular snapshots, not SnapshotNow or one of the other
00008  * special snapshots).  The reason is that since index and heap scans are
00009  * decoupled, there can be no assurance that the index tuple prompting a
00010  * visit to a particular heap TID still exists when the visit is made.
00011  * Therefore the tuple might not exist anymore either (which is OK because
00012  * heap_fetch will cope) --- but worse, the tuple slot could have been
00013  * re-used for a newer tuple.  With an MVCC snapshot the newer tuple is
00014  * certain to fail the time qual and so it will not be mistakenly returned.
00015  * With SnapshotNow we might return a tuple that doesn't meet the required
00016  * index qual conditions.
00017  *
00018  *
00019  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00020  * Portions Copyright (c) 1994, Regents of the University of California
00021  *
00022  *
00023  * IDENTIFICATION
00024  *    src/backend/executor/nodeBitmapHeapscan.c
00025  *
00026  *-------------------------------------------------------------------------
00027  */
00028 /*
00029  * INTERFACE ROUTINES
00030  *      ExecBitmapHeapScan          scans a relation using bitmap info
00031  *      ExecBitmapHeapNext          workhorse for above
00032  *      ExecInitBitmapHeapScan      creates and initializes state info.
00033  *      ExecReScanBitmapHeapScan    prepares to rescan the plan.
00034  *      ExecEndBitmapHeapScan       releases all storage.
00035  */
00036 #include "postgres.h"
00037 
00038 #include "access/relscan.h"
00039 #include "access/transam.h"
00040 #include "executor/execdebug.h"
00041 #include "executor/nodeBitmapHeapscan.h"
00042 #include "pgstat.h"
00043 #include "storage/bufmgr.h"
00044 #include "storage/predicate.h"
00045 #include "utils/memutils.h"
00046 #include "utils/rel.h"
00047 #include "utils/snapmgr.h"
00048 #include "utils/tqual.h"
00049 
00050 
00051 static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
00052 static void bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres);
00053 
00054 
00055 /* ----------------------------------------------------------------
00056  *      BitmapHeapNext
00057  *
00058  *      Retrieve next tuple from the BitmapHeapScan node's currentRelation
00059  * ----------------------------------------------------------------
00060  */
00061 static TupleTableSlot *
00062 BitmapHeapNext(BitmapHeapScanState *node)
00063 {
00064     ExprContext *econtext;
00065     HeapScanDesc scan;
00066     TIDBitmap  *tbm;
00067     TBMIterator *tbmiterator;
00068     TBMIterateResult *tbmres;
00069 
00070 #ifdef USE_PREFETCH
00071     TBMIterator *prefetch_iterator;
00072 #endif
00073     OffsetNumber targoffset;
00074     TupleTableSlot *slot;
00075 
00076     /*
00077      * extract necessary information from index scan node
00078      */
00079     econtext = node->ss.ps.ps_ExprContext;
00080     slot = node->ss.ss_ScanTupleSlot;
00081     scan = node->ss.ss_currentScanDesc;
00082     tbm = node->tbm;
00083     tbmiterator = node->tbmiterator;
00084     tbmres = node->tbmres;
00085 #ifdef USE_PREFETCH
00086     prefetch_iterator = node->prefetch_iterator;
00087 #endif
00088 
00089     /*
00090      * If we haven't yet performed the underlying index scan, do it, and begin
00091      * the iteration over the bitmap.
00092      *
00093      * For prefetching, we use *two* iterators, one for the pages we are
00094      * actually scanning and another that runs ahead of the first for
00095      * prefetching.  node->prefetch_pages tracks exactly how many pages ahead
00096      * the prefetch iterator is.  Also, node->prefetch_target tracks the
00097      * desired prefetch distance, which starts small and increases up to the
00098      * GUC-controlled maximum, target_prefetch_pages.  This is to avoid doing
00099      * a lot of prefetching in a scan that stops after a few tuples because of
00100      * a LIMIT.
00101      */
00102     if (tbm == NULL)
00103     {
00104         tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
00105 
00106         if (!tbm || !IsA(tbm, TIDBitmap))
00107             elog(ERROR, "unrecognized result from subplan");
00108 
00109         node->tbm = tbm;
00110         node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm);
00111         node->tbmres = tbmres = NULL;
00112 
00113 #ifdef USE_PREFETCH
00114         if (target_prefetch_pages > 0)
00115         {
00116             node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm);
00117             node->prefetch_pages = 0;
00118             node->prefetch_target = -1;
00119         }
00120 #endif   /* USE_PREFETCH */
00121     }
00122 
00123     for (;;)
00124     {
00125         Page        dp;
00126         ItemId      lp;
00127 
00128         /*
00129          * Get next page of results if needed
00130          */
00131         if (tbmres == NULL)
00132         {
00133             node->tbmres = tbmres = tbm_iterate(tbmiterator);
00134             if (tbmres == NULL)
00135             {
00136                 /* no more entries in the bitmap */
00137                 break;
00138             }
00139 
00140 #ifdef USE_PREFETCH
00141             if (node->prefetch_pages > 0)
00142             {
00143                 /* The main iterator has closed the distance by one page */
00144                 node->prefetch_pages--;
00145             }
00146             else if (prefetch_iterator)
00147             {
00148                 /* Do not let the prefetch iterator get behind the main one */
00149                 TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
00150 
00151                 if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno)
00152                     elog(ERROR, "prefetch and main iterators are out of sync");
00153             }
00154 #endif   /* USE_PREFETCH */
00155 
00156             /*
00157              * Ignore any claimed entries past what we think is the end of the
00158              * relation.  (This is probably not necessary given that we got at
00159              * least AccessShareLock on the table before performing any of the
00160              * indexscans, but let's be safe.)
00161              */
00162             if (tbmres->blockno >= scan->rs_nblocks)
00163             {
00164                 node->tbmres = tbmres = NULL;
00165                 continue;
00166             }
00167 
00168             /*
00169              * Fetch the current heap page and identify candidate tuples.
00170              */
00171             bitgetpage(scan, tbmres);
00172 
00173             /*
00174              * Set rs_cindex to first slot to examine
00175              */
00176             scan->rs_cindex = 0;
00177 
00178 #ifdef USE_PREFETCH
00179 
00180             /*
00181              * Increase prefetch target if it's not yet at the max.  Note that
00182              * we will increase it to zero after fetching the very first
00183              * page/tuple, then to one after the second tuple is fetched, then
00184              * it doubles as later pages are fetched.
00185              */
00186             if (node->prefetch_target >= target_prefetch_pages)
00187                  /* don't increase any further */ ;
00188             else if (node->prefetch_target >= target_prefetch_pages / 2)
00189                 node->prefetch_target = target_prefetch_pages;
00190             else if (node->prefetch_target > 0)
00191                 node->prefetch_target *= 2;
00192             else
00193                 node->prefetch_target++;
00194 #endif   /* USE_PREFETCH */
00195         }
00196         else
00197         {
00198             /*
00199              * Continuing in previously obtained page; advance rs_cindex
00200              */
00201             scan->rs_cindex++;
00202 
00203 #ifdef USE_PREFETCH
00204 
00205             /*
00206              * Try to prefetch at least a few pages even before we get to the
00207              * second page if we don't stop reading after the first tuple.
00208              */
00209             if (node->prefetch_target < target_prefetch_pages)
00210                 node->prefetch_target++;
00211 #endif   /* USE_PREFETCH */
00212         }
00213 
00214         /*
00215          * Out of range?  If so, nothing more to look at on this page
00216          */
00217         if (scan->rs_cindex < 0 || scan->rs_cindex >= scan->rs_ntuples)
00218         {
00219             node->tbmres = tbmres = NULL;
00220             continue;
00221         }
00222 
00223 #ifdef USE_PREFETCH
00224 
00225         /*
00226          * We issue prefetch requests *after* fetching the current page to try
00227          * to avoid having prefetching interfere with the main I/O. Also, this
00228          * should happen only when we have determined there is still something
00229          * to do on the current page, else we may uselessly prefetch the same
00230          * page we are just about to request for real.
00231          */
00232         if (prefetch_iterator)
00233         {
00234             while (node->prefetch_pages < node->prefetch_target)
00235             {
00236                 TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
00237 
00238                 if (tbmpre == NULL)
00239                 {
00240                     /* No more pages to prefetch */
00241                     tbm_end_iterate(prefetch_iterator);
00242                     node->prefetch_iterator = prefetch_iterator = NULL;
00243                     break;
00244                 }
00245                 node->prefetch_pages++;
00246                 PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
00247             }
00248         }
00249 #endif   /* USE_PREFETCH */
00250 
00251         /*
00252          * Okay to fetch the tuple
00253          */
00254         targoffset = scan->rs_vistuples[scan->rs_cindex];
00255         dp = (Page) BufferGetPage(scan->rs_cbuf);
00256         lp = PageGetItemId(dp, targoffset);
00257         Assert(ItemIdIsNormal(lp));
00258 
00259         scan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
00260         scan->rs_ctup.t_len = ItemIdGetLength(lp);
00261         ItemPointerSet(&scan->rs_ctup.t_self, tbmres->blockno, targoffset);
00262 
00263         pgstat_count_heap_fetch(scan->rs_rd);
00264 
00265         /*
00266          * Set up the result slot to point to this tuple. Note that the slot
00267          * acquires a pin on the buffer.
00268          */
00269         ExecStoreTuple(&scan->rs_ctup,
00270                        slot,
00271                        scan->rs_cbuf,
00272                        false);
00273 
00274         /*
00275          * If we are using lossy info, we have to recheck the qual conditions
00276          * at every tuple.
00277          */
00278         if (tbmres->recheck)
00279         {
00280             econtext->ecxt_scantuple = slot;
00281             ResetExprContext(econtext);
00282 
00283             if (!ExecQual(node->bitmapqualorig, econtext, false))
00284             {
00285                 /* Fails recheck, so drop it and loop back for another */
00286                 InstrCountFiltered2(node, 1);
00287                 ExecClearTuple(slot);
00288                 continue;
00289             }
00290         }
00291 
00292         /* OK to return this tuple */
00293         return slot;
00294     }
00295 
00296     /*
00297      * if we get here it means we are at the end of the scan..
00298      */
00299     return ExecClearTuple(slot);
00300 }
00301 
00302 /*
00303  * bitgetpage - subroutine for BitmapHeapNext()
00304  *
00305  * This routine reads and pins the specified page of the relation, then
00306  * builds an array indicating which tuples on the page are both potentially
00307  * interesting according to the bitmap, and visible according to the snapshot.
00308  */
00309 static void
00310 bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
00311 {
00312     BlockNumber page = tbmres->blockno;
00313     Buffer      buffer;
00314     Snapshot    snapshot;
00315     int         ntup;
00316 
00317     /*
00318      * Acquire pin on the target heap page, trading in any pin we held before.
00319      */
00320     Assert(page < scan->rs_nblocks);
00321 
00322     scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf,
00323                                          scan->rs_rd,
00324                                          page);
00325     buffer = scan->rs_cbuf;
00326     snapshot = scan->rs_snapshot;
00327 
00328     ntup = 0;
00329 
00330     /*
00331      * Prune and repair fragmentation for the whole page, if possible.
00332      */
00333     Assert(TransactionIdIsValid(RecentGlobalXmin));
00334     heap_page_prune_opt(scan->rs_rd, buffer, RecentGlobalXmin);
00335 
00336     /*
00337      * We must hold share lock on the buffer content while examining tuple
00338      * visibility.  Afterwards, however, the tuples we have found to be
00339      * visible are guaranteed good as long as we hold the buffer pin.
00340      */
00341     LockBuffer(buffer, BUFFER_LOCK_SHARE);
00342 
00343     /*
00344      * We need two separate strategies for lossy and non-lossy cases.
00345      */
00346     if (tbmres->ntuples >= 0)
00347     {
00348         /*
00349          * Bitmap is non-lossy, so we just look through the offsets listed in
00350          * tbmres; but we have to follow any HOT chain starting at each such
00351          * offset.
00352          */
00353         int         curslot;
00354 
00355         for (curslot = 0; curslot < tbmres->ntuples; curslot++)
00356         {
00357             OffsetNumber offnum = tbmres->offsets[curslot];
00358             ItemPointerData tid;
00359             HeapTupleData heapTuple;
00360 
00361             ItemPointerSet(&tid, page, offnum);
00362             if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot,
00363                                        &heapTuple, NULL, true))
00364                 scan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
00365         }
00366     }
00367     else
00368     {
00369         /*
00370          * Bitmap is lossy, so we must examine each item pointer on the page.
00371          * But we can ignore HOT chains, since we'll check each tuple anyway.
00372          */
00373         Page        dp = (Page) BufferGetPage(buffer);
00374         OffsetNumber maxoff = PageGetMaxOffsetNumber(dp);
00375         OffsetNumber offnum;
00376 
00377         for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
00378         {
00379             ItemId      lp;
00380             HeapTupleData loctup;
00381             bool        valid;
00382 
00383             lp = PageGetItemId(dp, offnum);
00384             if (!ItemIdIsNormal(lp))
00385                 continue;
00386             loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
00387             loctup.t_len = ItemIdGetLength(lp);
00388             loctup.t_tableOid = scan->rs_rd->rd_id;
00389             ItemPointerSet(&loctup.t_self, page, offnum);
00390             valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
00391             if (valid)
00392             {
00393                 scan->rs_vistuples[ntup++] = offnum;
00394                 PredicateLockTuple(scan->rs_rd, &loctup, snapshot);
00395             }
00396             CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
00397                                             buffer, snapshot);
00398         }
00399     }
00400 
00401     LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
00402 
00403     Assert(ntup <= MaxHeapTuplesPerPage);
00404     scan->rs_ntuples = ntup;
00405 }
00406 
00407 /*
00408  * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual
00409  */
00410 static bool
00411 BitmapHeapRecheck(BitmapHeapScanState *node, TupleTableSlot *slot)
00412 {
00413     ExprContext *econtext;
00414 
00415     /*
00416      * extract necessary information from index scan node
00417      */
00418     econtext = node->ss.ps.ps_ExprContext;
00419 
00420     /* Does the tuple meet the original qual conditions? */
00421     econtext->ecxt_scantuple = slot;
00422 
00423     ResetExprContext(econtext);
00424 
00425     return ExecQual(node->bitmapqualorig, econtext, false);
00426 }
00427 
00428 /* ----------------------------------------------------------------
00429  *      ExecBitmapHeapScan(node)
00430  * ----------------------------------------------------------------
00431  */
00432 TupleTableSlot *
00433 ExecBitmapHeapScan(BitmapHeapScanState *node)
00434 {
00435     return ExecScan(&node->ss,
00436                     (ExecScanAccessMtd) BitmapHeapNext,
00437                     (ExecScanRecheckMtd) BitmapHeapRecheck);
00438 }
00439 
00440 /* ----------------------------------------------------------------
00441  *      ExecReScanBitmapHeapScan(node)
00442  * ----------------------------------------------------------------
00443  */
00444 void
00445 ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
00446 {
00447     /* rescan to release any page pin */
00448     heap_rescan(node->ss.ss_currentScanDesc, NULL);
00449 
00450     if (node->tbmiterator)
00451         tbm_end_iterate(node->tbmiterator);
00452     if (node->prefetch_iterator)
00453         tbm_end_iterate(node->prefetch_iterator);
00454     if (node->tbm)
00455         tbm_free(node->tbm);
00456     node->tbm = NULL;
00457     node->tbmiterator = NULL;
00458     node->tbmres = NULL;
00459     node->prefetch_iterator = NULL;
00460 
00461     ExecScanReScan(&node->ss);
00462 
00463     /*
00464      * if chgParam of subnode is not null then plan will be re-scanned by
00465      * first ExecProcNode.
00466      */
00467     if (node->ss.ps.lefttree->chgParam == NULL)
00468         ExecReScan(node->ss.ps.lefttree);
00469 }
00470 
00471 /* ----------------------------------------------------------------
00472  *      ExecEndBitmapHeapScan
00473  * ----------------------------------------------------------------
00474  */
00475 void
00476 ExecEndBitmapHeapScan(BitmapHeapScanState *node)
00477 {
00478     Relation    relation;
00479     HeapScanDesc scanDesc;
00480 
00481     /*
00482      * extract information from the node
00483      */
00484     relation = node->ss.ss_currentRelation;
00485     scanDesc = node->ss.ss_currentScanDesc;
00486 
00487     /*
00488      * Free the exprcontext
00489      */
00490     ExecFreeExprContext(&node->ss.ps);
00491 
00492     /*
00493      * clear out tuple table slots
00494      */
00495     ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
00496     ExecClearTuple(node->ss.ss_ScanTupleSlot);
00497 
00498     /*
00499      * close down subplans
00500      */
00501     ExecEndNode(outerPlanState(node));
00502 
00503     /*
00504      * release bitmap if any
00505      */
00506     if (node->tbmiterator)
00507         tbm_end_iterate(node->tbmiterator);
00508     if (node->prefetch_iterator)
00509         tbm_end_iterate(node->prefetch_iterator);
00510     if (node->tbm)
00511         tbm_free(node->tbm);
00512 
00513     /*
00514      * close heap scan
00515      */
00516     heap_endscan(scanDesc);
00517 
00518     /*
00519      * close the heap relation.
00520      */
00521     ExecCloseScanRelation(relation);
00522 }
00523 
00524 /* ----------------------------------------------------------------
00525  *      ExecInitBitmapHeapScan
00526  *
00527  *      Initializes the scan's state information.
00528  * ----------------------------------------------------------------
00529  */
00530 BitmapHeapScanState *
00531 ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
00532 {
00533     BitmapHeapScanState *scanstate;
00534     Relation    currentRelation;
00535 
00536     /* check for unsupported flags */
00537     Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
00538 
00539     /*
00540      * Assert caller didn't ask for an unsafe snapshot --- see comments at
00541      * head of file.
00542      */
00543     Assert(IsMVCCSnapshot(estate->es_snapshot));
00544 
00545     /*
00546      * create state structure
00547      */
00548     scanstate = makeNode(BitmapHeapScanState);
00549     scanstate->ss.ps.plan = (Plan *) node;
00550     scanstate->ss.ps.state = estate;
00551 
00552     scanstate->tbm = NULL;
00553     scanstate->tbmiterator = NULL;
00554     scanstate->tbmres = NULL;
00555     scanstate->prefetch_iterator = NULL;
00556     scanstate->prefetch_pages = 0;
00557     scanstate->prefetch_target = 0;
00558 
00559     /*
00560      * Miscellaneous initialization
00561      *
00562      * create expression context for node
00563      */
00564     ExecAssignExprContext(estate, &scanstate->ss.ps);
00565 
00566     scanstate->ss.ps.ps_TupFromTlist = false;
00567 
00568     /*
00569      * initialize child expressions
00570      */
00571     scanstate->ss.ps.targetlist = (List *)
00572         ExecInitExpr((Expr *) node->scan.plan.targetlist,
00573                      (PlanState *) scanstate);
00574     scanstate->ss.ps.qual = (List *)
00575         ExecInitExpr((Expr *) node->scan.plan.qual,
00576                      (PlanState *) scanstate);
00577     scanstate->bitmapqualorig = (List *)
00578         ExecInitExpr((Expr *) node->bitmapqualorig,
00579                      (PlanState *) scanstate);
00580 
00581     /*
00582      * tuple table initialization
00583      */
00584     ExecInitResultTupleSlot(estate, &scanstate->ss.ps);
00585     ExecInitScanTupleSlot(estate, &scanstate->ss);
00586 
00587     /*
00588      * open the base relation and acquire appropriate lock on it.
00589      */
00590     currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
00591 
00592     scanstate->ss.ss_currentRelation = currentRelation;
00593 
00594     /*
00595      * Even though we aren't going to do a conventional seqscan, it is useful
00596      * to create a HeapScanDesc --- most of the fields in it are usable.
00597      */
00598     scanstate->ss.ss_currentScanDesc = heap_beginscan_bm(currentRelation,
00599                                                          estate->es_snapshot,
00600                                                          0,
00601                                                          NULL);
00602 
00603     /*
00604      * get the scan type from the relation descriptor.
00605      */
00606     ExecAssignScanType(&scanstate->ss, RelationGetDescr(currentRelation));
00607 
00608     /*
00609      * Initialize result tuple type and projection info.
00610      */
00611     ExecAssignResultTypeFromTL(&scanstate->ss.ps);
00612     ExecAssignScanProjectionInfo(&scanstate->ss);
00613 
00614     /*
00615      * initialize child nodes
00616      *
00617      * We do this last because the child nodes will open indexscans on our
00618      * relation's indexes, and we want to be sure we have acquired a lock on
00619      * the relation first.
00620      */
00621     outerPlanState(scanstate) = ExecInitNode(outerPlan(node), estate, eflags);
00622 
00623     /*
00624      * all done.
00625      */
00626     return scanstate;
00627 }