Header And Logo

PostgreSQL
| The world's most advanced open source database.

pruneheap.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * pruneheap.c
00004  *    heap page pruning and HOT-chain management code
00005  *
00006  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00007  * Portions Copyright (c) 1994, Regents of the University of California
00008  *
00009  *
00010  * IDENTIFICATION
00011  *    src/backend/access/heap/pruneheap.c
00012  *
00013  *-------------------------------------------------------------------------
00014  */
00015 #include "postgres.h"
00016 
00017 #include "access/heapam.h"
00018 #include "access/heapam_xlog.h"
00019 #include "access/transam.h"
00020 #include "access/htup_details.h"
00021 #include "miscadmin.h"
00022 #include "pgstat.h"
00023 #include "storage/bufmgr.h"
00024 #include "utils/rel.h"
00025 #include "utils/tqual.h"
00026 
00027 
00028 /* Working data for heap_page_prune and subroutines */
00029 typedef struct
00030 {
00031     TransactionId new_prune_xid;    /* new prune hint value for page */
00032     TransactionId latestRemovedXid;     /* latest xid to be removed by this
00033                                          * prune */
00034     int         nredirected;    /* numbers of entries in arrays below */
00035     int         ndead;
00036     int         nunused;
00037     /* arrays that accumulate indexes of items to be changed */
00038     OffsetNumber redirected[MaxHeapTuplesPerPage * 2];
00039     OffsetNumber nowdead[MaxHeapTuplesPerPage];
00040     OffsetNumber nowunused[MaxHeapTuplesPerPage];
00041     /* marked[i] is TRUE if item i is entered in one of the above arrays */
00042     bool        marked[MaxHeapTuplesPerPage + 1];
00043 } PruneState;
00044 
00045 /* Local functions */
00046 static int heap_prune_chain(Relation relation, Buffer buffer,
00047                  OffsetNumber rootoffnum,
00048                  TransactionId OldestXmin,
00049                  PruneState *prstate);
00050 static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid);
00051 static void heap_prune_record_redirect(PruneState *prstate,
00052                            OffsetNumber offnum, OffsetNumber rdoffnum);
00053 static void heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum);
00054 static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum);
00055 
00056 
00057 /*
00058  * Optionally prune and repair fragmentation in the specified page.
00059  *
00060  * This is an opportunistic function.  It will perform housekeeping
00061  * only if the page heuristically looks like a candidate for pruning and we
00062  * can acquire buffer cleanup lock without blocking.
00063  *
00064  * Note: this is called quite often.  It's important that it fall out quickly
00065  * if there's not any use in pruning.
00066  *
00067  * Caller must have pin on the buffer, and must *not* have a lock on it.
00068  *
00069  * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD
00070  * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
00071  */
00072 void
00073 heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin)
00074 {
00075     Page        page = BufferGetPage(buffer);
00076     Size        minfree;
00077 
00078     /*
00079      * Let's see if we really need pruning.
00080      *
00081      * Forget it if page is not hinted to contain something prunable that's
00082      * older than OldestXmin.
00083      */
00084     if (!PageIsPrunable(page, OldestXmin))
00085         return;
00086 
00087     /*
00088      * We can't write WAL in recovery mode, so there's no point trying to
00089      * clean the page. The master will likely issue a cleaning WAL record soon
00090      * anyway, so this is no particular loss.
00091      */
00092     if (RecoveryInProgress())
00093         return;
00094 
00095     /*
00096      * We prune when a previous UPDATE failed to find enough space on the page
00097      * for a new tuple version, or when free space falls below the relation's
00098      * fill-factor target (but not less than 10%).
00099      *
00100      * Checking free space here is questionable since we aren't holding any
00101      * lock on the buffer; in the worst case we could get a bogus answer. It's
00102      * unlikely to be *seriously* wrong, though, since reading either pd_lower
00103      * or pd_upper is probably atomic.  Avoiding taking a lock seems more
00104      * important than sometimes getting a wrong answer in what is after all
00105      * just a heuristic estimate.
00106      */
00107     minfree = RelationGetTargetPageFreeSpace(relation,
00108                                              HEAP_DEFAULT_FILLFACTOR);
00109     minfree = Max(minfree, BLCKSZ / 10);
00110 
00111     if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
00112     {
00113         /* OK, try to get exclusive buffer lock */
00114         if (!ConditionalLockBufferForCleanup(buffer))
00115             return;
00116 
00117         /*
00118          * Now that we have buffer lock, get accurate information about the
00119          * page's free space, and recheck the heuristic about whether to
00120          * prune. (We needn't recheck PageIsPrunable, since no one else could
00121          * have pruned while we hold pin.)
00122          */
00123         if (PageIsFull(page) || PageGetHeapFreeSpace(page) < minfree)
00124         {
00125             TransactionId ignore = InvalidTransactionId;        /* return value not
00126                                                                  * needed */
00127 
00128             /* OK to prune */
00129             (void) heap_page_prune(relation, buffer, OldestXmin, true, &ignore);
00130         }
00131 
00132         /* And release buffer lock */
00133         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
00134     }
00135 }
00136 
00137 
00138 /*
00139  * Prune and repair fragmentation in the specified page.
00140  *
00141  * Caller must have pin and buffer cleanup lock on the page.
00142  *
00143  * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD
00144  * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
00145  *
00146  * If report_stats is true then we send the number of reclaimed heap-only
00147  * tuples to pgstats.  (This must be FALSE during vacuum, since vacuum will
00148  * send its own new total to pgstats, and we don't want this delta applied
00149  * on top of that.)
00150  *
00151  * Returns the number of tuples deleted from the page and sets
00152  * latestRemovedXid.
00153  */
00154 int
00155 heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
00156                 bool report_stats, TransactionId *latestRemovedXid)
00157 {
00158     int         ndeleted = 0;
00159     Page        page = BufferGetPage(buffer);
00160     OffsetNumber offnum,
00161                 maxoff;
00162     PruneState  prstate;
00163 
00164     /*
00165      * Our strategy is to scan the page and make lists of items to change,
00166      * then apply the changes within a critical section.  This keeps as much
00167      * logic as possible out of the critical section, and also ensures that
00168      * WAL replay will work the same as the normal case.
00169      *
00170      * First, initialize the new pd_prune_xid value to zero (indicating no
00171      * prunable tuples).  If we find any tuples which may soon become
00172      * prunable, we will save the lowest relevant XID in new_prune_xid. Also
00173      * initialize the rest of our working state.
00174      */
00175     prstate.new_prune_xid = InvalidTransactionId;
00176     prstate.latestRemovedXid = *latestRemovedXid;
00177     prstate.nredirected = prstate.ndead = prstate.nunused = 0;
00178     memset(prstate.marked, 0, sizeof(prstate.marked));
00179 
00180     /* Scan the page */
00181     maxoff = PageGetMaxOffsetNumber(page);
00182     for (offnum = FirstOffsetNumber;
00183          offnum <= maxoff;
00184          offnum = OffsetNumberNext(offnum))
00185     {
00186         ItemId      itemid;
00187 
00188         /* Ignore items already processed as part of an earlier chain */
00189         if (prstate.marked[offnum])
00190             continue;
00191 
00192         /* Nothing to do if slot is empty or already dead */
00193         itemid = PageGetItemId(page, offnum);
00194         if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid))
00195             continue;
00196 
00197         /* Process this item or chain of items */
00198         ndeleted += heap_prune_chain(relation, buffer, offnum,
00199                                      OldestXmin,
00200                                      &prstate);
00201     }
00202 
00203     /* Any error while applying the changes is critical */
00204     START_CRIT_SECTION();
00205 
00206     /* Have we found any prunable items? */
00207     if (prstate.nredirected > 0 || prstate.ndead > 0 || prstate.nunused > 0)
00208     {
00209         /*
00210          * Apply the planned item changes, then repair page fragmentation, and
00211          * update the page's hint bit about whether it has free line pointers.
00212          */
00213         heap_page_prune_execute(buffer,
00214                                 prstate.redirected, prstate.nredirected,
00215                                 prstate.nowdead, prstate.ndead,
00216                                 prstate.nowunused, prstate.nunused);
00217 
00218         /*
00219          * Update the page's pd_prune_xid field to either zero, or the lowest
00220          * XID of any soon-prunable tuple.
00221          */
00222         ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
00223 
00224         /*
00225          * Also clear the "page is full" flag, since there's no point in
00226          * repeating the prune/defrag process until something else happens to
00227          * the page.
00228          */
00229         PageClearFull(page);
00230 
00231         MarkBufferDirty(buffer);
00232 
00233         /*
00234          * Emit a WAL HEAP_CLEAN record showing what we did
00235          */
00236         if (RelationNeedsWAL(relation))
00237         {
00238             XLogRecPtr  recptr;
00239 
00240             recptr = log_heap_clean(relation, buffer,
00241                                     prstate.redirected, prstate.nredirected,
00242                                     prstate.nowdead, prstate.ndead,
00243                                     prstate.nowunused, prstate.nunused,
00244                                     prstate.latestRemovedXid);
00245 
00246             PageSetLSN(BufferGetPage(buffer), recptr);
00247         }
00248     }
00249     else
00250     {
00251         /*
00252          * If we didn't prune anything, but have found a new value for the
00253          * pd_prune_xid field, update it and mark the buffer dirty. This is
00254          * treated as a non-WAL-logged hint.
00255          *
00256          * Also clear the "page is full" flag if it is set, since there's no
00257          * point in repeating the prune/defrag process until something else
00258          * happens to the page.
00259          */
00260         if (((PageHeader) page)->pd_prune_xid != prstate.new_prune_xid ||
00261             PageIsFull(page))
00262         {
00263             ((PageHeader) page)->pd_prune_xid = prstate.new_prune_xid;
00264             PageClearFull(page);
00265             MarkBufferDirtyHint(buffer);
00266         }
00267     }
00268 
00269     END_CRIT_SECTION();
00270 
00271     /*
00272      * If requested, report the number of tuples reclaimed to pgstats. This is
00273      * ndeleted minus ndead, because we don't want to count a now-DEAD root
00274      * item as a deletion for this purpose.
00275      */
00276     if (report_stats && ndeleted > prstate.ndead)
00277         pgstat_update_heap_dead_tuples(relation, ndeleted - prstate.ndead);
00278 
00279     *latestRemovedXid = prstate.latestRemovedXid;
00280 
00281     /*
00282      * XXX Should we update the FSM information of this page ?
00283      *
00284      * There are two schools of thought here. We may not want to update FSM
00285      * information so that the page is not used for unrelated UPDATEs/INSERTs
00286      * and any free space in this page will remain available for further
00287      * UPDATEs in *this* page, thus improving chances for doing HOT updates.
00288      *
00289      * But for a large table and where a page does not receive further UPDATEs
00290      * for a long time, we might waste this space by not updating the FSM
00291      * information. The relation may get extended and fragmented further.
00292      *
00293      * One possibility is to leave "fillfactor" worth of space in this page
00294      * and update FSM with the remaining space.
00295      */
00296 
00297     return ndeleted;
00298 }
00299 
00300 
00301 /*
00302  * Prune specified item pointer or a HOT chain originating at that item.
00303  *
00304  * If the item is an index-referenced tuple (i.e. not a heap-only tuple),
00305  * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT
00306  * chain.  We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple.
00307  * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really
00308  * DEAD, the OldestXmin test is just too coarse to detect it.
00309  *
00310  * The root line pointer is redirected to the tuple immediately after the
00311  * latest DEAD tuple.  If all tuples in the chain are DEAD, the root line
00312  * pointer is marked LP_DEAD.  (This includes the case of a DEAD simple
00313  * tuple, which we treat as a chain of length 1.)
00314  *
00315  * OldestXmin is the cutoff XID used to identify dead tuples.
00316  *
00317  * We don't actually change the page here, except perhaps for hint-bit updates
00318  * caused by HeapTupleSatisfiesVacuum.  We just add entries to the arrays in
00319  * prstate showing the changes to be made.  Items to be redirected are added
00320  * to the redirected[] array (two entries per redirection); items to be set to
00321  * LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED
00322  * state are added to nowunused[].
00323  *
00324  * Returns the number of tuples (to be) deleted from the page.
00325  */
00326 static int
00327 heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
00328                  TransactionId OldestXmin,
00329                  PruneState *prstate)
00330 {
00331     int         ndeleted = 0;
00332     Page        dp = (Page) BufferGetPage(buffer);
00333     TransactionId priorXmax = InvalidTransactionId;
00334     ItemId      rootlp;
00335     HeapTupleHeader htup;
00336     OffsetNumber latestdead = InvalidOffsetNumber,
00337                 maxoff = PageGetMaxOffsetNumber(dp),
00338                 offnum;
00339     OffsetNumber chainitems[MaxHeapTuplesPerPage];
00340     int         nchain = 0,
00341                 i;
00342 
00343     rootlp = PageGetItemId(dp, rootoffnum);
00344 
00345     /*
00346      * If it's a heap-only tuple, then it is not the start of a HOT chain.
00347      */
00348     if (ItemIdIsNormal(rootlp))
00349     {
00350         htup = (HeapTupleHeader) PageGetItem(dp, rootlp);
00351         if (HeapTupleHeaderIsHeapOnly(htup))
00352         {
00353             /*
00354              * If the tuple is DEAD and doesn't chain to anything else, mark
00355              * it unused immediately.  (If it does chain, we can only remove
00356              * it as part of pruning its chain.)
00357              *
00358              * We need this primarily to handle aborted HOT updates, that is,
00359              * XMIN_INVALID heap-only tuples.  Those might not be linked to by
00360              * any chain, since the parent tuple might be re-updated before
00361              * any pruning occurs.  So we have to be able to reap them
00362              * separately from chain-pruning.  (Note that
00363              * HeapTupleHeaderIsHotUpdated will never return true for an
00364              * XMIN_INVALID tuple, so this code will work even when there were
00365              * sequential updates within the aborted transaction.)
00366              *
00367              * Note that we might first arrive at a dead heap-only tuple
00368              * either here or while following a chain below.  Whichever path
00369              * gets there first will mark the tuple unused.
00370              */
00371             if (HeapTupleSatisfiesVacuum(htup, OldestXmin, buffer)
00372                 == HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup))
00373             {
00374                 heap_prune_record_unused(prstate, rootoffnum);
00375                 HeapTupleHeaderAdvanceLatestRemovedXid(htup,
00376                                                  &prstate->latestRemovedXid);
00377                 ndeleted++;
00378             }
00379 
00380             /* Nothing more to do */
00381             return ndeleted;
00382         }
00383     }
00384 
00385     /* Start from the root tuple */
00386     offnum = rootoffnum;
00387 
00388     /* while not end of the chain */
00389     for (;;)
00390     {
00391         ItemId      lp;
00392         bool        tupdead,
00393                     recent_dead;
00394 
00395         /* Some sanity checks */
00396         if (offnum < FirstOffsetNumber || offnum > maxoff)
00397             break;
00398 
00399         /* If item is already processed, stop --- it must not be same chain */
00400         if (prstate->marked[offnum])
00401             break;
00402 
00403         lp = PageGetItemId(dp, offnum);
00404 
00405         /* Unused item obviously isn't part of the chain */
00406         if (!ItemIdIsUsed(lp))
00407             break;
00408 
00409         /*
00410          * If we are looking at the redirected root line pointer, jump to the
00411          * first normal tuple in the chain.  If we find a redirect somewhere
00412          * else, stop --- it must not be same chain.
00413          */
00414         if (ItemIdIsRedirected(lp))
00415         {
00416             if (nchain > 0)
00417                 break;          /* not at start of chain */
00418             chainitems[nchain++] = offnum;
00419             offnum = ItemIdGetRedirect(rootlp);
00420             continue;
00421         }
00422 
00423         /*
00424          * Likewise, a dead item pointer can't be part of the chain. (We
00425          * already eliminated the case of dead root tuple outside this
00426          * function.)
00427          */
00428         if (ItemIdIsDead(lp))
00429             break;
00430 
00431         Assert(ItemIdIsNormal(lp));
00432         htup = (HeapTupleHeader) PageGetItem(dp, lp);
00433 
00434         /*
00435          * Check the tuple XMIN against prior XMAX, if any
00436          */
00437         if (TransactionIdIsValid(priorXmax) &&
00438             !TransactionIdEquals(HeapTupleHeaderGetXmin(htup), priorXmax))
00439             break;
00440 
00441         /*
00442          * OK, this tuple is indeed a member of the chain.
00443          */
00444         chainitems[nchain++] = offnum;
00445 
00446         /*
00447          * Check tuple's visibility status.
00448          */
00449         tupdead = recent_dead = false;
00450 
00451         switch (HeapTupleSatisfiesVacuum(htup, OldestXmin, buffer))
00452         {
00453             case HEAPTUPLE_DEAD:
00454                 tupdead = true;
00455                 break;
00456 
00457             case HEAPTUPLE_RECENTLY_DEAD:
00458                 recent_dead = true;
00459 
00460                 /*
00461                  * This tuple may soon become DEAD.  Update the hint field so
00462                  * that the page is reconsidered for pruning in future.
00463                  */
00464                 heap_prune_record_prunable(prstate,
00465                                            HeapTupleHeaderGetUpdateXid(htup));
00466                 break;
00467 
00468             case HEAPTUPLE_DELETE_IN_PROGRESS:
00469 
00470                 /*
00471                  * This tuple may soon become DEAD.  Update the hint field so
00472                  * that the page is reconsidered for pruning in future.
00473                  */
00474                 heap_prune_record_prunable(prstate,
00475                                            HeapTupleHeaderGetUpdateXid(htup));
00476                 break;
00477 
00478             case HEAPTUPLE_LIVE:
00479             case HEAPTUPLE_INSERT_IN_PROGRESS:
00480 
00481                 /*
00482                  * If we wanted to optimize for aborts, we might consider
00483                  * marking the page prunable when we see INSERT_IN_PROGRESS.
00484                  * But we don't.  See related decisions about when to mark the
00485                  * page prunable in heapam.c.
00486                  */
00487                 break;
00488 
00489             default:
00490                 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
00491                 break;
00492         }
00493 
00494         /*
00495          * Remember the last DEAD tuple seen.  We will advance past
00496          * RECENTLY_DEAD tuples just in case there's a DEAD one after them;
00497          * but we can't advance past anything else.  (XXX is it really worth
00498          * continuing to scan beyond RECENTLY_DEAD?  The case where we will
00499          * find another DEAD tuple is a fairly unusual corner case.)
00500          */
00501         if (tupdead)
00502         {
00503             latestdead = offnum;
00504             HeapTupleHeaderAdvanceLatestRemovedXid(htup,
00505                                                  &prstate->latestRemovedXid);
00506         }
00507         else if (!recent_dead)
00508             break;
00509 
00510         /*
00511          * If the tuple is not HOT-updated, then we are at the end of this
00512          * HOT-update chain.
00513          */
00514         if (!HeapTupleHeaderIsHotUpdated(htup))
00515             break;
00516 
00517         /*
00518          * Advance to next chain member.
00519          */
00520         Assert(ItemPointerGetBlockNumber(&htup->t_ctid) ==
00521                BufferGetBlockNumber(buffer));
00522         offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
00523         priorXmax = HeapTupleHeaderGetUpdateXid(htup);
00524     }
00525 
00526     /*
00527      * If we found a DEAD tuple in the chain, adjust the HOT chain so that all
00528      * the DEAD tuples at the start of the chain are removed and the root line
00529      * pointer is appropriately redirected.
00530      */
00531     if (OffsetNumberIsValid(latestdead))
00532     {
00533         /*
00534          * Mark as unused each intermediate item that we are able to remove
00535          * from the chain.
00536          *
00537          * When the previous item is the last dead tuple seen, we are at the
00538          * right candidate for redirection.
00539          */
00540         for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++)
00541         {
00542             heap_prune_record_unused(prstate, chainitems[i]);
00543             ndeleted++;
00544         }
00545 
00546         /*
00547          * If the root entry had been a normal tuple, we are deleting it, so
00548          * count it in the result.  But changing a redirect (even to DEAD
00549          * state) doesn't count.
00550          */
00551         if (ItemIdIsNormal(rootlp))
00552             ndeleted++;
00553 
00554         /*
00555          * If the DEAD tuple is at the end of the chain, the entire chain is
00556          * dead and the root line pointer can be marked dead.  Otherwise just
00557          * redirect the root to the correct chain member.
00558          */
00559         if (i >= nchain)
00560             heap_prune_record_dead(prstate, rootoffnum);
00561         else
00562             heap_prune_record_redirect(prstate, rootoffnum, chainitems[i]);
00563     }
00564     else if (nchain < 2 && ItemIdIsRedirected(rootlp))
00565     {
00566         /*
00567          * We found a redirect item that doesn't point to a valid follow-on
00568          * item.  This can happen if the loop in heap_page_prune caused us to
00569          * visit the dead successor of a redirect item before visiting the
00570          * redirect item.  We can clean up by setting the redirect item to
00571          * DEAD state.
00572          */
00573         heap_prune_record_dead(prstate, rootoffnum);
00574     }
00575 
00576     return ndeleted;
00577 }
00578 
00579 /* Record lowest soon-prunable XID */
00580 static void
00581 heap_prune_record_prunable(PruneState *prstate, TransactionId xid)
00582 {
00583     /*
00584      * This should exactly match the PageSetPrunable macro.  We can't store
00585      * directly into the page header yet, so we update working state.
00586      */
00587     Assert(TransactionIdIsNormal(xid));
00588     if (!TransactionIdIsValid(prstate->new_prune_xid) ||
00589         TransactionIdPrecedes(xid, prstate->new_prune_xid))
00590         prstate->new_prune_xid = xid;
00591 }
00592 
00593 /* Record item pointer to be redirected */
00594 static void
00595 heap_prune_record_redirect(PruneState *prstate,
00596                            OffsetNumber offnum, OffsetNumber rdoffnum)
00597 {
00598     Assert(prstate->nredirected < MaxHeapTuplesPerPage);
00599     prstate->redirected[prstate->nredirected * 2] = offnum;
00600     prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum;
00601     prstate->nredirected++;
00602     Assert(!prstate->marked[offnum]);
00603     prstate->marked[offnum] = true;
00604     Assert(!prstate->marked[rdoffnum]);
00605     prstate->marked[rdoffnum] = true;
00606 }
00607 
00608 /* Record item pointer to be marked dead */
00609 static void
00610 heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum)
00611 {
00612     Assert(prstate->ndead < MaxHeapTuplesPerPage);
00613     prstate->nowdead[prstate->ndead] = offnum;
00614     prstate->ndead++;
00615     Assert(!prstate->marked[offnum]);
00616     prstate->marked[offnum] = true;
00617 }
00618 
00619 /* Record item pointer to be marked unused */
00620 static void
00621 heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum)
00622 {
00623     Assert(prstate->nunused < MaxHeapTuplesPerPage);
00624     prstate->nowunused[prstate->nunused] = offnum;
00625     prstate->nunused++;
00626     Assert(!prstate->marked[offnum]);
00627     prstate->marked[offnum] = true;
00628 }
00629 
00630 
00631 /*
00632  * Perform the actual page changes needed by heap_page_prune.
00633  * It is expected that the caller has suitable pin and lock on the
00634  * buffer, and is inside a critical section.
00635  *
00636  * This is split out because it is also used by heap_xlog_clean()
00637  * to replay the WAL record when needed after a crash.  Note that the
00638  * arguments are identical to those of log_heap_clean().
00639  */
00640 void
00641 heap_page_prune_execute(Buffer buffer,
00642                         OffsetNumber *redirected, int nredirected,
00643                         OffsetNumber *nowdead, int ndead,
00644                         OffsetNumber *nowunused, int nunused)
00645 {
00646     Page        page = (Page) BufferGetPage(buffer);
00647     OffsetNumber *offnum;
00648     int         i;
00649 
00650     /* Update all redirected line pointers */
00651     offnum = redirected;
00652     for (i = 0; i < nredirected; i++)
00653     {
00654         OffsetNumber fromoff = *offnum++;
00655         OffsetNumber tooff = *offnum++;
00656         ItemId      fromlp = PageGetItemId(page, fromoff);
00657 
00658         ItemIdSetRedirect(fromlp, tooff);
00659     }
00660 
00661     /* Update all now-dead line pointers */
00662     offnum = nowdead;
00663     for (i = 0; i < ndead; i++)
00664     {
00665         OffsetNumber off = *offnum++;
00666         ItemId      lp = PageGetItemId(page, off);
00667 
00668         ItemIdSetDead(lp);
00669     }
00670 
00671     /* Update all now-unused line pointers */
00672     offnum = nowunused;
00673     for (i = 0; i < nunused; i++)
00674     {
00675         OffsetNumber off = *offnum++;
00676         ItemId      lp = PageGetItemId(page, off);
00677 
00678         ItemIdSetUnused(lp);
00679     }
00680 
00681     /*
00682      * Finally, repair any fragmentation, and update the page's hint bit about
00683      * whether it has free pointers.
00684      */
00685     PageRepairFragmentation(page);
00686 }
00687 
00688 
00689 /*
00690  * For all items in this page, find their respective root line pointers.
00691  * If item k is part of a HOT-chain with root at item j, then we set
00692  * root_offsets[k - 1] = j.
00693  *
00694  * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
00695  * We zero out all unused entries.
00696  *
00697  * The function must be called with at least share lock on the buffer, to
00698  * prevent concurrent prune operations.
00699  *
00700  * Note: The information collected here is valid only as long as the caller
00701  * holds a pin on the buffer. Once pin is released, a tuple might be pruned
00702  * and reused by a completely unrelated tuple.
00703  */
00704 void
00705 heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
00706 {
00707     OffsetNumber offnum,
00708                 maxoff;
00709 
00710     MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber));
00711 
00712     maxoff = PageGetMaxOffsetNumber(page);
00713     for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
00714     {
00715         ItemId      lp = PageGetItemId(page, offnum);
00716         HeapTupleHeader htup;
00717         OffsetNumber nextoffnum;
00718         TransactionId priorXmax;
00719 
00720         /* skip unused and dead items */
00721         if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp))
00722             continue;
00723 
00724         if (ItemIdIsNormal(lp))
00725         {
00726             htup = (HeapTupleHeader) PageGetItem(page, lp);
00727 
00728             /*
00729              * Check if this tuple is part of a HOT-chain rooted at some other
00730              * tuple. If so, skip it for now; we'll process it when we find
00731              * its root.
00732              */
00733             if (HeapTupleHeaderIsHeapOnly(htup))
00734                 continue;
00735 
00736             /*
00737              * This is either a plain tuple or the root of a HOT-chain.
00738              * Remember it in the mapping.
00739              */
00740             root_offsets[offnum - 1] = offnum;
00741 
00742             /* If it's not the start of a HOT-chain, we're done with it */
00743             if (!HeapTupleHeaderIsHotUpdated(htup))
00744                 continue;
00745 
00746             /* Set up to scan the HOT-chain */
00747             nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
00748             priorXmax = HeapTupleHeaderGetUpdateXid(htup);
00749         }
00750         else
00751         {
00752             /* Must be a redirect item. We do not set its root_offsets entry */
00753             Assert(ItemIdIsRedirected(lp));
00754             /* Set up to scan the HOT-chain */
00755             nextoffnum = ItemIdGetRedirect(lp);
00756             priorXmax = InvalidTransactionId;
00757         }
00758 
00759         /*
00760          * Now follow the HOT-chain and collect other tuples in the chain.
00761          *
00762          * Note: Even though this is a nested loop, the complexity of the
00763          * function is O(N) because a tuple in the page should be visited not
00764          * more than twice, once in the outer loop and once in HOT-chain
00765          * chases.
00766          */
00767         for (;;)
00768         {
00769             lp = PageGetItemId(page, nextoffnum);
00770 
00771             /* Check for broken chains */
00772             if (!ItemIdIsNormal(lp))
00773                 break;
00774 
00775             htup = (HeapTupleHeader) PageGetItem(page, lp);
00776 
00777             if (TransactionIdIsValid(priorXmax) &&
00778                 !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup)))
00779                 break;
00780 
00781             /* Remember the root line pointer for this item */
00782             root_offsets[nextoffnum - 1] = offnum;
00783 
00784             /* Advance to next chain member, if any */
00785             if (!HeapTupleHeaderIsHotUpdated(htup))
00786                 break;
00787 
00788             nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
00789             priorXmax = HeapTupleHeaderGetUpdateXid(htup);
00790         }
00791     }
00792 }