Header And Logo

PostgreSQL
| The world's most advanced open source database.

vacuumlazy.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * vacuumlazy.c
00004  *    Concurrent ("lazy") vacuuming.
00005  *
00006  *
00007  * The major space usage for LAZY VACUUM is storage for the array of dead
00008  * tuple TIDs, with the next biggest need being storage for per-disk-page
00009  * free space info.  We want to ensure we can vacuum even the very largest
00010  * relations with finite memory space usage.  To do that, we set upper bounds
00011  * on the number of tuples and pages we will keep track of at once.
00012  *
00013  * We are willing to use at most maintenance_work_mem memory space to keep
00014  * track of dead tuples.  We initially allocate an array of TIDs of that size,
00015  * with an upper limit that depends on table size (this limit ensures we don't
00016  * allocate a huge area uselessly for vacuuming small tables).  If the array
00017  * threatens to overflow, we suspend the heap scan phase and perform a pass of
00018  * index cleanup and page compaction, then resume the heap scan with an empty
00019  * TID array.
00020  *
00021  * If we're processing a table with no indexes, we can just vacuum each page
00022  * as we go; there's no need to save up multiple tuples to minimize the number
00023  * of index scans performed.  So we don't use maintenance_work_mem memory for
00024  * the TID array, just enough to hold as many heap tuples as fit on one page.
00025  *
00026  *
00027  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00028  * Portions Copyright (c) 1994, Regents of the University of California
00029  *
00030  *
00031  * IDENTIFICATION
00032  *    src/backend/commands/vacuumlazy.c
00033  *
00034  *-------------------------------------------------------------------------
00035  */
00036 #include "postgres.h"
00037 
00038 #include <math.h>
00039 
00040 #include "access/genam.h"
00041 #include "access/heapam.h"
00042 #include "access/heapam_xlog.h"
00043 #include "access/htup_details.h"
00044 #include "access/multixact.h"
00045 #include "access/transam.h"
00046 #include "access/visibilitymap.h"
00047 #include "catalog/storage.h"
00048 #include "commands/dbcommands.h"
00049 #include "commands/vacuum.h"
00050 #include "miscadmin.h"
00051 #include "pgstat.h"
00052 #include "portability/instr_time.h"
00053 #include "postmaster/autovacuum.h"
00054 #include "storage/bufmgr.h"
00055 #include "storage/freespace.h"
00056 #include "storage/lmgr.h"
00057 #include "utils/lsyscache.h"
00058 #include "utils/memutils.h"
00059 #include "utils/pg_rusage.h"
00060 #include "utils/timestamp.h"
00061 #include "utils/tqual.h"
00062 
00063 
00064 /*
00065  * Space/time tradeoff parameters: do these need to be user-tunable?
00066  *
00067  * To consider truncating the relation, we want there to be at least
00068  * REL_TRUNCATE_MINIMUM or (relsize / REL_TRUNCATE_FRACTION) (whichever
00069  * is less) potentially-freeable pages.
00070  */
00071 #define REL_TRUNCATE_MINIMUM    1000
00072 #define REL_TRUNCATE_FRACTION   16
00073 
00074 /*
00075  * Timing parameters for truncate locking heuristics.
00076  *
00077  * These were not exposed as user tunable GUC values because it didn't seem
00078  * that the potential for improvement was great enough to merit the cost of
00079  * supporting them.
00080  */
00081 #define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL     20  /* ms */
00082 #define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL      50  /* ms */
00083 #define VACUUM_TRUNCATE_LOCK_TIMEOUT            5000        /* ms */
00084 
00085 /*
00086  * Guesstimation of number of dead tuples per page.  This is used to
00087  * provide an upper limit to memory allocated when vacuuming small
00088  * tables.
00089  */
00090 #define LAZY_ALLOC_TUPLES       MaxHeapTuplesPerPage
00091 
00092 /*
00093  * Before we consider skipping a page that's marked as clean in
00094  * visibility map, we must've seen at least this many clean pages.
00095  */
00096 #define SKIP_PAGES_THRESHOLD    ((BlockNumber) 32)
00097 
00098 typedef struct LVRelStats
00099 {
00100     /* hasindex = true means two-pass strategy; false means one-pass */
00101     bool        hasindex;
00102     /* Overall statistics about rel */
00103     BlockNumber old_rel_pages;  /* previous value of pg_class.relpages */
00104     BlockNumber rel_pages;      /* total number of pages */
00105     BlockNumber scanned_pages;  /* number of pages we examined */
00106     double      scanned_tuples; /* counts only tuples on scanned pages */
00107     double      old_rel_tuples; /* previous value of pg_class.reltuples */
00108     double      new_rel_tuples; /* new estimated total # of tuples */
00109     BlockNumber pages_removed;
00110     double      tuples_deleted;
00111     BlockNumber nonempty_pages; /* actually, last nonempty page + 1 */
00112     /* List of TIDs of tuples we intend to delete */
00113     /* NB: this list is ordered by TID address */
00114     int         num_dead_tuples;    /* current # of entries */
00115     int         max_dead_tuples;    /* # slots allocated in array */
00116     ItemPointer dead_tuples;    /* array of ItemPointerData */
00117     int         num_index_scans;
00118     TransactionId latestRemovedXid;
00119     bool        lock_waiter_detected;
00120 } LVRelStats;
00121 
00122 
00123 /* A few variables that don't seem worth passing around as parameters */
00124 static int  elevel = -1;
00125 
00126 static TransactionId OldestXmin;
00127 static TransactionId FreezeLimit;
00128 static MultiXactId MultiXactFrzLimit;
00129 
00130 static BufferAccessStrategy vac_strategy;
00131 
00132 
00133 /* non-export function prototypes */
00134 static void lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
00135                Relation *Irel, int nindexes, bool scan_all);
00136 static void lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats);
00137 static bool lazy_check_needs_freeze(Buffer buf);
00138 static void lazy_vacuum_index(Relation indrel,
00139                   IndexBulkDeleteResult **stats,
00140                   LVRelStats *vacrelstats);
00141 static void lazy_cleanup_index(Relation indrel,
00142                    IndexBulkDeleteResult *stats,
00143                    LVRelStats *vacrelstats);
00144 static int lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
00145                  int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer);
00146 static void lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats);
00147 static BlockNumber count_nondeletable_pages(Relation onerel,
00148                          LVRelStats *vacrelstats);
00149 static void lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks);
00150 static void lazy_record_dead_tuple(LVRelStats *vacrelstats,
00151                        ItemPointer itemptr);
00152 static bool lazy_tid_reaped(ItemPointer itemptr, void *state);
00153 static int  vac_cmp_itemptr(const void *left, const void *right);
00154 static bool heap_page_is_all_visible(Buffer buf,
00155                          TransactionId *visibility_cutoff_xid);
00156 
00157 
00158 /*
00159  *  lazy_vacuum_rel() -- perform LAZY VACUUM for one heap relation
00160  *
00161  *      This routine vacuums a single heap, cleans out its indexes, and
00162  *      updates its relpages and reltuples statistics.
00163  *
00164  *      At entry, we have already established a transaction and opened
00165  *      and locked the relation.
00166  */
00167 void
00168 lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
00169                 BufferAccessStrategy bstrategy)
00170 {
00171     LVRelStats *vacrelstats;
00172     Relation   *Irel;
00173     int         nindexes;
00174     BlockNumber possibly_freeable;
00175     PGRUsage    ru0;
00176     TimestampTz starttime = 0;
00177     long        secs;
00178     int         usecs;
00179     double      read_rate,
00180                 write_rate;
00181     bool        scan_all;
00182     TransactionId freezeTableLimit;
00183     BlockNumber new_rel_pages;
00184     double      new_rel_tuples;
00185     BlockNumber new_rel_allvisible;
00186     TransactionId new_frozen_xid;
00187     MultiXactId new_min_multi;
00188 
00189     /* measure elapsed time iff autovacuum logging requires it */
00190     if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0)
00191     {
00192         pg_rusage_init(&ru0);
00193         starttime = GetCurrentTimestamp();
00194     }
00195 
00196     if (vacstmt->options & VACOPT_VERBOSE)
00197         elevel = INFO;
00198     else
00199         elevel = DEBUG2;
00200 
00201     vac_strategy = bstrategy;
00202 
00203     vacuum_set_xid_limits(vacstmt->freeze_min_age, vacstmt->freeze_table_age,
00204                           onerel->rd_rel->relisshared,
00205                           &OldestXmin, &FreezeLimit, &freezeTableLimit,
00206                           &MultiXactFrzLimit);
00207     scan_all = TransactionIdPrecedesOrEquals(onerel->rd_rel->relfrozenxid,
00208                                              freezeTableLimit);
00209 
00210     vacrelstats = (LVRelStats *) palloc0(sizeof(LVRelStats));
00211 
00212     vacrelstats->old_rel_pages = onerel->rd_rel->relpages;
00213     vacrelstats->old_rel_tuples = onerel->rd_rel->reltuples;
00214     vacrelstats->num_index_scans = 0;
00215     vacrelstats->pages_removed = 0;
00216     vacrelstats->lock_waiter_detected = false;
00217 
00218     /* Open all indexes of the relation */
00219     vac_open_indexes(onerel, RowExclusiveLock, &nindexes, &Irel);
00220     vacrelstats->hasindex = (nindexes > 0);
00221 
00222     /* Do the vacuuming */
00223     lazy_scan_heap(onerel, vacrelstats, Irel, nindexes, scan_all);
00224 
00225     /* Done with indexes */
00226     vac_close_indexes(nindexes, Irel, NoLock);
00227 
00228     /*
00229      * Optionally truncate the relation.
00230      *
00231      * Don't even think about it unless we have a shot at releasing a goodly
00232      * number of pages.  Otherwise, the time taken isn't worth it.
00233      *
00234      * Leave a populated materialized view with at least one page.
00235      */
00236     if (onerel->rd_rel->relkind == RELKIND_MATVIEW &&
00237         vacrelstats->nonempty_pages == 0)
00238         vacrelstats->nonempty_pages = 1;
00239 
00240     possibly_freeable = vacrelstats->rel_pages - vacrelstats->nonempty_pages;
00241     if (possibly_freeable > 0 &&
00242         (possibly_freeable >= REL_TRUNCATE_MINIMUM ||
00243          possibly_freeable >= vacrelstats->rel_pages / REL_TRUNCATE_FRACTION))
00244         lazy_truncate_heap(onerel, vacrelstats);
00245 
00246     /* Vacuum the Free Space Map */
00247     FreeSpaceMapVacuum(onerel);
00248 
00249     /*
00250      * Update statistics in pg_class.
00251      *
00252      * A corner case here is that if we scanned no pages at all because every
00253      * page is all-visible, we should not update relpages/reltuples, because
00254      * we have no new information to contribute.  In particular this keeps us
00255      * from replacing relpages=reltuples=0 (which means "unknown tuple
00256      * density") with nonzero relpages and reltuples=0 (which means "zero
00257      * tuple density") unless there's some actual evidence for the latter.
00258      *
00259      * We do update relallvisible even in the corner case, since if the table
00260      * is all-visible we'd definitely like to know that.  But clamp the value
00261      * to be not more than what we're setting relpages to.
00262      *
00263      * Also, don't change relfrozenxid if we skipped any pages, since then we
00264      * don't know for certain that all tuples have a newer xmin.
00265      */
00266     new_rel_pages = vacrelstats->rel_pages;
00267     new_rel_tuples = vacrelstats->new_rel_tuples;
00268     if (vacrelstats->scanned_pages == 0 && new_rel_pages > 0)
00269     {
00270         new_rel_pages = vacrelstats->old_rel_pages;
00271         new_rel_tuples = vacrelstats->old_rel_tuples;
00272     }
00273 
00274     new_rel_allvisible = visibilitymap_count(onerel);
00275     if (new_rel_allvisible > new_rel_pages)
00276         new_rel_allvisible = new_rel_pages;
00277 
00278     new_frozen_xid = FreezeLimit;
00279     if (vacrelstats->scanned_pages < vacrelstats->rel_pages)
00280         new_frozen_xid = InvalidTransactionId;
00281 
00282     new_min_multi = MultiXactFrzLimit;
00283     if (vacrelstats->scanned_pages < vacrelstats->rel_pages)
00284         new_min_multi = InvalidMultiXactId;
00285 
00286     vac_update_relstats(onerel,
00287                         new_rel_pages,
00288                         new_rel_tuples,
00289                         new_rel_allvisible,
00290                         vacrelstats->hasindex,
00291                         new_frozen_xid,
00292                         new_min_multi);
00293 
00294     /* report results to the stats collector, too */
00295     pgstat_report_vacuum(RelationGetRelid(onerel),
00296                           onerel->rd_rel->relisshared,
00297                           new_rel_tuples);
00298 
00299     /* and log the action if appropriate */
00300     if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0)
00301     {
00302         TimestampTz endtime = GetCurrentTimestamp();
00303 
00304         if (Log_autovacuum_min_duration == 0 ||
00305             TimestampDifferenceExceeds(starttime, endtime,
00306                                        Log_autovacuum_min_duration))
00307         {
00308             TimestampDifference(starttime, endtime, &secs, &usecs);
00309 
00310             read_rate = 0;
00311             write_rate = 0;
00312             if ((secs > 0) || (usecs > 0))
00313             {
00314                 read_rate = (double) BLCKSZ *VacuumPageMiss / (1024 * 1024) /
00315                             (secs + usecs / 1000000.0);
00316                 write_rate = (double) BLCKSZ *VacuumPageDirty / (1024 * 1024) /
00317                             (secs + usecs / 1000000.0);
00318             }
00319             ereport(LOG,
00320                     (errmsg("automatic vacuum of table \"%s.%s.%s\": index scans: %d\n"
00321                             "pages: %d removed, %d remain\n"
00322                             "tuples: %.0f removed, %.0f remain\n"
00323                             "buffer usage: %d hits, %d misses, %d dirtied\n"
00324                     "avg read rate: %.3f MB/s, avg write rate: %.3f MB/s\n"
00325                             "system usage: %s",
00326                             get_database_name(MyDatabaseId),
00327                             get_namespace_name(RelationGetNamespace(onerel)),
00328                             RelationGetRelationName(onerel),
00329                             vacrelstats->num_index_scans,
00330                             vacrelstats->pages_removed,
00331                             vacrelstats->rel_pages,
00332                             vacrelstats->tuples_deleted,
00333                             vacrelstats->new_rel_tuples,
00334                             VacuumPageHit,
00335                             VacuumPageMiss,
00336                             VacuumPageDirty,
00337                             read_rate, write_rate,
00338                             pg_rusage_show(&ru0))));
00339         }
00340     }
00341 }
00342 
00343 /*
00344  * For Hot Standby we need to know the highest transaction id that will
00345  * be removed by any change. VACUUM proceeds in a number of passes so
00346  * we need to consider how each pass operates. The first phase runs
00347  * heap_page_prune(), which can issue XLOG_HEAP2_CLEAN records as it
00348  * progresses - these will have a latestRemovedXid on each record.
00349  * In some cases this removes all of the tuples to be removed, though
00350  * often we have dead tuples with index pointers so we must remember them
00351  * for removal in phase 3. Index records for those rows are removed
00352  * in phase 2 and index blocks do not have MVCC information attached.
00353  * So before we can allow removal of any index tuples we need to issue
00354  * a WAL record containing the latestRemovedXid of rows that will be
00355  * removed in phase three. This allows recovery queries to block at the
00356  * correct place, i.e. before phase two, rather than during phase three
00357  * which would be after the rows have become inaccessible.
00358  */
00359 static void
00360 vacuum_log_cleanup_info(Relation rel, LVRelStats *vacrelstats)
00361 {
00362     /*
00363      * Skip this for relations for which no WAL is to be written, or if we're
00364      * not trying to support archive recovery.
00365      */
00366     if (!RelationNeedsWAL(rel) || !XLogIsNeeded())
00367         return;
00368 
00369     /*
00370      * No need to write the record at all unless it contains a valid value
00371      */
00372     if (TransactionIdIsValid(vacrelstats->latestRemovedXid))
00373         (void) log_heap_cleanup_info(rel->rd_node, vacrelstats->latestRemovedXid);
00374 }
00375 
00376 /*
00377  *  lazy_scan_heap() -- scan an open heap relation
00378  *
00379  *      This routine prunes each page in the heap, which will among other
00380  *      things truncate dead tuples to dead line pointers, defragment the
00381  *      page, and set commit status bits (see heap_page_prune).  It also builds
00382  *      lists of dead tuples and pages with free space, calculates statistics
00383  *      on the number of live tuples in the heap, and marks pages as
00384  *      all-visible if appropriate.  When done, or when we run low on space for
00385  *      dead-tuple TIDs, invoke vacuuming of indexes and call lazy_vacuum_heap
00386  *      to reclaim dead line pointers.
00387  *
00388  *      If there are no indexes then we can reclaim line pointers on the fly;
00389  *      dead line pointers need only be retained until all index pointers that
00390  *      reference them have been killed.
00391  */
00392 static void
00393 lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
00394                Relation *Irel, int nindexes, bool scan_all)
00395 {
00396     BlockNumber nblocks,
00397                 blkno;
00398     HeapTupleData tuple;
00399     char       *relname;
00400     BlockNumber empty_pages,
00401                 vacuumed_pages;
00402     double      num_tuples,
00403                 tups_vacuumed,
00404                 nkeep,
00405                 nunused;
00406     IndexBulkDeleteResult **indstats;
00407     int         i;
00408     PGRUsage    ru0;
00409     Buffer      vmbuffer = InvalidBuffer;
00410     BlockNumber next_not_all_visible_block;
00411     bool        skipping_all_visible_blocks;
00412 
00413     pg_rusage_init(&ru0);
00414 
00415     relname = RelationGetRelationName(onerel);
00416     ereport(elevel,
00417             (errmsg("vacuuming \"%s.%s\"",
00418                     get_namespace_name(RelationGetNamespace(onerel)),
00419                     relname)));
00420 
00421     empty_pages = vacuumed_pages = 0;
00422     num_tuples = tups_vacuumed = nkeep = nunused = 0;
00423 
00424     indstats = (IndexBulkDeleteResult **)
00425         palloc0(nindexes * sizeof(IndexBulkDeleteResult *));
00426 
00427     nblocks = RelationGetNumberOfBlocks(onerel);
00428     vacrelstats->rel_pages = nblocks;
00429     vacrelstats->scanned_pages = 0;
00430     vacrelstats->nonempty_pages = 0;
00431     vacrelstats->latestRemovedXid = InvalidTransactionId;
00432 
00433     lazy_space_alloc(vacrelstats, nblocks);
00434 
00435     /*
00436      * We want to skip pages that don't require vacuuming according to the
00437      * visibility map, but only when we can skip at least SKIP_PAGES_THRESHOLD
00438      * consecutive pages.  Since we're reading sequentially, the OS should be
00439      * doing readahead for us, so there's no gain in skipping a page now and
00440      * then; that's likely to disable readahead and so be counterproductive.
00441      * Also, skipping even a single page means that we can't update
00442      * relfrozenxid, so we only want to do it if we can skip a goodly number
00443      * of pages.
00444      *
00445      * Before entering the main loop, establish the invariant that
00446      * next_not_all_visible_block is the next block number >= blkno that's not
00447      * all-visible according to the visibility map, or nblocks if there's no
00448      * such block.  Also, we set up the skipping_all_visible_blocks flag,
00449      * which is needed because we need hysteresis in the decision: once we've
00450      * started skipping blocks, we may as well skip everything up to the next
00451      * not-all-visible block.
00452      *
00453      * Note: if scan_all is true, we won't actually skip any pages; but we
00454      * maintain next_not_all_visible_block anyway, so as to set up the
00455      * all_visible_according_to_vm flag correctly for each page.
00456      *
00457      * Note: The value returned by visibilitymap_test could be slightly
00458      * out-of-date, since we make this test before reading the corresponding
00459      * heap page or locking the buffer.  This is OK.  If we mistakenly think
00460      * that the page is all-visible when in fact the flag's just been cleared,
00461      * we might fail to vacuum the page.  But it's OK to skip pages when
00462      * scan_all is not set, so no great harm done; the next vacuum will find
00463      * them.  If we make the reverse mistake and vacuum a page unnecessarily,
00464      * it'll just be a no-op.
00465      */
00466     for (next_not_all_visible_block = 0;
00467          next_not_all_visible_block < nblocks;
00468          next_not_all_visible_block++)
00469     {
00470         if (!visibilitymap_test(onerel, next_not_all_visible_block, &vmbuffer))
00471             break;
00472         vacuum_delay_point();
00473     }
00474     if (next_not_all_visible_block >= SKIP_PAGES_THRESHOLD)
00475         skipping_all_visible_blocks = true;
00476     else
00477         skipping_all_visible_blocks = false;
00478 
00479     for (blkno = 0; blkno < nblocks; blkno++)
00480     {
00481         Buffer      buf;
00482         Page        page;
00483         OffsetNumber offnum,
00484                     maxoff;
00485         bool        tupgone,
00486                     hastup;
00487         int         prev_dead_count;
00488         OffsetNumber frozen[MaxOffsetNumber];
00489         int         nfrozen;
00490         Size        freespace;
00491         bool        all_visible_according_to_vm;
00492         bool        all_visible;
00493         bool        has_dead_tuples;
00494         TransactionId visibility_cutoff_xid = InvalidTransactionId;
00495 
00496         if (blkno == next_not_all_visible_block)
00497         {
00498             /* Time to advance next_not_all_visible_block */
00499             for (next_not_all_visible_block++;
00500                  next_not_all_visible_block < nblocks;
00501                  next_not_all_visible_block++)
00502             {
00503                 if (!visibilitymap_test(onerel, next_not_all_visible_block,
00504                                         &vmbuffer))
00505                     break;
00506                 vacuum_delay_point();
00507             }
00508 
00509             /*
00510              * We know we can't skip the current block.  But set up
00511              * skipping_all_visible_blocks to do the right thing at the
00512              * following blocks.
00513              */
00514             if (next_not_all_visible_block - blkno > SKIP_PAGES_THRESHOLD)
00515                 skipping_all_visible_blocks = true;
00516             else
00517                 skipping_all_visible_blocks = false;
00518             all_visible_according_to_vm = false;
00519         }
00520         else
00521         {
00522             /* Current block is all-visible */
00523             if (skipping_all_visible_blocks && !scan_all)
00524                 continue;
00525             all_visible_according_to_vm = true;
00526         }
00527 
00528         vacuum_delay_point();
00529 
00530         /*
00531          * If we are close to overrunning the available space for dead-tuple
00532          * TIDs, pause and do a cycle of vacuuming before we tackle this page.
00533          */
00534         if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
00535             vacrelstats->num_dead_tuples > 0)
00536         {
00537             /*
00538              * Before beginning index vacuuming, we release any pin we may
00539              * hold on the visibility map page.  This isn't necessary for
00540              * correctness, but we do it anyway to avoid holding the pin
00541              * across a lengthy, unrelated operation.
00542              */
00543             if (BufferIsValid(vmbuffer))
00544             {
00545                 ReleaseBuffer(vmbuffer);
00546                 vmbuffer = InvalidBuffer;
00547             }
00548 
00549             /* Log cleanup info before we touch indexes */
00550             vacuum_log_cleanup_info(onerel, vacrelstats);
00551 
00552             /* Remove index entries */
00553             for (i = 0; i < nindexes; i++)
00554                 lazy_vacuum_index(Irel[i],
00555                                   &indstats[i],
00556                                   vacrelstats);
00557             /* Remove tuples from heap */
00558             lazy_vacuum_heap(onerel, vacrelstats);
00559 
00560             /*
00561              * Forget the now-vacuumed tuples, and press on, but be careful
00562              * not to reset latestRemovedXid since we want that value to be
00563              * valid.
00564              */
00565             vacrelstats->num_dead_tuples = 0;
00566             vacrelstats->num_index_scans++;
00567         }
00568 
00569         /*
00570          * Pin the visibility map page in case we need to mark the page
00571          * all-visible.  In most cases this will be very cheap, because we'll
00572          * already have the correct page pinned anyway.  However, it's
00573          * possible that (a) next_not_all_visible_block is covered by a
00574          * different VM page than the current block or (b) we released our pin
00575          * and did a cycle of index vacuuming.
00576          */
00577         visibilitymap_pin(onerel, blkno, &vmbuffer);
00578 
00579         buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
00580                                  RBM_NORMAL, vac_strategy);
00581 
00582         /* We need buffer cleanup lock so that we can prune HOT chains. */
00583         if (!ConditionalLockBufferForCleanup(buf))
00584         {
00585             /*
00586              * If we're not scanning the whole relation to guard against XID
00587              * wraparound, it's OK to skip vacuuming a page.  The next vacuum
00588              * will clean it up.
00589              */
00590             if (!scan_all)
00591             {
00592                 ReleaseBuffer(buf);
00593                 continue;
00594             }
00595 
00596             /*
00597              * If this is a wraparound checking vacuum, then we read the page
00598              * with share lock to see if any xids need to be frozen. If the
00599              * page doesn't need attention we just skip and continue. If it
00600              * does, we wait for cleanup lock.
00601              *
00602              * We could defer the lock request further by remembering the page
00603              * and coming back to it later, or we could even register
00604              * ourselves for multiple buffers and then service whichever one
00605              * is received first.  For now, this seems good enough.
00606              */
00607             LockBuffer(buf, BUFFER_LOCK_SHARE);
00608             if (!lazy_check_needs_freeze(buf))
00609             {
00610                 UnlockReleaseBuffer(buf);
00611                 continue;
00612             }
00613             LockBuffer(buf, BUFFER_LOCK_UNLOCK);
00614             LockBufferForCleanup(buf);
00615             /* drop through to normal processing */
00616         }
00617 
00618         vacrelstats->scanned_pages++;
00619 
00620         page = BufferGetPage(buf);
00621 
00622         if (PageIsNew(page))
00623         {
00624             /*
00625              * An all-zeroes page could be left over if a backend extends the
00626              * relation but crashes before initializing the page. Reclaim such
00627              * pages for use.
00628              *
00629              * We have to be careful here because we could be looking at a
00630              * page that someone has just added to the relation and not yet
00631              * been able to initialize (see RelationGetBufferForTuple). To
00632              * protect against that, release the buffer lock, grab the
00633              * relation extension lock momentarily, and re-lock the buffer. If
00634              * the page is still uninitialized by then, it must be left over
00635              * from a crashed backend, and we can initialize it.
00636              *
00637              * We don't really need the relation lock when this is a new or
00638              * temp relation, but it's probably not worth the code space to
00639              * check that, since this surely isn't a critical path.
00640              *
00641              * Note: the comparable code in vacuum.c need not worry because
00642              * it's got exclusive lock on the whole relation.
00643              */
00644             LockBuffer(buf, BUFFER_LOCK_UNLOCK);
00645             LockRelationForExtension(onerel, ExclusiveLock);
00646             UnlockRelationForExtension(onerel, ExclusiveLock);
00647             LockBufferForCleanup(buf);
00648             if (PageIsNew(page))
00649             {
00650                 ereport(WARNING,
00651                 (errmsg("relation \"%s\" page %u is uninitialized --- fixing",
00652                         relname, blkno)));
00653                 PageInit(page, BufferGetPageSize(buf), 0);
00654                 empty_pages++;
00655             }
00656             freespace = PageGetHeapFreeSpace(page);
00657             MarkBufferDirty(buf);
00658             UnlockReleaseBuffer(buf);
00659 
00660             RecordPageWithFreeSpace(onerel, blkno, freespace);
00661             continue;
00662         }
00663 
00664         if (PageIsEmpty(page))
00665         {
00666             empty_pages++;
00667             freespace = PageGetHeapFreeSpace(page);
00668 
00669             /* empty pages are always all-visible */
00670             if (!PageIsAllVisible(page))
00671             {
00672                 PageSetAllVisible(page);
00673                 MarkBufferDirty(buf);
00674                 visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
00675                                   vmbuffer, InvalidTransactionId);
00676             }
00677 
00678             UnlockReleaseBuffer(buf);
00679             RecordPageWithFreeSpace(onerel, blkno, freespace);
00680             continue;
00681         }
00682 
00683         /*
00684          * Prune all HOT-update chains in this page.
00685          *
00686          * We count tuples removed by the pruning step as removed by VACUUM.
00687          */
00688         tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false,
00689                                          &vacrelstats->latestRemovedXid);
00690 
00691         /*
00692          * Now scan the page to collect vacuumable items and check for tuples
00693          * requiring freezing.
00694          */
00695         all_visible = true;
00696         has_dead_tuples = false;
00697         nfrozen = 0;
00698         hastup = false;
00699         prev_dead_count = vacrelstats->num_dead_tuples;
00700         maxoff = PageGetMaxOffsetNumber(page);
00701 
00702         /*
00703          * Note: If you change anything in the loop below, also look at
00704          * heap_page_is_all_visible to see if that needs to be changed.
00705          */
00706         for (offnum = FirstOffsetNumber;
00707              offnum <= maxoff;
00708              offnum = OffsetNumberNext(offnum))
00709         {
00710             ItemId      itemid;
00711 
00712             itemid = PageGetItemId(page, offnum);
00713 
00714             /* Unused items require no processing, but we count 'em */
00715             if (!ItemIdIsUsed(itemid))
00716             {
00717                 nunused += 1;
00718                 continue;
00719             }
00720 
00721             /* Redirect items mustn't be touched */
00722             if (ItemIdIsRedirected(itemid))
00723             {
00724                 hastup = true;  /* this page won't be truncatable */
00725                 continue;
00726             }
00727 
00728             ItemPointerSet(&(tuple.t_self), blkno, offnum);
00729 
00730             /*
00731              * DEAD item pointers are to be vacuumed normally; but we don't
00732              * count them in tups_vacuumed, else we'd be double-counting (at
00733              * least in the common case where heap_page_prune() just freed up
00734              * a non-HOT tuple).
00735              */
00736             if (ItemIdIsDead(itemid))
00737             {
00738                 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
00739                 all_visible = false;
00740                 continue;
00741             }
00742 
00743             Assert(ItemIdIsNormal(itemid));
00744 
00745             tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
00746             tuple.t_len = ItemIdGetLength(itemid);
00747 
00748             tupgone = false;
00749 
00750             switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
00751             {
00752                 case HEAPTUPLE_DEAD:
00753 
00754                     /*
00755                      * Ordinarily, DEAD tuples would have been removed by
00756                      * heap_page_prune(), but it's possible that the tuple
00757                      * state changed since heap_page_prune() looked.  In
00758                      * particular an INSERT_IN_PROGRESS tuple could have
00759                      * changed to DEAD if the inserter aborted.  So this
00760                      * cannot be considered an error condition.
00761                      *
00762                      * If the tuple is HOT-updated then it must only be
00763                      * removed by a prune operation; so we keep it just as if
00764                      * it were RECENTLY_DEAD.  Also, if it's a heap-only
00765                      * tuple, we choose to keep it, because it'll be a lot
00766                      * cheaper to get rid of it in the next pruning pass than
00767                      * to treat it like an indexed tuple.
00768                      */
00769                     if (HeapTupleIsHotUpdated(&tuple) ||
00770                         HeapTupleIsHeapOnly(&tuple))
00771                         nkeep += 1;
00772                     else
00773                         tupgone = true; /* we can delete the tuple */
00774                     all_visible = false;
00775                     break;
00776                 case HEAPTUPLE_LIVE:
00777                     /* Tuple is good --- but let's do some validity checks */
00778                     if (onerel->rd_rel->relhasoids &&
00779                         !OidIsValid(HeapTupleGetOid(&tuple)))
00780                         elog(WARNING, "relation \"%s\" TID %u/%u: OID is invalid",
00781                              relname, blkno, offnum);
00782 
00783                     /*
00784                      * Is the tuple definitely visible to all transactions?
00785                      *
00786                      * NB: Like with per-tuple hint bits, we can't set the
00787                      * PD_ALL_VISIBLE flag if the inserter committed
00788                      * asynchronously. See SetHintBits for more info. Check
00789                      * that the HEAP_XMIN_COMMITTED hint bit is set because of
00790                      * that.
00791                      */
00792                     if (all_visible)
00793                     {
00794                         TransactionId xmin;
00795 
00796                         if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
00797                         {
00798                             all_visible = false;
00799                             break;
00800                         }
00801 
00802                         /*
00803                          * The inserter definitely committed. But is it old
00804                          * enough that everyone sees it as committed?
00805                          */
00806                         xmin = HeapTupleHeaderGetXmin(tuple.t_data);
00807                         if (!TransactionIdPrecedes(xmin, OldestXmin))
00808                         {
00809                             all_visible = false;
00810                             break;
00811                         }
00812 
00813                         /* Track newest xmin on page. */
00814                         if (TransactionIdFollows(xmin, visibility_cutoff_xid))
00815                             visibility_cutoff_xid = xmin;
00816                     }
00817                     break;
00818                 case HEAPTUPLE_RECENTLY_DEAD:
00819 
00820                     /*
00821                      * If tuple is recently deleted then we must not remove it
00822                      * from relation.
00823                      */
00824                     nkeep += 1;
00825                     all_visible = false;
00826                     break;
00827                 case HEAPTUPLE_INSERT_IN_PROGRESS:
00828                     /* This is an expected case during concurrent vacuum */
00829                     all_visible = false;
00830                     break;
00831                 case HEAPTUPLE_DELETE_IN_PROGRESS:
00832                     /* This is an expected case during concurrent vacuum */
00833                     all_visible = false;
00834                     break;
00835                 default:
00836                     elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
00837                     break;
00838             }
00839 
00840             if (tupgone)
00841             {
00842                 lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
00843                 HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data,
00844                                              &vacrelstats->latestRemovedXid);
00845                 tups_vacuumed += 1;
00846                 has_dead_tuples = true;
00847             }
00848             else
00849             {
00850                 num_tuples += 1;
00851                 hastup = true;
00852 
00853                 /*
00854                  * Each non-removable tuple must be checked to see if it needs
00855                  * freezing.  Note we already have exclusive buffer lock.
00856                  */
00857                 if (heap_freeze_tuple(tuple.t_data, FreezeLimit,
00858                                       MultiXactFrzLimit))
00859                     frozen[nfrozen++] = offnum;
00860             }
00861         }                       /* scan along page */
00862 
00863         /*
00864          * If we froze any tuples, mark the buffer dirty, and write a WAL
00865          * record recording the changes.  We must log the changes to be
00866          * crash-safe against future truncation of CLOG.
00867          */
00868         if (nfrozen > 0)
00869         {
00870             MarkBufferDirty(buf);
00871             if (RelationNeedsWAL(onerel))
00872             {
00873                 XLogRecPtr  recptr;
00874 
00875                 recptr = log_heap_freeze(onerel, buf, FreezeLimit,
00876                                          MultiXactFrzLimit, frozen, nfrozen);
00877                 PageSetLSN(page, recptr);
00878             }
00879         }
00880 
00881         /*
00882          * If there are no indexes then we can vacuum the page right now
00883          * instead of doing a second scan.
00884          */
00885         if (nindexes == 0 &&
00886             vacrelstats->num_dead_tuples > 0)
00887         {
00888             /* Remove tuples from heap */
00889             lazy_vacuum_page(onerel, blkno, buf, 0, vacrelstats, &vmbuffer);
00890 
00891             /*
00892              * Forget the now-vacuumed tuples, and press on, but be careful
00893              * not to reset latestRemovedXid since we want that value to be
00894              * valid.
00895              */
00896             vacrelstats->num_dead_tuples = 0;
00897             vacuumed_pages++;
00898         }
00899 
00900         freespace = PageGetHeapFreeSpace(page);
00901 
00902         /* mark page all-visible, if appropriate */
00903         if (all_visible && !all_visible_according_to_vm)
00904         {
00905             /*
00906              * It should never be the case that the visibility map page is set
00907              * while the page-level bit is clear, but the reverse is allowed
00908              * (if checksums are not enabled).  Regardless, set the both bits
00909              * so that we get back in sync.
00910              *
00911              * NB: If the heap page is all-visible but the VM bit is not set,
00912              * we don't need to dirty the heap page.  However, if checksums are
00913              * enabled, we do need to make sure that the heap page is dirtied
00914              * before passing it to visibilitymap_set(), because it may be
00915              * logged.  Given that this situation should only happen in rare
00916              * cases after a crash, it is not worth optimizing.
00917              */
00918             PageSetAllVisible(page);
00919             MarkBufferDirty(buf);
00920             visibilitymap_set(onerel, blkno, buf, InvalidXLogRecPtr,
00921                               vmbuffer, visibility_cutoff_xid);
00922         }
00923 
00924         /*
00925          * As of PostgreSQL 9.2, the visibility map bit should never be set if
00926          * the page-level bit is clear.  However, it's possible that the bit
00927          * got cleared after we checked it and before we took the buffer
00928          * content lock, so we must recheck before jumping to the conclusion
00929          * that something bad has happened.
00930          */
00931         else if (all_visible_according_to_vm && !PageIsAllVisible(page)
00932                  && visibilitymap_test(onerel, blkno, &vmbuffer))
00933         {
00934             elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
00935                  relname, blkno);
00936             visibilitymap_clear(onerel, blkno, vmbuffer);
00937         }
00938 
00939         /*
00940          * It's possible for the value returned by GetOldestXmin() to move
00941          * backwards, so it's not wrong for us to see tuples that appear to
00942          * not be visible to everyone yet, while PD_ALL_VISIBLE is already
00943          * set. The real safe xmin value never moves backwards, but
00944          * GetOldestXmin() is conservative and sometimes returns a value
00945          * that's unnecessarily small, so if we see that contradiction it just
00946          * means that the tuples that we think are not visible to everyone yet
00947          * actually are, and the PD_ALL_VISIBLE flag is correct.
00948          *
00949          * There should never be dead tuples on a page with PD_ALL_VISIBLE
00950          * set, however.
00951          */
00952         else if (PageIsAllVisible(page) && has_dead_tuples)
00953         {
00954             elog(WARNING, "page containing dead tuples is marked as all-visible in relation \"%s\" page %u",
00955                  relname, blkno);
00956             PageClearAllVisible(page);
00957             MarkBufferDirty(buf);
00958             visibilitymap_clear(onerel, blkno, vmbuffer);
00959         }
00960 
00961         UnlockReleaseBuffer(buf);
00962 
00963         /* Remember the location of the last page with nonremovable tuples */
00964         if (hastup)
00965             vacrelstats->nonempty_pages = blkno + 1;
00966 
00967         /*
00968          * If we remembered any tuples for deletion, then the page will be
00969          * visited again by lazy_vacuum_heap, which will compute and record
00970          * its post-compaction free space.  If not, then we're done with this
00971          * page, so remember its free space as-is.  (This path will always be
00972          * taken if there are no indexes.)
00973          */
00974         if (vacrelstats->num_dead_tuples == prev_dead_count)
00975             RecordPageWithFreeSpace(onerel, blkno, freespace);
00976     }
00977 
00978     /* save stats for use later */
00979     vacrelstats->scanned_tuples = num_tuples;
00980     vacrelstats->tuples_deleted = tups_vacuumed;
00981 
00982     /* now we can compute the new value for pg_class.reltuples */
00983     vacrelstats->new_rel_tuples = vac_estimate_reltuples(onerel, false,
00984                                                          nblocks,
00985                                                   vacrelstats->scanned_pages,
00986                                                          num_tuples);
00987 
00988     /*
00989      * Release any remaining pin on visibility map page.
00990      */
00991     if (BufferIsValid(vmbuffer))
00992     {
00993         ReleaseBuffer(vmbuffer);
00994         vmbuffer = InvalidBuffer;
00995     }
00996 
00997     /* If any tuples need to be deleted, perform final vacuum cycle */
00998     /* XXX put a threshold on min number of tuples here? */
00999     if (vacrelstats->num_dead_tuples > 0)
01000     {
01001         /* Log cleanup info before we touch indexes */
01002         vacuum_log_cleanup_info(onerel, vacrelstats);
01003 
01004         /* Remove index entries */
01005         for (i = 0; i < nindexes; i++)
01006             lazy_vacuum_index(Irel[i],
01007                               &indstats[i],
01008                               vacrelstats);
01009         /* Remove tuples from heap */
01010         lazy_vacuum_heap(onerel, vacrelstats);
01011         vacrelstats->num_index_scans++;
01012     }
01013 
01014     /* Do post-vacuum cleanup and statistics update for each index */
01015     for (i = 0; i < nindexes; i++)
01016         lazy_cleanup_index(Irel[i], indstats[i], vacrelstats);
01017 
01018     /* If no indexes, make log report that lazy_vacuum_heap would've made */
01019     if (vacuumed_pages)
01020         ereport(elevel,
01021                 (errmsg("\"%s\": removed %.0f row versions in %u pages",
01022                         RelationGetRelationName(onerel),
01023                         tups_vacuumed, vacuumed_pages)));
01024 
01025     ereport(elevel,
01026             (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u out of %u pages",
01027                     RelationGetRelationName(onerel),
01028                     tups_vacuumed, num_tuples,
01029                     vacrelstats->scanned_pages, nblocks),
01030              errdetail("%.0f dead row versions cannot be removed yet.\n"
01031                        "There were %.0f unused item pointers.\n"
01032                        "%u pages are entirely empty.\n"
01033                        "%s.",
01034                        nkeep,
01035                        nunused,
01036                        empty_pages,
01037                        pg_rusage_show(&ru0))));
01038 }
01039 
01040 
01041 /*
01042  *  lazy_vacuum_heap() -- second pass over the heap
01043  *
01044  *      This routine marks dead tuples as unused and compacts out free
01045  *      space on their pages.  Pages not having dead tuples recorded from
01046  *      lazy_scan_heap are not visited at all.
01047  *
01048  * Note: the reason for doing this as a second pass is we cannot remove
01049  * the tuples until we've removed their index entries, and we want to
01050  * process index entry removal in batches as large as possible.
01051  */
01052 static void
01053 lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
01054 {
01055     int         tupindex;
01056     int         npages;
01057     PGRUsage    ru0;
01058     Buffer      vmbuffer = InvalidBuffer;
01059 
01060     pg_rusage_init(&ru0);
01061     npages = 0;
01062 
01063     tupindex = 0;
01064     while (tupindex < vacrelstats->num_dead_tuples)
01065     {
01066         BlockNumber tblk;
01067         Buffer      buf;
01068         Page        page;
01069         Size        freespace;
01070 
01071         vacuum_delay_point();
01072 
01073         tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
01074         buf = ReadBufferExtended(onerel, MAIN_FORKNUM, tblk, RBM_NORMAL,
01075                                  vac_strategy);
01076         if (!ConditionalLockBufferForCleanup(buf))
01077         {
01078             ReleaseBuffer(buf);
01079             ++tupindex;
01080             continue;
01081         }
01082         tupindex = lazy_vacuum_page(onerel, tblk, buf, tupindex, vacrelstats,
01083                                     &vmbuffer);
01084 
01085         /* Now that we've compacted the page, record its available space */
01086         page = BufferGetPage(buf);
01087         freespace = PageGetHeapFreeSpace(page);
01088 
01089         UnlockReleaseBuffer(buf);
01090         RecordPageWithFreeSpace(onerel, tblk, freespace);
01091         npages++;
01092     }
01093 
01094     if (BufferIsValid(vmbuffer))
01095     {
01096         ReleaseBuffer(vmbuffer);
01097         vmbuffer = InvalidBuffer;
01098     }
01099 
01100     ereport(elevel,
01101             (errmsg("\"%s\": removed %d row versions in %d pages",
01102                     RelationGetRelationName(onerel),
01103                     tupindex, npages),
01104              errdetail("%s.",
01105                        pg_rusage_show(&ru0))));
01106 }
01107 
01108 /*
01109  *  lazy_vacuum_page() -- free dead tuples on a page
01110  *                   and repair its fragmentation.
01111  *
01112  * Caller must hold pin and buffer cleanup lock on the buffer.
01113  *
01114  * tupindex is the index in vacrelstats->dead_tuples of the first dead
01115  * tuple for this page.  We assume the rest follow sequentially.
01116  * The return value is the first tupindex after the tuples of this page.
01117  */
01118 static int
01119 lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
01120                  int tupindex, LVRelStats *vacrelstats, Buffer *vmbuffer)
01121 {
01122     Page        page = BufferGetPage(buffer);
01123     OffsetNumber unused[MaxOffsetNumber];
01124     int         uncnt = 0;
01125     TransactionId   visibility_cutoff_xid;
01126 
01127     START_CRIT_SECTION();
01128 
01129     for (; tupindex < vacrelstats->num_dead_tuples; tupindex++)
01130     {
01131         BlockNumber tblk;
01132         OffsetNumber toff;
01133         ItemId      itemid;
01134 
01135         tblk = ItemPointerGetBlockNumber(&vacrelstats->dead_tuples[tupindex]);
01136         if (tblk != blkno)
01137             break;              /* past end of tuples for this block */
01138         toff = ItemPointerGetOffsetNumber(&vacrelstats->dead_tuples[tupindex]);
01139         itemid = PageGetItemId(page, toff);
01140         ItemIdSetUnused(itemid);
01141         unused[uncnt++] = toff;
01142     }
01143 
01144     PageRepairFragmentation(page);
01145 
01146     /*
01147      * Mark buffer dirty before we write WAL.
01148      *
01149      * If checksums are enabled, visibilitymap_set() may log the heap page, so
01150      * we must mark heap buffer dirty before calling visibilitymap_set().
01151      */
01152     MarkBufferDirty(buffer);
01153 
01154     /*
01155      * Now that we have removed the dead tuples from the page, once again check
01156      * if the page has become all-visible.
01157      */
01158     if (!visibilitymap_test(onerel, blkno, vmbuffer) &&
01159         heap_page_is_all_visible(buffer, &visibility_cutoff_xid))
01160     {
01161         Assert(BufferIsValid(*vmbuffer));
01162         PageSetAllVisible(page);
01163         visibilitymap_set(onerel, blkno, buffer, InvalidXLogRecPtr, *vmbuffer,
01164                 visibility_cutoff_xid);
01165     }
01166 
01167     /* XLOG stuff */
01168     if (RelationNeedsWAL(onerel))
01169     {
01170         XLogRecPtr  recptr;
01171 
01172         recptr = log_heap_clean(onerel, buffer,
01173                                 NULL, 0, NULL, 0,
01174                                 unused, uncnt,
01175                                 vacrelstats->latestRemovedXid);
01176         PageSetLSN(page, recptr);
01177     }
01178 
01179     END_CRIT_SECTION();
01180 
01181     return tupindex;
01182 }
01183 
01184 /*
01185  *  lazy_check_needs_freeze() -- scan page to see if any tuples
01186  *                   need to be cleaned to avoid wraparound
01187  *
01188  * Returns true if the page needs to be vacuumed using cleanup lock.
01189  */
01190 static bool
01191 lazy_check_needs_freeze(Buffer buf)
01192 {
01193     Page        page;
01194     OffsetNumber offnum,
01195                 maxoff;
01196     HeapTupleHeader tupleheader;
01197 
01198     page = BufferGetPage(buf);
01199 
01200     if (PageIsNew(page) || PageIsEmpty(page))
01201     {
01202         /* PageIsNew probably shouldn't happen... */
01203         return false;
01204     }
01205 
01206     maxoff = PageGetMaxOffsetNumber(page);
01207     for (offnum = FirstOffsetNumber;
01208          offnum <= maxoff;
01209          offnum = OffsetNumberNext(offnum))
01210     {
01211         ItemId      itemid;
01212 
01213         itemid = PageGetItemId(page, offnum);
01214 
01215         if (!ItemIdIsNormal(itemid))
01216             continue;
01217 
01218         tupleheader = (HeapTupleHeader) PageGetItem(page, itemid);
01219 
01220         if (heap_tuple_needs_freeze(tupleheader, FreezeLimit,
01221                                     MultiXactFrzLimit, buf))
01222             return true;
01223     }                           /* scan along page */
01224 
01225     return false;
01226 }
01227 
01228 
01229 /*
01230  *  lazy_vacuum_index() -- vacuum one index relation.
01231  *
01232  *      Delete all the index entries pointing to tuples listed in
01233  *      vacrelstats->dead_tuples, and update running statistics.
01234  */
01235 static void
01236 lazy_vacuum_index(Relation indrel,
01237                   IndexBulkDeleteResult **stats,
01238                   LVRelStats *vacrelstats)
01239 {
01240     IndexVacuumInfo ivinfo;
01241     PGRUsage    ru0;
01242 
01243     pg_rusage_init(&ru0);
01244 
01245     ivinfo.index = indrel;
01246     ivinfo.analyze_only = false;
01247     ivinfo.estimated_count = true;
01248     ivinfo.message_level = elevel;
01249     ivinfo.num_heap_tuples = vacrelstats->old_rel_tuples;
01250     ivinfo.strategy = vac_strategy;
01251 
01252     /* Do bulk deletion */
01253     *stats = index_bulk_delete(&ivinfo, *stats,
01254                                lazy_tid_reaped, (void *) vacrelstats);
01255 
01256     ereport(elevel,
01257             (errmsg("scanned index \"%s\" to remove %d row versions",
01258                     RelationGetRelationName(indrel),
01259                     vacrelstats->num_dead_tuples),
01260              errdetail("%s.", pg_rusage_show(&ru0))));
01261 }
01262 
01263 /*
01264  *  lazy_cleanup_index() -- do post-vacuum cleanup for one index relation.
01265  */
01266 static void
01267 lazy_cleanup_index(Relation indrel,
01268                    IndexBulkDeleteResult *stats,
01269                    LVRelStats *vacrelstats)
01270 {
01271     IndexVacuumInfo ivinfo;
01272     PGRUsage    ru0;
01273 
01274     pg_rusage_init(&ru0);
01275 
01276     ivinfo.index = indrel;
01277     ivinfo.analyze_only = false;
01278     ivinfo.estimated_count = (vacrelstats->scanned_pages < vacrelstats->rel_pages);
01279     ivinfo.message_level = elevel;
01280     ivinfo.num_heap_tuples = vacrelstats->new_rel_tuples;
01281     ivinfo.strategy = vac_strategy;
01282 
01283     stats = index_vacuum_cleanup(&ivinfo, stats);
01284 
01285     if (!stats)
01286         return;
01287 
01288     /*
01289      * Now update statistics in pg_class, but only if the index says the count
01290      * is accurate.
01291      */
01292     if (!stats->estimated_count)
01293         vac_update_relstats(indrel,
01294                             stats->num_pages,
01295                             stats->num_index_tuples,
01296                             0,
01297                             false,
01298                             InvalidTransactionId,
01299                             InvalidMultiXactId);
01300 
01301     ereport(elevel,
01302             (errmsg("index \"%s\" now contains %.0f row versions in %u pages",
01303                     RelationGetRelationName(indrel),
01304                     stats->num_index_tuples,
01305                     stats->num_pages),
01306              errdetail("%.0f index row versions were removed.\n"
01307              "%u index pages have been deleted, %u are currently reusable.\n"
01308                        "%s.",
01309                        stats->tuples_removed,
01310                        stats->pages_deleted, stats->pages_free,
01311                        pg_rusage_show(&ru0))));
01312 
01313     pfree(stats);
01314 }
01315 
01316 /*
01317  * lazy_truncate_heap - try to truncate off any empty pages at the end
01318  */
01319 static void
01320 lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
01321 {
01322     BlockNumber old_rel_pages = vacrelstats->rel_pages;
01323     BlockNumber new_rel_pages;
01324     PGRUsage    ru0;
01325     int         lock_retry;
01326 
01327     pg_rusage_init(&ru0);
01328 
01329     /*
01330      * Loop until no more truncating can be done.
01331      */
01332     do
01333     {
01334         /*
01335          * We need full exclusive lock on the relation in order to do
01336          * truncation. If we can't get it, give up rather than waiting --- we
01337          * don't want to block other backends, and we don't want to deadlock
01338          * (which is quite possible considering we already hold a lower-grade
01339          * lock).
01340          */
01341         vacrelstats->lock_waiter_detected = false;
01342         lock_retry = 0;
01343         while (true)
01344         {
01345             if (ConditionalLockRelation(onerel, AccessExclusiveLock))
01346                 break;
01347 
01348             /*
01349              * Check for interrupts while trying to (re-)acquire the exclusive
01350              * lock.
01351              */
01352             CHECK_FOR_INTERRUPTS();
01353 
01354             if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT /
01355                                 VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL))
01356             {
01357                 /*
01358                  * We failed to establish the lock in the specified number of
01359                  * retries. This means we give up truncating.
01360                  */
01361                 vacrelstats->lock_waiter_detected = true;
01362                 ereport(elevel,
01363                         (errmsg("\"%s\": stopping truncate due to conflicting lock request",
01364                                 RelationGetRelationName(onerel))));
01365                 return;
01366             }
01367 
01368             pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL);
01369         }
01370 
01371         /*
01372          * Now that we have exclusive lock, look to see if the rel has grown
01373          * whilst we were vacuuming with non-exclusive lock.  If so, give up;
01374          * the newly added pages presumably contain non-deletable tuples.
01375          */
01376         new_rel_pages = RelationGetNumberOfBlocks(onerel);
01377         if (new_rel_pages != old_rel_pages)
01378         {
01379             /*
01380              * Note: we intentionally don't update vacrelstats->rel_pages with
01381              * the new rel size here.  If we did, it would amount to assuming
01382              * that the new pages are empty, which is unlikely. Leaving the
01383              * numbers alone amounts to assuming that the new pages have the
01384              * same tuple density as existing ones, which is less unlikely.
01385              */
01386             UnlockRelation(onerel, AccessExclusiveLock);
01387             return;
01388         }
01389 
01390         /*
01391          * Scan backwards from the end to verify that the end pages actually
01392          * contain no tuples.  This is *necessary*, not optional, because
01393          * other backends could have added tuples to these pages whilst we
01394          * were vacuuming.
01395          */
01396         new_rel_pages = count_nondeletable_pages(onerel, vacrelstats);
01397 
01398         if (new_rel_pages >= old_rel_pages)
01399         {
01400             /* can't do anything after all */
01401             UnlockRelation(onerel, AccessExclusiveLock);
01402             return;
01403         }
01404 
01405         /*
01406          * Okay to truncate.
01407          */
01408         RelationTruncate(onerel, new_rel_pages);
01409 
01410         /*
01411          * We can release the exclusive lock as soon as we have truncated.
01412          * Other backends can't safely access the relation until they have
01413          * processed the smgr invalidation that smgrtruncate sent out ... but
01414          * that should happen as part of standard invalidation processing once
01415          * they acquire lock on the relation.
01416          */
01417         UnlockRelation(onerel, AccessExclusiveLock);
01418 
01419         /*
01420          * Update statistics.  Here, it *is* correct to adjust rel_pages
01421          * without also touching reltuples, since the tuple count wasn't
01422          * changed by the truncation.
01423          */
01424         vacrelstats->pages_removed += old_rel_pages - new_rel_pages;
01425         vacrelstats->rel_pages = new_rel_pages;
01426 
01427         ereport(elevel,
01428                 (errmsg("\"%s\": truncated %u to %u pages",
01429                         RelationGetRelationName(onerel),
01430                         old_rel_pages, new_rel_pages),
01431                  errdetail("%s.",
01432                            pg_rusage_show(&ru0))));
01433         old_rel_pages = new_rel_pages;
01434     } while (new_rel_pages > vacrelstats->nonempty_pages &&
01435              vacrelstats->lock_waiter_detected);
01436 }
01437 
01438 /*
01439  * Rescan end pages to verify that they are (still) empty of tuples.
01440  *
01441  * Returns number of nondeletable pages (last nonempty page + 1).
01442  */
01443 static BlockNumber
01444 count_nondeletable_pages(Relation onerel, LVRelStats *vacrelstats)
01445 {
01446     BlockNumber blkno;
01447     instr_time  starttime;
01448 
01449     /* Initialize the starttime if we check for conflicting lock requests */
01450     INSTR_TIME_SET_CURRENT(starttime);
01451 
01452     /* Strange coding of loop control is needed because blkno is unsigned */
01453     blkno = vacrelstats->rel_pages;
01454     while (blkno > vacrelstats->nonempty_pages)
01455     {
01456         Buffer      buf;
01457         Page        page;
01458         OffsetNumber offnum,
01459                     maxoff;
01460         bool        hastup;
01461 
01462         /*
01463          * Check if another process requests a lock on our relation. We are
01464          * holding an AccessExclusiveLock here, so they will be waiting. We
01465          * only do this once per VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL, and we
01466          * only check if that interval has elapsed once every 32 blocks to
01467          * keep the number of system calls and actual shared lock table
01468          * lookups to a minimum.
01469          */
01470         if ((blkno % 32) == 0)
01471         {
01472             instr_time  currenttime;
01473             instr_time  elapsed;
01474 
01475             INSTR_TIME_SET_CURRENT(currenttime);
01476             elapsed = currenttime;
01477             INSTR_TIME_SUBTRACT(elapsed, starttime);
01478             if ((INSTR_TIME_GET_MICROSEC(elapsed) / 1000)
01479                 >= VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL)
01480             {
01481                 if (LockHasWaitersRelation(onerel, AccessExclusiveLock))
01482                 {
01483                     ereport(elevel,
01484                             (errmsg("\"%s\": suspending truncate due to conflicting lock request",
01485                                     RelationGetRelationName(onerel))));
01486 
01487                     vacrelstats->lock_waiter_detected = true;
01488                     return blkno;
01489                 }
01490                 starttime = currenttime;
01491             }
01492         }
01493 
01494         /*
01495          * We don't insert a vacuum delay point here, because we have an
01496          * exclusive lock on the table which we want to hold for as short a
01497          * time as possible.  We still need to check for interrupts however.
01498          */
01499         CHECK_FOR_INTERRUPTS();
01500 
01501         blkno--;
01502 
01503         buf = ReadBufferExtended(onerel, MAIN_FORKNUM, blkno,
01504                                  RBM_NORMAL, vac_strategy);
01505 
01506         /* In this phase we only need shared access to the buffer */
01507         LockBuffer(buf, BUFFER_LOCK_SHARE);
01508 
01509         page = BufferGetPage(buf);
01510 
01511         if (PageIsNew(page) || PageIsEmpty(page))
01512         {
01513             /* PageIsNew probably shouldn't happen... */
01514             UnlockReleaseBuffer(buf);
01515             continue;
01516         }
01517 
01518         hastup = false;
01519         maxoff = PageGetMaxOffsetNumber(page);
01520         for (offnum = FirstOffsetNumber;
01521              offnum <= maxoff;
01522              offnum = OffsetNumberNext(offnum))
01523         {
01524             ItemId      itemid;
01525 
01526             itemid = PageGetItemId(page, offnum);
01527 
01528             /*
01529              * Note: any non-unused item should be taken as a reason to keep
01530              * this page.  We formerly thought that DEAD tuples could be
01531              * thrown away, but that's not so, because we'd not have cleaned
01532              * out their index entries.
01533              */
01534             if (ItemIdIsUsed(itemid))
01535             {
01536                 hastup = true;
01537                 break;          /* can stop scanning */
01538             }
01539         }                       /* scan along page */
01540 
01541         UnlockReleaseBuffer(buf);
01542 
01543         /* Done scanning if we found a tuple here */
01544         if (hastup)
01545             return blkno + 1;
01546     }
01547 
01548     /*
01549      * If we fall out of the loop, all the previously-thought-to-be-empty
01550      * pages still are; we need not bother to look at the last known-nonempty
01551      * page.
01552      */
01553     return vacrelstats->nonempty_pages;
01554 }
01555 
01556 /*
01557  * lazy_space_alloc - space allocation decisions for lazy vacuum
01558  *
01559  * See the comments at the head of this file for rationale.
01560  */
01561 static void
01562 lazy_space_alloc(LVRelStats *vacrelstats, BlockNumber relblocks)
01563 {
01564     long        maxtuples;
01565 
01566     if (vacrelstats->hasindex)
01567     {
01568         maxtuples = (maintenance_work_mem * 1024L) / sizeof(ItemPointerData);
01569         maxtuples = Min(maxtuples, INT_MAX);
01570         maxtuples = Min(maxtuples, MaxAllocSize / sizeof(ItemPointerData));
01571 
01572         /* curious coding here to ensure the multiplication can't overflow */
01573         if ((BlockNumber) (maxtuples / LAZY_ALLOC_TUPLES) > relblocks)
01574             maxtuples = relblocks * LAZY_ALLOC_TUPLES;
01575 
01576         /* stay sane if small maintenance_work_mem */
01577         maxtuples = Max(maxtuples, MaxHeapTuplesPerPage);
01578     }
01579     else
01580     {
01581         maxtuples = MaxHeapTuplesPerPage;
01582     }
01583 
01584     vacrelstats->num_dead_tuples = 0;
01585     vacrelstats->max_dead_tuples = (int) maxtuples;
01586     vacrelstats->dead_tuples = (ItemPointer)
01587         palloc(maxtuples * sizeof(ItemPointerData));
01588 }
01589 
01590 /*
01591  * lazy_record_dead_tuple - remember one deletable tuple
01592  */
01593 static void
01594 lazy_record_dead_tuple(LVRelStats *vacrelstats,
01595                        ItemPointer itemptr)
01596 {
01597     /*
01598      * The array shouldn't overflow under normal behavior, but perhaps it
01599      * could if we are given a really small maintenance_work_mem. In that
01600      * case, just forget the last few tuples (we'll get 'em next time).
01601      */
01602     if (vacrelstats->num_dead_tuples < vacrelstats->max_dead_tuples)
01603     {
01604         vacrelstats->dead_tuples[vacrelstats->num_dead_tuples] = *itemptr;
01605         vacrelstats->num_dead_tuples++;
01606     }
01607 }
01608 
01609 /*
01610  *  lazy_tid_reaped() -- is a particular tid deletable?
01611  *
01612  *      This has the right signature to be an IndexBulkDeleteCallback.
01613  *
01614  *      Assumes dead_tuples array is in sorted order.
01615  */
01616 static bool
01617 lazy_tid_reaped(ItemPointer itemptr, void *state)
01618 {
01619     LVRelStats *vacrelstats = (LVRelStats *) state;
01620     ItemPointer res;
01621 
01622     res = (ItemPointer) bsearch((void *) itemptr,
01623                                 (void *) vacrelstats->dead_tuples,
01624                                 vacrelstats->num_dead_tuples,
01625                                 sizeof(ItemPointerData),
01626                                 vac_cmp_itemptr);
01627 
01628     return (res != NULL);
01629 }
01630 
01631 /*
01632  * Comparator routines for use with qsort() and bsearch().
01633  */
01634 static int
01635 vac_cmp_itemptr(const void *left, const void *right)
01636 {
01637     BlockNumber lblk,
01638                 rblk;
01639     OffsetNumber loff,
01640                 roff;
01641 
01642     lblk = ItemPointerGetBlockNumber((ItemPointer) left);
01643     rblk = ItemPointerGetBlockNumber((ItemPointer) right);
01644 
01645     if (lblk < rblk)
01646         return -1;
01647     if (lblk > rblk)
01648         return 1;
01649 
01650     loff = ItemPointerGetOffsetNumber((ItemPointer) left);
01651     roff = ItemPointerGetOffsetNumber((ItemPointer) right);
01652 
01653     if (loff < roff)
01654         return -1;
01655     if (loff > roff)
01656         return 1;
01657 
01658     return 0;
01659 }
01660 
01661 /*
01662  * Check if every tuple in the given page is visible to all current and future
01663  * transactions. Also return the visibility_cutoff_xid which is the highest
01664  * xmin amongst the visible tuples.
01665  */
01666 static bool
01667 heap_page_is_all_visible(Buffer buf, TransactionId *visibility_cutoff_xid)
01668 {
01669     Page         page = BufferGetPage(buf);
01670     OffsetNumber offnum,
01671                  maxoff;
01672     bool         all_visible = true;
01673 
01674     *visibility_cutoff_xid = InvalidTransactionId;
01675 
01676     /*
01677      * This is a stripped down version of the line pointer scan in
01678      * lazy_scan_heap(). So if you change anything here, also check that
01679      * code.
01680      */
01681     maxoff = PageGetMaxOffsetNumber(page);
01682     for (offnum = FirstOffsetNumber;
01683             offnum <= maxoff && all_visible;
01684             offnum = OffsetNumberNext(offnum))
01685     {
01686         ItemId          itemid;
01687         HeapTupleData   tuple;
01688 
01689         itemid = PageGetItemId(page, offnum);
01690 
01691         /* Unused or redirect line pointers are of no interest */
01692         if (!ItemIdIsUsed(itemid) || ItemIdIsRedirected(itemid))
01693             continue;
01694 
01695         ItemPointerSet(&(tuple.t_self), BufferGetBlockNumber(buf), offnum);
01696 
01697         /*
01698          * Dead line pointers can have index pointers pointing to them. So they
01699          * can't be treated as visible
01700          */
01701         if (ItemIdIsDead(itemid))
01702         {
01703             all_visible = false;
01704             break;
01705         }
01706 
01707         Assert(ItemIdIsNormal(itemid));
01708 
01709         tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid);
01710 
01711         switch (HeapTupleSatisfiesVacuum(tuple.t_data, OldestXmin, buf))
01712         {
01713             case HEAPTUPLE_LIVE:
01714                 {
01715                     TransactionId xmin;
01716 
01717                     /* Check comments in lazy_scan_heap. */
01718                     if (!(tuple.t_data->t_infomask & HEAP_XMIN_COMMITTED))
01719                     {
01720                         all_visible = false;
01721                         break;
01722                     }
01723 
01724                     /*
01725                      * The inserter definitely committed. But is it old
01726                      * enough that everyone sees it as committed?
01727                      */
01728                     xmin = HeapTupleHeaderGetXmin(tuple.t_data);
01729                     if (!TransactionIdPrecedes(xmin, OldestXmin))
01730                     {
01731                         all_visible = false;
01732                         break;
01733                     }
01734 
01735                     /* Track newest xmin on page. */
01736                     if (TransactionIdFollows(xmin, *visibility_cutoff_xid))
01737                         *visibility_cutoff_xid = xmin;
01738                 }
01739                 break;
01740 
01741             case HEAPTUPLE_DEAD:
01742             case HEAPTUPLE_RECENTLY_DEAD:
01743             case HEAPTUPLE_INSERT_IN_PROGRESS:
01744             case HEAPTUPLE_DELETE_IN_PROGRESS:
01745                 all_visible = false;
01746                 break;
01747 
01748             default:
01749                 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
01750                 break;
01751         }
01752     }                       /* scan along page */
01753 
01754     return all_visible;
01755 }