Header And Logo

PostgreSQL
| The world's most advanced open source database.

visibilitymap.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * visibilitymap.c
00004  *    bitmap for tracking visibility of heap tuples
00005  *
00006  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00007  * Portions Copyright (c) 1994, Regents of the University of California
00008  *
00009  *
00010  * IDENTIFICATION
00011  *    src/backend/access/heap/visibilitymap.c
00012  *
00013  * INTERFACE ROUTINES
00014  *      visibilitymap_clear  - clear a bit in the visibility map
00015  *      visibilitymap_pin    - pin a map page for setting a bit
00016  *      visibilitymap_pin_ok - check whether correct map page is already pinned
00017  *      visibilitymap_set    - set a bit in a previously pinned page
00018  *      visibilitymap_test   - test if a bit is set
00019  *      visibilitymap_count  - count number of bits set in visibility map
00020  *      visibilitymap_truncate  - truncate the visibility map
00021  *
00022  * NOTES
00023  *
00024  * The visibility map is a bitmap with one bit per heap page. A set bit means
00025  * that all tuples on the page are known visible to all transactions, and
00026  * therefore the page doesn't need to be vacuumed. The map is conservative in
00027  * the sense that we make sure that whenever a bit is set, we know the
00028  * condition is true, but if a bit is not set, it might or might not be true.
00029  *
00030  * Clearing a visibility map bit is not separately WAL-logged.  The callers
00031  * must make sure that whenever a bit is cleared, the bit is cleared on WAL
00032  * replay of the updating operation as well.
00033  *
00034  * When we *set* a visibility map during VACUUM, we must write WAL.  This may
00035  * seem counterintuitive, since the bit is basically a hint: if it is clear,
00036  * it may still be the case that every tuple on the page is visible to all
00037  * transactions; we just don't know that for certain.  The difficulty is that
00038  * there are two bits which are typically set together: the PD_ALL_VISIBLE bit
00039  * on the page itself, and the visibility map bit.  If a crash occurs after the
00040  * visibility map page makes it to disk and before the updated heap page makes
00041  * it to disk, redo must set the bit on the heap page.  Otherwise, the next
00042  * insert, update, or delete on the heap page will fail to realize that the
00043  * visibility map bit must be cleared, possibly causing index-only scans to
00044  * return wrong answers.
00045  *
00046  * VACUUM will normally skip pages for which the visibility map bit is set;
00047  * such pages can't contain any dead tuples and therefore don't need vacuuming.
00048  * The visibility map is not used for anti-wraparound vacuums, because
00049  * an anti-wraparound vacuum needs to freeze tuples and observe the latest xid
00050  * present in the table, even on pages that don't have any dead tuples.
00051  *
00052  * LOCKING
00053  *
00054  * In heapam.c, whenever a page is modified so that not all tuples on the
00055  * page are visible to everyone anymore, the corresponding bit in the
00056  * visibility map is cleared. In order to be crash-safe, we need to do this
00057  * while still holding a lock on the heap page and in the same critical
00058  * section that logs the page modification. However, we don't want to hold
00059  * the buffer lock over any I/O that may be required to read in the visibility
00060  * map page.  To avoid this, we examine the heap page before locking it;
00061  * if the page-level PD_ALL_VISIBLE bit is set, we pin the visibility map
00062  * bit.  Then, we lock the buffer.  But this creates a race condition: there
00063  * is a possibility that in the time it takes to lock the buffer, the
00064  * PD_ALL_VISIBLE bit gets set.  If that happens, we have to unlock the
00065  * buffer, pin the visibility map page, and relock the buffer.  This shouldn't
00066  * happen often, because only VACUUM currently sets visibility map bits,
00067  * and the race will only occur if VACUUM processes a given page at almost
00068  * exactly the same time that someone tries to further modify it.
00069  *
00070  * To set a bit, you need to hold a lock on the heap page. That prevents
00071  * the race condition where VACUUM sees that all tuples on the page are
00072  * visible to everyone, but another backend modifies the page before VACUUM
00073  * sets the bit in the visibility map.
00074  *
00075  * When a bit is set, the LSN of the visibility map page is updated to make
00076  * sure that the visibility map update doesn't get written to disk before the
00077  * WAL record of the changes that made it possible to set the bit is flushed.
00078  * But when a bit is cleared, we don't have to do that because it's always
00079  * safe to clear a bit in the map from correctness point of view.
00080  *
00081  *-------------------------------------------------------------------------
00082  */
00083 #include "postgres.h"
00084 
00085 #include "access/heapam_xlog.h"
00086 #include "access/visibilitymap.h"
00087 #include "miscadmin.h"
00088 #include "storage/bufmgr.h"
00089 #include "storage/lmgr.h"
00090 #include "storage/smgr.h"
00091 #include "utils/inval.h"
00092 
00093 
00094 /*#define TRACE_VISIBILITYMAP */
00095 
00096 /*
00097  * Size of the bitmap on each visibility map page, in bytes. There's no
00098  * extra headers, so the whole page minus the standard page header is
00099  * used for the bitmap.
00100  */
00101 #define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
00102 
00103 /* Number of bits allocated for each heap block. */
00104 #define BITS_PER_HEAPBLOCK 1
00105 
00106 /* Number of heap blocks we can represent in one byte. */
00107 #define HEAPBLOCKS_PER_BYTE 8
00108 
00109 /* Number of heap blocks we can represent in one visibility map page. */
00110 #define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
00111 
00112 /* Mapping from heap block number to the right bit in the visibility map */
00113 #define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
00114 #define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
00115 #define HEAPBLK_TO_MAPBIT(x) ((x) % HEAPBLOCKS_PER_BYTE)
00116 
00117 /* table for fast counting of set bits */
00118 static const uint8 number_of_ones[256] = {
00119     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
00120     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
00121     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
00122     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
00123     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
00124     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
00125     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
00126     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
00127     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
00128     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
00129     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
00130     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
00131     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
00132     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
00133     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
00134     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
00135 };
00136 
00137 /* prototypes for internal routines */
00138 static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend);
00139 static void vm_extend(Relation rel, BlockNumber nvmblocks);
00140 
00141 
00142 /*
00143  *  visibilitymap_clear - clear a bit in visibility map
00144  *
00145  * You must pass a buffer containing the correct map page to this function.
00146  * Call visibilitymap_pin first to pin the right one. This function doesn't do
00147  * any I/O.
00148  */
00149 void
00150 visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer buf)
00151 {
00152     BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
00153     int         mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
00154     int         mapBit = HEAPBLK_TO_MAPBIT(heapBlk);
00155     uint8       mask = 1 << mapBit;
00156     char       *map;
00157 
00158 #ifdef TRACE_VISIBILITYMAP
00159     elog(DEBUG1, "vm_clear %s %d", RelationGetRelationName(rel), heapBlk);
00160 #endif
00161 
00162     if (!BufferIsValid(buf) || BufferGetBlockNumber(buf) != mapBlock)
00163         elog(ERROR, "wrong buffer passed to visibilitymap_clear");
00164 
00165     LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
00166     map = PageGetContents(BufferGetPage(buf));
00167 
00168     if (map[mapByte] & mask)
00169     {
00170         map[mapByte] &= ~mask;
00171 
00172         MarkBufferDirty(buf);
00173     }
00174 
00175     LockBuffer(buf, BUFFER_LOCK_UNLOCK);
00176 }
00177 
00178 /*
00179  *  visibilitymap_pin - pin a map page for setting a bit
00180  *
00181  * Setting a bit in the visibility map is a two-phase operation. First, call
00182  * visibilitymap_pin, to pin the visibility map page containing the bit for
00183  * the heap page. Because that can require I/O to read the map page, you
00184  * shouldn't hold a lock on the heap page while doing that. Then, call
00185  * visibilitymap_set to actually set the bit.
00186  *
00187  * On entry, *buf should be InvalidBuffer or a valid buffer returned by
00188  * an earlier call to visibilitymap_pin or visibilitymap_test on the same
00189  * relation. On return, *buf is a valid buffer with the map page containing
00190  * the bit for heapBlk.
00191  *
00192  * If the page doesn't exist in the map file yet, it is extended.
00193  */
00194 void
00195 visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *buf)
00196 {
00197     BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
00198 
00199     /* Reuse the old pinned buffer if possible */
00200     if (BufferIsValid(*buf))
00201     {
00202         if (BufferGetBlockNumber(*buf) == mapBlock)
00203             return;
00204 
00205         ReleaseBuffer(*buf);
00206     }
00207     *buf = vm_readbuf(rel, mapBlock, true);
00208 }
00209 
00210 /*
00211  *  visibilitymap_pin_ok - do we already have the correct page pinned?
00212  *
00213  * On entry, buf should be InvalidBuffer or a valid buffer returned by
00214  * an earlier call to visibilitymap_pin or visibilitymap_test on the same
00215  * relation.  The return value indicates whether the buffer covers the
00216  * given heapBlk.
00217  */
00218 bool
00219 visibilitymap_pin_ok(BlockNumber heapBlk, Buffer buf)
00220 {
00221     BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
00222 
00223     return BufferIsValid(buf) && BufferGetBlockNumber(buf) == mapBlock;
00224 }
00225 
00226 /*
00227  *  visibilitymap_set - set a bit on a previously pinned page
00228  *
00229  * recptr is the LSN of the XLOG record we're replaying, if we're in recovery,
00230  * or InvalidXLogRecPtr in normal running.  The page LSN is advanced to the
00231  * one provided; in normal running, we generate a new XLOG record and set the
00232  * page LSN to that value.  cutoff_xid is the largest xmin on the page being
00233  * marked all-visible; it is needed for Hot Standby, and can be
00234  * InvalidTransactionId if the page contains no tuples.
00235  *
00236  * Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling
00237  * this function. Except in recovery, caller should also pass the heap
00238  * buffer. When checksums are enabled and we're not in recovery, we must add
00239  * the heap buffer to the WAL chain to protect it from being torn.
00240  *
00241  * You must pass a buffer containing the correct map page to this function.
00242  * Call visibilitymap_pin first to pin the right one. This function doesn't do
00243  * any I/O.
00244  */
00245 void
00246 visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
00247                   XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid)
00248 {
00249     BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
00250     uint32      mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
00251     uint8       mapBit = HEAPBLK_TO_MAPBIT(heapBlk);
00252     Page        page;
00253     char       *map;
00254 
00255 #ifdef TRACE_VISIBILITYMAP
00256     elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk);
00257 #endif
00258 
00259     Assert(InRecovery || XLogRecPtrIsInvalid(recptr));
00260     Assert(InRecovery || BufferIsValid(heapBuf));
00261 
00262     /* Check that we have the right heap page pinned, if present */
00263     if (BufferIsValid(heapBuf) && BufferGetBlockNumber(heapBuf) != heapBlk)
00264         elog(ERROR, "wrong heap buffer passed to visibilitymap_set");
00265 
00266     /* Check that we have the right VM page pinned */
00267     if (!BufferIsValid(vmBuf) || BufferGetBlockNumber(vmBuf) != mapBlock)
00268         elog(ERROR, "wrong VM buffer passed to visibilitymap_set");
00269 
00270     page = BufferGetPage(vmBuf);
00271     map = PageGetContents(page);
00272     LockBuffer(vmBuf, BUFFER_LOCK_EXCLUSIVE);
00273 
00274     if (!(map[mapByte] & (1 << mapBit)))
00275     {
00276         START_CRIT_SECTION();
00277 
00278         map[mapByte] |= (1 << mapBit);
00279         MarkBufferDirty(vmBuf);
00280 
00281         if (RelationNeedsWAL(rel))
00282         {
00283             if (XLogRecPtrIsInvalid(recptr))
00284             {
00285                 Assert(!InRecovery);
00286                 recptr = log_heap_visible(rel->rd_node, heapBuf, vmBuf,
00287                                           cutoff_xid);
00288 
00289                 /*
00290                  * If data checksums are enabled, we need to protect the heap
00291                  * page from being torn.
00292                  */
00293                 if (DataChecksumsEnabled())
00294                 {
00295                     Page heapPage = BufferGetPage(heapBuf);
00296 
00297                     /* caller is expected to set PD_ALL_VISIBLE first */
00298                     Assert(PageIsAllVisible(heapPage));
00299                     PageSetLSN(heapPage, recptr);
00300                 }
00301             }
00302             PageSetLSN(page, recptr);
00303         }
00304 
00305         END_CRIT_SECTION();
00306     }
00307 
00308     LockBuffer(vmBuf, BUFFER_LOCK_UNLOCK);
00309 }
00310 
00311 /*
00312  *  visibilitymap_test - test if a bit is set
00313  *
00314  * Are all tuples on heapBlk visible to all, according to the visibility map?
00315  *
00316  * On entry, *buf should be InvalidBuffer or a valid buffer returned by an
00317  * earlier call to visibilitymap_pin or visibilitymap_test on the same
00318  * relation. On return, *buf is a valid buffer with the map page containing
00319  * the bit for heapBlk, or InvalidBuffer. The caller is responsible for
00320  * releasing *buf after it's done testing and setting bits.
00321  *
00322  * NOTE: This function is typically called without a lock on the heap page,
00323  * so somebody else could change the bit just after we look at it.  In fact,
00324  * since we don't lock the visibility map page either, it's even possible that
00325  * someone else could have changed the bit just before we look at it, but yet
00326  * we might see the old value.  It is the caller's responsibility to deal with
00327  * all concurrency issues!
00328  */
00329 bool
00330 visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf)
00331 {
00332     BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
00333     uint32      mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
00334     uint8       mapBit = HEAPBLK_TO_MAPBIT(heapBlk);
00335     bool        result;
00336     char       *map;
00337 
00338 #ifdef TRACE_VISIBILITYMAP
00339     elog(DEBUG1, "vm_test %s %d", RelationGetRelationName(rel), heapBlk);
00340 #endif
00341 
00342     /* Reuse the old pinned buffer if possible */
00343     if (BufferIsValid(*buf))
00344     {
00345         if (BufferGetBlockNumber(*buf) != mapBlock)
00346         {
00347             ReleaseBuffer(*buf);
00348             *buf = InvalidBuffer;
00349         }
00350     }
00351 
00352     if (!BufferIsValid(*buf))
00353     {
00354         *buf = vm_readbuf(rel, mapBlock, false);
00355         if (!BufferIsValid(*buf))
00356             return false;
00357     }
00358 
00359     map = PageGetContents(BufferGetPage(*buf));
00360 
00361     /*
00362      * A single-bit read is atomic.  There could be memory-ordering effects
00363      * here, but for performance reasons we make it the caller's job to worry
00364      * about that.
00365      */
00366     result = (map[mapByte] & (1 << mapBit)) ? true : false;
00367 
00368     return result;
00369 }
00370 
00371 /*
00372  *  visibilitymap_count  - count number of bits set in visibility map
00373  *
00374  * Note: we ignore the possibility of race conditions when the table is being
00375  * extended concurrently with the call.  New pages added to the table aren't
00376  * going to be marked all-visible, so they won't affect the result.
00377  */
00378 BlockNumber
00379 visibilitymap_count(Relation rel)
00380 {
00381     BlockNumber result = 0;
00382     BlockNumber mapBlock;
00383 
00384     for (mapBlock = 0;; mapBlock++)
00385     {
00386         Buffer      mapBuffer;
00387         unsigned char *map;
00388         int         i;
00389 
00390         /*
00391          * Read till we fall off the end of the map.  We assume that any extra
00392          * bytes in the last page are zeroed, so we don't bother excluding
00393          * them from the count.
00394          */
00395         mapBuffer = vm_readbuf(rel, mapBlock, false);
00396         if (!BufferIsValid(mapBuffer))
00397             break;
00398 
00399         /*
00400          * We choose not to lock the page, since the result is going to be
00401          * immediately stale anyway if anyone is concurrently setting or
00402          * clearing bits, and we only really need an approximate value.
00403          */
00404         map = (unsigned char *) PageGetContents(BufferGetPage(mapBuffer));
00405 
00406         for (i = 0; i < MAPSIZE; i++)
00407         {
00408             result += number_of_ones[map[i]];
00409         }
00410 
00411         ReleaseBuffer(mapBuffer);
00412     }
00413 
00414     return result;
00415 }
00416 
00417 /*
00418  *  visibilitymap_truncate - truncate the visibility map
00419  *
00420  * The caller must hold AccessExclusiveLock on the relation, to ensure that
00421  * other backends receive the smgr invalidation event that this function sends
00422  * before they access the VM again.
00423  *
00424  * nheapblocks is the new size of the heap.
00425  */
00426 void
00427 visibilitymap_truncate(Relation rel, BlockNumber nheapblocks)
00428 {
00429     BlockNumber newnblocks;
00430 
00431     /* last remaining block, byte, and bit */
00432     BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks);
00433     uint32      truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks);
00434     uint8       truncBit = HEAPBLK_TO_MAPBIT(nheapblocks);
00435 
00436 #ifdef TRACE_VISIBILITYMAP
00437     elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks);
00438 #endif
00439 
00440     RelationOpenSmgr(rel);
00441 
00442     /*
00443      * If no visibility map has been created yet for this relation, there's
00444      * nothing to truncate.
00445      */
00446     if (!smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM))
00447         return;
00448 
00449     /*
00450      * Unless the new size is exactly at a visibility map page boundary, the
00451      * tail bits in the last remaining map page, representing truncated heap
00452      * blocks, need to be cleared. This is not only tidy, but also necessary
00453      * because we don't get a chance to clear the bits if the heap is extended
00454      * again.
00455      */
00456     if (truncByte != 0 || truncBit != 0)
00457     {
00458         Buffer      mapBuffer;
00459         Page        page;
00460         char       *map;
00461 
00462         newnblocks = truncBlock + 1;
00463 
00464         mapBuffer = vm_readbuf(rel, truncBlock, false);
00465         if (!BufferIsValid(mapBuffer))
00466         {
00467             /* nothing to do, the file was already smaller */
00468             return;
00469         }
00470 
00471         page = BufferGetPage(mapBuffer);
00472         map = PageGetContents(page);
00473 
00474         LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE);
00475 
00476         /* Clear out the unwanted bytes. */
00477         MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1));
00478 
00479         /*
00480          * Mask out the unwanted bits of the last remaining byte.
00481          *
00482          * ((1 << 0) - 1) = 00000000 ((1 << 1) - 1) = 00000001 ... ((1 << 6) -
00483          * 1) = 00111111 ((1 << 7) - 1) = 01111111
00484          */
00485         map[truncByte] &= (1 << truncBit) - 1;
00486 
00487         MarkBufferDirty(mapBuffer);
00488         UnlockReleaseBuffer(mapBuffer);
00489     }
00490     else
00491         newnblocks = truncBlock;
00492 
00493     if (smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM) <= newnblocks)
00494     {
00495         /* nothing to do, the file was already smaller than requested size */
00496         return;
00497     }
00498 
00499     /* Truncate the unused VM pages, and send smgr inval message */
00500     smgrtruncate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, newnblocks);
00501 
00502     /*
00503      * We might as well update the local smgr_vm_nblocks setting. smgrtruncate
00504      * sent an smgr cache inval message, which will cause other backends to
00505      * invalidate their copy of smgr_vm_nblocks, and this one too at the next
00506      * command boundary.  But this ensures it isn't outright wrong until then.
00507      */
00508     if (rel->rd_smgr)
00509         rel->rd_smgr->smgr_vm_nblocks = newnblocks;
00510 }
00511 
00512 /*
00513  * Read a visibility map page.
00514  *
00515  * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
00516  * true, the visibility map file is extended.
00517  */
00518 static Buffer
00519 vm_readbuf(Relation rel, BlockNumber blkno, bool extend)
00520 {
00521     Buffer      buf;
00522 
00523     /*
00524      * We might not have opened the relation at the smgr level yet, or we
00525      * might have been forced to close it by a sinval message.  The code below
00526      * won't necessarily notice relation extension immediately when extend =
00527      * false, so we rely on sinval messages to ensure that our ideas about the
00528      * size of the map aren't too far out of date.
00529      */
00530     RelationOpenSmgr(rel);
00531 
00532     /*
00533      * If we haven't cached the size of the visibility map fork yet, check it
00534      * first.
00535      */
00536     if (rel->rd_smgr->smgr_vm_nblocks == InvalidBlockNumber)
00537     {
00538         if (smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM))
00539             rel->rd_smgr->smgr_vm_nblocks = smgrnblocks(rel->rd_smgr,
00540                                                       VISIBILITYMAP_FORKNUM);
00541         else
00542             rel->rd_smgr->smgr_vm_nblocks = 0;
00543     }
00544 
00545     /* Handle requests beyond EOF */
00546     if (blkno >= rel->rd_smgr->smgr_vm_nblocks)
00547     {
00548         if (extend)
00549             vm_extend(rel, blkno + 1);
00550         else
00551             return InvalidBuffer;
00552     }
00553 
00554     /*
00555      * Use ZERO_ON_ERROR mode, and initialize the page if necessary. It's
00556      * always safe to clear bits, so it's better to clear corrupt pages than
00557      * error out.
00558      */
00559     buf = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, blkno,
00560                              RBM_ZERO_ON_ERROR, NULL);
00561     if (PageIsNew(BufferGetPage(buf)))
00562         PageInit(BufferGetPage(buf), BLCKSZ, 0);
00563     return buf;
00564 }
00565 
00566 /*
00567  * Ensure that the visibility map fork is at least vm_nblocks long, extending
00568  * it if necessary with zeroed pages.
00569  */
00570 static void
00571 vm_extend(Relation rel, BlockNumber vm_nblocks)
00572 {
00573     BlockNumber vm_nblocks_now;
00574     Page        pg;
00575 
00576     pg = (Page) palloc(BLCKSZ);
00577     PageInit(pg, BLCKSZ, 0);
00578 
00579     /*
00580      * We use the relation extension lock to lock out other backends trying to
00581      * extend the visibility map at the same time. It also locks out extension
00582      * of the main fork, unnecessarily, but extending the visibility map
00583      * happens seldom enough that it doesn't seem worthwhile to have a
00584      * separate lock tag type for it.
00585      *
00586      * Note that another backend might have extended or created the relation
00587      * by the time we get the lock.
00588      */
00589     LockRelationForExtension(rel, ExclusiveLock);
00590 
00591     /* Might have to re-open if a cache flush happened */
00592     RelationOpenSmgr(rel);
00593 
00594     /*
00595      * Create the file first if it doesn't exist.  If smgr_vm_nblocks is
00596      * positive then it must exist, no need for an smgrexists call.
00597      */
00598     if ((rel->rd_smgr->smgr_vm_nblocks == 0 ||
00599          rel->rd_smgr->smgr_vm_nblocks == InvalidBlockNumber) &&
00600         !smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM))
00601         smgrcreate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, false);
00602 
00603     vm_nblocks_now = smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
00604 
00605     /* Now extend the file */
00606     while (vm_nblocks_now < vm_nblocks)
00607     {
00608         PageSetChecksumInplace(pg, vm_nblocks_now);
00609 
00610         smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now,
00611                    (char *) pg, false);
00612         vm_nblocks_now++;
00613     }
00614 
00615     /*
00616      * Send a shared-inval message to force other backends to close any smgr
00617      * references they may have for this rel, which we are about to change.
00618      * This is a useful optimization because it means that backends don't have
00619      * to keep checking for creation or extension of the file, which happens
00620      * infrequently.
00621      */
00622     CacheInvalidateSmgr(rel->rd_smgr->smgr_rnode);
00623 
00624     /* Update local cache with the up-to-date size */
00625     rel->rd_smgr->smgr_vm_nblocks = vm_nblocks_now;
00626 
00627     UnlockRelationForExtension(rel, ExclusiveLock);
00628 
00629     pfree(pg);
00630 }