#include "postgres.h"
#include <sys/file.h>
#include <unistd.h>
#include "catalog/catalog.h"
#include "catalog/storage.h"
#include "common/relpath.h"
#include "executor/instrument.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/ipc.h"
#include "storage/proc.h"
#include "storage/smgr.h"
#include "storage/standby.h"
#include "utils/rel.h"
#include "utils/resowner_private.h"
#include "utils/timestamp.h"

Include dependency graph for bufmgr.c:

Defines
#define	BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
#define	BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
#define	LocalBufHdrGetBlock(bufHdr) LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
#define	BUF_WRITTEN 0x01
#define	BUF_REUSABLE 0x02
#define	DROP_RELS_BSEARCH_THRESHOLD 20
Functions
static Buffer	ReadBuffer_common (SMgrRelation reln, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit)
static bool	PinBuffer (volatile BufferDesc *buf, BufferAccessStrategy strategy)
static void	PinBuffer_Locked (volatile BufferDesc *buf)
static void	UnpinBuffer (volatile BufferDesc *buf, bool fixOwner)
static void	BufferSync (int flags)
static int	SyncOneBuffer (int buf_id, bool skip_recently_used)
static void	WaitIO (volatile BufferDesc *buf)
static bool	StartBufferIO (volatile BufferDesc *buf, bool forInput)
static void	TerminateBufferIO (volatile BufferDesc *buf, bool clear_dirty, int set_flag_bits)
static void	shared_buffer_write_error_callback (void *arg)
static void	local_buffer_write_error_callback (void *arg)
static volatile BufferDesc *	BufferAlloc (SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
static void	FlushBuffer (volatile BufferDesc *buf, SMgrRelation reln)
static void	AtProcExit_Buffers (int code, Datum arg)
static int	rnode_comparator (const void p1, const void p2)
void	PrefetchBuffer (Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Buffer	ReadBuffer (Relation reln, BlockNumber blockNum)
Buffer	ReadBufferExtended (Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Buffer	ReadBufferWithoutRelcache (RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
static void	InvalidateBuffer (volatile BufferDesc *buf)
void	MarkBufferDirty (Buffer buffer)
Buffer	ReleaseAndReadBuffer (Buffer buffer, Relation relation, BlockNumber blockNum)
bool	BgBufferSync (void)
void	AtEOXact_Buffers (bool isCommit)
void	InitBufferPoolBackend (void)
void	PrintBufferLeakWarning (Buffer buffer)
void	CheckPointBuffers (int flags)
void	BufmgrCommit (void)
BlockNumber	BufferGetBlockNumber (Buffer buffer)
void	BufferGetTag (Buffer buffer, RelFileNode rnode, ForkNumber forknum, BlockNumber *blknum)
BlockNumber	RelationGetNumberOfBlocksInFork (Relation relation, ForkNumber forkNum)
bool	BufferIsPermanent (Buffer buffer)
XLogRecPtr	BufferGetLSNAtomic (Buffer buffer)
void	DropRelFileNodeBuffers (RelFileNodeBackend rnode, ForkNumber forkNum, BlockNumber firstDelBlock)
void	DropRelFileNodesAllBuffers (RelFileNodeBackend *rnodes, int nnodes)
void	DropDatabaseBuffers (Oid dbid)
void	FlushRelationBuffers (Relation rel)
void	FlushDatabaseBuffers (Oid dbid)
void	ReleaseBuffer (Buffer buffer)
void	UnlockReleaseBuffer (Buffer buffer)
void	IncrBufferRefCount (Buffer buffer)
void	MarkBufferDirtyHint (Buffer buffer)
void	UnlockBuffers (void)
void	LockBuffer (Buffer buffer, int mode)
bool	ConditionalLockBuffer (Buffer buffer)
void	LockBufferForCleanup (Buffer buffer)
bool	HoldingBufferPinThatDelaysRecovery (void)
bool	ConditionalLockBufferForCleanup (Buffer buffer)
void	AbortBufferIO (void)
Variables
bool	zero_damaged_pages = false
int	bgwriter_lru_maxpages = 100
double	bgwriter_lru_multiplier = 2.0
bool	track_io_timing = false
int	target_prefetch_pages = 0
static volatile BufferDesc *	InProgressBuf = NULL
static bool	IsForInput
static volatile BufferDesc *	PinCountWaitBuf = NULL

Define Documentation

#define BUF_REUSABLE 0x02

Definition at line 65 of file bufmgr.c.

Referenced by BgBufferSync().

#define BUF_WRITTEN 0x01

Definition at line 64 of file bufmgr.c.

Referenced by BgBufferSync(), and BufferSync().

#define BufferGetLSN ( bufHdr ) (PageGetLSN(BufHdrGetBlock(bufHdr)))

Definition at line 57 of file bufmgr.c.

Referenced by BufferAlloc(), and FlushBuffer().

#define BufHdrGetBlock ( bufHdr ) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))

Definition at line 56 of file bufmgr.c.

Referenced by FlushBuffer(), and ReadBuffer_common().

#define DROP_RELS_BSEARCH_THRESHOLD 20

Definition at line 67 of file bufmgr.c.

#define LocalBufHdrGetBlock ( bufHdr ) LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]

Definition at line 60 of file bufmgr.c.

Referenced by FlushRelationBuffers(), and ReadBuffer_common().

Function Documentation

void AbortBufferIO ( void )

Definition at line 3098 of file bufmgr.c.

References Assert, buftag::blockNum, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_VALID, ereport, errcode(), errdetail(), errmsg(), sbufdesc::flags, buftag::forkNum, sbufdesc::io_in_progress_lock, IsForInput, LockBufHdr, LW_EXCLUSIVE, LWLockAcquire(), pfree(), relpathperm, buftag::rnode, sbufdesc::tag, TerminateBufferIO(), UnlockBufHdr, and WARNING.

Referenced by AbortSubTransaction(), AbortTransaction(), AtProcExit_Buffers(), BackgroundWriterMain(), CheckpointerMain(), and WalWriterMain().

{
    volatile BufferDesc *buf = InProgressBuf;

    if (buf)
    {
        /*
         * Since LWLockReleaseAll has already been called, we're not holding
         * the buffer's io_in_progress_lock. We have to re-acquire it so that
         * we can use TerminateBufferIO. Anyone who's executing WaitIO on the
         * buffer will be in a busy spin until we succeed in doing this.
         */
        LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);

        LockBufHdr(buf);
        Assert(buf->flags & BM_IO_IN_PROGRESS);
        if (IsForInput)
        {
            Assert(!(buf->flags & BM_DIRTY));
            /* We'd better not think buffer is valid yet */
            Assert(!(buf->flags & BM_VALID));
            UnlockBufHdr(buf);
        }
        else
        {
            BufFlags    sv_flags;

            sv_flags = buf->flags;
            Assert(sv_flags & BM_DIRTY);
            UnlockBufHdr(buf);
            /* Issue notice if this is not the first failure... */
            if (sv_flags & BM_IO_ERROR)
            {
                /* Buffer is pinned, so we can read tag without spinlock */
                char       *path;

                path = relpathperm(buf->tag.rnode, buf->tag.forkNum);
                ereport(WARNING,
                        (errcode(ERRCODE_IO_ERROR),
                         errmsg("could not write block %u of %s",
                                buf->tag.blockNum, path),
                         errdetail("Multiple failures --- write error might be permanent.")));
                pfree(path);
            }
        }
        TerminateBufferIO(buf, false, BM_IO_ERROR);
    }
}

void AtEOXact_Buffers ( bool isCommit )

Definition at line 1708 of file bufmgr.c.

References Assert, assert_enabled, AtEOXact_LocalBuffers(), NBuffers, PrintBufferLeakWarning(), and PrivateRefCount.

Referenced by AbortTransaction(), BackgroundWriterMain(), CheckpointerMain(), CommitTransaction(), PrepareTransaction(), and WalWriterMain().

{
#ifdef USE_ASSERT_CHECKING
    if (assert_enabled)
    {
        int         RefCountErrors = 0;
        Buffer      b;

        for (b = 1; b <= NBuffers; b++)
        {
            if (PrivateRefCount[b - 1] != 0)
            {
                PrintBufferLeakWarning(b);
                RefCountErrors++;
            }
        }
        Assert(RefCountErrors == 0);
    }
#endif

    AtEOXact_LocalBuffers(isCommit);
}

static void AtProcExit_Buffers	(	int	code,
		Datum	arg
	)			`[static]`

Definition at line 1751 of file bufmgr.c.

References AbortBufferIO(), Assert, assert_enabled, AtProcExit_LocalBuffers(), NBuffers, PrintBufferLeakWarning(), PrivateRefCount, and UnlockBuffers().

Referenced by InitBufferPoolBackend().

{
    AbortBufferIO();
    UnlockBuffers();

#ifdef USE_ASSERT_CHECKING
    if (assert_enabled)
    {
        int         RefCountErrors = 0;
        Buffer      b;

        for (b = 1; b <= NBuffers; b++)
        {
            if (PrivateRefCount[b - 1] != 0)
            {
                PrintBufferLeakWarning(b);
                RefCountErrors++;
            }
        }
        Assert(RefCountErrors == 0);
    }
#endif

    /* localbuf.c needs a chance too */
    AtProcExit_LocalBuffers();
}

bool BgBufferSync ( void )

Definition at line 1350 of file bufmgr.c.

References Assert, bgwriter_lru_maxpages, bgwriter_lru_multiplier, BgWriterDelay, BgWriterStats, BUF_REUSABLE, BUF_WRITTEN, CurrentResourceOwner, DEBUG1, DEBUG2, elog, PgStat_MsgBgWriter::m_buf_alloc, PgStat_MsgBgWriter::m_buf_written_clean, PgStat_MsgBgWriter::m_maxwritten_clean, NBuffers, ResourceOwnerEnlargeBuffers(), StrategySyncStart(), and SyncOneBuffer().

Referenced by BackgroundWriterMain().

{
    /* info obtained from freelist.c */
    int         strategy_buf_id;
    uint32      strategy_passes;
    uint32      recent_alloc;

    /*
     * Information saved between calls so we can determine the strategy
     * point's advance rate and avoid scanning already-cleaned buffers.
     */
    static bool saved_info_valid = false;
    static int  prev_strategy_buf_id;
    static uint32 prev_strategy_passes;
    static int  next_to_clean;
    static uint32 next_passes;

    /* Moving averages of allocation rate and clean-buffer density */
    static float smoothed_alloc = 0;
    static float smoothed_density = 10.0;

    /* Potentially these could be tunables, but for now, not */
    float       smoothing_samples = 16;
    float       scan_whole_pool_milliseconds = 120000.0;

    /* Used to compute how far we scan ahead */
    long        strategy_delta;
    int         bufs_to_lap;
    int         bufs_ahead;
    float       scans_per_alloc;
    int         reusable_buffers_est;
    int         upcoming_alloc_est;
    int         min_scan_buffers;

    /* Variables for the scanning loop proper */
    int         num_to_scan;
    int         num_written;
    int         reusable_buffers;

    /* Variables for final smoothed_density update */
    long        new_strategy_delta;
    uint32      new_recent_alloc;

    /*
     * Find out where the freelist clock sweep currently is, and how many
     * buffer allocations have happened since our last call.
     */
    strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);

    /* Report buffer alloc counts to pgstat */
    BgWriterStats.m_buf_alloc += recent_alloc;

    /*
     * If we're not running the LRU scan, just stop after doing the stats
     * stuff.  We mark the saved state invalid so that we can recover sanely
     * if LRU scan is turned back on later.
     */
    if (bgwriter_lru_maxpages <= 0)
    {
        saved_info_valid = false;
        return true;
    }

    /*
     * Compute strategy_delta = how many buffers have been scanned by the
     * clock sweep since last time.  If first time through, assume none. Then
     * see if we are still ahead of the clock sweep, and if so, how many
     * buffers we could scan before we'd catch up with it and "lap" it. Note:
     * weird-looking coding of xxx_passes comparisons are to avoid bogus
     * behavior when the passes counts wrap around.
     */
    if (saved_info_valid)
    {
        int32       passes_delta = strategy_passes - prev_strategy_passes;

        strategy_delta = strategy_buf_id - prev_strategy_buf_id;
        strategy_delta += (long) passes_delta *NBuffers;

        Assert(strategy_delta >= 0);

        if ((int32) (next_passes - strategy_passes) > 0)
        {
            /* we're one pass ahead of the strategy point */
            bufs_to_lap = strategy_buf_id - next_to_clean;
#ifdef BGW_DEBUG
            elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
                 next_passes, next_to_clean,
                 strategy_passes, strategy_buf_id,
                 strategy_delta, bufs_to_lap);
#endif
        }
        else if (next_passes == strategy_passes &&
                 next_to_clean >= strategy_buf_id)
        {
            /* on same pass, but ahead or at least not behind */
            bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
#ifdef BGW_DEBUG
            elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
                 next_passes, next_to_clean,
                 strategy_passes, strategy_buf_id,
                 strategy_delta, bufs_to_lap);
#endif
        }
        else
        {
            /*
             * We're behind, so skip forward to the strategy point and start
             * cleaning from there.
             */
#ifdef BGW_DEBUG
            elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
                 next_passes, next_to_clean,
                 strategy_passes, strategy_buf_id,
                 strategy_delta);
#endif
            next_to_clean = strategy_buf_id;
            next_passes = strategy_passes;
            bufs_to_lap = NBuffers;
        }
    }
    else
    {
        /*
         * Initializing at startup or after LRU scanning had been off. Always
         * start at the strategy point.
         */
#ifdef BGW_DEBUG
        elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
             strategy_passes, strategy_buf_id);
#endif
        strategy_delta = 0;
        next_to_clean = strategy_buf_id;
        next_passes = strategy_passes;
        bufs_to_lap = NBuffers;
    }

    /* Update saved info for next time */
    prev_strategy_buf_id = strategy_buf_id;
    prev_strategy_passes = strategy_passes;
    saved_info_valid = true;

    /*
     * Compute how many buffers had to be scanned for each new allocation, ie,
     * 1/density of reusable buffers, and track a moving average of that.
     *
     * If the strategy point didn't move, we don't update the density estimate
     */
    if (strategy_delta > 0 && recent_alloc > 0)
    {
        scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
        smoothed_density += (scans_per_alloc - smoothed_density) /
            smoothing_samples;
    }

    /*
     * Estimate how many reusable buffers there are between the current
     * strategy point and where we've scanned ahead to, based on the smoothed
     * density estimate.
     */
    bufs_ahead = NBuffers - bufs_to_lap;
    reusable_buffers_est = (float) bufs_ahead / smoothed_density;

    /*
     * Track a moving average of recent buffer allocations.  Here, rather than
     * a true average we want a fast-attack, slow-decline behavior: we
     * immediately follow any increase.
     */
    if (smoothed_alloc <= (float) recent_alloc)
        smoothed_alloc = recent_alloc;
    else
        smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
            smoothing_samples;

    /* Scale the estimate by a GUC to allow more aggressive tuning. */
    upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);

    /*
     * If recent_alloc remains at zero for many cycles, smoothed_alloc will
     * eventually underflow to zero, and the underflows produce annoying
     * kernel warnings on some platforms.  Once upcoming_alloc_est has gone to
     * zero, there's no point in tracking smaller and smaller values of
     * smoothed_alloc, so just reset it to exactly zero to avoid this
     * syndrome.  It will pop back up as soon as recent_alloc increases.
     */
    if (upcoming_alloc_est == 0)
        smoothed_alloc = 0;

    /*
     * Even in cases where there's been little or no buffer allocation
     * activity, we want to make a small amount of progress through the buffer
     * cache so that as many reusable buffers as possible are clean after an
     * idle period.
     *
     * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
     * the BGW will be called during the scan_whole_pool time; slice the
     * buffer pool into that many sections.
     */
    min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));

    if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
    {
#ifdef BGW_DEBUG
        elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
             upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
#endif
        upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
    }

    /*
     * Now write out dirty reusable buffers, working forward from the
     * next_to_clean point, until we have lapped the strategy scan, or cleaned
     * enough buffers to match our estimate of the next cycle's allocation
     * requirements, or hit the bgwriter_lru_maxpages limit.
     */

    /* Make sure we can handle the pin inside SyncOneBuffer */
    ResourceOwnerEnlargeBuffers(CurrentResourceOwner);

    num_to_scan = bufs_to_lap;
    num_written = 0;
    reusable_buffers = reusable_buffers_est;

    /* Execute the LRU scan */
    while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
    {
        int         buffer_state = SyncOneBuffer(next_to_clean, true);

        if (++next_to_clean >= NBuffers)
        {
            next_to_clean = 0;
            next_passes++;
        }
        num_to_scan--;

        if (buffer_state & BUF_WRITTEN)
        {
            reusable_buffers++;
            if (++num_written >= bgwriter_lru_maxpages)
            {
                BgWriterStats.m_maxwritten_clean++;
                break;
            }
        }
        else if (buffer_state & BUF_REUSABLE)
            reusable_buffers++;
    }

    BgWriterStats.m_buf_written_clean += num_written;

#ifdef BGW_DEBUG
    elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
         recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
         smoothed_density, reusable_buffers_est, upcoming_alloc_est,
         bufs_to_lap - num_to_scan,
         num_written,
         reusable_buffers - reusable_buffers_est);
#endif

    /*
     * Consider the above scan as being like a new allocation scan.
     * Characterize its density and update the smoothed one based on it. This
     * effectively halves the moving average period in cases where both the
     * strategy and the background writer are doing some useful scanning,
     * which is helpful because a long memory isn't as desirable on the
     * density estimates.
     */
    new_strategy_delta = bufs_to_lap - num_to_scan;
    new_recent_alloc = reusable_buffers - reusable_buffers_est;
    if (new_strategy_delta > 0 && new_recent_alloc > 0)
    {
        scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
        smoothed_density += (scans_per_alloc - smoothed_density) /
            smoothing_samples;

#ifdef BGW_DEBUG
        elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
             new_recent_alloc, new_strategy_delta,
             scans_per_alloc, smoothed_density);
#endif
    }

    /* Return true if OK to hibernate */
    return (bufs_to_lap == 0 && recent_alloc == 0);
}

static volatile BufferDesc * BufferAlloc	(	SMgrRelation	smgr,
		char	relpersistence,
		ForkNumber	forkNum,
		BlockNumber	blockNum,
		BufferAccessStrategy	strategy,
		bool *	foundPtr
	)			`[static]`

Definition at line 532 of file bufmgr.c.

References Assert, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_IO_ERROR, BM_JUST_DIRTIED, BM_TAG_VALID, BM_VALID, buf, sbufdesc::buf_id, BufferDescriptors, BufferGetLSN, BufFreelistLock, BufMappingPartitionLock, BufTableDelete(), BufTableHashCode(), BufTableInsert(), BufTableLookup(), sbufdesc::content_lock, RelFileNode::dbNode, sbufdesc::flags, FlushBuffer(), INIT_BUFFERTAG, LockBufHdr, LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockConditionalAcquire(), LWLockRelease(), RelFileNodeBackend::node, NULL, PinBuffer(), PinBuffer_Locked(), sbufdesc::refcount, RelFileNode::relNode, RELPERSISTENCE_PERMANENT, SMgrRelationData::smgr_rnode, RelFileNode::spcNode, StartBufferIO(), StrategyGetBuffer(), StrategyRejectBuffer(), sbufdesc::tag, UnlockBufHdr, UnpinBuffer(), sbufdesc::usage_count, and XLogNeedsFlush().

Referenced by ReadBuffer_common().

{
    BufferTag   newTag;         /* identity of requested block */
    uint32      newHash;        /* hash value for newTag */
    LWLockId    newPartitionLock;       /* buffer partition lock for it */
    BufferTag   oldTag;         /* previous identity of selected buffer */
    uint32      oldHash;        /* hash value for oldTag */
    LWLockId    oldPartitionLock;       /* buffer partition lock for it */
    BufFlags    oldFlags;
    int         buf_id;
    volatile BufferDesc *buf;
    bool        valid;

    /* create a tag so we can lookup the buffer */
    INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);

    /* determine its hash code and partition lock ID */
    newHash = BufTableHashCode(&newTag);
    newPartitionLock = BufMappingPartitionLock(newHash);

    /* see if the block is in the buffer pool already */
    LWLockAcquire(newPartitionLock, LW_SHARED);
    buf_id = BufTableLookup(&newTag, newHash);
    if (buf_id >= 0)
    {
        /*
         * Found it.  Now, pin the buffer so no one can steal it from the
         * buffer pool, and check to see if the correct data has been loaded
         * into the buffer.
         */
        buf = &BufferDescriptors[buf_id];

        valid = PinBuffer(buf, strategy);

        /* Can release the mapping lock as soon as we've pinned it */
        LWLockRelease(newPartitionLock);

        *foundPtr = TRUE;

        if (!valid)
        {
            /*
             * We can only get here if (a) someone else is still reading in
             * the page, or (b) a previous read attempt failed.  We have to
             * wait for any active read attempt to finish, and then set up our
             * own read attempt if the page is still not BM_VALID.
             * StartBufferIO does it all.
             */
            if (StartBufferIO(buf, true))
            {
                /*
                 * If we get here, previous attempts to read the buffer must
                 * have failed ... but we shall bravely try again.
                 */
                *foundPtr = FALSE;
            }
        }

        return buf;
    }

    /*
     * Didn't find it in the buffer pool.  We'll have to initialize a new
     * buffer.  Remember to unlock the mapping lock while doing the work.
     */
    LWLockRelease(newPartitionLock);

    /* Loop here in case we have to try another victim buffer */
    for (;;)
    {
        bool        lock_held;

        /*
         * Select a victim buffer.  The buffer is returned with its header
         * spinlock still held!  Also (in most cases) the BufFreelistLock is
         * still held, since it would be bad to hold the spinlock while
         * possibly waking up other processes.
         */
        buf = StrategyGetBuffer(strategy, &lock_held);

        Assert(buf->refcount == 0);

        /* Must copy buffer flags while we still hold the spinlock */
        oldFlags = buf->flags;

        /* Pin the buffer and then release the buffer spinlock */
        PinBuffer_Locked(buf);

        /* Now it's safe to release the freelist lock */
        if (lock_held)
            LWLockRelease(BufFreelistLock);

        /*
         * If the buffer was dirty, try to write it out.  There is a race
         * condition here, in that someone might dirty it after we released it
         * above, or even while we are writing it out (since our share-lock
         * won't prevent hint-bit updates).  We will recheck the dirty bit
         * after re-locking the buffer header.
         */
        if (oldFlags & BM_DIRTY)
        {
            /*
             * We need a share-lock on the buffer contents to write it out
             * (else we might write invalid data, eg because someone else is
             * compacting the page contents while we write).  We must use a
             * conditional lock acquisition here to avoid deadlock.  Even
             * though the buffer was not pinned (and therefore surely not
             * locked) when StrategyGetBuffer returned it, someone else could
             * have pinned and exclusive-locked it by the time we get here. If
             * we try to get the lock unconditionally, we'd block waiting for
             * them; if they later block waiting for us, deadlock ensues.
             * (This has been observed to happen when two backends are both
             * trying to split btree index pages, and the second one just
             * happens to be trying to split the page the first one got from
             * StrategyGetBuffer.)
             */
            if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED))
            {
                /*
                 * If using a nondefault strategy, and writing the buffer
                 * would require a WAL flush, let the strategy decide whether
                 * to go ahead and write/reuse the buffer or to choose another
                 * victim.  We need lock to inspect the page LSN, so this
                 * can't be done inside StrategyGetBuffer.
                 */
                if (strategy != NULL)
                {
                    XLogRecPtr  lsn;

                    /* Read the LSN while holding buffer header lock */
                    LockBufHdr(buf);
                    lsn = BufferGetLSN(buf);
                    UnlockBufHdr(buf);

                    if (XLogNeedsFlush(lsn) &&
                        StrategyRejectBuffer(strategy, buf))
                    {
                        /* Drop lock/pin and loop around for another buffer */
                        LWLockRelease(buf->content_lock);
                        UnpinBuffer(buf, true);
                        continue;
                    }
                }

                /* OK, do the I/O */
                TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum,
                                               smgr->smgr_rnode.node.spcNode,
                                                smgr->smgr_rnode.node.dbNode,
                                              smgr->smgr_rnode.node.relNode);

                FlushBuffer(buf, NULL);
                LWLockRelease(buf->content_lock);

                TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
                                               smgr->smgr_rnode.node.spcNode,
                                                smgr->smgr_rnode.node.dbNode,
                                              smgr->smgr_rnode.node.relNode);
            }
            else
            {
                /*
                 * Someone else has locked the buffer, so give it up and loop
                 * back to get another one.
                 */
                UnpinBuffer(buf, true);
                continue;
            }
        }

        /*
         * To change the association of a valid buffer, we'll need to have
         * exclusive lock on both the old and new mapping partitions.
         */
        if (oldFlags & BM_TAG_VALID)
        {
            /*
             * Need to compute the old tag's hashcode and partition lock ID.
             * XXX is it worth storing the hashcode in BufferDesc so we need
             * not recompute it here?  Probably not.
             */
            oldTag = buf->tag;
            oldHash = BufTableHashCode(&oldTag);
            oldPartitionLock = BufMappingPartitionLock(oldHash);

            /*
             * Must lock the lower-numbered partition first to avoid
             * deadlocks.
             */
            if (oldPartitionLock < newPartitionLock)
            {
                LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
                LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
            }
            else if (oldPartitionLock > newPartitionLock)
            {
                LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
                LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
            }
            else
            {
                /* only one partition, only one lock */
                LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
            }
        }
        else
        {
            /* if it wasn't valid, we need only the new partition */
            LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
            /* these just keep the compiler quiet about uninit variables */
            oldHash = 0;
            oldPartitionLock = 0;
        }

        /*
         * Try to make a hashtable entry for the buffer under its new tag.
         * This could fail because while we were writing someone else
         * allocated another buffer for the same block we want to read in.
         * Note that we have not yet removed the hashtable entry for the old
         * tag.
         */
        buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);

        if (buf_id >= 0)
        {
            /*
             * Got a collision. Someone has already done what we were about to
             * do. We'll just handle this as if it were found in the buffer
             * pool in the first place.  First, give up the buffer we were
             * planning to use.
             */
            UnpinBuffer(buf, true);

            /* Can give up that buffer's mapping partition lock now */
            if ((oldFlags & BM_TAG_VALID) &&
                oldPartitionLock != newPartitionLock)
                LWLockRelease(oldPartitionLock);

            /* remaining code should match code at top of routine */

            buf = &BufferDescriptors[buf_id];

            valid = PinBuffer(buf, strategy);

            /* Can release the mapping lock as soon as we've pinned it */
            LWLockRelease(newPartitionLock);

            *foundPtr = TRUE;

            if (!valid)
            {
                /*
                 * We can only get here if (a) someone else is still reading
                 * in the page, or (b) a previous read attempt failed.  We
                 * have to wait for any active read attempt to finish, and
                 * then set up our own read attempt if the page is still not
                 * BM_VALID.  StartBufferIO does it all.
                 */
                if (StartBufferIO(buf, true))
                {
                    /*
                     * If we get here, previous attempts to read the buffer
                     * must have failed ... but we shall bravely try again.
                     */
                    *foundPtr = FALSE;
                }
            }

            return buf;
        }

        /*
         * Need to lock the buffer header too in order to change its tag.
         */
        LockBufHdr(buf);

        /*
         * Somebody could have pinned or re-dirtied the buffer while we were
         * doing the I/O and making the new hashtable entry.  If so, we can't
         * recycle this buffer; we must undo everything we've done and start
         * over with a new victim buffer.
         */
        oldFlags = buf->flags;
        if (buf->refcount == 1 && !(oldFlags & BM_DIRTY))
            break;

        UnlockBufHdr(buf);
        BufTableDelete(&newTag, newHash);
        if ((oldFlags & BM_TAG_VALID) &&
            oldPartitionLock != newPartitionLock)
            LWLockRelease(oldPartitionLock);
        LWLockRelease(newPartitionLock);
        UnpinBuffer(buf, true);
    }

    /*
     * Okay, it's finally safe to rename the buffer.
     *
     * Clearing BM_VALID here is necessary, clearing the dirtybits is just
     * paranoia.  We also reset the usage_count since any recency of use of
     * the old content is no longer relevant.  (The usage_count starts out at
     * 1 so that the buffer can survive one clock-sweep pass.)
     */
    buf->tag = newTag;
    buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT);
    if (relpersistence == RELPERSISTENCE_PERMANENT)
        buf->flags |= BM_TAG_VALID | BM_PERMANENT;
    else
        buf->flags |= BM_TAG_VALID;
    buf->usage_count = 1;

    UnlockBufHdr(buf);

    if (oldFlags & BM_TAG_VALID)
    {
        BufTableDelete(&oldTag, oldHash);
        if (oldPartitionLock != newPartitionLock)
            LWLockRelease(oldPartitionLock);
    }

    LWLockRelease(newPartitionLock);

    /*
     * Buffer contents are currently invalid.  Try to get the io_in_progress
     * lock.  If StartBufferIO returns false, then someone else managed to
     * read it before we did, so there's nothing left for BufferAlloc() to do.
     */
    if (StartBufferIO(buf, true))
        *foundPtr = FALSE;
    else
        *foundPtr = TRUE;

    return buf;
}

BlockNumber BufferGetBlockNumber ( Buffer buffer )

Definition at line 1854 of file bufmgr.c.

References Assert, buftag::blockNum, BufferDescriptors, BufferIsLocal, BufferIsPinned, LocalBufferDescriptors, and sbufdesc::tag.

Referenced by _bt_checkpage(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_endpoint(), _bt_first(), _bt_getroot(), _bt_insert_parent(), _bt_insertonpg(), _bt_newroot(), _bt_pagedel(), _bt_search(), _bt_split(), _bt_steppage(), _bt_walk_left(), _hash_addovflpage(), _hash_checkpage(), _hash_freeovflpage(), _hash_getnewbuf(), _hash_step(), allocNewBuffer(), btvacuumpage(), CheckForSerializableConflictIn(), createPostingTree(), dataPlaceToPage(), dataSplitPage(), doPickSplit(), entryPlaceToPage(), entrySplitPage(), fill_seq_with_data(), ginDataFillRoot(), ginFindParents(), ginInsertValue(), ginPageGetLinkItup(), ginRedoSplit(), gistbufferinginserttuples(), gistbuild(), gistcheckpage(), gistformdownlink(), gistMemorizeAllDownlinks(), gistplacetopage(), gistRelocateBuildBuffersOnSplit(), gistXLogSplit(), gistXLogUpdate(), heap_delete(), heap_hot_search_buffer(), heap_multi_insert(), heap_page_is_all_visible(), heap_prune_chain(), heap_update(), index_getnext(), log_heap_clean(), log_heap_freeze(), log_heap_visible(), makeSublist(), moveLeafs(), ReadBufferBI(), RelationGetBufferForTuple(), RelationPutHeapTuple(), spgAddNodeAction(), spgbuild(), spgdoinsert(), SpGistSetLastUsedPage(), spgSplitNodeAction(), spgWalk(), vacuumLeafPage(), vacuumLeafRoot(), vacuumRedirectAndPlaceholder(), visibilitymap_clear(), visibilitymap_pin(), visibilitymap_pin_ok(), visibilitymap_set(), visibilitymap_test(), writeListPage(), XLogReadBufferExtended(), and xlogVacuumPage().

{
    volatile BufferDesc *bufHdr;

    Assert(BufferIsPinned(buffer));

    if (BufferIsLocal(buffer))
        bufHdr = &(LocalBufferDescriptors[-buffer - 1]);
    else
        bufHdr = &BufferDescriptors[buffer - 1];

    /* pinned, so OK to read tag without spinlock */
    return bufHdr->tag.blockNum;
}

XLogRecPtr BufferGetLSNAtomic ( Buffer buffer )

Definition at line 2076 of file bufmgr.c.

References Assert, BufferDescriptors, BufferGetPage, BufferIsLocal, BufferIsPinned, BufferIsValid, DataChecksumsEnabled(), LockBufHdr, PageGetLSN, and UnlockBufHdr.

Referenced by gistScanPage(), and XLogCheckBuffer().

{
    volatile BufferDesc *bufHdr = &BufferDescriptors[buffer - 1];
    char                *page = BufferGetPage(buffer);
    XLogRecPtr           lsn;

    /*
     * If we don't need locking for correctness, fastpath out.
     */
    if (!DataChecksumsEnabled() || BufferIsLocal(buffer))
        return PageGetLSN(page);

    /* Make sure we've got a real buffer, and that we hold a pin on it. */
    Assert(BufferIsValid(buffer));
    Assert(BufferIsPinned(buffer));

    LockBufHdr(bufHdr);
    lsn = PageGetLSN(page);
    UnlockBufHdr(bufHdr);

    return lsn;
}

void BufferGetTag	(	Buffer	buffer,
		RelFileNode *	rnode,
		ForkNumber *	forknum,
		BlockNumber *	blknum
	)

Definition at line 1875 of file bufmgr.c.

References Assert, buftag::blockNum, BufferDescriptors, BufferIsLocal, BufferIsPinned, buftag::forkNum, LocalBufferDescriptors, buftag::rnode, and sbufdesc::tag.

Referenced by fsm_search_avail(), log_newpage_buffer(), and XLogCheckBuffer().

{
    volatile BufferDesc *bufHdr;

    /* Do the same checks as BufferGetBlockNumber. */
    Assert(BufferIsPinned(buffer));

    if (BufferIsLocal(buffer))
        bufHdr = &(LocalBufferDescriptors[-buffer - 1]);
    else
        bufHdr = &BufferDescriptors[buffer - 1];

    /* pinned, so OK to read tag without spinlock */
    *rnode = bufHdr->tag.rnode;
    *forknum = bufHdr->tag.forkNum;
    *blknum = bufHdr->tag.blockNum;
}

bool BufferIsPermanent ( Buffer buffer )

Definition at line 2046 of file bufmgr.c.

References Assert, BM_PERMANENT, BufferDescriptors, BufferIsLocal, BufferIsPinned, BufferIsValid, and sbufdesc::flags.

Referenced by SetHintBits().

{
    volatile BufferDesc *bufHdr;

    /* Local buffers are used only for temp relations. */
    if (BufferIsLocal(buffer))
        return false;

    /* Make sure we've got a real buffer, and that we hold a pin on it. */
    Assert(BufferIsValid(buffer));
    Assert(BufferIsPinned(buffer));

    /*
     * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
     * need not bother with the buffer header spinlock.  Even if someone else
     * changes the buffer header flags while we're doing this, we assume that
     * changing an aligned 2-byte BufFlags value is atomic, so we'll read the
     * old value or the new value, but not random garbage.
     */
    bufHdr = &BufferDescriptors[buffer - 1];
    return (bufHdr->flags & BM_PERMANENT) != 0;
}

static void BufferSync ( int flags ) [static]

Definition at line 1210 of file bufmgr.c.

References BgWriterStats, BM_CHECKPOINT_NEEDED, BUF_WRITTEN, BufferDescriptors, CHECKPOINT_END_OF_RECOVERY, CHECKPOINT_IS_SHUTDOWN, CheckpointStats, CheckpointWriteDelay(), CheckpointStatsData::ckpt_bufs_written, CurrentResourceOwner, sbufdesc::flags, LockBufHdr, PgStat_MsgBgWriter::m_buf_written_checkpoints, NBuffers, NULL, ResourceOwnerEnlargeBuffers(), StrategySyncStart(), SyncOneBuffer(), and UnlockBufHdr.

Referenced by CheckPointBuffers().

{
    int         buf_id;
    int         num_to_scan;
    int         num_to_write;
    int         num_written;
    int         mask = BM_DIRTY;

    /* Make sure we can handle the pin inside SyncOneBuffer */
    ResourceOwnerEnlargeBuffers(CurrentResourceOwner);

    /*
     * Unless this is a shutdown checkpoint, we write only permanent, dirty
     * buffers.  But at shutdown or end of recovery, we write all dirty buffers.
     */
    if (!((flags & CHECKPOINT_IS_SHUTDOWN) || (flags & CHECKPOINT_END_OF_RECOVERY)))
        mask |= BM_PERMANENT;

    /*
     * Loop over all buffers, and mark the ones that need to be written with
     * BM_CHECKPOINT_NEEDED.  Count them as we go (num_to_write), so that we
     * can estimate how much work needs to be done.
     *
     * This allows us to write only those pages that were dirty when the
     * checkpoint began, and not those that get dirtied while it proceeds.
     * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
     * later in this function, or by normal backends or the bgwriter cleaning
     * scan, the flag is cleared.  Any buffer dirtied after this point won't
     * have the flag set.
     *
     * Note that if we fail to write some buffer, we may leave buffers with
     * BM_CHECKPOINT_NEEDED still set.  This is OK since any such buffer would
     * certainly need to be written for the next checkpoint attempt, too.
     */
    num_to_write = 0;
    for (buf_id = 0; buf_id < NBuffers; buf_id++)
    {
        volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];

        /*
         * Header spinlock is enough to examine BM_DIRTY, see comment in
         * SyncOneBuffer.
         */
        LockBufHdr(bufHdr);

        if ((bufHdr->flags & mask) == mask)
        {
            bufHdr->flags |= BM_CHECKPOINT_NEEDED;
            num_to_write++;
        }

        UnlockBufHdr(bufHdr);
    }

    if (num_to_write == 0)
        return;                 /* nothing to do */

    TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_write);

    /*
     * Loop over all buffers again, and write the ones (still) marked with
     * BM_CHECKPOINT_NEEDED.  In this loop, we start at the clock sweep point
     * since we might as well dump soon-to-be-recycled buffers first.
     *
     * Note that we don't read the buffer alloc count here --- that should be
     * left untouched till the next BgBufferSync() call.
     */
    buf_id = StrategySyncStart(NULL, NULL);
    num_to_scan = NBuffers;
    num_written = 0;
    while (num_to_scan-- > 0)
    {
        volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];

        /*
         * We don't need to acquire the lock here, because we're only looking
         * at a single bit. It's possible that someone else writes the buffer
         * and clears the flag right after we check, but that doesn't matter
         * since SyncOneBuffer will then do nothing.  However, there is a
         * further race condition: it's conceivable that between the time we
         * examine the bit here and the time SyncOneBuffer acquires lock,
         * someone else not only wrote the buffer but replaced it with another
         * page and dirtied it.  In that improbable case, SyncOneBuffer will
         * write the buffer though we didn't need to.  It doesn't seem worth
         * guarding against this, though.
         */
        if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
        {
            if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
            {
                TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
                BgWriterStats.m_buf_written_checkpoints++;
                num_written++;

                /*
                 * We know there are at most num_to_write buffers with
                 * BM_CHECKPOINT_NEEDED set; so we can stop scanning if
                 * num_written reaches num_to_write.
                 *
                 * Note that num_written doesn't include buffers written by
                 * other backends, or by the bgwriter cleaning scan. That
                 * means that the estimate of how much progress we've made is
                 * conservative, and also that this test will often fail to
                 * trigger.  But it seems worth making anyway.
                 */
                if (num_written >= num_to_write)
                    break;

                /*
                 * Sleep to throttle our I/O rate.
                 */
                CheckpointWriteDelay(flags, (double) num_written / num_to_write);
            }
        }

        if (++buf_id >= NBuffers)
            buf_id = 0;
    }

    /*
     * Update checkpoint statistics. As noted above, this doesn't include
     * buffers written by other backends or bgwriter scan.
     */
    CheckpointStats.ckpt_bufs_written += num_written;

    TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_write);
}

void BufmgrCommit ( void )

Definition at line 1840 of file bufmgr.c.

Referenced by PrepareTransaction(), and RecordTransactionCommit().

{
    /* Nothing to do in bufmgr anymore... */
}

void CheckPointBuffers ( int flags )

Definition at line 1823 of file bufmgr.c.

References BufferSync(), CheckpointStats, CheckpointStatsData::ckpt_sync_end_t, CheckpointStatsData::ckpt_sync_t, CheckpointStatsData::ckpt_write_t, GetCurrentTimestamp(), and smgrsync().

Referenced by CheckPointGuts().

{
    TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
    CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
    BufferSync(flags);
    CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
    TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
    smgrsync();
    CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
    TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
}

bool ConditionalLockBuffer ( Buffer buffer )

Definition at line 2772 of file bufmgr.c.

References Assert, BufferDescriptors, BufferIsLocal, BufferIsValid, sbufdesc::content_lock, LW_EXCLUSIVE, and LWLockConditionalAcquire().

Referenced by _bt_getbuf(), ConditionalLockBufferForCleanup(), GinNewBuffer(), gistNewBuffer(), SpGistGetBuffer(), SpGistNewBuffer(), and SpGistUpdateMetaPage().

{
    volatile BufferDesc *buf;

    Assert(BufferIsValid(buffer));
    if (BufferIsLocal(buffer))
        return true;            /* act as though we got it */

    buf = &(BufferDescriptors[buffer - 1]);

    return LWLockConditionalAcquire(buf->content_lock, LW_EXCLUSIVE);
}

bool ConditionalLockBufferForCleanup ( Buffer buffer )

Definition at line 2900 of file bufmgr.c.

References Assert, BUFFER_LOCK_UNLOCK, BufferDescriptors, BufferIsLocal, BufferIsValid, ConditionalLockBuffer(), LocalRefCount, LockBuffer(), LockBufHdr, PrivateRefCount, sbufdesc::refcount, and UnlockBufHdr.

Referenced by heap_page_prune_opt(), lazy_scan_heap(), and lazy_vacuum_heap().

{
    volatile BufferDesc *bufHdr;

    Assert(BufferIsValid(buffer));

    if (BufferIsLocal(buffer))
    {
        /* There should be exactly one pin */
        Assert(LocalRefCount[-buffer - 1] > 0);
        if (LocalRefCount[-buffer - 1] != 1)
            return false;
        /* Nobody else to wait for */
        return true;
    }

    /* There should be exactly one local pin */
    Assert(PrivateRefCount[buffer - 1] > 0);
    if (PrivateRefCount[buffer - 1] != 1)
        return false;

    /* Try to acquire lock */
    if (!ConditionalLockBuffer(buffer))
        return false;

    bufHdr = &BufferDescriptors[buffer - 1];
    LockBufHdr(bufHdr);
    Assert(bufHdr->refcount > 0);
    if (bufHdr->refcount == 1)
    {
        /* Successfully acquired exclusive lock with pincount 1 */
        UnlockBufHdr(bufHdr);
        return true;
    }

    /* Failed, so release the lock */
    UnlockBufHdr(bufHdr);
    LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    return false;
}

void DropDatabaseBuffers ( Oid dbid )

Definition at line 2284 of file bufmgr.c.

References BufferDescriptors, RelFileNode::dbNode, i, InvalidateBuffer(), LockBufHdr, NBuffers, buftag::rnode, sbufdesc::tag, and UnlockBufHdr.

Referenced by dbase_redo(), and dropdb().

{
    int         i;

    /*
     * We needn't consider local buffers, since by assumption the target
     * database isn't our own.
     */

    for (i = 0; i < NBuffers; i++)
    {
        volatile BufferDesc *bufHdr = &BufferDescriptors[i];

        /*
         * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
         * and saves some cycles.
         */
        if (bufHdr->tag.rnode.dbNode != dbid)
            continue;

        LockBufHdr(bufHdr);
        if (bufHdr->tag.rnode.dbNode == dbid)
            InvalidateBuffer(bufHdr);   /* releases spinlock */
        else
            UnlockBufHdr(bufHdr);
    }
}

void DropRelFileNodeBuffers	(	RelFileNodeBackend	rnode,
		ForkNumber	forkNum,
		BlockNumber	firstDelBlock
	)

Definition at line 2126 of file bufmgr.c.

References RelFileNodeBackend::backend, buftag::blockNum, BufferDescriptors, DropRelFileNodeLocalBuffers(), buftag::forkNum, i, InvalidateBuffer(), LockBufHdr, MyBackendId, NBuffers, RelFileNodeBackend::node, RelFileNodeBackendIsTemp, RelFileNodeEquals, buftag::rnode, sbufdesc::tag, and UnlockBufHdr.

Referenced by smgrdounlinkfork(), and smgrtruncate().

{
    int         i;

    /* If it's a local relation, it's localbuf.c's problem. */
    if (RelFileNodeBackendIsTemp(rnode))
    {
        if (rnode.backend == MyBackendId)
            DropRelFileNodeLocalBuffers(rnode.node, forkNum, firstDelBlock);
        return;
    }

    for (i = 0; i < NBuffers; i++)
    {
        volatile BufferDesc *bufHdr = &BufferDescriptors[i];

        /*
         * We can make this a tad faster by prechecking the buffer tag before
         * we attempt to lock the buffer; this saves a lot of lock
         * acquisitions in typical cases.  It should be safe because the
         * caller must have AccessExclusiveLock on the relation, or some other
         * reason to be certain that no one is loading new pages of the rel
         * into the buffer pool.  (Otherwise we might well miss such pages
         * entirely.)  Therefore, while the tag might be changing while we
         * look at it, it can't be changing *to* a value we care about, only
         * *away* from such a value.  So false negatives are impossible, and
         * false positives are safe because we'll recheck after getting the
         * buffer lock.
         *
         * We could check forkNum and blockNum as well as the rnode, but the
         * incremental win from doing so seems small.
         */
        if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
            continue;

        LockBufHdr(bufHdr);
        if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
            bufHdr->tag.forkNum == forkNum &&
            bufHdr->tag.blockNum >= firstDelBlock)
            InvalidateBuffer(bufHdr);   /* releases spinlock */
        else
            UnlockBufHdr(bufHdr);
    }
}

void DropRelFileNodesAllBuffers	(	RelFileNodeBackend *	rnodes,
		int	nnodes
	)

Definition at line 2182 of file bufmgr.c.

References BufferDescriptors, DropRelFileNodeAllLocalBuffers(), i, InvalidateBuffer(), LockBufHdr, MyBackendId, NBuffers, RelFileNodeBackend::node, NULL, palloc(), pfree(), pg_qsort(), RelFileNodeBackendIsTemp, RelFileNodeEquals, buftag::rnode, rnode_comparator(), sbufdesc::tag, and UnlockBufHdr.

Referenced by smgrdounlink(), and smgrdounlinkall().

{
    int         i,
                n = 0;
    RelFileNode *nodes;
    bool        use_bsearch;

    if (nnodes == 0)
        return;

    nodes = palloc(sizeof(RelFileNode) * nnodes); /* non-local relations */

    /* If it's a local relation, it's localbuf.c's problem. */
    for (i = 0; i < nnodes; i++)
    {
        if (RelFileNodeBackendIsTemp(rnodes[i]))
        {
            if (rnodes[i].backend == MyBackendId)
                DropRelFileNodeAllLocalBuffers(rnodes[i].node);
        }
        else
            nodes[n++] = rnodes[i].node;
    }

    /*
     * If there are no non-local relations, then we're done. Release the memory
     * and return.
     */
    if (n == 0)
    {
        pfree(nodes);
        return;
    }

    /*
     * For low number of relations to drop just use a simple walk through, to
     * save the bsearch overhead. The threshold to use is rather a guess than a
     * exactly determined value, as it depends on many factors (CPU and RAM
     * speeds, amount of shared buffers etc.).
     */
    use_bsearch = n > DROP_RELS_BSEARCH_THRESHOLD;

    /* sort the list of rnodes if necessary */
    if (use_bsearch)
        pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);

    for (i = 0; i < NBuffers; i++)
    {
        RelFileNode *rnode = NULL;
        volatile BufferDesc *bufHdr = &BufferDescriptors[i];

        /*
         * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
         * and saves some cycles.
         */

        if (!use_bsearch)
        {
            int     j;

            for (j = 0; j < n; j++)
            {
                if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
                {
                    rnode = &nodes[j];
                    break;
                }
            }
        }
        else
        {
            rnode = bsearch((const void *) &(bufHdr->tag.rnode),
                            nodes, n, sizeof(RelFileNode),
                            rnode_comparator);
        }

        /* buffer doesn't belong to any of the given relfilenodes; skip it */
        if (rnode == NULL)
            continue;

        LockBufHdr(bufHdr);
        if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
            InvalidateBuffer(bufHdr);   /* releases spinlock */
        else
            UnlockBufHdr(bufHdr);
    }

    pfree(nodes);
}

static void FlushBuffer	(	volatile BufferDesc *	buf,
		SMgrRelation	reln
	)			`[static]`

Definition at line 1914 of file bufmgr.c.

References ErrorContextCallback::arg, BufferUsage::blk_write_time, buftag::blockNum, BM_PERMANENT, BufferGetLSN, BufHdrGetBlock, ErrorContextCallback::callback, RelFileNode::dbNode, error_context_stack, sbufdesc::flags, buftag::forkNum, INSTR_TIME_ADD, INSTR_TIME_GET_MICROSEC, INSTR_TIME_SET_CURRENT, INSTR_TIME_SUBTRACT, InvalidBackendId, LockBufHdr, RelFileNodeBackend::node, NULL, PageSetChecksumCopy(), pgBufferUsage, pgstat_count_buffer_write_time, ErrorContextCallback::previous, RelFileNode::relNode, buftag::rnode, BufferUsage::shared_blks_written, SMgrRelationData::smgr_rnode, smgropen(), smgrwrite(), RelFileNode::spcNode, StartBufferIO(), sbufdesc::tag, TerminateBufferIO(), track_io_timing, UnlockBufHdr, and XLogFlush().

Referenced by BufferAlloc(), FlushDatabaseBuffers(), FlushRelationBuffers(), and SyncOneBuffer().

{
    XLogRecPtr  recptr;
    ErrorContextCallback errcallback;
    instr_time  io_start,
                io_time;
    Block       bufBlock;
    char        *bufToWrite;

    /*
     * Acquire the buffer's io_in_progress lock.  If StartBufferIO returns
     * false, then someone else flushed the buffer before we could, so we need
     * not do anything.
     */
    if (!StartBufferIO(buf, false))
        return;

    /* Setup error traceback support for ereport() */
    errcallback.callback = shared_buffer_write_error_callback;
    errcallback.arg = (void *) buf;
    errcallback.previous = error_context_stack;
    error_context_stack = &errcallback;

    /* Find smgr relation for buffer */
    if (reln == NULL)
        reln = smgropen(buf->tag.rnode, InvalidBackendId);

    TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum,
                                        buf->tag.blockNum,
                                        reln->smgr_rnode.node.spcNode,
                                        reln->smgr_rnode.node.dbNode,
                                        reln->smgr_rnode.node.relNode);

    LockBufHdr(buf);

    /*
     * Run PageGetLSN while holding header lock, since we don't have the
     * buffer locked exclusively in all cases.
     */
    recptr = BufferGetLSN(buf);

    /* To check if block content changes while flushing. - vadim 01/17/97 */
    buf->flags &= ~BM_JUST_DIRTIED;
    UnlockBufHdr(buf);

    /*
     * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
     * rule that log updates must hit disk before any of the data-file changes
     * they describe do.
     *
     * However, this rule does not apply to unlogged relations, which will be
     * lost after a crash anyway.  Most unlogged relation pages do not bear
     * LSNs since we never emit WAL records for them, and therefore flushing
     * up through the buffer LSN would be useless, but harmless.  However, GiST
     * indexes use LSNs internally to track page-splits, and therefore unlogged
     * GiST pages bear "fake" LSNs generated by GetFakeLSNForUnloggedRel.  It
     * is unlikely but possible that the fake LSN counter could advance past
     * the WAL insertion point; and if it did happen, attempting to flush WAL
     * through that location would fail, with disastrous system-wide
     * consequences.  To make sure that can't happen, skip the flush if the
     * buffer isn't permanent.
     */
    if (buf->flags & BM_PERMANENT)
        XLogFlush(recptr);

    /*
     * Now it's safe to write buffer to disk. Note that no one else should
     * have been able to write it while we were busy with log flushing because
     * we have the io_in_progress lock.
     */

    bufBlock = BufHdrGetBlock(buf);

    bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);

    if (track_io_timing)
        INSTR_TIME_SET_CURRENT(io_start);

    /*
     * bufToWrite is either the shared buffer or a copy, as appropriate.
     */
    smgrwrite(reln,
              buf->tag.forkNum,
              buf->tag.blockNum,
              bufToWrite,
              false);

    if (track_io_timing)
    {
        INSTR_TIME_SET_CURRENT(io_time);
        INSTR_TIME_SUBTRACT(io_time, io_start);
        pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
        INSTR_TIME_ADD(pgBufferUsage.blk_write_time, io_time);
    }

    pgBufferUsage.shared_blks_written++;

    /*
     * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
     * end the io_in_progress state.
     */
    TerminateBufferIO(buf, true, 0);

    TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(buf->tag.forkNum,
                                       buf->tag.blockNum,
                                       reln->smgr_rnode.node.spcNode,
                                       reln->smgr_rnode.node.dbNode,
                                       reln->smgr_rnode.node.relNode);

    /* Pop the error context stack */
    error_context_stack = errcallback.previous;
}

void FlushDatabaseBuffers ( Oid dbid )

Definition at line 2474 of file bufmgr.c.

References BM_DIRTY, BM_VALID, BufferDescriptors, sbufdesc::content_lock, CurrentResourceOwner, RelFileNode::dbNode, sbufdesc::flags, FlushBuffer(), LockBufHdr, LW_SHARED, LWLockAcquire(), LWLockRelease(), NULL, PinBuffer_Locked(), ResourceOwnerEnlargeBuffers(), buftag::rnode, sbufdesc::tag, UnlockBufHdr, and UnpinBuffer().

Referenced by dbase_redo().

{
    int         i;
    volatile BufferDesc *bufHdr;

    /* Make sure we can handle the pin inside the loop */
    ResourceOwnerEnlargeBuffers(CurrentResourceOwner);

    for (i = 0; i < NBuffers; i++)
    {
        bufHdr = &BufferDescriptors[i];

        /*
         * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
         * and saves some cycles.
         */
        if (bufHdr->tag.rnode.dbNode != dbid)
            continue;

        LockBufHdr(bufHdr);
        if (bufHdr->tag.rnode.dbNode == dbid &&
            (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
        {
            PinBuffer_Locked(bufHdr);
            LWLockAcquire(bufHdr->content_lock, LW_SHARED);
            FlushBuffer(bufHdr, NULL);
            LWLockRelease(bufHdr->content_lock);
            UnpinBuffer(bufHdr, true);
        }
        else
            UnlockBufHdr(bufHdr);
    }
}

void FlushRelationBuffers ( Relation rel )

Definition at line 2384 of file bufmgr.c.

References ErrorContextCallback::arg, buftag::blockNum, BM_DIRTY, BM_VALID, BufferDescriptors, ErrorContextCallback::callback, sbufdesc::content_lock, CurrentResourceOwner, error_context_stack, sbufdesc::flags, FlushBuffer(), buftag::forkNum, LocalBufferDescriptors, LocalBufHdrGetBlock, LockBufHdr, LW_SHARED, LWLockAcquire(), LWLockRelease(), NLocBuffer, PageSetChecksumInplace(), PinBuffer_Locked(), ErrorContextCallback::previous, RelationData::rd_node, RelationData::rd_smgr, RelationOpenSmgr, RelationUsesLocalBuffers, RelFileNodeEquals, ResourceOwnerEnlargeBuffers(), buftag::rnode, smgrwrite(), sbufdesc::tag, UnlockBufHdr, and UnpinBuffer().

Referenced by ATExecSetTableSpace(), and heap_sync().

{
    int         i;
    volatile BufferDesc *bufHdr;

    /* Open rel at the smgr level if not already done */
    RelationOpenSmgr(rel);

    if (RelationUsesLocalBuffers(rel))
    {
        for (i = 0; i < NLocBuffer; i++)
        {
            bufHdr = &LocalBufferDescriptors[i];
            if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
                (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
            {
                ErrorContextCallback    errcallback;
                Page                    localpage;

                localpage = (char *) LocalBufHdrGetBlock(bufHdr);

                /* Setup error traceback support for ereport() */
                errcallback.callback = local_buffer_write_error_callback;
                errcallback.arg = (void *) bufHdr;
                errcallback.previous = error_context_stack;
                error_context_stack = &errcallback;

                PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);

                smgrwrite(rel->rd_smgr,
                          bufHdr->tag.forkNum,
                          bufHdr->tag.blockNum,
                          localpage,
                          false);

                bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);

                /* Pop the error context stack */
                error_context_stack = errcallback.previous;
            }
        }

        return;
    }

    /* Make sure we can handle the pin inside the loop */
    ResourceOwnerEnlargeBuffers(CurrentResourceOwner);

    for (i = 0; i < NBuffers; i++)
    {
        bufHdr = &BufferDescriptors[i];

        /*
         * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
         * and saves some cycles.
         */
        if (!RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node))
            continue;

        LockBufHdr(bufHdr);
        if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) &&
            (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_DIRTY))
        {
            PinBuffer_Locked(bufHdr);
            LWLockAcquire(bufHdr->content_lock, LW_SHARED);
            FlushBuffer(bufHdr, rel->rd_smgr);
            LWLockRelease(bufHdr->content_lock);
            UnpinBuffer(bufHdr, true);
        }
        else
            UnlockBufHdr(bufHdr);
    }
}

bool HoldingBufferPinThatDelaysRecovery ( void )

Definition at line 2874 of file bufmgr.c.

References GetStartupBufferPinWaitBufId(), and PrivateRefCount.

Referenced by CheckRecoveryConflictDeadlock(), and RecoveryConflictInterrupt().

{
    int         bufid = GetStartupBufferPinWaitBufId();

    /*
     * If we get woken slowly then it's possible that the Startup process was
     * already woken by other backends before we got here. Also possible that
     * we get here by multiple interrupts or interrupts at inappropriate
     * times, so make sure we do nothing if the bufid is not set.
     */
    if (bufid < 0)
        return false;

    if (PrivateRefCount[bufid] > 0)
        return true;

    return false;
}

void IncrBufferRefCount ( Buffer buffer )

Definition at line 2559 of file bufmgr.c.

References Assert, BufferIsLocal, BufferIsPinned, CurrentResourceOwner, LocalRefCount, PrivateRefCount, ResourceOwnerEnlargeBuffers(), and ResourceOwnerRememberBuffer().

Referenced by _bt_steppage(), btrestrpos(), ExecStoreTuple(), ReadBufferBI(), scanPostingTree(), and startScanEntry().

{
    Assert(BufferIsPinned(buffer));
    ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
    ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
    if (BufferIsLocal(buffer))
        LocalRefCount[-buffer - 1]++;
    else
        PrivateRefCount[buffer - 1]++;
}

void InitBufferPoolBackend ( void )

Definition at line 1741 of file bufmgr.c.

References AtProcExit_Buffers(), and on_shmem_exit().

Referenced by AuxiliaryProcessMain(), and InitPostgres().

{
    on_shmem_exit(AtProcExit_Buffers, 0);
}

static void InvalidateBuffer ( volatile BufferDesc * buf ) [static]

Definition at line 887 of file bufmgr.c.

References BM_TAG_VALID, sbufdesc::buf_id, BUFFERTAGS_EQUAL, BufMappingPartitionLock, BufTableDelete(), BufTableHashCode(), CLEAR_BUFFERTAG, elog, ERROR, sbufdesc::flags, LockBufHdr, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), PrivateRefCount, sbufdesc::refcount, StrategyFreeBuffer(), sbufdesc::tag, UnlockBufHdr, sbufdesc::usage_count, and WaitIO().

Referenced by DropDatabaseBuffers(), DropRelFileNodeBuffers(), and DropRelFileNodesAllBuffers().

{
    BufferTag   oldTag;
    uint32      oldHash;        /* hash value for oldTag */
    LWLockId    oldPartitionLock;       /* buffer partition lock for it */
    BufFlags    oldFlags;

    /* Save the original buffer tag before dropping the spinlock */
    oldTag = buf->tag;

    UnlockBufHdr(buf);

    /*
     * Need to compute the old tag's hashcode and partition lock ID. XXX is it
     * worth storing the hashcode in BufferDesc so we need not recompute it
     * here?  Probably not.
     */
    oldHash = BufTableHashCode(&oldTag);
    oldPartitionLock = BufMappingPartitionLock(oldHash);

retry:

    /*
     * Acquire exclusive mapping lock in preparation for changing the buffer's
     * association.
     */
    LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);

    /* Re-lock the buffer header */
    LockBufHdr(buf);

    /* If it's changed while we were waiting for lock, do nothing */
    if (!BUFFERTAGS_EQUAL(buf->tag, oldTag))
    {
        UnlockBufHdr(buf);
        LWLockRelease(oldPartitionLock);
        return;
    }

    /*
     * We assume the only reason for it to be pinned is that someone else is
     * flushing the page out.  Wait for them to finish.  (This could be an
     * infinite loop if the refcount is messed up... it would be nice to time
     * out after awhile, but there seems no way to be sure how many loops may
     * be needed.  Note that if the other guy has pinned the buffer but not
     * yet done StartBufferIO, WaitIO will fall through and we'll effectively
     * be busy-looping here.)
     */
    if (buf->refcount != 0)
    {
        UnlockBufHdr(buf);
        LWLockRelease(oldPartitionLock);
        /* safety check: should definitely not be our *own* pin */
        if (PrivateRefCount[buf->buf_id] != 0)
            elog(ERROR, "buffer is pinned in InvalidateBuffer");
        WaitIO(buf);
        goto retry;
    }

    /*
     * Clear out the buffer's tag and flags.  We must do this to ensure that
     * linear scans of the buffer array don't think the buffer is valid.
     */
    oldFlags = buf->flags;
    CLEAR_BUFFERTAG(buf->tag);
    buf->flags = 0;
    buf->usage_count = 0;

    UnlockBufHdr(buf);

    /*
     * Remove the buffer from the lookup hashtable, if it was in there.
     */
    if (oldFlags & BM_TAG_VALID)
        BufTableDelete(&oldTag, oldHash);

    /*
     * Done with mapping lock.
     */
    LWLockRelease(oldPartitionLock);

    /*
     * Insert the buffer at the head of the list of free buffers.
     */
    StrategyFreeBuffer(buf);
}

static void local_buffer_write_error_callback ( void * arg ) [static]

Definition at line 3170 of file bufmgr.c.

References buftag::blockNum, errcontext, buftag::forkNum, MyBackendId, NULL, pfree(), relpathbackend(), buftag::rnode, and sbufdesc::tag.

{
    volatile BufferDesc *bufHdr = (volatile BufferDesc *) arg;

    if (bufHdr != NULL)
    {
        char       *path = relpathbackend(bufHdr->tag.rnode, MyBackendId,
                                          bufHdr->tag.forkNum);

        errcontext("writing block %u of relation %s",
                   bufHdr->tag.blockNum, path);
        pfree(path);
    }
}

void LockBuffer	(	Buffer	buffer,
		int	mode
	)

Definition at line 2746 of file bufmgr.c.

References Assert, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferDescriptors, BufferIsLocal, BufferIsValid, sbufdesc::content_lock, elog, ERROR, LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), and LWLockRelease().

Referenced by _bt_doinsert(), _bt_endpoint(), _bt_first(), _bt_getbuf(), _bt_getroot(), _bt_killitems(), _bt_next(), _bt_relandgetbuf(), _hash_chgbufaccess(), _hash_getbuf(), _hash_getbuf_with_strategy(), _hash_getinitbuf(), _hash_getnewbuf(), acquire_sample_rows(), bitgetpage(), bt_metap(), bt_page_items(), bt_page_stats(), btvacuumpage(), collectMatchBitmap(), ConditionalLockBufferForCleanup(), copy_heap_data(), count_nondeletable_pages(), entryGetNextItem(), fill_seq_with_data(), FreeSpaceMapTruncateRel(), fsm_search(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), get_raw_page_internal(), GetTupleForTrigger(), GetVisibilityMapPins(), ginbuildempty(), ginbulkdelete(), ginDeletePage(), ginEntryInsert(), ginFindLeafPage(), ginFindParents(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), ginInsertItemPointers(), ginInsertValue(), GinNewBuffer(), ginTraverseLock(), ginUpdateStats(), ginvacuumcleanup(), ginVacuumPostingTreeLeaves(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistbuildempty(), gistbulkdelete(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfinishsplit(), gistfixsplit(), gistformdownlink(), gistGetMaxLevel(), gistinserttuples(), gistNewBuffer(), gistProcessItup(), gistScanPage(), gistvacuumcleanup(), heap_delete(), heap_fetch(), heap_get_latest_tid(), heap_hot_search(), heap_inplace_update(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_page_prune_opt(), heap_update(), heap_xlog_newpage(), heap_xlog_visible(), heapgetpage(), heapgettup(), index_fetch_heap(), IndexBuildHeapScan(), lazy_scan_heap(), LockBufferForCleanup(), moveRightIfItNeeded(), pgrowlocks(), pgstat_btree_page(), pgstat_gist_page(), pgstat_heap(), pgstatginindex(), pgstatindex(), read_seq_tuple(), RelationGetBufferForTuple(), RestoreBackupBlockContents(), scanGetCandidate(), scanPendingInsert(), scanPostingTree(), shiftList(), spgdoinsert(), spgGetCache(), SpGistNewBuffer(), spgprocesspending(), spgvacuumpage(), spgWalk(), startScanEntry(), systable_recheck_tuple(), UnlockReleaseBuffer(), validate_index_heapscan(), visibilitymap_clear(), visibilitymap_set(), visibilitymap_truncate(), XLogReadBuffer(), and XLogRecordPageWithFreeSpace().

{
    volatile BufferDesc *buf;

    Assert(BufferIsValid(buffer));
    if (BufferIsLocal(buffer))
        return;                 /* local buffers need no lock */

    buf = &(BufferDescriptors[buffer - 1]);

    if (mode == BUFFER_LOCK_UNLOCK)
        LWLockRelease(buf->content_lock);
    else if (mode == BUFFER_LOCK_SHARE)
        LWLockAcquire(buf->content_lock, LW_SHARED);
    else if (mode == BUFFER_LOCK_EXCLUSIVE)
        LWLockAcquire(buf->content_lock, LW_EXCLUSIVE);
    else
        elog(ERROR, "unrecognized buffer lock mode: %d", mode);
}

void LockBufferForCleanup ( Buffer buffer )

Definition at line 2802 of file bufmgr.c.

References Assert, BM_PIN_COUNT_WAITER, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferDescriptors, BufferIsLocal, BufferIsValid, elog, ERROR, sbufdesc::flags, InHotStandby, LocalRefCount, LockBuffer(), LockBufHdr, MyProcPid, NULL, PrivateRefCount, ProcWaitForSignal(), sbufdesc::refcount, ResolveRecoveryConflictWithBufferPin(), SetStartupBufferPinWaitBufId(), UnlockBufHdr, and sbufdesc::wait_backend_pid.

Referenced by btree_xlog_vacuum(), btvacuumpage(), btvacuumscan(), ginVacuumPostingTreeLeaves(), heap_xlog_clean(), lazy_scan_heap(), and RestoreBackupBlockContents().

{
    volatile BufferDesc *bufHdr;

    Assert(BufferIsValid(buffer));
    Assert(PinCountWaitBuf == NULL);

    if (BufferIsLocal(buffer))
    {
        /* There should be exactly one pin */
        if (LocalRefCount[-buffer - 1] != 1)
            elog(ERROR, "incorrect local pin count: %d",
                 LocalRefCount[-buffer - 1]);
        /* Nobody else to wait for */
        return;
    }

    /* There should be exactly one local pin */
    if (PrivateRefCount[buffer - 1] != 1)
        elog(ERROR, "incorrect local pin count: %d",
             PrivateRefCount[buffer - 1]);

    bufHdr = &BufferDescriptors[buffer - 1];

    for (;;)
    {
        /* Try to acquire lock */
        LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
        LockBufHdr(bufHdr);
        Assert(bufHdr->refcount > 0);
        if (bufHdr->refcount == 1)
        {
            /* Successfully acquired exclusive lock with pincount 1 */
            UnlockBufHdr(bufHdr);
            return;
        }
        /* Failed, so mark myself as waiting for pincount 1 */
        if (bufHdr->flags & BM_PIN_COUNT_WAITER)
        {
            UnlockBufHdr(bufHdr);
            LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
            elog(ERROR, "multiple backends attempting to wait for pincount 1");
        }
        bufHdr->wait_backend_pid = MyProcPid;
        bufHdr->flags |= BM_PIN_COUNT_WAITER;
        PinCountWaitBuf = bufHdr;
        UnlockBufHdr(bufHdr);
        LockBuffer(buffer, BUFFER_LOCK_UNLOCK);

        /* Wait to be signaled by UnpinBuffer() */
        if (InHotStandby)
        {
            /* Publish the bufid that Startup process waits on */
            SetStartupBufferPinWaitBufId(buffer - 1);
            /* Set alarm and then wait to be signaled by UnpinBuffer() */
            ResolveRecoveryConflictWithBufferPin();
            /* Reset the published bufid */
            SetStartupBufferPinWaitBufId(-1);
        }
        else
            ProcWaitForSignal();

        PinCountWaitBuf = NULL;
        /* Loop back and try again */
    }
}

void MarkBufferDirty ( Buffer buffer )

Definition at line 984 of file bufmgr.c.

References Assert, BM_DIRTY, BufferDescriptors, BufferIsLocal, BufferIsValid, sbufdesc::content_lock, elog, ERROR, sbufdesc::flags, LockBufHdr, LWLockHeldByMe(), MarkLocalBufferDirty(), pgBufferUsage, PrivateRefCount, sbufdesc::refcount, BufferUsage::shared_blks_dirtied, UnlockBufHdr, VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, and VacuumPageDirty.

Referenced by _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_getroot(), _bt_insertonpg(), _bt_newroot(), _bt_pagedel(), _bt_restore_meta(), _bt_split(), _hash_addovflpage(), _hash_chgbufaccess(), _hash_wrtbuf(), addLeafTuple(), AlterSequence(), btree_xlog_delete(), btree_xlog_delete_page(), btree_xlog_insert(), btree_xlog_newroot(), btree_xlog_split(), btree_xlog_vacuum(), createPostingTree(), do_setval(), doPickSplit(), fill_seq_with_data(), ginbuild(), ginbuildempty(), ginbulkdelete(), ginDeletePage(), ginHeapTupleFastInsert(), ginInsertValue(), ginRedoCreateIndex(), ginRedoCreatePTree(), ginRedoDeleteListPages(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoSplit(), ginRedoUpdateMetapage(), ginRedoVacuumPage(), ginUpdateStats(), ginVacuumPostingTreeLeaves(), gistbuild(), gistbuildempty(), gistbulkdelete(), gistplacetopage(), gistRedoClearFollowRight(), gistRedoCreateIndex(), gistRedoPageSplitRecord(), gistRedoPageUpdateRecord(), heap_delete(), heap_inplace_update(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_page_prune(), heap_update(), heap_xlog_clean(), heap_xlog_delete(), heap_xlog_freeze(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_newpage(), heap_xlog_update(), heap_xlog_visible(), lazy_scan_heap(), lazy_vacuum_page(), moveLeafs(), nextval_internal(), RestoreBackupBlockContents(), saveNodeLink(), seq_redo(), shiftList(), spgAddNodeAction(), spgbuild(), SpGistUpdateMetaPage(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoCreateIndex(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), spgRedoVacuumRoot(), spgSplitNodeAction(), spgvacuumpage(), vacuumLeafPage(), vacuumLeafRoot(), vacuumRedirectAndPlaceholder(), visibilitymap_clear(), visibilitymap_set(), visibilitymap_truncate(), and writeListPage().

{
    volatile BufferDesc *bufHdr;

    if (!BufferIsValid(buffer))
        elog(ERROR, "bad buffer ID: %d", buffer);

    if (BufferIsLocal(buffer))
    {
        MarkLocalBufferDirty(buffer);
        return;
    }

    bufHdr = &BufferDescriptors[buffer - 1];

    Assert(PrivateRefCount[buffer - 1] > 0);
    /* unfortunately we can't check if the lock is held exclusively */
    Assert(LWLockHeldByMe(bufHdr->content_lock));

    LockBufHdr(bufHdr);

    Assert(bufHdr->refcount > 0);

    /*
     * If the buffer was not dirty already, do vacuum accounting.
     */
    if (!(bufHdr->flags & BM_DIRTY))
    {
        VacuumPageDirty++;
        pgBufferUsage.shared_blks_dirtied++;
        if (VacuumCostActive)
            VacuumCostBalance += VacuumCostPageDirty;
    }

    bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);

    UnlockBufHdr(bufHdr);
}

void MarkBufferDirtyHint ( Buffer buffer )

Definition at line 2585 of file bufmgr.c.

References Assert, BM_DIRTY, BM_JUST_DIRTIED, BM_PERMANENT, BufferDescriptors, BufferGetPage, BufferIsLocal, BufferIsValid, sbufdesc::content_lock, DataChecksumsEnabled(), PGXACT::delayChkpt, elog, ERROR, sbufdesc::flags, LockBufHdr, LWLockHeldByMe(), MarkLocalBufferDirty(), MyPgXact, PageSetLSN, PrivateRefCount, RecoveryInProgress(), sbufdesc::refcount, UnlockBufHdr, VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, VacuumPageDirty, XLogRecPtrIsInvalid, and XLogSaveBufferForHint().

Referenced by _bt_check_unique(), _bt_killitems(), btvacuumpage(), FreeSpaceMapTruncateRel(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), hashgettuple(), heap_page_prune(), read_seq_tuple(), SetHintBits(), and XLogRecordPageWithFreeSpace().

{
    volatile BufferDesc *bufHdr;
    Page    page = BufferGetPage(buffer);

    if (!BufferIsValid(buffer))
        elog(ERROR, "bad buffer ID: %d", buffer);

    if (BufferIsLocal(buffer))
    {
        MarkLocalBufferDirty(buffer);
        return;
    }

    bufHdr = &BufferDescriptors[buffer - 1];

    Assert(PrivateRefCount[buffer - 1] > 0);
    /* here, either share or exclusive lock is OK */
    Assert(LWLockHeldByMe(bufHdr->content_lock));

    /*
     * This routine might get called many times on the same page, if we are
     * making the first scan after commit of an xact that added/deleted many
     * tuples. So, be as quick as we can if the buffer is already dirty.  We do
     * this by not acquiring spinlock if it looks like the status bits are
     * already set.  Since we make this test unlocked, there's a chance we
     * might fail to notice that the flags have just been cleared, and failed
     * to reset them, due to memory-ordering issues.  But since this function
     * is only intended to be used in cases where failing to write out the data
     * would be harmless anyway, it doesn't really matter.
     */
    if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
        (BM_DIRTY | BM_JUST_DIRTIED))
    {
        XLogRecPtr  lsn = InvalidXLogRecPtr;
        bool        dirtied = false;
        bool        delayChkpt = false;

        /*
         * If checksums are enabled, and the buffer is permanent, then a full
         * page image may be required even for some hint bit updates to protect
         * against torn pages. This full page image is only necessary if the
         * hint bit update is the first change to the page since the last
         * checkpoint.
         *
         * We don't check full_page_writes here because that logic is
         * included when we call XLogInsert() since the value changes
         * dynamically.
         */
        if (DataChecksumsEnabled() && (bufHdr->flags & BM_PERMANENT))
        {
            /*
             * If we're in recovery we cannot dirty a page because of a hint.
             * We can set the hint, just not dirty the page as a result so
             * the hint is lost when we evict the page or shutdown.
             *
             * See src/backend/storage/page/README for longer discussion.
             */
            if (RecoveryInProgress())
                return;

            /*
             * If the block is already dirty because we either made a change
             * or set a hint already, then we don't need to write a full page
             * image.  Note that aggressive cleaning of blocks
             * dirtied by hint bit setting would increase the call rate.
             * Bulk setting of hint bits would reduce the call rate...
             *
             * We must issue the WAL record before we mark the buffer dirty.
             * Otherwise we might write the page before we write the WAL.
             * That causes a race condition, since a checkpoint might occur
             * between writing the WAL record and marking the buffer dirty.
             * We solve that with a kluge, but one that is already in use
             * during transaction commit to prevent race conditions.
             * Basically, we simply prevent the checkpoint WAL record from
             * being written until we have marked the buffer dirty. We don't
             * start the checkpoint flush until we have marked dirty, so our
             * checkpoint must flush the change to disk successfully or the
             * checkpoint never gets written, so crash recovery will fix.
             *
             * It's possible we may enter here without an xid, so it is
             * essential that CreateCheckpoint waits for virtual transactions
             * rather than full transactionids.
             */
            MyPgXact->delayChkpt = delayChkpt = true;
            lsn = XLogSaveBufferForHint(buffer);
        }

        LockBufHdr(bufHdr);
        Assert(bufHdr->refcount > 0);
        if (!(bufHdr->flags & BM_DIRTY))
        {
            dirtied = true;     /* Means "will be dirtied by this action" */

            /*
             * Set the page LSN if we wrote a backup block. We aren't
             * supposed to set this when only holding a share lock but
             * as long as we serialise it somehow we're OK. We choose to
             * set LSN while holding the buffer header lock, which causes
             * any reader of an LSN who holds only a share lock to also
             * obtain a buffer header lock before using PageGetLSN(),
             * which is enforced in BufferGetLSNAtomic().
             *
             * If checksums are enabled, you might think we should reset the
             * checksum here. That will happen when the page is written
             * sometime later in this checkpoint cycle.
             */
            if (!XLogRecPtrIsInvalid(lsn))
                PageSetLSN(page, lsn);
        }
        bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
        UnlockBufHdr(bufHdr);

        if (delayChkpt)
            MyPgXact->delayChkpt = false;

        if (dirtied)
        {
            VacuumPageDirty++;
            if (VacuumCostActive)
                VacuumCostBalance += VacuumCostPageDirty;
        }
    }
}

static bool PinBuffer	(	volatile BufferDesc *	buf,
		BufferAccessStrategy	strategy
	)			`[static]`

Definition at line 1092 of file bufmgr.c.

References Assert, BM_MAX_USAGE_COUNT, sbufdesc::buf_id, BufferDescriptorGetBuffer, CurrentResourceOwner, sbufdesc::flags, LockBufHdr, NULL, PrivateRefCount, sbufdesc::refcount, ResourceOwnerRememberBuffer(), among::result, UnlockBufHdr, and sbufdesc::usage_count.

Referenced by BufferAlloc().

{
    int         b = buf->buf_id;
    bool        result;

    if (PrivateRefCount[b] == 0)
    {
        LockBufHdr(buf);
        buf->refcount++;
        if (strategy == NULL)
        {
            if (buf->usage_count < BM_MAX_USAGE_COUNT)
                buf->usage_count++;
        }
        else
        {
            if (buf->usage_count == 0)
                buf->usage_count = 1;
        }
        result = (buf->flags & BM_VALID) != 0;
        UnlockBufHdr(buf);
    }
    else
    {
        /* If we previously pinned the buffer, it must surely be valid */
        result = true;
    }
    PrivateRefCount[b]++;
    Assert(PrivateRefCount[b] > 0);
    ResourceOwnerRememberBuffer(CurrentResourceOwner,
                                BufferDescriptorGetBuffer(buf));
    return result;
}

static void PinBuffer_Locked ( volatile BufferDesc * buf ) [static]

Definition at line 1140 of file bufmgr.c.

References Assert, sbufdesc::buf_id, BufferDescriptorGetBuffer, CurrentResourceOwner, PrivateRefCount, sbufdesc::refcount, ResourceOwnerRememberBuffer(), and UnlockBufHdr.

Referenced by BufferAlloc(), FlushDatabaseBuffers(), FlushRelationBuffers(), and SyncOneBuffer().

{
    int         b = buf->buf_id;

    if (PrivateRefCount[b] == 0)
        buf->refcount++;
    UnlockBufHdr(buf);
    PrivateRefCount[b]++;
    Assert(PrivateRefCount[b] > 0);
    ResourceOwnerRememberBuffer(CurrentResourceOwner,
                                BufferDescriptorGetBuffer(buf));
}

void PrefetchBuffer	(	Relation	reln,
		ForkNumber	forkNum,
		BlockNumber	blockNum
	)

Definition at line 125 of file bufmgr.c.

References Assert, BlockNumberIsValid, BufMappingPartitionLock, BufTableHashCode(), BufTableLookup(), ereport, errcode(), errmsg(), ERROR, INIT_BUFFERTAG, LocalPrefetchBuffer(), LW_SHARED, LWLockAcquire(), LWLockRelease(), RelFileNodeBackend::node, RelationData::rd_smgr, RELATION_IS_OTHER_TEMP, RelationIsValid, RelationOpenSmgr, RelationUsesLocalBuffers, SMgrRelationData::smgr_rnode, and smgrprefetch().

Referenced by BitmapHeapNext().

{
#ifdef USE_PREFETCH
    Assert(RelationIsValid(reln));
    Assert(BlockNumberIsValid(blockNum));

    /* Open it at the smgr level if not already done */
    RelationOpenSmgr(reln);

    if (RelationUsesLocalBuffers(reln))
    {
        /* see comments in ReadBufferExtended */
        if (RELATION_IS_OTHER_TEMP(reln))
            ereport(ERROR,
                    (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                errmsg("cannot access temporary tables of other sessions")));

        /* pass it off to localbuf.c */
        LocalPrefetchBuffer(reln->rd_smgr, forkNum, blockNum);
    }
    else
    {
        BufferTag   newTag;     /* identity of requested block */
        uint32      newHash;    /* hash value for newTag */
        LWLockId    newPartitionLock;   /* buffer partition lock for it */
        int         buf_id;

        /* create a tag so we can lookup the buffer */
        INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode.node,
                       forkNum, blockNum);

        /* determine its hash code and partition lock ID */
        newHash = BufTableHashCode(&newTag);
        newPartitionLock = BufMappingPartitionLock(newHash);

        /* see if the block is in the buffer pool already */
        LWLockAcquire(newPartitionLock, LW_SHARED);
        buf_id = BufTableLookup(&newTag, newHash);
        LWLockRelease(newPartitionLock);

        /* If not in buffers, initiate prefetch */
        if (buf_id < 0)
            smgrprefetch(reln->rd_smgr, forkNum, blockNum);

        /*
         * If the block *is* in buffers, we do nothing.  This is not really
         * ideal: the block might be just about to be evicted, which would be
         * stupid since we know we are going to need it soon.  But the only
         * easy answer is to bump the usage_count, which does not seem like a
         * great solution: when the caller does ultimately touch the block,
         * usage_count would get bumped again, resulting in too much
         * favoritism for blocks that are involved in a prefetch sequence. A
         * real fix would involve some additional per-buffer state, and it's
         * not clear that there's enough of a problem to justify that.
         */
    }
#endif   /* USE_PREFETCH */
}

void PrintBufferLeakWarning ( Buffer buffer )

Definition at line 1782 of file bufmgr.c.

References Assert, buftag::blockNum, buf, BufferDescriptors, BufferIsLocal, BufferIsValid, elog, sbufdesc::flags, buftag::forkNum, LocalBufferDescriptors, LocalRefCount, MyBackendId, pfree(), PrivateRefCount, sbufdesc::refcount, relpathbackend(), buftag::rnode, sbufdesc::tag, and WARNING.

Referenced by AtEOXact_Buffers(), AtEOXact_LocalBuffers(), AtProcExit_Buffers(), AtProcExit_LocalBuffers(), and ResourceOwnerReleaseInternal().

{
    volatile BufferDesc *buf;
    int32       loccount;
    char       *path;
    BackendId   backend;

    Assert(BufferIsValid(buffer));
    if (BufferIsLocal(buffer))
    {
        buf = &LocalBufferDescriptors[-buffer - 1];
        loccount = LocalRefCount[-buffer - 1];
        backend = MyBackendId;
    }
    else
    {
        buf = &BufferDescriptors[buffer - 1];
        loccount = PrivateRefCount[buffer - 1];
        backend = InvalidBackendId;
    }

    /* theoretically we should lock the bufhdr here */
    path = relpathbackend(buf->tag.rnode, backend, buf->tag.forkNum);
    elog(WARNING,
         "buffer refcount leak: [%03d] "
         "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
         buffer, path,
         buf->tag.blockNum, buf->flags,
         buf->refcount, loccount);
    pfree(path);
}

Buffer ReadBuffer	(	Relation	reln,
		BlockNumber	blockNum
	)

Definition at line 190 of file bufmgr.c.

References MAIN_FORKNUM, NULL, RBM_NORMAL, and ReadBufferExtended().

Referenced by _bt_getbuf(), _hash_getbuf(), bt_metap(), bt_page_items(), bt_page_stats(), fill_seq_with_data(), GetTupleForTrigger(), ginFindLeafPage(), ginFindParents(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), GinNewBuffer(), ginPrepareFindLeafPage(), ginUpdateStats(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfixsplit(), gistGetMaxLevel(), gistNewBuffer(), gistProcessItup(), gistScanPage(), heap_delete(), heap_fetch(), heap_get_latest_tid(), heap_hot_search(), heap_inplace_update(), heap_lock_tuple(), heap_update(), pgstatginindex(), read_seq_tuple(), ReadBufferBI(), RelationGetBufferForTuple(), ReleaseAndReadBuffer(), scanGetCandidate(), scanPendingInsert(), shiftList(), spgdoinsert(), spgGetCache(), SpGistGetBuffer(), SpGistNewBuffer(), SpGistUpdateMetaPage(), and spgWalk().

{
    return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
}

static Buffer ReadBuffer_common	(	SMgrRelation	reln,
		char	relpersistence,
		ForkNumber	forkNum,
		BlockNumber	blockNum,
		ReadBufferMode	mode,
		BufferAccessStrategy	strategy,
		bool *	hit
	)			`[static]`

Definition at line 291 of file bufmgr.c.

References Assert, RelFileNodeBackend::backend, BufferUsage::blk_read_time, BM_VALID, BufferAlloc(), BufferDescriptorGetBuffer, BufHdrGetBlock, CurrentResourceOwner, RelFileNode::dbNode, ereport, errcode(), errhint(), errmsg(), ERROR, sbufdesc::flags, INSTR_TIME_ADD, INSTR_TIME_GET_MICROSEC, INSTR_TIME_SET_CURRENT, INSTR_TIME_SUBTRACT, BufferUsage::local_blks_hit, BufferUsage::local_blks_read, LocalBufferAlloc(), LocalBufHdrGetBlock, LockBufHdr, MemSet, RelFileNodeBackend::node, PageIsNew, PageIsVerified(), pgBufferUsage, pgstat_count_buffer_read_time, RBM_ZERO, RBM_ZERO_ON_ERROR, RelFileNode::relNode, relpath, ResourceOwnerEnlargeBuffers(), BufferUsage::shared_blks_hit, BufferUsage::shared_blks_read, SMgrRelationData::smgr_rnode, smgrextend(), SmgrIsTemp, smgrnblocks(), smgrread(), RelFileNode::spcNode, StartBufferIO(), TerminateBufferIO(), track_io_timing, UnlockBufHdr, VacuumCostActive, VacuumCostBalance, VacuumCostPageHit, VacuumCostPageMiss, VacuumPageHit, VacuumPageMiss, WARNING, and zero_damaged_pages.

Referenced by ReadBufferExtended(), and ReadBufferWithoutRelcache().

{
    volatile BufferDesc *bufHdr;
    Block       bufBlock;
    bool        found;
    bool        isExtend;
    bool        isLocalBuf = SmgrIsTemp(smgr);

    *hit = false;

    /* Make sure we will have room to remember the buffer pin */
    ResourceOwnerEnlargeBuffers(CurrentResourceOwner);

    isExtend = (blockNum == P_NEW);

    TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
                                       smgr->smgr_rnode.node.spcNode,
                                       smgr->smgr_rnode.node.dbNode,
                                       smgr->smgr_rnode.node.relNode,
                                       smgr->smgr_rnode.backend,
                                       isExtend);

    /* Substitute proper block number if caller asked for P_NEW */
    if (isExtend)
        blockNum = smgrnblocks(smgr, forkNum);

    if (isLocalBuf)
    {
        bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
        if (found)
            pgBufferUsage.local_blks_hit++;
        else
            pgBufferUsage.local_blks_read++;
    }
    else
    {
        /*
         * lookup the buffer.  IO_IN_PROGRESS is set if the requested block is
         * not currently in memory.
         */
        bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
                             strategy, &found);
        if (found)
            pgBufferUsage.shared_blks_hit++;
        else
            pgBufferUsage.shared_blks_read++;
    }

    /* At this point we do NOT hold any locks. */

    /* if it was already in the buffer pool, we're done */
    if (found)
    {
        if (!isExtend)
        {
            /* Just need to update stats before we exit */
            *hit = true;
            VacuumPageHit++;

            if (VacuumCostActive)
                VacuumCostBalance += VacuumCostPageHit;

            TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
                                              smgr->smgr_rnode.node.spcNode,
                                              smgr->smgr_rnode.node.dbNode,
                                              smgr->smgr_rnode.node.relNode,
                                              smgr->smgr_rnode.backend,
                                              isExtend,
                                              found);

            return BufferDescriptorGetBuffer(bufHdr);
        }

        /*
         * We get here only in the corner case where we are trying to extend
         * the relation but we found a pre-existing buffer marked BM_VALID.
         * This can happen because mdread doesn't complain about reads beyond
         * EOF (when zero_damaged_pages is ON) and so a previous attempt to
         * read a block beyond EOF could have left a "valid" zero-filled
         * buffer.  Unfortunately, we have also seen this case occurring
         * because of buggy Linux kernels that sometimes return an
         * lseek(SEEK_END) result that doesn't account for a recent write. In
         * that situation, the pre-existing buffer would contain valid data
         * that we don't want to overwrite.  Since the legitimate case should
         * always have left a zero-filled buffer, complain if not PageIsNew.
         */
        bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
        if (!PageIsNew((Page) bufBlock))
            ereport(ERROR,
             (errmsg("unexpected data beyond EOF in block %u of relation %s",
                     blockNum, relpath(smgr->smgr_rnode, forkNum)),
              errhint("This has been seen to occur with buggy kernels; consider updating your system.")));

        /*
         * We *must* do smgrextend before succeeding, else the page will not
         * be reserved by the kernel, and the next P_NEW call will decide to
         * return the same page.  Clear the BM_VALID bit, do the StartBufferIO
         * call that BufferAlloc didn't, and proceed.
         */
        if (isLocalBuf)
        {
            /* Only need to adjust flags */
            Assert(bufHdr->flags & BM_VALID);
            bufHdr->flags &= ~BM_VALID;
        }
        else
        {
            /*
             * Loop to handle the very small possibility that someone re-sets
             * BM_VALID between our clearing it and StartBufferIO inspecting
             * it.
             */
            do
            {
                LockBufHdr(bufHdr);
                Assert(bufHdr->flags & BM_VALID);
                bufHdr->flags &= ~BM_VALID;
                UnlockBufHdr(bufHdr);
            } while (!StartBufferIO(bufHdr, true));
        }
    }

    /*
     * if we have gotten to this point, we have allocated a buffer for the
     * page but its contents are not yet valid.  IO_IN_PROGRESS is set for it,
     * if it's a shared buffer.
     *
     * Note: if smgrextend fails, we will end up with a buffer that is
     * allocated but not marked BM_VALID.  P_NEW will still select the same
     * block number (because the relation didn't get any longer on disk) and
     * so future attempts to extend the relation will find the same buffer (if
     * it's not been recycled) but come right back here to try smgrextend
     * again.
     */
    Assert(!(bufHdr->flags & BM_VALID));        /* spinlock not needed */

    bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);

    if (isExtend)
    {
        /* new buffers are zero-filled */
        MemSet((char *) bufBlock, 0, BLCKSZ);
        /* don't set checksum for all-zero page */
        smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);
    }
    else
    {
        /*
         * Read in the page, unless the caller intends to overwrite it and
         * just wants us to allocate a buffer.
         */
        if (mode == RBM_ZERO)
            MemSet((char *) bufBlock, 0, BLCKSZ);
        else
        {
            instr_time  io_start,
                        io_time;

            if (track_io_timing)
                INSTR_TIME_SET_CURRENT(io_start);

            smgrread(smgr, forkNum, blockNum, (char *) bufBlock);

            if (track_io_timing)
            {
                INSTR_TIME_SET_CURRENT(io_time);
                INSTR_TIME_SUBTRACT(io_time, io_start);
                pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
                INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
            }

            /* check for garbage data */
            if (!PageIsVerified((Page) bufBlock, blockNum))
            {
                if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
                {
                    ereport(WARNING,
                            (errcode(ERRCODE_DATA_CORRUPTED),
                             errmsg("invalid page in block %u of relation %s; zeroing out page",
                                    blockNum,
                                    relpath(smgr->smgr_rnode, forkNum))));
                    MemSet((char *) bufBlock, 0, BLCKSZ);
                }
                else
                    ereport(ERROR,
                            (errcode(ERRCODE_DATA_CORRUPTED),
                     errmsg("invalid page in block %u of relation %s",
                            blockNum,
                            relpath(smgr->smgr_rnode, forkNum))));
            }
        }
    }

    if (isLocalBuf)
    {
        /* Only need to adjust flags */
        bufHdr->flags |= BM_VALID;
    }
    else
    {
        /* Set BM_VALID, terminate IO, and wake up any waiters */
        TerminateBufferIO(bufHdr, false, BM_VALID);
    }

    VacuumPageMiss++;
    if (VacuumCostActive)
        VacuumCostBalance += VacuumCostPageMiss;

    TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
                                      smgr->smgr_rnode.node.spcNode,
                                      smgr->smgr_rnode.node.dbNode,
                                      smgr->smgr_rnode.node.relNode,
                                      smgr->smgr_rnode.backend,
                                      isExtend,
                                      found);

    return BufferDescriptorGetBuffer(bufHdr);
}

Buffer ReadBufferExtended	(	Relation	reln,
		ForkNumber	forkNum,
		BlockNumber	blockNum,
		ReadBufferMode	mode,
		BufferAccessStrategy	strategy
	)

Definition at line 228 of file bufmgr.c.

References buf, ereport, errcode(), errmsg(), ERROR, pgstat_count_buffer_hit, pgstat_count_buffer_read, RelationData::rd_rel, RelationData::rd_smgr, ReadBuffer_common(), RELATION_IS_OTHER_TEMP, and RelationOpenSmgr.

Referenced by _hash_getbuf_with_strategy(), _hash_getinitbuf(), _hash_getnewbuf(), acquire_sample_rows(), btvacuumpage(), btvacuumscan(), count_nondeletable_pages(), fsm_readbuf(), get_raw_page_internal(), ginbuildempty(), ginbulkdelete(), ginDeletePage(), ginScanToDelete(), ginvacuumcleanup(), ginVacuumPostingTreeLeaves(), gistbuildempty(), gistbulkdelete(), gistvacuumcleanup(), heapgetpage(), lazy_scan_heap(), lazy_vacuum_heap(), pgstat_btree_page(), pgstat_gist_page(), pgstat_heap(), pgstatindex(), ReadBuffer(), ReadBufferBI(), spgprocesspending(), spgvacuumpage(), and vm_readbuf().

{
    bool        hit;
    Buffer      buf;

    /* Open it at the smgr level if not already done */
    RelationOpenSmgr(reln);

    /*
     * Reject attempts to read non-local temporary relations; we would be
     * likely to get wrong data since we have no visibility into the owning
     * session's local buffers.
     */
    if (RELATION_IS_OTHER_TEMP(reln))
        ereport(ERROR,
                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                 errmsg("cannot access temporary tables of other sessions")));

    /*
     * Read the buffer, and update pgstat counters to reflect a cache hit or
     * miss.
     */
    pgstat_count_buffer_read(reln);
    buf = ReadBuffer_common(reln->rd_smgr, reln->rd_rel->relpersistence,
                            forkNum, blockNum, mode, strategy, &hit);
    if (hit)
        pgstat_count_buffer_hit(reln);
    return buf;
}

Buffer ReadBufferWithoutRelcache	(	RelFileNode	rnode,
		ForkNumber	forkNum,
		BlockNumber	blockNum,
		ReadBufferMode	mode,
		BufferAccessStrategy	strategy
	)

Definition at line 270 of file bufmgr.c.

References Assert, InRecovery, InvalidBackendId, ReadBuffer_common(), RELPERSISTENCE_PERMANENT, and smgropen().

Referenced by XLogReadBufferExtended().

{
    bool        hit;

    SMgrRelation smgr = smgropen(rnode, InvalidBackendId);

    Assert(InRecovery);

    return ReadBuffer_common(smgr, RELPERSISTENCE_PERMANENT, forkNum, blockNum,
                             mode, strategy, &hit);
}

BlockNumber RelationGetNumberOfBlocksInFork	(	Relation	relation,
		ForkNumber	forkNum
	)

Definition at line 2032 of file bufmgr.c.

References RelationData::rd_smgr, RelationOpenSmgr, and smgrnblocks().

Referenced by _hash_getnewbuf(), and _hash_metapinit().

{
    /* Open it at the smgr level if not already done */
    RelationOpenSmgr(relation);

    return smgrnblocks(relation->rd_smgr, forkNum);
}

Buffer ReleaseAndReadBuffer	(	Buffer	buffer,
		Relation	relation,
		BlockNumber	blockNum
	)

Definition at line 1037 of file bufmgr.c.

References Assert, buftag::blockNum, BufferDescriptors, BufferIsLocal, BufferIsValid, CurrentResourceOwner, buftag::forkNum, LocalBufferDescriptors, LocalRefCount, PrivateRefCount, RelationData::rd_node, ReadBuffer(), RelFileNodeEquals, ResourceOwnerForgetBuffer(), buftag::rnode, sbufdesc::tag, and UnpinBuffer().

Referenced by _bt_relandgetbuf(), bitgetpage(), entryGetNextItem(), ginFindLeafPage(), ginInsertValue(), index_fetch_heap(), moveRightIfItNeeded(), and scanPostingTree().

{
    ForkNumber  forkNum = MAIN_FORKNUM;
    volatile BufferDesc *bufHdr;

    if (BufferIsValid(buffer))
    {
        if (BufferIsLocal(buffer))
        {
            Assert(LocalRefCount[-buffer - 1] > 0);
            bufHdr = &LocalBufferDescriptors[-buffer - 1];
            if (bufHdr->tag.blockNum == blockNum &&
                RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
                bufHdr->tag.forkNum == forkNum)
                return buffer;
            ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
            LocalRefCount[-buffer - 1]--;
        }
        else
        {
            Assert(PrivateRefCount[buffer - 1] > 0);
            bufHdr = &BufferDescriptors[buffer - 1];
            /* we have pin, so it's ok to examine tag without spinlock */
            if (bufHdr->tag.blockNum == blockNum &&
                RelFileNodeEquals(bufHdr->tag.rnode, relation->rd_node) &&
                bufHdr->tag.forkNum == forkNum)
                return buffer;
            UnpinBuffer(bufHdr, true);
        }
    }

    return ReadBuffer(relation, blockNum);
}

void ReleaseBuffer ( Buffer buffer )

Definition at line 2512 of file bufmgr.c.

References Assert, BufferDescriptors, BufferIsLocal, BufferIsValid, CurrentResourceOwner, elog, ERROR, LocalRefCount, PrivateRefCount, ResourceOwnerForgetBuffer(), and UnpinBuffer().

Referenced by _bt_getbuf(), _hash_dropbuf(), AfterTriggerExecute(), btendscan(), btmarkpos(), btrescan(), btrestrpos(), entryGetNextItem(), EvalPlanQualFetch(), EvalPlanQualFetchRowMarks(), ExecClearTuple(), ExecDelete(), ExecEndIndexOnlyScan(), ExecLockRows(), ExecMaterializeSlot(), ExecStoreMinimalTuple(), ExecStoreTuple(), FreeBulkInsertState(), freeGinBtreeStack(), freeScanKeys(), fsm_vacuum_page(), get_raw_page_internal(), GetRecordedFreeSpace(), GetTupleForTrigger(), ginDeletePage(), ginFindParents(), ginInsertCleanup(), GinNewBuffer(), ginScanToDelete(), gistdoinsert(), gistFindCorrectParent(), gistNewBuffer(), heap_delete(), heap_endscan(), heap_fetch(), heap_hot_search(), heap_insert(), heap_multi_insert(), heap_rescan(), heap_restrpos(), heap_update(), heap_xlog_delete(), heap_xlog_insert(), heap_xlog_multi_insert(), heap_xlog_update(), heap_xlog_visible(), heapgetpage(), heapgettup(), heapgettup_pagemode(), index_endscan(), index_getnext_tid(), index_rescan(), lazy_scan_heap(), lazy_vacuum_heap(), pgstatindex(), ReadBufferBI(), RelationGetBufferForTuple(), ResourceOwnerReleaseInternal(), SpGistGetBuffer(), SpGistNewBuffer(), SpGistUpdateMetaPage(), TidNext(), UnlockReleaseBuffer(), visibilitymap_count(), visibilitymap_pin(), visibilitymap_test(), and XLogReadBufferExtended().

{
    volatile BufferDesc *bufHdr;

    if (!BufferIsValid(buffer))
        elog(ERROR, "bad buffer ID: %d", buffer);

    ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);

    if (BufferIsLocal(buffer))
    {
        Assert(LocalRefCount[-buffer - 1] > 0);
        LocalRefCount[-buffer - 1]--;
        return;
    }

    bufHdr = &BufferDescriptors[buffer - 1];

    Assert(PrivateRefCount[buffer - 1] > 0);

    if (PrivateRefCount[buffer - 1] > 1)
        PrivateRefCount[buffer - 1]--;
    else
        UnpinBuffer(bufHdr, false);
}

static int rnode_comparator	(	const void *	p1,
		const void *	p2
	)			`[static]`

Definition at line 3189 of file bufmgr.c.

References RelFileNode::dbNode, RelFileNode::relNode, and RelFileNode::spcNode.

Referenced by DropRelFileNodesAllBuffers().

{
    RelFileNode n1 = *(RelFileNode *) p1;
    RelFileNode n2 = *(RelFileNode *) p2;

    if (n1.relNode < n2.relNode)
        return -1;
    else if (n1.relNode > n2.relNode)
        return 1;

    if (n1.dbNode < n2.dbNode)
        return -1;
    else if (n1.dbNode > n2.dbNode)
        return 1;

    if (n1.spcNode < n2.spcNode)
        return -1;
    else if (n1.spcNode > n2.spcNode)
        return 1;
    else
        return 0;
}

static void shared_buffer_write_error_callback ( void * arg ) [static]

Definition at line 3151 of file bufmgr.c.

References buftag::blockNum, errcontext, buftag::forkNum, NULL, pfree(), relpathperm, buftag::rnode, and sbufdesc::tag.

{
    volatile BufferDesc *bufHdr = (volatile BufferDesc *) arg;

    /* Buffer is pinned, so we can read the tag without locking the spinlock */
    if (bufHdr != NULL)
    {
        char       *path = relpathperm(bufHdr->tag.rnode, bufHdr->tag.forkNum);

        errcontext("writing block %u of relation %s",
                   bufHdr->tag.blockNum, path);
        pfree(path);
    }
}

static bool StartBufferIO	(	volatile BufferDesc *	buf,
		bool	forInput
	)			`[static]`

Definition at line 3002 of file bufmgr.c.

References Assert, BM_DIRTY, BM_IO_IN_PROGRESS, BM_VALID, sbufdesc::flags, sbufdesc::io_in_progress_lock, IsForInput, LockBufHdr, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), UnlockBufHdr, and WaitIO().

Referenced by BufferAlloc(), FlushBuffer(), and ReadBuffer_common().

{
    Assert(!InProgressBuf);

    for (;;)
    {
        /*
         * Grab the io_in_progress lock so that other processes can wait for
         * me to finish the I/O.
         */
        LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);

        LockBufHdr(buf);

        if (!(buf->flags & BM_IO_IN_PROGRESS))
            break;

        /*
         * The only way BM_IO_IN_PROGRESS could be set when the io_in_progress
         * lock isn't held is if the process doing the I/O is recovering from
         * an error (see AbortBufferIO).  If that's the case, we must wait for
         * him to get unwedged.
         */
        UnlockBufHdr(buf);
        LWLockRelease(buf->io_in_progress_lock);
        WaitIO(buf);
    }

    /* Once we get here, there is definitely no I/O active on this buffer */

    if (forInput ? (buf->flags & BM_VALID) : !(buf->flags & BM_DIRTY))
    {
        /* someone else already did the I/O */
        UnlockBufHdr(buf);
        LWLockRelease(buf->io_in_progress_lock);
        return false;
    }

    buf->flags |= BM_IO_IN_PROGRESS;

    UnlockBufHdr(buf);

    InProgressBuf = buf;
    IsForInput = forInput;

    return true;
}

static int SyncOneBuffer	(	int	buf_id,
		bool	skip_recently_used
	)			`[static]`

Definition at line 1652 of file bufmgr.c.

References BM_DIRTY, BM_VALID, BufferDescriptors, sbufdesc::content_lock, sbufdesc::flags, FlushBuffer(), LockBufHdr, LW_SHARED, LWLockAcquire(), LWLockRelease(), NULL, PinBuffer_Locked(), sbufdesc::refcount, among::result, UnlockBufHdr, UnpinBuffer(), and sbufdesc::usage_count.

Referenced by BgBufferSync(), and BufferSync().

{
    volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];
    int         result = 0;

    /*
     * Check whether buffer needs writing.
     *
     * We can make this check without taking the buffer content lock so long
     * as we mark pages dirty in access methods *before* logging changes with
     * XLogInsert(): if someone marks the buffer dirty just after our check we
     * don't worry because our checkpoint.redo points before log record for
     * upcoming changes and so we are not required to write such dirty buffer.
     */
    LockBufHdr(bufHdr);

    if (bufHdr->refcount == 0 && bufHdr->usage_count == 0)
        result |= BUF_REUSABLE;
    else if (skip_recently_used)
    {
        /* Caller told us not to write recently-used buffers */
        UnlockBufHdr(bufHdr);
        return result;
    }

    if (!(bufHdr->flags & BM_VALID) || !(bufHdr->flags & BM_DIRTY))
    {
        /* It's clean, so nothing to do */
        UnlockBufHdr(bufHdr);
        return result;
    }

    /*
     * Pin it, share-lock it, write it.  (FlushBuffer will do nothing if the
     * buffer is clean by the time we've locked it.)
     */
    PinBuffer_Locked(bufHdr);
    LWLockAcquire(bufHdr->content_lock, LW_SHARED);

    FlushBuffer(bufHdr, NULL);

    LWLockRelease(bufHdr->content_lock);
    UnpinBuffer(bufHdr, true);

    return result | BUF_WRITTEN;
}

static void TerminateBufferIO	(	volatile BufferDesc *	buf,
		bool	clear_dirty,
		int	set_flag_bits
	)			`[static]`

Definition at line 3068 of file bufmgr.c.

References Assert, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_IO_IN_PROGRESS, BM_JUST_DIRTIED, sbufdesc::flags, sbufdesc::io_in_progress_lock, LockBufHdr, LWLockRelease(), and UnlockBufHdr.

Referenced by AbortBufferIO(), FlushBuffer(), and ReadBuffer_common().

{
    Assert(buf == InProgressBuf);

    LockBufHdr(buf);

    Assert(buf->flags & BM_IO_IN_PROGRESS);
    buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
    if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
        buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
    buf->flags |= set_flag_bits;

    UnlockBufHdr(buf);

    InProgressBuf = NULL;

    LWLockRelease(buf->io_in_progress_lock);
}

void UnlockBuffers ( void )

Definition at line 2720 of file bufmgr.c.

References BM_PIN_COUNT_WAITER, sbufdesc::flags, LockBufHdr, MyProcPid, UnlockBufHdr, and sbufdesc::wait_backend_pid.

Referenced by AbortSubTransaction(), AbortTransaction(), AtProcExit_Buffers(), BackgroundWriterMain(), CheckpointerMain(), and WalWriterMain().

{
    volatile BufferDesc *buf = PinCountWaitBuf;

    if (buf)
    {
        LockBufHdr(buf);

        /*
         * Don't complain if flag bit not set; it could have been reset but we
         * got a cancel/die interrupt before getting the signal.
         */
        if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
            buf->wait_backend_pid == MyProcPid)
            buf->flags &= ~BM_PIN_COUNT_WAITER;

        UnlockBufHdr(buf);

        PinCountWaitBuf = NULL;
    }
}

void UnlockReleaseBuffer ( Buffer buffer )

Definition at line 2544 of file bufmgr.c.

References BUFFER_LOCK_UNLOCK, LockBuffer(), and ReleaseBuffer().

Referenced by _bt_relbuf(), _bt_restore_meta(), _hash_relbuf(), _hash_wrtbuf(), acquire_sample_rows(), allocNewBuffer(), AlterSequence(), bt_metap(), bt_page_items(), bt_page_stats(), btree_xlog_delete(), btree_xlog_delete_get_latestRemovedXid(), btree_xlog_delete_page(), btree_xlog_insert(), btree_xlog_newroot(), btree_xlog_split(), btree_xlog_vacuum(), count_nondeletable_pages(), createPostingTree(), do_setval(), doPickSplit(), fill_seq_with_data(), FreeSpaceMapTruncateRel(), fsm_search(), fsm_set_and_search(), ginbuild(), ginbuildempty(), ginbulkdelete(), ginContinueSplit(), ginDeletePage(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), ginInsertValue(), ginRedoCreateIndex(), ginRedoCreatePTree(), ginRedoDeleteListPages(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoSplit(), ginRedoUpdateMetapage(), ginRedoVacuumPage(), ginUpdateStats(), ginvacuumcleanup(), ginVacuumPostingTree(), ginVacuumPostingTreeLeaves(), gistbufferinginserttuples(), gistbuild(), gistbuildempty(), gistbulkdelete(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistGetMaxLevel(), gistinserttuples(), gistplacetopage(), gistProcessItup(), gistRedoClearFollowRight(), gistRedoCreateIndex(), gistRedoPageSplitRecord(), gistRedoPageUpdateRecord(), gistScanPage(), gistvacuumcleanup(), heap_delete(), heap_get_latest_tid(), heap_inplace_update(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_update(), heap_xlog_clean(), heap_xlog_delete(), heap_xlog_freeze(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_newpage(), heap_xlog_update(), heap_xlog_visible(), lazy_scan_heap(), lazy_vacuum_heap(), moveLeafs(), nextval_internal(), pg_sequence_parameters(), pgstat_gist_page(), pgstat_heap(), pgstatginindex(), ResetSequence(), RestoreBackupBlockContents(), scanGetCandidate(), scanPendingInsert(), scanPostingTree(), seq_redo(), shiftList(), spgAddNodeAction(), spgbuild(), spgdoinsert(), spgGetCache(), SpGistGetBuffer(), SpGistUpdateMetaPage(), spgMatchNodeAction(), spgprocesspending(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoCreateIndex(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), spgRedoVacuumRoot(), spgSplitNodeAction(), spgvacuumpage(), spgWalk(), visibilitymap_truncate(), writeListPage(), and XLogRecordPageWithFreeSpace().

{
    LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
    ReleaseBuffer(buffer);
}

static void UnpinBuffer	(	volatile BufferDesc *	buf,
		bool	fixOwner
	)			`[static]`

Definition at line 1162 of file bufmgr.c.

References Assert, BM_PIN_COUNT_WAITER, sbufdesc::buf_id, BufferDescriptorGetBuffer, sbufdesc::content_lock, CurrentResourceOwner, sbufdesc::flags, sbufdesc::io_in_progress_lock, LockBufHdr, LWLockHeldByMe(), PrivateRefCount, ProcSendSignal(), sbufdesc::refcount, ResourceOwnerForgetBuffer(), UnlockBufHdr, and sbufdesc::wait_backend_pid.

Referenced by BufferAlloc(), FlushDatabaseBuffers(), FlushRelationBuffers(), ReleaseAndReadBuffer(), ReleaseBuffer(), and SyncOneBuffer().

{
    int         b = buf->buf_id;

    if (fixOwner)
        ResourceOwnerForgetBuffer(CurrentResourceOwner,
                                  BufferDescriptorGetBuffer(buf));

    Assert(PrivateRefCount[b] > 0);
    PrivateRefCount[b]--;
    if (PrivateRefCount[b] == 0)
    {
        /* I'd better not still hold any locks on the buffer */
        Assert(!LWLockHeldByMe(buf->content_lock));
        Assert(!LWLockHeldByMe(buf->io_in_progress_lock));

        LockBufHdr(buf);

        /* Decrement the shared reference count */
        Assert(buf->refcount > 0);
        buf->refcount--;

        /* Support LockBufferForCleanup() */
        if ((buf->flags & BM_PIN_COUNT_WAITER) &&
            buf->refcount == 1)
        {
            /* we just released the last pin other than the waiter's */
            int         wait_backend_pid = buf->wait_backend_pid;

            buf->flags &= ~BM_PIN_COUNT_WAITER;
            UnlockBufHdr(buf);
            ProcSendSignal(wait_backend_pid);
        }
        else
            UnlockBufHdr(buf);
    }
}

static void WaitIO ( volatile BufferDesc * buf ) [static]

Definition at line 2955 of file bufmgr.c.

References BM_IO_IN_PROGRESS, sbufdesc::flags, sbufdesc::io_in_progress_lock, LockBufHdr, LW_SHARED, LWLockAcquire(), LWLockRelease(), and UnlockBufHdr.

Referenced by InvalidateBuffer(), and StartBufferIO().

{
    /*
     * Changed to wait until there's no IO - Inoue 01/13/2000
     *
     * Note this is *necessary* because an error abort in the process doing
     * I/O could release the io_in_progress_lock prematurely. See
     * AbortBufferIO.
     */
    for (;;)
    {
        BufFlags    sv_flags;

        /*
         * It may not be necessary to acquire the spinlock to check the flag
         * here, but since this test is essential for correctness, we'd better
         * play it safe.
         */
        LockBufHdr(buf);
        sv_flags = buf->flags;
        UnlockBufHdr(buf);
        if (!(sv_flags & BM_IO_IN_PROGRESS))
            break;
        LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
        LWLockRelease(buf->io_in_progress_lock);
    }
}