Header And Logo

PostgreSQL
| The world's most advanced open source database.

Data Structures | Defines | Typedefs | Enumerations | Functions | Variables

xlog.h File Reference

#include "access/rmgr.h"
#include "access/xlogdefs.h"
#include "datatype/timestamp.h"
#include "lib/stringinfo.h"
#include "storage/buf.h"
#include "utils/pg_crc.h"
Include dependency graph for xlog.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  XLogRecord
struct  XLogRecData
struct  CheckpointStatsData

Defines

#define SizeOfXLogRecord   MAXALIGN(sizeof(XLogRecord))
#define XLogRecGetData(record)   ((char*) (record) + SizeOfXLogRecord)
#define XLR_INFO_MASK   0x0F
#define XLR_BKP_BLOCK_MASK   0x0F
#define XLR_MAX_BKP_BLOCKS   4
#define XLR_BKP_BLOCK(iblk)   (0x08 >> (iblk))
#define SYNC_METHOD_FSYNC   0
#define SYNC_METHOD_FDATASYNC   1
#define SYNC_METHOD_OPEN   2
#define SYNC_METHOD_FSYNC_WRITETHROUGH   3
#define SYNC_METHOD_OPEN_DSYNC   4
#define InHotStandby   (standbyState >= STANDBY_SNAPSHOT_PENDING)
#define XLogArchivingActive()   (XLogArchiveMode && wal_level >= WAL_LEVEL_ARCHIVE)
#define XLogArchiveCommandSet()   (XLogArchiveCommand[0] != '\0')
#define XLogIsNeeded()   (wal_level >= WAL_LEVEL_ARCHIVE)
#define XLogStandbyInfoActive()   (wal_level >= WAL_LEVEL_HOT_STANDBY)
#define CHECKPOINT_IS_SHUTDOWN   0x0001
#define CHECKPOINT_END_OF_RECOVERY   0x0002
#define CHECKPOINT_IMMEDIATE   0x0004
#define CHECKPOINT_FORCE   0x0008
#define CHECKPOINT_WAIT   0x0010
#define CHECKPOINT_CAUSE_XLOG   0x0020
#define CHECKPOINT_CAUSE_TIME   0x0040
#define BACKUP_LABEL_FILE   "backup_label"
#define BACKUP_LABEL_OLD   "backup_label.old"

Typedefs

typedef struct XLogRecord XLogRecord
typedef struct XLogRecData XLogRecData
typedef enum WalLevel WalLevel
typedef struct CheckpointStatsData CheckpointStatsData

Enumerations

enum  HotStandbyState { STANDBY_DISABLED, STANDBY_INITIALIZED, STANDBY_SNAPSHOT_PENDING, STANDBY_SNAPSHOT_READY }
enum  RecoveryTargetType { RECOVERY_TARGET_UNSET, RECOVERY_TARGET_XID, RECOVERY_TARGET_TIME, RECOVERY_TARGET_NAME }
enum  WalLevel { WAL_LEVEL_MINIMAL = 0, WAL_LEVEL_ARCHIVE, WAL_LEVEL_HOT_STANDBY }

Functions

XLogRecPtr XLogInsert (RmgrId rmid, uint8 info, XLogRecData *rdata)
void XLogFlush (XLogRecPtr RecPtr)
bool XLogBackgroundFlush (void)
bool XLogNeedsFlush (XLogRecPtr RecPtr)
int XLogFileInit (XLogSegNo segno, bool *use_existent, bool use_lock)
int XLogFileOpen (XLogSegNo segno)
XLogRecPtr XLogSaveBufferForHint (Buffer buffer)
void CheckXLogRemoved (XLogSegNo segno, TimeLineID tli)
void XLogSetAsyncXactLSN (XLogRecPtr record)
Buffer RestoreBackupBlock (XLogRecPtr lsn, XLogRecord *record, int block_index, bool get_cleanup_lock, bool keep_buffer)
void xlog_redo (XLogRecPtr lsn, XLogRecord *record)
void xlog_desc (StringInfo buf, uint8 xl_info, char *rec)
void issue_xlog_fsync (int fd, XLogSegNo segno)
bool RecoveryInProgress (void)
bool HotStandbyActive (void)
bool XLogInsertAllowed (void)
void GetXLogReceiptTime (TimestampTz *rtime, bool *fromStream)
XLogRecPtr GetXLogReplayRecPtr (TimeLineID *replayTLI)
XLogRecPtr GetXLogInsertRecPtr (void)
XLogRecPtr GetXLogWriteRecPtr (void)
bool RecoveryIsPaused (void)
void SetRecoveryPause (bool recoveryPause)
TimestampTz GetLatestXTime (void)
TimestampTz GetCurrentChunkReplayStartTime (void)
char * XLogFileNameP (TimeLineID tli, XLogSegNo segno)
void UpdateControlFile (void)
uint64 GetSystemIdentifier (void)
bool DataChecksumsEnabled (void)
XLogRecPtr GetFakeLSNForUnloggedRel (void)
Size XLOGShmemSize (void)
void XLOGShmemInit (void)
void BootStrapXLOG (void)
void StartupXLOG (void)
void ShutdownXLOG (int code, Datum arg)
void InitXLOGAccess (void)
void CreateCheckPoint (int flags)
bool CreateRestartPoint (int flags)
void XLogPutNextOid (Oid nextOid)
XLogRecPtr XLogRestorePoint (const char *rpName)
void UpdateFullPageWrites (void)
XLogRecPtr GetRedoRecPtr (void)
XLogRecPtr GetInsertRecPtr (void)
XLogRecPtr GetFlushRecPtr (void)
void GetNextXidAndEpoch (TransactionId *xid, uint32 *epoch)
bool CheckPromoteSignal (void)
void WakeupRecovery (void)
void SetWalWriterSleeping (bool sleeping)
XLogRecPtr do_pg_start_backup (const char *backupidstr, bool fast, TimeLineID *starttli_p, char **labelfile)
XLogRecPtr do_pg_stop_backup (char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
void do_pg_abort_backup (void)

Variables

int sync_method
PGDLLIMPORT TimeLineID ThisTimeLineID
bool InRecovery
HotStandbyState standbyState
XLogRecPtr XactLastRecEnd
bool reachedConsistency
int CheckPointSegments
int wal_keep_segments
int XLOGbuffers
int XLogArchiveTimeout
bool XLogArchiveMode
char * XLogArchiveCommand
bool EnableHotStandby
bool fullPageWrites
bool log_checkpoints
int wal_level
CheckpointStatsData CheckpointStats

Define Documentation

#define BACKUP_LABEL_FILE   "backup_label"
#define BACKUP_LABEL_OLD   "backup_label.old"

Definition at line 332 of file xlog.h.

Referenced by CancelBackup(), and StartupXLOG().

#define CHECKPOINT_CAUSE_TIME   0x0040

Definition at line 236 of file xlog.h.

Referenced by LogCheckpointStart().

#define CHECKPOINT_CAUSE_XLOG   0x0020

Definition at line 235 of file xlog.h.

Referenced by CheckpointerMain(), LogCheckpointStart(), XLogPageRead(), and XLogWrite().

#define CHECKPOINT_END_OF_RECOVERY   0x0002
#define CHECKPOINT_FORCE   0x0008
#define CHECKPOINT_IMMEDIATE   0x0004
#define CHECKPOINT_IS_SHUTDOWN   0x0001
#define CHECKPOINT_WAIT   0x0010
#define InHotStandby   (standbyState >= STANDBY_SNAPSHOT_PENDING)
#define SizeOfXLogRecord   MAXALIGN(sizeof(XLogRecord))
#define SYNC_METHOD_FDATASYNC   1

Definition at line 78 of file xlog.h.

Referenced by get_sync_bit(), and issue_xlog_fsync().

#define SYNC_METHOD_FSYNC   0

Definition at line 77 of file xlog.h.

Referenced by get_sync_bit(), and issue_xlog_fsync().

#define SYNC_METHOD_FSYNC_WRITETHROUGH   3

Definition at line 80 of file xlog.h.

Referenced by get_sync_bit(), issue_xlog_fsync(), and pg_fsync().

#define SYNC_METHOD_OPEN   2

Definition at line 79 of file xlog.h.

Referenced by get_sync_bit(), issue_xlog_fsync(), and XLogWrite().

#define SYNC_METHOD_OPEN_DSYNC   4

Definition at line 81 of file xlog.h.

Referenced by get_sync_bit(), issue_xlog_fsync(), and XLogWrite().

#define XLogArchiveCommandSet (  )     (XLogArchiveCommand[0] != '\0')

Definition at line 204 of file xlog.h.

Referenced by pgarch_ArchiverCopyLoop(), and ShutdownXLOG().

#define XLogArchivingActive (  )     (XLogArchiveMode && wal_level >= WAL_LEVEL_ARCHIVE)
#define XLogIsNeeded (  )     (wal_level >= WAL_LEVEL_ARCHIVE)
#define XLogRecGetData (   record  )     ((char*) (record) + SizeOfXLogRecord)
#define XLogStandbyInfoActive (  )     (wal_level >= WAL_LEVEL_HOT_STANDBY)
#define XLR_BKP_BLOCK (   iblk  )     (0x08 >> (iblk))
#define XLR_BKP_BLOCK_MASK   0x0F
#define XLR_INFO_MASK   0x0F

Definition at line 65 of file xlog.h.

Referenced by XLogInsert().

#define XLR_MAX_BKP_BLOCKS   4

Definition at line 73 of file xlog.h.

Referenced by ValidXLogRecordHeader().


Typedef Documentation

typedef enum WalLevel WalLevel
typedef struct XLogRecData XLogRecData
typedef struct XLogRecord XLogRecord

Enumeration Type Documentation

Enumerator:
STANDBY_DISABLED 
STANDBY_INITIALIZED 
STANDBY_SNAPSHOT_PENDING 
STANDBY_SNAPSHOT_READY 

Definition at line 155 of file xlog.h.

{
    STANDBY_DISABLED,
    STANDBY_INITIALIZED,
    STANDBY_SNAPSHOT_PENDING,
    STANDBY_SNAPSHOT_READY
} HotStandbyState;

Enumerator:
RECOVERY_TARGET_UNSET 
RECOVERY_TARGET_XID 
RECOVERY_TARGET_TIME 
RECOVERY_TARGET_NAME 

Definition at line 171 of file xlog.h.

{
    RECOVERY_TARGET_UNSET,
    RECOVERY_TARGET_XID,
    RECOVERY_TARGET_TIME,
    RECOVERY_TARGET_NAME
} RecoveryTargetType;

enum WalLevel
Enumerator:
WAL_LEVEL_MINIMAL 
WAL_LEVEL_ARCHIVE 
WAL_LEVEL_HOT_STANDBY 

Definition at line 195 of file xlog.h.

{
    WAL_LEVEL_MINIMAL = 0,
    WAL_LEVEL_ARCHIVE,
    WAL_LEVEL_HOT_STANDBY
} WalLevel;


Function Documentation

void BootStrapXLOG ( void   ) 

Definition at line 3993 of file xlog.c.

References ALIGNOF_XLOG_BUFFER, bootstrap_data_checksum_version, BootStrapCLOG(), BootStrapMultiXact(), BootStrapSUBTRANS(), ControlFileData::checkPoint, ControlFileData::checkPointCopy, close, COMP_CRC32, ControlFileData::data_checksum_version, ereport, errcode_for_file_access(), errmsg(), FIN_CRC32, fullPageWrites, CheckPoint::fullPageWrites, gettimeofday(), INIT_CRC32, max_locks_per_xact, ControlFileData::max_locks_per_xact, max_prepared_xacts, ControlFileData::max_prepared_xacts, MaxConnections, ControlFileData::MaxConnections, MultiXactSetNextMXact(), CheckPoint::nextMulti, CheckPoint::nextMultiOffset, VariableCacheData::nextOid, CheckPoint::nextOid, VariableCacheData::nextXid, CheckPoint::nextXid, CheckPoint::nextXidEpoch, NULL, offsetof, VariableCacheData::oidCount, CheckPoint::oldestActiveXid, CheckPoint::oldestMulti, CheckPoint::oldestMultiDB, CheckPoint::oldestXid, CheckPoint::oldestXidDB, openLogFile, palloc(), PANIC, pfree(), pg_fsync(), CheckPoint::PrevTimeLineID, CheckPoint::redo, SetMultiXactIdLimit(), SetTransactionIdLimit(), ShmemVariableCache, SizeOfXLogRecord, ControlFileData::state, ControlFileData::system_identifier, CheckPoint::ThisTimeLineID, ThisTimeLineID, ControlFileData::time, CheckPoint::time, TYPEALIGN, ControlFileData::unloggedLSN, wal_level, ControlFileData::wal_level, write, WriteControlFile(), XLogRecord::xl_info, XLogRecord::xl_len, XLogRecord::xl_prev, XLogRecord::xl_rmid, XLogRecord::xl_tot_len, XLogRecord::xl_xid, XLogFileInit(), XLogRecGetData, XLogSegSize, XLogPageHeaderData::xlp_info, XLogPageHeaderData::xlp_magic, XLogPageHeaderData::xlp_pageaddr, XLogLongPageHeaderData::xlp_seg_size, XLogLongPageHeaderData::xlp_sysid, XLogPageHeaderData::xlp_tli, and XLogLongPageHeaderData::xlp_xlog_blcksz.

Referenced by AuxiliaryProcessMain().

{
    CheckPoint  checkPoint;
    char       *buffer;
    XLogPageHeader page;
    XLogLongPageHeader longpage;
    XLogRecord *record;
    bool        use_existent;
    uint64      sysidentifier;
    struct timeval tv;
    pg_crc32    crc;

    /*
     * Select a hopefully-unique system identifier code for this installation.
     * We use the result of gettimeofday(), including the fractional seconds
     * field, as being about as unique as we can easily get.  (Think not to
     * use random(), since it hasn't been seeded and there's no portable way
     * to seed it other than the system clock value...)  The upper half of the
     * uint64 value is just the tv_sec part, while the lower half is the XOR
     * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
     * unnecessarily if "uint64" is really only 32 bits wide.  A person
     * knowing this encoding can determine the initialization time of the
     * installation, which could perhaps be useful sometimes.
     */
    gettimeofday(&tv, NULL);
    sysidentifier = ((uint64) tv.tv_sec) << 32;
    sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);

    /* First timeline ID is always 1 */
    ThisTimeLineID = 1;

    /* page buffer must be aligned suitably for O_DIRECT */
    buffer = (char *) palloc(XLOG_BLCKSZ + ALIGNOF_XLOG_BUFFER);
    page = (XLogPageHeader) TYPEALIGN(ALIGNOF_XLOG_BUFFER, buffer);
    memset(page, 0, XLOG_BLCKSZ);

    /*
     * Set up information for the initial checkpoint record
     *
     * The initial checkpoint record is written to the beginning of the WAL
     * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
     * used, so that we can use 0/0 to mean "before any valid WAL segment".
     */
    checkPoint.redo = XLogSegSize + SizeOfXLogLongPHD;
    checkPoint.ThisTimeLineID = ThisTimeLineID;
    checkPoint.PrevTimeLineID = ThisTimeLineID;
    checkPoint.fullPageWrites = fullPageWrites;
    checkPoint.nextXidEpoch = 0;
    checkPoint.nextXid = FirstNormalTransactionId;
    checkPoint.nextOid = FirstBootstrapObjectId;
    checkPoint.nextMulti = FirstMultiXactId;
    checkPoint.nextMultiOffset = 0;
    checkPoint.oldestXid = FirstNormalTransactionId;
    checkPoint.oldestXidDB = TemplateDbOid;
    checkPoint.oldestMulti = FirstMultiXactId;
    checkPoint.oldestMultiDB = TemplateDbOid;
    checkPoint.time = (pg_time_t) time(NULL);
    checkPoint.oldestActiveXid = InvalidTransactionId;

    ShmemVariableCache->nextXid = checkPoint.nextXid;
    ShmemVariableCache->nextOid = checkPoint.nextOid;
    ShmemVariableCache->oidCount = 0;
    MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
    SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
    SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);

    /* Set up the XLOG page header */
    page->xlp_magic = XLOG_PAGE_MAGIC;
    page->xlp_info = XLP_LONG_HEADER;
    page->xlp_tli = ThisTimeLineID;
    page->xlp_pageaddr = XLogSegSize;
    longpage = (XLogLongPageHeader) page;
    longpage->xlp_sysid = sysidentifier;
    longpage->xlp_seg_size = XLogSegSize;
    longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;

    /* Insert the initial checkpoint record */
    record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
    record->xl_prev = 0;
    record->xl_xid = InvalidTransactionId;
    record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
    record->xl_len = sizeof(checkPoint);
    record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
    record->xl_rmid = RM_XLOG_ID;
    memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));

    INIT_CRC32(crc);
    COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
    COMP_CRC32(crc, (char *) record, offsetof(XLogRecord, xl_crc));
    FIN_CRC32(crc);
    record->xl_crc = crc;

    /* Create first XLOG segment file */
    use_existent = false;
    openLogFile = XLogFileInit(1, &use_existent, false);

    /* Write the first page with the initial record */
    errno = 0;
    if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
    {
        /* if write didn't set errno, assume problem is no disk space */
        if (errno == 0)
            errno = ENOSPC;
        ereport(PANIC,
                (errcode_for_file_access(),
              errmsg("could not write bootstrap transaction log file: %m")));
    }

    if (pg_fsync(openLogFile) != 0)
        ereport(PANIC,
                (errcode_for_file_access(),
              errmsg("could not fsync bootstrap transaction log file: %m")));

    if (close(openLogFile))
        ereport(PANIC,
                (errcode_for_file_access(),
              errmsg("could not close bootstrap transaction log file: %m")));

    openLogFile = -1;

    /* Now create pg_control */

    memset(ControlFile, 0, sizeof(ControlFileData));
    /* Initialize pg_control status fields */
    ControlFile->system_identifier = sysidentifier;
    ControlFile->state = DB_SHUTDOWNED;
    ControlFile->time = checkPoint.time;
    ControlFile->checkPoint = checkPoint.redo;
    ControlFile->checkPointCopy = checkPoint;
    ControlFile->unloggedLSN = 1;

    /* Set important parameter values for use when replaying WAL */
    ControlFile->MaxConnections = MaxConnections;
    ControlFile->max_prepared_xacts = max_prepared_xacts;
    ControlFile->max_locks_per_xact = max_locks_per_xact;
    ControlFile->wal_level = wal_level;
    ControlFile->data_checksum_version = bootstrap_data_checksum_version;

    /* some additional ControlFile fields are set in WriteControlFile() */

    WriteControlFile();

    /* Bootstrap the commit log, too */
    BootStrapCLOG();
    BootStrapSUBTRANS();
    BootStrapMultiXact();

    pfree(buffer);
}

bool CheckPromoteSignal ( void   ) 

Definition at line 9936 of file xlog.c.

References FAST_PROMOTE_SIGNAL_FILE, and PROMOTE_SIGNAL_FILE.

Referenced by sigusr1_handler().

{
    struct stat stat_buf;

    if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
        stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
        return true;

    return false;
}

void CheckXLogRemoved ( XLogSegNo  segno,
TimeLineID  tli 
)

Definition at line 2840 of file xlog.c.

References ereport, errcode_for_file_access(), errmsg(), ERROR, filename, XLogCtlData::info_lck, XLogCtlData::lastRemovedSegNo, SpinLockAcquire, SpinLockRelease, and XLogFileName.

Referenced by perform_base_backup(), and XLogRead().

{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;
    XLogSegNo   lastRemovedSegNo;

    SpinLockAcquire(&xlogctl->info_lck);
    lastRemovedSegNo = xlogctl->lastRemovedSegNo;
    SpinLockRelease(&xlogctl->info_lck);

    if (segno <= lastRemovedSegNo)
    {
        char        filename[MAXFNAMELEN];

        XLogFileName(filename, tli, segno);
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("requested WAL segment %s has already been removed",
                        filename)));
    }
}

void CreateCheckPoint ( int  flags  ) 

Definition at line 6764 of file xlog.c.

References AdvanceXLInsertBuffer(), XLogRecData::buffer, ControlFileData::checkPoint, CHECKPOINT_END_OF_RECOVERY, CHECKPOINT_FORCE, CHECKPOINT_IS_SHUTDOWN, ControlFileData::checkPointCopy, CheckPointGuts(), CheckpointLock, CheckpointStatsData::ckpt_bufs_written, CheckpointStatsData::ckpt_segs_added, CheckpointStatsData::ckpt_segs_recycled, CheckpointStatsData::ckpt_segs_removed, CheckpointStatsData::ckpt_start_t, XLogCtlData::ckptXid, XLogCtlData::ckptXidEpoch, ControlFileLock, XLogCtlInsert::curridx, XLogRecData::data, elog, END_CRIT_SECTION, ereport, errmsg(), ERROR, XLogCtlInsert::fullPageWrites, CheckPoint::fullPageWrites, GetCurrentTimestamp(), GetOldestActiveTransactionId(), GetOldestXmin(), GetVirtualXIDsDelayingChkpt(), HaveVirtualXIDsDelayingChkpt(), XLogCtlData::info_lck, XLogCtlData::Insert, Insert(), INSERT_FREESPACE, INSERT_RECPTR, KeepLogSeg(), XLogRecData::len, LocalSetXLogInsertAllowed(), LocalXLogInsertAllowed, log_checkpoints, LogCheckpointEnd(), LogCheckpointStart(), LogStandbySnapshot(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), MAXALIGN, MemSet, ControlFileData::minRecoveryPoint, ControlFileData::minRecoveryPointTLI, MultiXactGetCheckptMulti(), NBuffers, XLogRecData::next, CheckPoint::nextMulti, CheckPoint::nextMultiOffset, VariableCacheData::nextOid, CheckPoint::nextOid, VariableCacheData::nextXid, CheckPoint::nextXid, CheckPoint::nextXidEpoch, NULL, VariableCacheData::oidCount, OidGenLock, CheckPoint::oldestActiveXid, CheckPoint::oldestMulti, CheckPoint::oldestMultiDB, VariableCacheData::oldestXid, CheckPoint::oldestXid, VariableCacheData::oldestXidDB, CheckPoint::oldestXidDB, PANIC, pfree(), pg_usleep(), PreallocXlogFiles(), ControlFileData::prevCheckPoint, XLogCtlData::PrevTimeLineID, CheckPoint::PrevTimeLineID, ProcLastRecPtr, RecoveryInProgress(), CheckPoint::redo, XLogCtlInsert::RedoRecPtr, RedoRecPtr, RemoveOldXlogFiles(), ShmemVariableCache, SizeOfXLogRecord, smgrpostckpt(), smgrpreckpt(), SpinLockAcquire, SpinLockRelease, START_CRIT_SECTION, ControlFileData::state, ThisTimeLineID, CheckPoint::ThisTimeLineID, CheckPoint::time, ControlFileData::time, TruncateSUBTRANS(), XLogCtlData::ulsn_lck, XLogCtlData::unloggedLSN, ControlFileData::unloggedLSN, UpdateControlFile(), WALInsertLock, XidGenLock, XLByteToSeg, XLOG_CHECKPOINT_ONLINE, XLOG_CHECKPOINT_SHUTDOWN, XLogFlush(), XLogInsert(), and XLogStandbyInfoActive.

Referenced by CheckpointerMain(), RequestCheckpoint(), ShutdownXLOG(), and StartupXLOG().

{
    bool        shutdown;
    CheckPoint  checkPoint;
    XLogRecPtr  recptr;
    XLogCtlInsert *Insert = &XLogCtl->Insert;
    XLogRecData rdata;
    uint32      freespace;
    XLogSegNo   _logSegNo;
    VirtualTransactionId *vxids;
    int nvxids;

    /*
     * An end-of-recovery checkpoint is really a shutdown checkpoint, just
     * issued at a different time.
     */
    if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
        shutdown = true;
    else
        shutdown = false;

    /* sanity check */
    if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
        elog(ERROR, "can't create a checkpoint during recovery");

    /*
     * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
     * (This is just pro forma, since in the present system structure there is
     * only one process that is allowed to issue checkpoints at any given
     * time.)
     */
    LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);

    /*
     * Prepare to accumulate statistics.
     *
     * Note: because it is possible for log_checkpoints to change while a
     * checkpoint proceeds, we always accumulate stats, even if
     * log_checkpoints is currently off.
     */
    MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
    CheckpointStats.ckpt_start_t = GetCurrentTimestamp();

    /*
     * Use a critical section to force system panic if we have trouble.
     */
    START_CRIT_SECTION();

    if (shutdown)
    {
        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
        ControlFile->state = DB_SHUTDOWNING;
        ControlFile->time = (pg_time_t) time(NULL);
        UpdateControlFile();
        LWLockRelease(ControlFileLock);
    }

    /*
     * Let smgr prepare for checkpoint; this has to happen before we determine
     * the REDO pointer.  Note that smgr must not do anything that'd have to
     * be undone if we decide no checkpoint is needed.
     */
    smgrpreckpt();

    /* Begin filling in the checkpoint WAL record */
    MemSet(&checkPoint, 0, sizeof(checkPoint));
    checkPoint.time = (pg_time_t) time(NULL);

    /*
     * For Hot Standby, derive the oldestActiveXid before we fix the redo
     * pointer. This allows us to begin accumulating changes to assemble our
     * starting snapshot of locks and transactions.
     */
    if (!shutdown && XLogStandbyInfoActive())
        checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
    else
        checkPoint.oldestActiveXid = InvalidTransactionId;

    /*
     * We must hold WALInsertLock while examining insert state to determine
     * the checkpoint REDO pointer.
     */
    LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);

    /*
     * If this isn't a shutdown or forced checkpoint, and we have not inserted
     * any XLOG records since the start of the last checkpoint, skip the
     * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
     * when the system is idle. That wastes log space, and more importantly it
     * exposes us to possible loss of both current and previous checkpoint
     * records if the machine crashes just as we're writing the update.
     * (Perhaps it'd make even more sense to checkpoint only when the previous
     * checkpoint record is in a different xlog page?)
     *
     * We have to make two tests to determine that nothing has happened since
     * the start of the last checkpoint: current insertion point must match
     * the end of the last checkpoint record, and its redo pointer must point
     * to itself.
     */
    if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
                  CHECKPOINT_FORCE)) == 0)
    {
        XLogRecPtr  curInsert;

        INSERT_RECPTR(curInsert, Insert, Insert->curridx);
        if (curInsert == ControlFile->checkPoint + 
            MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
            ControlFile->checkPoint == ControlFile->checkPointCopy.redo)
        {
            LWLockRelease(WALInsertLock);
            LWLockRelease(CheckpointLock);
            END_CRIT_SECTION();
            return;
        }
    }

    /*
     * An end-of-recovery checkpoint is created before anyone is allowed to
     * write WAL. To allow us to write the checkpoint record, temporarily
     * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
     * initialized, which we need here and in AdvanceXLInsertBuffer.)
     */
    if (flags & CHECKPOINT_END_OF_RECOVERY)
        LocalSetXLogInsertAllowed();

    checkPoint.ThisTimeLineID = ThisTimeLineID;
    if (flags & CHECKPOINT_END_OF_RECOVERY)
        checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
    else
        checkPoint.PrevTimeLineID = ThisTimeLineID;

    checkPoint.fullPageWrites = Insert->fullPageWrites;

    /*
     * Compute new REDO record ptr = location of next XLOG record.
     *
     * NB: this is NOT necessarily where the checkpoint record itself will be,
     * since other backends may insert more XLOG records while we're off doing
     * the buffer flush work.  Those XLOG records are logically after the
     * checkpoint, even though physically before it.  Got that?
     */
    freespace = INSERT_FREESPACE(Insert);
    if (freespace == 0)
    {
        (void) AdvanceXLInsertBuffer(false);
        /* OK to ignore update return flag, since we will do flush anyway */
        freespace = INSERT_FREESPACE(Insert);
    }
    INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);

    /*
     * Here we update the shared RedoRecPtr for future XLogInsert calls; this
     * must be done while holding the insert lock AND the info_lck.
     *
     * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
     * pointing past where it really needs to point.  This is okay; the only
     * consequence is that XLogInsert might back up whole buffers that it
     * didn't really need to.  We can't postpone advancing RedoRecPtr because
     * XLogInserts that happen while we are dumping buffers must assume that
     * their buffer changes are not included in the checkpoint.
     */
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
        SpinLockRelease(&xlogctl->info_lck);
    }

    /*
     * Now we can release WAL insert lock, allowing other xacts to proceed
     * while we are flushing disk buffers.
     */
    LWLockRelease(WALInsertLock);

    /*
     * If enabled, log checkpoint start.  We postpone this until now so as not
     * to log anything if we decided to skip the checkpoint.
     */
    if (log_checkpoints)
        LogCheckpointStart(flags, false);

    TRACE_POSTGRESQL_CHECKPOINT_START(flags);

    /*
     * In some cases there are groups of actions that must all occur on
     * one side or the other of a checkpoint record. Before flushing the
     * checkpoint record we must explicitly wait for any backend currently
     * performing those groups of actions.
     *
     * One example is end of transaction, so we must wait for any transactions
     * that are currently in commit critical sections.  If an xact inserted
     * its commit record into XLOG just before the REDO point, then a crash
     * restart from the REDO point would not replay that record, which means
     * that our flushing had better include the xact's update of pg_clog.  So
     * we wait till he's out of his commit critical section before proceeding.
     * See notes in RecordTransactionCommit().
     *
     * Because we've already released WALInsertLock, this test is a bit fuzzy:
     * it is possible that we will wait for xacts we didn't really need to
     * wait for.  But the delay should be short and it seems better to make
     * checkpoint take a bit longer than to hold locks longer than necessary.
     * (In fact, the whole reason we have this issue is that xact.c does
     * commit record XLOG insertion and clog update as two separate steps
     * protected by different locks, but again that seems best on grounds of
     * minimizing lock contention.)
     *
     * A transaction that has not yet set delayChkpt when we look cannot be at
     * risk, since he's not inserted his commit record yet; and one that's
     * already cleared it is not at risk either, since he's done fixing clog
     * and we will correctly flush the update below.  So we cannot miss any
     * xacts we need to wait for.
     */
    vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
    if (nvxids > 0)
    {
        uint32  nwaits = 0;

        do
        {
            pg_usleep(10000L);  /* wait for 10 msec */
            nwaits++;
        } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
    }
    pfree(vxids);

    /*
     * Get the other info we need for the checkpoint record.
     */
    LWLockAcquire(XidGenLock, LW_SHARED);
    checkPoint.nextXid = ShmemVariableCache->nextXid;
    checkPoint.oldestXid = ShmemVariableCache->oldestXid;
    checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
    LWLockRelease(XidGenLock);

    /* Increase XID epoch if we've wrapped around since last checkpoint */
    checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
    if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
        checkPoint.nextXidEpoch++;

    LWLockAcquire(OidGenLock, LW_SHARED);
    checkPoint.nextOid = ShmemVariableCache->nextOid;
    if (!shutdown)
        checkPoint.nextOid += ShmemVariableCache->oidCount;
    LWLockRelease(OidGenLock);

    MultiXactGetCheckptMulti(shutdown,
                             &checkPoint.nextMulti,
                             &checkPoint.nextMultiOffset,
                             &checkPoint.oldestMulti,
                             &checkPoint.oldestMultiDB);

    /*
     * Having constructed the checkpoint record, ensure all shmem disk buffers
     * and commit-log buffers are flushed to disk.
     *
     * This I/O could fail for various reasons.  If so, we will fail to
     * complete the checkpoint, but there is no reason to force a system
     * panic. Accordingly, exit critical section while doing it.
     */
    END_CRIT_SECTION();

    CheckPointGuts(checkPoint.redo, flags);

    /*
     * Take a snapshot of running transactions and write this to WAL. This
     * allows us to reconstruct the state of running transactions during
     * archive recovery, if required. Skip, if this info disabled.
     *
     * If we are shutting down, or Startup process is completing crash
     * recovery we don't need to write running xact data.
     */
    if (!shutdown && XLogStandbyInfoActive())
        LogStandbySnapshot();

    START_CRIT_SECTION();

    /*
     * Now insert the checkpoint record into XLOG.
     */
    rdata.data = (char *) (&checkPoint);
    rdata.len = sizeof(checkPoint);
    rdata.buffer = InvalidBuffer;
    rdata.next = NULL;

    recptr = XLogInsert(RM_XLOG_ID,
                        shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
                        XLOG_CHECKPOINT_ONLINE,
                        &rdata);

    XLogFlush(recptr);

    /*
     * We mustn't write any new WAL after a shutdown checkpoint, or it will be
     * overwritten at next startup.  No-one should even try, this just allows
     * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
     * to just temporarily disable writing until the system has exited
     * recovery.
     */
    if (shutdown)
    {
        if (flags & CHECKPOINT_END_OF_RECOVERY)
            LocalXLogInsertAllowed = -1;        /* return to "check" state */
        else
            LocalXLogInsertAllowed = 0; /* never again write WAL */
    }

    /*
     * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
     * = end of actual checkpoint record.
     */
    if (shutdown && checkPoint.redo != ProcLastRecPtr)
        ereport(PANIC,
                (errmsg("concurrent transaction log activity while database system is shutting down")));

    /*
     * Select point at which we can truncate the log, which we base on the
     * prior checkpoint's earliest info.
     */
    XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);

    /*
     * Update the control file.
     */
    LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    if (shutdown)
        ControlFile->state = DB_SHUTDOWNED;
    ControlFile->prevCheckPoint = ControlFile->checkPoint;
    ControlFile->checkPoint = ProcLastRecPtr;
    ControlFile->checkPointCopy = checkPoint;
    ControlFile->time = (pg_time_t) time(NULL);
    /* crash recovery should always recover to the end of WAL */
    ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
    ControlFile->minRecoveryPointTLI = 0;

    /*
     * Persist unloggedLSN value. It's reset on crash recovery, so this goes
     * unused on non-shutdown checkpoints, but seems useful to store it always
     * for debugging purposes.
     */
    SpinLockAcquire(&XLogCtl->ulsn_lck);
    ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
    SpinLockRelease(&XLogCtl->ulsn_lck);

    UpdateControlFile();
    LWLockRelease(ControlFileLock);

    /* Update shared-memory copy of checkpoint XID/epoch */
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
        xlogctl->ckptXid = checkPoint.nextXid;
        SpinLockRelease(&xlogctl->info_lck);
    }

    /*
     * We are now done with critical updates; no need for system panic if we
     * have trouble while fooling with old log segments.
     */
    END_CRIT_SECTION();

    /*
     * Let smgr do post-checkpoint cleanup (eg, deleting old files).
     */
    smgrpostckpt();

    /*
     * Delete old log files (those no longer needed even for previous
     * checkpoint or the standbys in XLOG streaming).
     */
    if (_logSegNo)
    {
        KeepLogSeg(recptr, &_logSegNo);
        _logSegNo--;
        RemoveOldXlogFiles(_logSegNo, recptr);
    }

    /*
     * Make more log segments if needed.  (Do this after recycling old log
     * segments, since that may supply some of the needed files.)
     */
    if (!shutdown)
        PreallocXlogFiles(recptr);

    /*
     * Truncate pg_subtrans if possible.  We can throw away all data before
     * the oldest XMIN of any running transaction.  No future transaction will
     * attempt to reference any pg_subtrans entry older than that (see Asserts
     * in subtrans.c).  During recovery, though, we mustn't do this because
     * StartupSUBTRANS hasn't been called yet.
     */
    if (!RecoveryInProgress())
        TruncateSUBTRANS(GetOldestXmin(true, false));

    /* Real work is done, but log and update stats before releasing lock. */
    LogCheckpointEnd(false);

    TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
                                     NBuffers,
                                     CheckpointStats.ckpt_segs_added,
                                     CheckpointStats.ckpt_segs_removed,
                                     CheckpointStats.ckpt_segs_recycled);

    LWLockRelease(CheckpointLock);
}

bool CreateRestartPoint ( int  flags  ) 

Definition at line 7326 of file xlog.c.

References XLogCtlData::archiveCleanupCommand, ControlFileData::checkPoint, CHECKPOINT_IS_SHUTDOWN, ControlFileData::checkPointCopy, CheckPointGuts(), CheckpointLock, CheckpointStatsData::ckpt_start_t, ControlFileLock, DB_IN_ARCHIVE_RECOVERY, DEBUG2, EnableHotStandby, ereport, errdetail(), errmsg(), ExecuteRecoveryCommand(), GetCurrentTimestamp(), GetLatestXTime(), GetOldestXmin(), GetWalRcvWriteRecPtr(), GetXLogReplayRecPtr(), XLogCtlData::info_lck, XLogCtlData::Insert, InvalidXLogRecPtr, KeepLogSeg(), XLogCtlData::lastCheckPoint, XLogCtlData::lastCheckPointRecPtr, LOG, log_checkpoints, LogCheckpointEnd(), LogCheckpointStart(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), MemSet, NULL, PreallocXlogFiles(), ControlFileData::prevCheckPoint, RecoveryInProgress(), CheckPoint::redo, XLogCtlInsert::RedoRecPtr, RemoveOldXlogFiles(), SpinLockAcquire, SpinLockRelease, ControlFileData::state, ThisTimeLineID, ControlFileData::time, timestamptz_to_str(), TruncateSUBTRANS(), UpdateControlFile(), UpdateMinRecoveryPoint(), WALInsertLock, XLByteToSeg, and XLogRecPtrIsInvalid.

Referenced by CheckpointerMain(), and ShutdownXLOG().

{
    XLogRecPtr  lastCheckPointRecPtr;
    CheckPoint  lastCheckPoint;
    XLogSegNo   _logSegNo;
    TimestampTz xtime;

    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;

    /*
     * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
     * happens at a time.
     */
    LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);

    /* Get a local copy of the last safe checkpoint record. */
    SpinLockAcquire(&xlogctl->info_lck);
    lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
    lastCheckPoint = xlogctl->lastCheckPoint;
    SpinLockRelease(&xlogctl->info_lck);

    /*
     * Check that we're still in recovery mode. It's ok if we exit recovery
     * mode after this check, the restart point is valid anyway.
     */
    if (!RecoveryInProgress())
    {
        ereport(DEBUG2,
              (errmsg("skipping restartpoint, recovery has already ended")));
        LWLockRelease(CheckpointLock);
        return false;
    }

    /*
     * If the last checkpoint record we've replayed is already our last
     * restartpoint, we can't perform a new restart point. We still update
     * minRecoveryPoint in that case, so that if this is a shutdown restart
     * point, we won't start up earlier than before. That's not strictly
     * necessary, but when hot standby is enabled, it would be rather weird if
     * the database opened up for read-only connections at a point-in-time
     * before the last shutdown. Such time travel is still possible in case of
     * immediate shutdown, though.
     *
     * We don't explicitly advance minRecoveryPoint when we do create a
     * restartpoint. It's assumed that flushing the buffers will do that as a
     * side-effect.
     */
    if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
        lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
    {
        ereport(DEBUG2,
                (errmsg("skipping restartpoint, already performed at %X/%X",
                        (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo)));

        UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
        if (flags & CHECKPOINT_IS_SHUTDOWN)
        {
            LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
            ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
            ControlFile->time = (pg_time_t) time(NULL);
            UpdateControlFile();
            LWLockRelease(ControlFileLock);
        }
        LWLockRelease(CheckpointLock);
        return false;
    }

    /*
     * Update the shared RedoRecPtr so that the startup process can calculate
     * the number of segments replayed since last restartpoint, and request a
     * restartpoint if it exceeds checkpoint_segments.
     *
     * You need to hold WALInsertLock and info_lck to update it, although
     * during recovery acquiring WALInsertLock is just pro forma, because
     * there is no other processes updating Insert.RedoRecPtr.
     */
    LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->Insert.RedoRecPtr = lastCheckPoint.redo;
    SpinLockRelease(&xlogctl->info_lck);
    LWLockRelease(WALInsertLock);

    /*
     * Prepare to accumulate statistics.
     *
     * Note: because it is possible for log_checkpoints to change while a
     * checkpoint proceeds, we always accumulate stats, even if
     * log_checkpoints is currently off.
     */
    MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
    CheckpointStats.ckpt_start_t = GetCurrentTimestamp();

    if (log_checkpoints)
        LogCheckpointStart(flags, true);

    CheckPointGuts(lastCheckPoint.redo, flags);

    /*
     * Select point at which we can truncate the xlog, which we base on the
     * prior checkpoint's earliest info.
     */
    XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);

    /*
     * Update pg_control, using current time.  Check that it still shows
     * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
     * this is a quick hack to make sure nothing really bad happens if somehow
     * we get here after the end-of-recovery checkpoint.
     */
    LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
        ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
    {
        ControlFile->prevCheckPoint = ControlFile->checkPoint;
        ControlFile->checkPoint = lastCheckPointRecPtr;
        ControlFile->checkPointCopy = lastCheckPoint;
        ControlFile->time = (pg_time_t) time(NULL);
        if (flags & CHECKPOINT_IS_SHUTDOWN)
            ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
        UpdateControlFile();
    }
    LWLockRelease(ControlFileLock);

    /*
     * Delete old log files (those no longer needed even for previous
     * checkpoint/restartpoint) to prevent the disk holding the xlog from
     * growing full.
     */
    if (_logSegNo)
    {
        XLogRecPtr  receivePtr;
        XLogRecPtr  replayPtr;
        XLogRecPtr  endptr;

        /*
         * Get the current end of xlog replayed or received, whichever is later.
         */
        receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
        replayPtr = GetXLogReplayRecPtr(NULL);
        endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;

        KeepLogSeg(endptr, &_logSegNo);
        _logSegNo--;

        /*
         * Update ThisTimeLineID to the timeline we're currently replaying,
         * so that we install any recycled segments on that timeline.
         *
         * There is no guarantee that the WAL segments will be useful on the
         * current timeline; if recovery proceeds to a new timeline right
         * after this, the pre-allocated WAL segments on this timeline will
         * not be used, and will go wasted until recycled on the next
         * restartpoint. We'll live with that.
         */
        (void) GetXLogReplayRecPtr(&ThisTimeLineID);

        RemoveOldXlogFiles(_logSegNo, endptr);

        /*
         * Make more log segments if needed.  (Do this after recycling old log
         * segments, since that may supply some of the needed files.)
         */
        PreallocXlogFiles(endptr);
    }

    /*
     * Truncate pg_subtrans if possible.  We can throw away all data before
     * the oldest XMIN of any running transaction.  No future transaction will
     * attempt to reference any pg_subtrans entry older than that (see Asserts
     * in subtrans.c).  When hot standby is disabled, though, we mustn't do
     * this because StartupSUBTRANS hasn't been called yet.
     */
    if (EnableHotStandby)
        TruncateSUBTRANS(GetOldestXmin(true, false));

    /* Real work is done, but log and update before releasing lock. */
    LogCheckpointEnd(true);

    xtime = GetLatestXTime();
    ereport((log_checkpoints ? LOG : DEBUG2),
            (errmsg("recovery restart point at %X/%X",
                    (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
           xtime ? errdetail("last completed transaction was at log time %s",
                             timestamptz_to_str(xtime)) : 0));

    LWLockRelease(CheckpointLock);

    /*
     * Finally, execute archive_cleanup_command, if any.
     */
    if (XLogCtl->archiveCleanupCommand[0])
        ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
                               "archive_cleanup_command",
                               false);

    return true;
}

bool DataChecksumsEnabled ( void   ) 
void do_pg_abort_backup ( void   ) 
XLogRecPtr do_pg_start_backup ( const char *  backupidstr,
bool  fast,
TimeLineID starttli_p,
char **  labelfile 
)

Definition at line 8397 of file xlog.c.

References AllocateFile(), appendStringInfo(), BACKUP_LABEL_FILE, backup_started_in_recovery, BoolGetDatum, ControlFileData::checkPoint, CHECKPOINT_FORCE, CHECKPOINT_IMMEDIATE, CHECKPOINT_WAIT, ControlFileData::checkPointCopy, ControlFileLock, StringInfoData::data, ereport, errcode(), errcode_for_file_access(), errhint(), errmsg(), ERROR, XLogCtlInsert::exclusiveBackup, XLogCtlInsert::forcePageWrites, FreeFile(), CheckPoint::fullPageWrites, GetUserId(), has_rolreplication(), XLogCtlData::info_lck, initStringInfo(), XLogCtlData::Insert, XLogCtlInsert::lastBackupStart, XLogCtlData::lastFpwDisableRecPtr, StringInfoData::len, log_timezone, LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), MAXPGPATH, XLogCtlInsert::nonExclusiveBackups, NULL, pfree(), PG_END_ENSURE_ERROR_CLEANUP, PG_ENSURE_ERROR_CLEANUP, pg_fsync(), pg_localtime(), pg_start_backup_callback(), pg_strftime(), RecoveryInProgress(), CheckPoint::redo, RequestCheckpoint(), RequestXLogSwitch(), SpinLockAcquire, SpinLockRelease, superuser(), ThisTimeLineID, CheckPoint::ThisTimeLineID, WALInsertLock, XLByteToSeg, XLogFileName, and XLogIsNeeded.

Referenced by perform_base_backup(), and pg_start_backup().

{
    bool        exclusive = (labelfile == NULL);
    bool        backup_started_in_recovery = false;
    XLogRecPtr  checkpointloc;
    XLogRecPtr  startpoint;
    TimeLineID  starttli;
    pg_time_t   stamp_time;
    char        strfbuf[128];
    char        xlogfilename[MAXFNAMELEN];
    XLogSegNo   _logSegNo;
    struct stat stat_buf;
    FILE       *fp;
    StringInfoData labelfbuf;

    backup_started_in_recovery = RecoveryInProgress();

    if (!superuser() && !has_rolreplication(GetUserId()))
        ereport(ERROR,
                (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
           errmsg("must be superuser or replication role to run a backup")));

    /*
     * Currently only non-exclusive backup can be taken during recovery.
     */
    if (backup_started_in_recovery && exclusive)
        ereport(ERROR,
                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                 errmsg("recovery is in progress"),
                 errhint("WAL control functions cannot be executed during recovery.")));

    /*
     * During recovery, we don't need to check WAL level. Because, if WAL
     * level is not sufficient, it's impossible to get here during recovery.
     */
    if (!backup_started_in_recovery && !XLogIsNeeded())
        ereport(ERROR,
                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
              errmsg("WAL level not sufficient for making an online backup"),
                 errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));

    if (strlen(backupidstr) > MAXPGPATH)
        ereport(ERROR,
                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                 errmsg("backup label too long (max %d bytes)",
                        MAXPGPATH)));

    /*
     * Mark backup active in shared memory.  We must do full-page WAL writes
     * during an on-line backup even if not doing so at other times, because
     * it's quite possible for the backup dump to obtain a "torn" (partially
     * written) copy of a database page if it reads the page concurrently with
     * our write to the same page.  This can be fixed as long as the first
     * write to the page in the WAL sequence is a full-page write. Hence, we
     * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
     * are no dirty pages in shared memory that might get dumped while the
     * backup is in progress without having a corresponding WAL record.  (Once
     * the backup is complete, we need not force full-page writes anymore,
     * since we expect that any pages not modified during the backup interval
     * must have been correctly captured by the backup.)
     *
     * Note that forcePageWrites has no effect during an online backup from
     * the standby.
     *
     * We must hold WALInsertLock to change the value of forcePageWrites, to
     * ensure adequate interlocking against XLogInsert().
     */
    LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
    if (exclusive)
    {
        if (XLogCtl->Insert.exclusiveBackup)
        {
            LWLockRelease(WALInsertLock);
            ereport(ERROR,
                    (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                     errmsg("a backup is already in progress"),
                     errhint("Run pg_stop_backup() and try again.")));
        }
        XLogCtl->Insert.exclusiveBackup = true;
    }
    else
        XLogCtl->Insert.nonExclusiveBackups++;
    XLogCtl->Insert.forcePageWrites = true;
    LWLockRelease(WALInsertLock);

    /* Ensure we release forcePageWrites if fail below */
    PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
    {
        bool        gotUniqueStartpoint = false;

        /*
         * Force an XLOG file switch before the checkpoint, to ensure that the
         * WAL segment the checkpoint is written to doesn't contain pages with
         * old timeline IDs.  That would otherwise happen if you called
         * pg_start_backup() right after restoring from a PITR archive: the
         * first WAL segment containing the startup checkpoint has pages in
         * the beginning with the old timeline ID.  That can cause trouble at
         * recovery: we won't have a history file covering the old timeline if
         * pg_xlog directory was not included in the base backup and the WAL
         * archive was cleared too before starting the backup.
         *
         * This also ensures that we have emitted a WAL page header that has
         * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
         * Therefore, if a WAL archiver (such as pglesslog) is trying to
         * compress out removable backup blocks, it won't remove any that
         * occur after this point.
         *
         * During recovery, we skip forcing XLOG file switch, which means that
         * the backup taken during recovery is not available for the special
         * recovery case described above.
         */
        if (!backup_started_in_recovery)
            RequestXLogSwitch();

        do
        {
            bool        checkpointfpw;

            /*
             * Force a CHECKPOINT.  Aside from being necessary to prevent torn
             * page problems, this guarantees that two successive backup runs
             * will have different checkpoint positions and hence different
             * history file names, even if nothing happened in between.
             *
             * During recovery, establish a restartpoint if possible. We use
             * the last restartpoint as the backup starting checkpoint. This
             * means that two successive backup runs can have same checkpoint
             * positions.
             *
             * Since the fact that we are executing do_pg_start_backup()
             * during recovery means that checkpointer is running, we can use
             * RequestCheckpoint() to establish a restartpoint.
             *
             * We use CHECKPOINT_IMMEDIATE only if requested by user (via
             * passing fast = true).  Otherwise this can take awhile.
             */
            RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
                              (fast ? CHECKPOINT_IMMEDIATE : 0));

            /*
             * Now we need to fetch the checkpoint record location, and also
             * its REDO pointer.  The oldest point in WAL that would be needed
             * to restore starting from the checkpoint is precisely the REDO
             * pointer.
             */
            LWLockAcquire(ControlFileLock, LW_SHARED);
            checkpointloc = ControlFile->checkPoint;
            startpoint = ControlFile->checkPointCopy.redo;
            starttli = ControlFile->checkPointCopy.ThisTimeLineID;
            checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
            LWLockRelease(ControlFileLock);

            if (backup_started_in_recovery)
            {
                /* use volatile pointer to prevent code rearrangement */
                volatile XLogCtlData *xlogctl = XLogCtl;
                XLogRecPtr  recptr;

                /*
                 * Check to see if all WAL replayed during online backup
                 * (i.e., since last restartpoint used as backup starting
                 * checkpoint) contain full-page writes.
                 */
                SpinLockAcquire(&xlogctl->info_lck);
                recptr = xlogctl->lastFpwDisableRecPtr;
                SpinLockRelease(&xlogctl->info_lck);

                if (!checkpointfpw || startpoint <= recptr)
                    ereport(ERROR,
                          (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                           errmsg("WAL generated with full_page_writes=off was replayed "
                                  "since last restartpoint"),
                           errhint("This means that the backup being taken on the standby "
                                   "is corrupt and should not be used. "
                                   "Enable full_page_writes and run CHECKPOINT on the master, "
                                   "and then try an online backup again.")));

                /*
                 * During recovery, since we don't use the end-of-backup WAL
                 * record and don't write the backup history file, the
                 * starting WAL location doesn't need to be unique. This means
                 * that two base backups started at the same time might use
                 * the same checkpoint as starting locations.
                 */
                gotUniqueStartpoint = true;
            }

            /*
             * If two base backups are started at the same time (in WAL sender
             * processes), we need to make sure that they use different
             * checkpoints as starting locations, because we use the starting
             * WAL location as a unique identifier for the base backup in the
             * end-of-backup WAL record and when we write the backup history
             * file. Perhaps it would be better generate a separate unique ID
             * for each backup instead of forcing another checkpoint, but
             * taking a checkpoint right after another is not that expensive
             * either because only few buffers have been dirtied yet.
             */
            LWLockAcquire(WALInsertLock, LW_SHARED);
            if (XLogCtl->Insert.lastBackupStart < startpoint)
            {
                XLogCtl->Insert.lastBackupStart = startpoint;
                gotUniqueStartpoint = true;
            }
            LWLockRelease(WALInsertLock);
        } while (!gotUniqueStartpoint);

        XLByteToSeg(startpoint, _logSegNo);
        XLogFileName(xlogfilename, ThisTimeLineID, _logSegNo);

        /*
         * Construct backup label file
         */
        initStringInfo(&labelfbuf);

        /* Use the log timezone here, not the session timezone */
        stamp_time = (pg_time_t) time(NULL);
        pg_strftime(strfbuf, sizeof(strfbuf),
                    "%Y-%m-%d %H:%M:%S %Z",
                    pg_localtime(&stamp_time, log_timezone));
        appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n",
                         (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
        appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n",
                         (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
        appendStringInfo(&labelfbuf, "BACKUP METHOD: %s\n",
                         exclusive ? "pg_start_backup" : "streamed");
        appendStringInfo(&labelfbuf, "BACKUP FROM: %s\n",
                         backup_started_in_recovery ? "standby" : "master");
        appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
        appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);

        /*
         * Okay, write the file, or return its contents to caller.
         */
        if (exclusive)
        {
            /*
             * Check for existing backup label --- implies a backup is already
             * running.  (XXX given that we checked exclusiveBackup above,
             * maybe it would be OK to just unlink any such label file?)
             */
            if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
            {
                if (errno != ENOENT)
                    ereport(ERROR,
                            (errcode_for_file_access(),
                             errmsg("could not stat file \"%s\": %m",
                                    BACKUP_LABEL_FILE)));
            }
            else
                ereport(ERROR,
                        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                         errmsg("a backup is already in progress"),
                         errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
                                 BACKUP_LABEL_FILE)));

            fp = AllocateFile(BACKUP_LABEL_FILE, "w");

            if (!fp)
                ereport(ERROR,
                        (errcode_for_file_access(),
                         errmsg("could not create file \"%s\": %m",
                                BACKUP_LABEL_FILE)));
            if (fwrite(labelfbuf.data, labelfbuf.len, 1, fp) != 1 ||
                fflush(fp) != 0 ||
                pg_fsync(fileno(fp)) != 0 ||
                ferror(fp) ||
                FreeFile(fp))
                ereport(ERROR,
                        (errcode_for_file_access(),
                         errmsg("could not write file \"%s\": %m",
                                BACKUP_LABEL_FILE)));
            pfree(labelfbuf.data);
        }
        else
            *labelfile = labelfbuf.data;
    }
    PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));

    /*
     * We're done.  As a convenience, return the starting WAL location.
     */
    if (starttli_p)
        *starttli_p = starttli;
    return startpoint;
}

XLogRecPtr do_pg_stop_backup ( char *  labelfile,
bool  waitforarchive,
TimeLineID stoptli_p 
)

Definition at line 8723 of file xlog.c.

References AllocateFile(), Assert, BACKUP_LABEL_FILE, backup_started_in_recovery, BackupHistoryFileName, BackupHistoryFilePath, XLogRecData::buffer, CHECK_FOR_INTERRUPTS, CleanupBackupHistory(), ControlFileLock, XLogRecData::data, ereport, errcode(), errcode_for_file_access(), errhint(), errmsg(), ERROR, XLogCtlInsert::exclusiveBackup, XLogCtlInsert::forcePageWrites, FreeFile(), GetUserId(), has_rolreplication(), XLogCtlData::info_lck, XLogCtlData::Insert, XLogCtlData::lastFpwDisableRecPtr, XLogRecData::len, log_timezone, LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), ControlFileData::minRecoveryPoint, ControlFileData::minRecoveryPointTLI, XLogRecData::next, XLogCtlInsert::nonExclusiveBackups, NOTICE, NULL, palloc(), pg_localtime(), pg_strftime(), pg_usleep(), RecoveryInProgress(), remaining, RequestXLogSwitch(), SpinLockAcquire, SpinLockRelease, superuser(), ThisTimeLineID, unlink(), WALInsertLock, WARNING, XLByteToPrevSeg, XLByteToSeg, XLOG_BACKUP_END, XLogArchiveIsBusy(), XLogArchivingActive, XLogFileName, XLogInsert(), XLogIsNeeded, and XLogSegSize.

Referenced by perform_base_backup(), and pg_stop_backup().

{
    bool        exclusive = (labelfile == NULL);
    bool        backup_started_in_recovery = false;
    XLogRecPtr  startpoint;
    XLogRecPtr  stoppoint;
    TimeLineID  stoptli;
    XLogRecData rdata;
    pg_time_t   stamp_time;
    char        strfbuf[128];
    char        histfilepath[MAXPGPATH];
    char        startxlogfilename[MAXFNAMELEN];
    char        stopxlogfilename[MAXFNAMELEN];
    char        lastxlogfilename[MAXFNAMELEN];
    char        histfilename[MAXFNAMELEN];
    char        backupfrom[20];
    XLogSegNo   _logSegNo;
    FILE       *lfp;
    FILE       *fp;
    char        ch;
    int         seconds_before_warning;
    int         waits = 0;
    bool        reported_waiting = false;
    char       *remaining;
    char       *ptr;
    uint32      hi,
                lo;

    backup_started_in_recovery = RecoveryInProgress();

    if (!superuser() && !has_rolreplication(GetUserId()))
        ereport(ERROR,
                (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
         (errmsg("must be superuser or replication role to run a backup"))));

    /*
     * Currently only non-exclusive backup can be taken during recovery.
     */
    if (backup_started_in_recovery && exclusive)
        ereport(ERROR,
                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                 errmsg("recovery is in progress"),
                 errhint("WAL control functions cannot be executed during recovery.")));

    /*
     * During recovery, we don't need to check WAL level. Because, if WAL
     * level is not sufficient, it's impossible to get here during recovery.
     */
    if (!backup_started_in_recovery && !XLogIsNeeded())
        ereport(ERROR,
                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
              errmsg("WAL level not sufficient for making an online backup"),
                 errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));

    /*
     * OK to update backup counters and forcePageWrites
     */
    LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
    if (exclusive)
        XLogCtl->Insert.exclusiveBackup = false;
    else
    {
        /*
         * The user-visible pg_start/stop_backup() functions that operate on
         * exclusive backups can be called at any time, but for non-exclusive
         * backups, it is expected that each do_pg_start_backup() call is
         * matched by exactly one do_pg_stop_backup() call.
         */
        Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
        XLogCtl->Insert.nonExclusiveBackups--;
    }

    if (!XLogCtl->Insert.exclusiveBackup &&
        XLogCtl->Insert.nonExclusiveBackups == 0)
    {
        XLogCtl->Insert.forcePageWrites = false;
    }
    LWLockRelease(WALInsertLock);

    if (exclusive)
    {
        /*
         * Read the existing label file into memory.
         */
        struct stat statbuf;
        int         r;

        if (stat(BACKUP_LABEL_FILE, &statbuf))
        {
            if (errno != ENOENT)
                ereport(ERROR,
                        (errcode_for_file_access(),
                         errmsg("could not stat file \"%s\": %m",
                                BACKUP_LABEL_FILE)));
            ereport(ERROR,
                    (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                     errmsg("a backup is not in progress")));
        }

        lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
        if (!lfp)
        {
            ereport(ERROR,
                    (errcode_for_file_access(),
                     errmsg("could not read file \"%s\": %m",
                            BACKUP_LABEL_FILE)));
        }
        labelfile = palloc(statbuf.st_size + 1);
        r = fread(labelfile, statbuf.st_size, 1, lfp);
        labelfile[statbuf.st_size] = '\0';

        /*
         * Close and remove the backup label file
         */
        if (r != 1 || ferror(lfp) || FreeFile(lfp))
            ereport(ERROR,
                    (errcode_for_file_access(),
                     errmsg("could not read file \"%s\": %m",
                            BACKUP_LABEL_FILE)));
        if (unlink(BACKUP_LABEL_FILE) != 0)
            ereport(ERROR,
                    (errcode_for_file_access(),
                     errmsg("could not remove file \"%s\": %m",
                            BACKUP_LABEL_FILE)));
    }

    /*
     * Read and parse the START WAL LOCATION line (this code is pretty crude,
     * but we are not expecting any variability in the file format).
     */
    if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
               &hi, &lo, startxlogfilename,
               &ch) != 4 || ch != '\n')
        ereport(ERROR,
                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
    startpoint = ((uint64) hi) << 32 | lo;
    remaining = strchr(labelfile, '\n') + 1;    /* %n is not portable enough */

    /*
     * Parse the BACKUP FROM line. If we are taking an online backup from the
     * standby, we confirm that the standby has not been promoted during the
     * backup.
     */
    ptr = strstr(remaining, "BACKUP FROM:");
    if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
        ereport(ERROR,
                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
    if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
        ereport(ERROR,
                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                 errmsg("the standby was promoted during online backup"),
                 errhint("This means that the backup being taken is corrupt "
                         "and should not be used. "
                         "Try taking another online backup.")));

    /*
     * During recovery, we don't write an end-of-backup record. We assume that
     * pg_control was backed up last and its minimum recovery point can be
     * available as the backup end location. Since we don't have an
     * end-of-backup record, we use the pg_control value to check whether
     * we've reached the end of backup when starting recovery from this
     * backup. We have no way of checking if pg_control wasn't backed up last
     * however.
     *
     * We don't force a switch to new WAL file and wait for all the required
     * files to be archived. This is okay if we use the backup to start the
     * standby. But, if it's for an archive recovery, to ensure all the
     * required files are available, a user should wait for them to be
     * archived, or include them into the backup.
     *
     * We return the current minimum recovery point as the backup end
     * location. Note that it can be greater than the exact backup end
     * location if the minimum recovery point is updated after the backup of
     * pg_control. This is harmless for current uses.
     *
     * XXX currently a backup history file is for informational and debug
     * purposes only. It's not essential for an online backup. Furthermore,
     * even if it's created, it will not be archived during recovery because
     * an archiver is not invoked. So it doesn't seem worthwhile to write a
     * backup history file during recovery.
     */
    if (backup_started_in_recovery)
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;
        XLogRecPtr  recptr;

        /*
         * Check to see if all WAL replayed during online backup contain
         * full-page writes.
         */
        SpinLockAcquire(&xlogctl->info_lck);
        recptr = xlogctl->lastFpwDisableRecPtr;
        SpinLockRelease(&xlogctl->info_lck);

        if (startpoint <= recptr)
            ereport(ERROR,
                    (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
               errmsg("WAL generated with full_page_writes=off was replayed "
                      "during online backup"),
                 errhint("This means that the backup being taken on the standby "
                         "is corrupt and should not be used. "
                 "Enable full_page_writes and run CHECKPOINT on the master, "
                         "and then try an online backup again.")));


        LWLockAcquire(ControlFileLock, LW_SHARED);
        stoppoint = ControlFile->minRecoveryPoint;
        stoptli = ControlFile->minRecoveryPointTLI;
        LWLockRelease(ControlFileLock);

        if (stoptli_p)
            *stoptli_p = stoptli;
        return stoppoint;
    }

    /*
     * Write the backup-end xlog record
     */
    rdata.data = (char *) (&startpoint);
    rdata.len = sizeof(startpoint);
    rdata.buffer = InvalidBuffer;
    rdata.next = NULL;
    stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END, &rdata);
    stoptli = ThisTimeLineID;

    /*
     * Force a switch to a new xlog segment file, so that the backup is valid
     * as soon as archiver moves out the current segment file.
     */
    RequestXLogSwitch();

    XLByteToPrevSeg(stoppoint, _logSegNo);
    XLogFileName(stopxlogfilename, ThisTimeLineID, _logSegNo);

    /* Use the log timezone here, not the session timezone */
    stamp_time = (pg_time_t) time(NULL);
    pg_strftime(strfbuf, sizeof(strfbuf),
                "%Y-%m-%d %H:%M:%S %Z",
                pg_localtime(&stamp_time, log_timezone));

    /*
     * Write the backup history file
     */
    XLByteToSeg(startpoint, _logSegNo);
    BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logSegNo,
                          (uint32) (startpoint % XLogSegSize));
    fp = AllocateFile(histfilepath, "w");
    if (!fp)
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not create file \"%s\": %m",
                        histfilepath)));
    fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
            (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
    fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
            (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
    /* transfer remaining lines from label to history file */
    fprintf(fp, "%s", remaining);
    fprintf(fp, "STOP TIME: %s\n", strfbuf);
    if (fflush(fp) || ferror(fp) || FreeFile(fp))
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not write file \"%s\": %m",
                        histfilepath)));

    /*
     * Clean out any no-longer-needed history files.  As a side effect, this
     * will post a .ready file for the newly created history file, notifying
     * the archiver that history file may be archived immediately.
     */
    CleanupBackupHistory();

    /*
     * If archiving is enabled, wait for all the required WAL files to be
     * archived before returning. If archiving isn't enabled, the required WAL
     * needs to be transported via streaming replication (hopefully with
     * wal_keep_segments set high enough), or some more exotic mechanism like
     * polling and copying files from pg_xlog with script. We have no
     * knowledge of those mechanisms, so it's up to the user to ensure that he
     * gets all the required WAL.
     *
     * We wait until both the last WAL file filled during backup and the
     * history file have been archived, and assume that the alphabetic sorting
     * property of the WAL files ensures any earlier WAL files are safely
     * archived as well.
     *
     * We wait forever, since archive_command is supposed to work and we
     * assume the admin wanted his backup to work completely. If you don't
     * wish to wait, you can set statement_timeout.  Also, some notices are
     * issued to clue in anyone who might be doing this interactively.
     */
    if (waitforarchive && XLogArchivingActive())
    {
        XLByteToPrevSeg(stoppoint, _logSegNo);
        XLogFileName(lastxlogfilename, ThisTimeLineID, _logSegNo);

        XLByteToSeg(startpoint, _logSegNo);
        BackupHistoryFileName(histfilename, ThisTimeLineID, _logSegNo,
                              (uint32) (startpoint % XLogSegSize));

        seconds_before_warning = 60;
        waits = 0;

        while (XLogArchiveIsBusy(lastxlogfilename) ||
               XLogArchiveIsBusy(histfilename))
        {
            CHECK_FOR_INTERRUPTS();

            if (!reported_waiting && waits > 5)
            {
                ereport(NOTICE,
                        (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
                reported_waiting = true;
            }

            pg_usleep(1000000L);

            if (++waits >= seconds_before_warning)
            {
                seconds_before_warning *= 2;    /* This wraps in >10 years... */
                ereport(WARNING,
                        (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
                                waits),
                         errhint("Check that your archive_command is executing properly.  "
                                 "pg_stop_backup can be canceled safely, "
                                 "but the database backup will not be usable without all the WAL segments.")));
            }
        }

        ereport(NOTICE,
                (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
    }
    else if (waitforarchive)
        ereport(NOTICE,
                (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));

    /*
     * We're done.  As a convenience, return the ending WAL location.
     */
    if (stoptli_p)
        *stoptli_p = stoptli;
    return stoppoint;
}

TimestampTz GetCurrentChunkReplayStartTime ( void   ) 

Definition at line 4753 of file xlog.c.

References XLogCtlData::currentChunkStartTime, XLogCtlData::info_lck, SpinLockAcquire, and SpinLockRelease.

Referenced by GetReplicationApplyDelay().

{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;
    TimestampTz xtime;

    SpinLockAcquire(&xlogctl->info_lck);
    xtime = xlogctl->currentChunkStartTime;
    SpinLockRelease(&xlogctl->info_lck);

    return xtime;
}

XLogRecPtr GetFakeLSNForUnloggedRel ( void   ) 

Definition at line 3813 of file xlog.c.

References SpinLockAcquire, SpinLockRelease, XLogCtlData::ulsn_lck, and XLogCtlData::unloggedLSN.

Referenced by gistGetFakeLSN().

{
    XLogRecPtr nextUnloggedLSN;

    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;

    /* increment the unloggedLSN counter, need SpinLock */
    SpinLockAcquire(&xlogctl->ulsn_lck);
    nextUnloggedLSN = xlogctl->unloggedLSN++;
    SpinLockRelease(&xlogctl->ulsn_lck);

    return nextUnloggedLSN;
}

XLogRecPtr GetFlushRecPtr ( void   ) 

Definition at line 6512 of file xlog.c.

References XLogwrtResult::Flush, XLogCtlData::info_lck, XLogCtlData::LogwrtResult, SpinLockAcquire, and SpinLockRelease.

Referenced by StartReplication(), and XLogSend().

{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;
    XLogRecPtr  recptr;

    SpinLockAcquire(&xlogctl->info_lck);
    recptr = xlogctl->LogwrtResult.Flush;
    SpinLockRelease(&xlogctl->info_lck);

    return recptr;
}

XLogRecPtr GetInsertRecPtr ( void   ) 

Definition at line 6494 of file xlog.c.

References XLogCtlData::info_lck, XLogCtlData::LogwrtRqst, SpinLockAcquire, SpinLockRelease, and XLogwrtRqst::Write.

Referenced by CheckpointerMain(), IdentifySystem(), and IsCheckpointOnSchedule().

{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;
    XLogRecPtr  recptr;

    SpinLockAcquire(&xlogctl->info_lck);
    recptr = xlogctl->LogwrtRqst.Write;
    SpinLockRelease(&xlogctl->info_lck);

    return recptr;
}

TimestampTz GetLatestXTime ( void   ) 

Definition at line 4718 of file xlog.c.

References XLogCtlData::info_lck, XLogCtlData::recoveryLastXTime, SpinLockAcquire, and SpinLockRelease.

Referenced by CreateRestartPoint(), pg_last_xact_replay_timestamp(), and StartupXLOG().

{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;
    TimestampTz xtime;

    SpinLockAcquire(&xlogctl->info_lck);
    xtime = xlogctl->recoveryLastXTime;
    SpinLockRelease(&xlogctl->info_lck);

    return xtime;
}

void GetNextXidAndEpoch ( TransactionId xid,
uint32 epoch 
)

Definition at line 6550 of file xlog.c.

References XLogCtlData::ckptXid, XLogCtlData::ckptXidEpoch, XLogCtlData::info_lck, ReadNewTransactionId(), SpinLockAcquire, and SpinLockRelease.

Referenced by load_xid_epoch(), ProcessStandbyHSFeedbackMessage(), and XLogWalRcvSendHSFeedback().

{
    uint32      ckptXidEpoch;
    TransactionId ckptXid;
    TransactionId nextXid;

    /* Must read checkpoint info first, else have race condition */
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        ckptXidEpoch = xlogctl->ckptXidEpoch;
        ckptXid = xlogctl->ckptXid;
        SpinLockRelease(&xlogctl->info_lck);
    }

    /* Now fetch current nextXid */
    nextXid = ReadNewTransactionId();

    /*
     * nextXid is certainly logically later than ckptXid.  So if it's
     * numerically less, it must have wrapped into the next epoch.
     */
    if (nextXid < ckptXid)
        ckptXidEpoch++;

    *xid = nextXid;
    *epoch = ckptXidEpoch;
}

XLogRecPtr GetRedoRecPtr ( void   ) 

Definition at line 6471 of file xlog.c.

References Assert, XLogCtlData::info_lck, XLogCtlData::Insert, Insert(), XLogCtlInsert::RedoRecPtr, RedoRecPtr, SpinLockAcquire, and SpinLockRelease.

Referenced by InitXLOGAccess(), nextval_internal(), XLogPageRead(), XLogSaveBufferForHint(), and XLogWrite().

{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    Assert(RedoRecPtr <= xlogctl->Insert.RedoRecPtr);
    RedoRecPtr = xlogctl->Insert.RedoRecPtr;
    SpinLockRelease(&xlogctl->info_lck);

    return RedoRecPtr;
}

uint64 GetSystemIdentifier ( void   ) 

Definition at line 3787 of file xlog.c.

References Assert, NULL, and ControlFileData::system_identifier.

Referenced by IdentifySystem(), and libpqrcv_identify_system().

XLogRecPtr GetXLogInsertRecPtr ( void   ) 
void GetXLogReceiptTime ( TimestampTz rtime,
bool fromStream 
)

Definition at line 4771 of file xlog.c.

References Assert, InRecovery, XLogReceiptSource, and XLogReceiptTime.

Referenced by GetStandbyLimitTime().

{
    /*
     * This must be executed in the startup process, since we don't export the
     * relevant state to shared memory.
     */
    Assert(InRecovery);

    *rtime = XLogReceiptTime;
    *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
}

XLogRecPtr GetXLogReplayRecPtr ( TimeLineID replayTLI  ) 

Definition at line 9103 of file xlog.c.

References XLogCtlData::info_lck, XLogCtlData::lastReplayedEndRecPtr, XLogCtlData::lastReplayedTLI, SpinLockAcquire, and SpinLockRelease.

Referenced by CreateRestartPoint(), GetReplicationApplyDelay(), GetStandbyFlushRecPtr(), pg_last_xlog_replay_location(), WalReceiverMain(), and XLogWalRcvSendReply().

{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;
    XLogRecPtr  recptr;
    TimeLineID  tli;

    SpinLockAcquire(&xlogctl->info_lck);
    recptr = xlogctl->lastReplayedEndRecPtr;
    tli = xlogctl->lastReplayedTLI;
    SpinLockRelease(&xlogctl->info_lck);

    if (replayTLI)
        *replayTLI = tli;
    return recptr;
}

XLogRecPtr GetXLogWriteRecPtr ( void   ) 

Definition at line 9140 of file xlog.c.

References XLogCtlData::info_lck, XLogCtlData::LogwrtResult, SpinLockAcquire, SpinLockRelease, and XLogwrtResult::Write.

Referenced by pg_current_xlog_location().

{
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        LogwrtResult = xlogctl->LogwrtResult;
        SpinLockRelease(&xlogctl->info_lck);
    }

    return LogwrtResult.Write;
}

bool HotStandbyActive ( void   ) 

Definition at line 6252 of file xlog.c.

References XLogCtlData::info_lck, LocalHotStandbyActive, XLogCtlData::SharedHotStandbyActive, SpinLockAcquire, and SpinLockRelease.

Referenced by XLogWalRcvSendHSFeedback().

{
    /*
     * We check shared state each time only until Hot Standby is active. We
     * can't de-activate Hot Standby, so there's no need to keep checking
     * after the shared variable has once been seen true.
     */
    if (LocalHotStandbyActive)
        return true;
    else
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        /* spinlock is essential on machines with weak memory ordering! */
        SpinLockAcquire(&xlogctl->info_lck);
        LocalHotStandbyActive = xlogctl->SharedHotStandbyActive;
        SpinLockRelease(&xlogctl->info_lck);

        return LocalHotStandbyActive;
    }
}

void InitXLOGAccess ( void   ) 

Definition at line 6455 of file xlog.c.

References Assert, GetRedoRecPtr(), IsBootstrapProcessingMode, XLogCtlData::ThisTimeLineID, and ThisTimeLineID.

Referenced by AuxiliaryProcessMain(), LocalSetXLogInsertAllowed(), and RecoveryInProgress().

{
    /* ThisTimeLineID doesn't change so we need no lock to copy it */
    ThisTimeLineID = XLogCtl->ThisTimeLineID;
    Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());

    /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
    (void) GetRedoRecPtr();
}

void issue_xlog_fsync ( int  fd,
XLogSegNo  segno 
)

Definition at line 8322 of file xlog.c.

References elog, ereport, errcode_for_file_access(), errmsg(), PANIC, pg_fdatasync(), pg_fsync_no_writethrough(), pg_fsync_writethrough(), sync_method, SYNC_METHOD_FDATASYNC, SYNC_METHOD_FSYNC, SYNC_METHOD_FSYNC_WRITETHROUGH, SYNC_METHOD_OPEN, SYNC_METHOD_OPEN_DSYNC, ThisTimeLineID, and XLogFileNameP().

Referenced by XLogWalRcvFlush(), and XLogWrite().

{
    switch (sync_method)
    {
        case SYNC_METHOD_FSYNC:
            if (pg_fsync_no_writethrough(fd) != 0)
                ereport(PANIC,
                        (errcode_for_file_access(),
                         errmsg("could not fsync log file %s: %m",
                                XLogFileNameP(ThisTimeLineID, segno))));
            break;
#ifdef HAVE_FSYNC_WRITETHROUGH
        case SYNC_METHOD_FSYNC_WRITETHROUGH:
            if (pg_fsync_writethrough(fd) != 0)
                ereport(PANIC,
                        (errcode_for_file_access(),
                         errmsg("could not fsync write-through log file %s: %m",
                                XLogFileNameP(ThisTimeLineID, segno))));
            break;
#endif
#ifdef HAVE_FDATASYNC
        case SYNC_METHOD_FDATASYNC:
            if (pg_fdatasync(fd) != 0)
                ereport(PANIC,
                        (errcode_for_file_access(),
                         errmsg("could not fdatasync log file %s: %m",
                                XLogFileNameP(ThisTimeLineID, segno))));
            break;
#endif
        case SYNC_METHOD_OPEN:
        case SYNC_METHOD_OPEN_DSYNC:
            /* write synced it already */
            break;
        default:
            elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
            break;
    }
}

bool RecoveryInProgress ( void   ) 

Definition at line 6211 of file xlog.c.

References XLogCtlData::info_lck, InitXLOGAccess(), LocalRecoveryInProgress, XLogCtlData::SharedRecoveryInProgress, SpinLockAcquire, and SpinLockRelease.

Referenced by check_transaction_read_only(), check_XactIsoLevel(), CheckArchiveTimeout(), CheckpointerMain(), CreateCheckPoint(), CreateEndOfRecoveryRecord(), CreateRestartPoint(), do_pg_start_backup(), do_pg_stop_backup(), get_relation_info(), GetNewMultiXactId(), GetNewObjectId(), GetNewTransactionId(), GetOldestActiveTransactionId(), GetOldestXmin(), GetRunningTransactionData(), GetSerializableTransactionSnapshot(), GetSerializableTransactionSnapshotInt(), GetSnapshotData(), heap_page_prune_opt(), IdentifySystem(), InitPostgres(), InitTempTableNamespace(), InitWalSender(), IsCheckpointOnSchedule(), LockAcquireExtended(), MarkBufferDirtyHint(), OldSerXidSetActiveSerXmin(), perform_base_backup(), pg_create_restore_point(), pg_current_xlog_insert_location(), pg_current_xlog_location(), pg_is_in_recovery(), pg_is_xlog_replay_paused(), pg_switch_xlog(), pg_xlog_replay_pause(), pg_xlog_replay_resume(), pg_xlogfile_name(), pg_xlogfile_name_offset(), PreventCommandDuringRecovery(), ProcSendSignal(), ProcSleep(), sendDir(), ShutdownXLOG(), standard_ProcessUtility(), StartTransaction(), TransactionIdIsInProgress(), UpdateFullPageWrites(), WalReceiverMain(), XLogBackgroundFlush(), XLogInsertAllowed(), XLogNeedsFlush(), and XLogSend().

{
    /*
     * We check shared state each time only until we leave recovery mode. We
     * can't re-enter recovery, so there's no need to keep checking after the
     * shared variable has once been seen false.
     */
    if (!LocalRecoveryInProgress)
        return false;
    else
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        /* spinlock is essential on machines with weak memory ordering! */
        SpinLockAcquire(&xlogctl->info_lck);
        LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
        SpinLockRelease(&xlogctl->info_lck);

        /*
         * Initialize TimeLineID and RedoRecPtr when we discover that recovery
         * is finished. InitPostgres() relies upon this behaviour to ensure
         * that InitXLOGAccess() is called at backend startup.  (If you change
         * this, see also LocalSetXLogInsertAllowed.)
         */
        if (!LocalRecoveryInProgress)
            InitXLOGAccess();

        return LocalRecoveryInProgress;
    }
}

bool RecoveryIsPaused ( void   ) 

Definition at line 4672 of file xlog.c.

References XLogCtlData::info_lck, XLogCtlData::recoveryPause, SpinLockAcquire, and SpinLockRelease.

Referenced by pg_is_xlog_replay_paused(), and recoveryPausesHere().

{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;
    bool        recoveryPause;

    SpinLockAcquire(&xlogctl->info_lck);
    recoveryPause = xlogctl->recoveryPause;
    SpinLockRelease(&xlogctl->info_lck);

    return recoveryPause;
}

Buffer RestoreBackupBlock ( XLogRecPtr  lsn,
XLogRecord record,
int  block_index,
bool  get_cleanup_lock,
bool  keep_buffer 
)

Definition at line 3149 of file xlog.c.

References elog, ERROR, BkpBlock::hole_length, i, RestoreBackupBlockContents(), XLogRecord::xl_info, XLogRecord::xl_len, XLogRecGetData, and XLR_BKP_BLOCK.

Referenced by btree_xlog_delete(), btree_xlog_delete_page(), btree_xlog_insert(), btree_xlog_split(), btree_xlog_vacuum(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoUpdateMetapage(), ginRedoVacuumPage(), gistRedoClearFollowRight(), gistRedoPageUpdateRecord(), heap_xlog_clean(), heap_xlog_delete(), heap_xlog_freeze(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_update(), heap_xlog_visible(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), and spgRedoVacuumRoot().

{
    BkpBlock    bkpb;
    char       *blk;
    int         i;

    /* Locate requested BkpBlock in the record */
    blk = (char *) XLogRecGetData(record) + record->xl_len;
    for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
    {
        if (!(record->xl_info & XLR_BKP_BLOCK(i)))
            continue;

        memcpy(&bkpb, blk, sizeof(BkpBlock));
        blk += sizeof(BkpBlock);

        if (i == block_index)
        {
            /* Found it, apply the update */
            return RestoreBackupBlockContents(lsn, bkpb, blk, get_cleanup_lock,
                                              keep_buffer);
        }

        blk += BLCKSZ - bkpb.hole_length;
    }

    /* Caller specified a bogus block_index */
    elog(ERROR, "failed to restore block_index %d", block_index);
    return InvalidBuffer;       /* keep compiler quiet */
}

void SetRecoveryPause ( bool  recoveryPause  ) 

Definition at line 4686 of file xlog.c.

References XLogCtlData::info_lck, XLogCtlData::recoveryPause, SpinLockAcquire, and SpinLockRelease.

Referenced by pg_xlog_replay_pause(), pg_xlog_replay_resume(), and StartupXLOG().

{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->recoveryPause = recoveryPause;
    SpinLockRelease(&xlogctl->info_lck);
}

void SetWalWriterSleeping ( bool  sleeping  ) 

Definition at line 9961 of file xlog.c.

References XLogCtlData::info_lck, SpinLockAcquire, SpinLockRelease, and XLogCtlData::WalWriterSleeping.

Referenced by WalWriterMain().

{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->WalWriterSleeping = sleeping;
    SpinLockRelease(&xlogctl->info_lck);
}

void ShutdownXLOG ( int  code,
Datum  arg 
)

Definition at line 6585 of file xlog.c.

References CHECKPOINT_IMMEDIATE, CHECKPOINT_IS_SHUTDOWN, CreateCheckPoint(), CreateRestartPoint(), ereport, errmsg(), LOG, RecoveryInProgress(), RequestXLogSwitch(), ShutdownCLOG(), ShutdownMultiXact(), ShutdownSUBTRANS(), XLogArchiveCommandSet, and XLogArchivingActive.

Referenced by CheckpointerMain(), and InitPostgres().

{
    ereport(LOG,
            (errmsg("shutting down")));

    if (RecoveryInProgress())
        CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
    else
    {
        /*
         * If archiving is enabled, rotate the last XLOG file so that all the
         * remaining records are archived (postmaster wakes up the archiver
         * process one more time at the end of shutdown). The checkpoint
         * record will go to the next XLOG file and won't be archived (yet).
         */
        if (XLogArchivingActive() && XLogArchiveCommandSet())
            RequestXLogSwitch();

        CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
    }
    ShutdownCLOG();
    ShutdownSUBTRANS();
    ShutdownMultiXact();

    ereport(LOG,
            (errmsg("database system is shut down")));
}

void StartupXLOG ( void   ) 

Definition at line 4846 of file xlog.c.

References AllowCascadeReplication, appendStringInfo(), archiveCleanupCommand, XLogCtlData::archiveCleanupCommand, ArchiveRecoveryRequested, ErrorContextCallback::arg, Assert, BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, ControlFileData::backupEndPoint, ControlFileData::backupEndRequired, ControlFileData::backupStartPoint, bgwriterLaunched, buf, ErrorContextCallback::callback, ControlFileData::checkPoint, CHECKPOINT_END_OF_RECOVERY, CHECKPOINT_IMMEDIATE, CHECKPOINT_WAIT, ControlFileData::checkPointCopy, CheckRecoveryConsistency(), CheckRequiredParameterValues(), checkTimeLineSwitch(), XLogCtlData::ckptXid, XLogCtlData::ckptXidEpoch, close, ControlFileLock, CreateCheckPoint(), CreateEndOfRecoveryRecord(), XLogCtlData::currentChunkStartTime, XLogCtlWrite::curridx, XLogCtlInsert::currpage, XLogCtlInsert::currpos, StringInfoData::data, DataDir, DB_IN_ARCHIVE_RECOVERY, DB_IN_CRASH_RECOVERY, DB_IN_PRODUCTION, DB_SHUTDOWNED, DB_SHUTDOWNED_IN_RECOVERY, DB_SHUTDOWNING, DEBUG1, DEBUG2, DEBUG3, DeleteAllExportedSnapshotFiles(), DisownLatch(), elog, EnableHotStandby, EndRecPtr, ereport, errcode(), errcode_for_file_access(), errdetail(), errhint(), errmsg(), ERROR, error_context_stack, ExecuteRecoveryCommand(), exitArchiveRecovery(), fast_promote, FATAL, findNewestTimeLine(), XLogwrtRqst::Flush, XLogwrtResult::Flush, XLogCtlInsert::fullPageWrites, CheckPoint::fullPageWrites, GetCurrentTimestamp(), GetLatestXTime(), HandleStartupProcInterrupts(), InArchiveRecovery, XLogCtlData::info_lck, InitRecoveryTransactionEnvironment(), initStringInfo(), InRecovery, InRedo, XLogCtlData::Insert, INSERT_FREESPACE, InvalidXLogRecPtr, IsUnderPostmaster, lastFullPageWrites, LastRec, XLogCtlData::lastReplayedEndRecPtr, XLogCtlData::lastReplayedTLI, XLogCtlWrite::lastSegSwitchTime, VariableCacheData::latestCompletedXid, RunningTransactionsData::latestCompletedXid, LocalSetXLogInsertAllowed(), LocalXLogInsertAllowed, LOG, XLogCtlData::LogwrtResult, XLogCtlData::LogwrtRqst, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), MemSet, minRecoveryPoint, ControlFileData::minRecoveryPoint, minRecoveryPointTLI, ControlFileData::minRecoveryPointTLI, MultiXactSetNextMXact(), NextBufIdx, CheckPoint::nextMulti, CheckPoint::nextMultiOffset, VariableCacheData::nextOid, CheckPoint::nextOid, RunningTransactionsData::nextXid, VariableCacheData::nextXid, CheckPoint::nextXid, CheckPoint::nextXidEpoch, NULL, VariableCacheData::oidCount, CheckPoint::oldestActiveXid, CheckPoint::oldestMulti, CheckPoint::oldestMultiDB, RunningTransactionsData::oldestRunningXid, CheckPoint::oldestXid, CheckPoint::oldestXidDB, openLogFile, openLogOff, openLogSegNo, OwnLatch(), PANIC, pfree(), pg_usleep(), pgstat_reset_all(), PMSIGNAL_RECOVERY_STARTED, PreallocXlogFiles(), PrescanPreparedTransactions(), ControlFileData::prevCheckPoint, ErrorContextCallback::previous, XLogCtlInsert::PrevRecord, XLogCtlData::PrevTimeLineID, xl_end_of_recovery::PrevTimeLineID, CheckPoint::PrevTimeLineID, ProcArrayApplyRecoveryInfo(), ProcArrayLock, PublishStartupProcessInformation(), read_backup_label(), XLogReaderState::readBuf, ReadCheckpointRecord(), ReadControlFile(), readFile, readOff, XLogReaderState::readPageTLI, ReadRecord(), readRecoveryCommandFile(), ReadRecPtr, RecordKnownAssignedTransactionIds(), RecoverPreparedTransactions(), RECOVERY_TARGET_NAME, RECOVERY_TARGET_TIME, RECOVERY_TARGET_XID, recoveryEndCommand, XLogCtlData::recoveryLastXTime, XLogCtlData::recoveryPause, recoveryPauseAtTarget, recoveryPausesHere(), recoveryStopAfter, recoveryStopName, recoveryStopsHere(), recoveryStopTime, recoveryStopXid, recoveryTarget, recoveryTargetIsLatest, recoveryTargetName, recoveryTargetTime, recoveryTargetTLI, recoveryTargetXid, XLogCtlData::recoveryWakeupLatch, CheckPoint::redo, XLogCtlInsert::RedoRecPtr, RedoRecPtr, RedoStartLSN, RelationCacheInitFileRemove(), XLogCtlData::replayEndRecPtr, XLogCtlData::replayEndTLI, RequestCheckpoint(), ResetUnloggedRelations(), restoreTimeLineHistoryFiles(), RmgrData::rm_cleanup, RmgrData::rm_desc, RmgrData::rm_redo, RmgrData::rm_startup, RmgrTable, SendPostmasterSignal(), SetForwardFsyncRequests(), SetMultiXactIdLimit(), SetRecoveryPause(), SetTransactionIdLimit(), XLogCtlData::SharedRecoveryInProgress, ShmemVariableCache, ShutdownRecoveryTransactionEnvironment(), ShutdownWalRcv(), snprintf(), SpinLockAcquire, SpinLockRelease, STANDBY_DISABLED, STANDBY_INITIALIZED, StandbyMode, StandbyModeRequested, StandbyRecoverPreparedTransactions(), standbyState, StartupCLOG(), StartupMultiXact(), StartupSUBTRANS(), ControlFileData::state, str_time(), RunningTransactionsData::subxcnt, RunningTransactionsData::subxid_overflow, ControlFileData::system_identifier, XLogReaderState::system_identifier, XLogCtlData::ThisTimeLineID, xl_end_of_recovery::ThisTimeLineID, ThisTimeLineID, CheckPoint::ThisTimeLineID, CheckPoint::time, ControlFileData::time, timestamptz_to_str(), tliOfPointInHistory(), tliSwitchPoint(), trace_recovery_messages, TransactionIdAdvance, TransactionIdFollowsOrEquals(), TransactionIdIsNormal, TransactionIdIsValid, TransactionIdRetreat, TrimCLOG(), unlink(), UNLOGGED_RELATION_CLEANUP, UNLOGGED_RELATION_INIT, ControlFileData::unloggedLSN, XLogCtlData::unloggedLSN, UpdateControlFile(), UpdateFullPageWrites(), ValidateXLOGDirectoryStructure(), WalSndWakeup(), XLogCtlData::Write, XLogwrtRqst::Write, XLogwrtResult::Write, writeTimeLineHistory(), RunningTransactionsData::xcnt, XidGenLock, RunningTransactionsData::xids, XLogRecord::xl_info, XLogRecord::xl_rmid, XLogRecord::xl_xid, XLogCtlData::xlblocks, XLByteToPrevSeg, XLOG_CHECKPOINT_SHUTDOWN, XLOG_END_OF_RECOVERY, XLogFileOpen(), XLogPageRead(), XLogReaderAllocate(), XLogReaderFree(), XLogReceiptTime, XLogRecGetData, XLogRecPtrIsInvalid, XLogReportParameters(), XLogSegSize, and XRecOffIsValid.

Referenced by InitPostgres(), and StartupProcessMain().

{
    XLogCtlInsert *Insert;
    CheckPoint  checkPoint;
    bool        wasShutdown;
    bool        reachedStopPoint = false;
    bool        haveBackupLabel = false;
    XLogRecPtr  RecPtr,
                checkPointLoc,
                EndOfLog;
    XLogSegNo   endLogSegNo;
    TimeLineID  PrevTimeLineID;
    XLogRecord *record;
    uint32      freespace;
    TransactionId oldestActiveXID;
    bool        backupEndRequired = false;
    bool        backupFromStandby = false;
    DBState     dbstate_at_startup;
    XLogReaderState *xlogreader;
    XLogPageReadPrivate private;
    bool        fast_promoted = false;

    /*
     * Read control file and check XLOG status looks valid.
     *
     * Note: in most control paths, *ControlFile is already valid and we need
     * not do ReadControlFile() here, but might as well do it to be sure.
     */
    ReadControlFile();

    if (ControlFile->state < DB_SHUTDOWNED ||
        ControlFile->state > DB_IN_PRODUCTION ||
        !XRecOffIsValid(ControlFile->checkPoint))
        ereport(FATAL,
                (errmsg("control file contains invalid data")));

    if (ControlFile->state == DB_SHUTDOWNED)
        ereport(LOG,
                (errmsg("database system was shut down at %s",
                        str_time(ControlFile->time))));
    else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
        ereport(LOG,
                (errmsg("database system was shut down in recovery at %s",
                        str_time(ControlFile->time))));
    else if (ControlFile->state == DB_SHUTDOWNING)
        ereport(LOG,
                (errmsg("database system shutdown was interrupted; last known up at %s",
                        str_time(ControlFile->time))));
    else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
        ereport(LOG,
           (errmsg("database system was interrupted while in recovery at %s",
                   str_time(ControlFile->time)),
            errhint("This probably means that some data is corrupted and"
                    " you will have to use the last backup for recovery.")));
    else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
        ereport(LOG,
                (errmsg("database system was interrupted while in recovery at log time %s",
                        str_time(ControlFile->checkPointCopy.time)),
                 errhint("If this has occurred more than once some data might be corrupted"
              " and you might need to choose an earlier recovery target.")));
    else if (ControlFile->state == DB_IN_PRODUCTION)
        ereport(LOG,
              (errmsg("database system was interrupted; last known up at %s",
                      str_time(ControlFile->time))));

    /* This is just to allow attaching to startup process with a debugger */
#ifdef XLOG_REPLAY_DELAY
    if (ControlFile->state != DB_SHUTDOWNED)
        pg_usleep(60000000L);
#endif

    /*
     * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
     * someone has performed a copy for PITR, these directories may have been
     * excluded and need to be re-created.
     */
    ValidateXLOGDirectoryStructure();

    /*
     * Clear out any old relcache cache files.  This is *necessary* if we do
     * any WAL replay, since that would probably result in the cache files
     * being out of sync with database reality.  In theory we could leave them
     * in place if the database had been cleanly shut down, but it seems
     * safest to just remove them always and let them be rebuilt during the
     * first backend startup.
     */
    RelationCacheInitFileRemove();

    /*
     * Initialize on the assumption we want to recover to the same timeline
     * that's active according to pg_control.
     */
    recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;

    /*
     * Check for recovery control file, and if so set up state for offline
     * recovery
     */
    readRecoveryCommandFile();

    /*
     * Save archive_cleanup_command in shared memory so that other processes
     * can see it.
     */
    strncpy(XLogCtl->archiveCleanupCommand,
            archiveCleanupCommand ? archiveCleanupCommand : "",
            sizeof(XLogCtl->archiveCleanupCommand));

    if (ArchiveRecoveryRequested)
    {
        if (StandbyModeRequested)
            ereport(LOG,
                    (errmsg("entering standby mode")));
        else if (recoveryTarget == RECOVERY_TARGET_XID)
            ereport(LOG,
                    (errmsg("starting point-in-time recovery to XID %u",
                            recoveryTargetXid)));
        else if (recoveryTarget == RECOVERY_TARGET_TIME)
            ereport(LOG,
                    (errmsg("starting point-in-time recovery to %s",
                            timestamptz_to_str(recoveryTargetTime))));
        else if (recoveryTarget == RECOVERY_TARGET_NAME)
            ereport(LOG,
                    (errmsg("starting point-in-time recovery to \"%s\"",
                            recoveryTargetName)));
        else
            ereport(LOG,
                    (errmsg("starting archive recovery")));
    }
    else if (ControlFile->minRecoveryPointTLI > 0)
    {
        /*
         * If the minRecoveryPointTLI is set when not in Archive Recovery
         * it means that we have crashed after ending recovery and
         * yet before we wrote a new checkpoint on the new timeline.
         * That means we are doing a crash recovery that needs to cross
         * timelines to get to our newly assigned timeline again.
         * The timeline we are headed for is exact and not 'latest'.
         * As soon as we hit a checkpoint, the minRecoveryPointTLI is
         * reset, so we will not enter crash recovery again.
         */
        Assert(ControlFile->minRecoveryPointTLI != 1);
        recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
        recoveryTargetIsLatest = false;
    }

    /*
     * Take ownership of the wakeup latch if we're going to sleep during
     * recovery.
     */
    if (StandbyModeRequested)
        OwnLatch(&XLogCtl->recoveryWakeupLatch);

    /* Set up XLOG reader facility */
    MemSet(&private, 0, sizeof(XLogPageReadPrivate));
    xlogreader = XLogReaderAllocate(&XLogPageRead, &private);
    if (!xlogreader)
        ereport(ERROR,
                (errcode(ERRCODE_OUT_OF_MEMORY),
                 errmsg("out of memory"),
                 errdetail("Failed while allocating an XLog reading processor")));
    xlogreader->system_identifier = ControlFile->system_identifier;

    if (read_backup_label(&checkPointLoc, &backupEndRequired,
                          &backupFromStandby))
    {
        /*
         * Archive recovery was requested, and thanks to the backup label file,
         * we know how far we need to replay to reach consistency. Enter
         * archive recovery directly.
         */
        InArchiveRecovery = true;
        if (StandbyModeRequested)
            StandbyMode = true;

        /*
         * When a backup_label file is present, we want to roll forward from
         * the checkpoint it identifies, rather than using pg_control.
         */
        record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
        if (record != NULL)
        {
            memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
            wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
            ereport(DEBUG1,
                    (errmsg("checkpoint record is at %X/%X",
                            (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
            InRecovery = true;  /* force recovery even if SHUTDOWNED */

            /*
             * Make sure that REDO location exists. This may not be the case
             * if there was a crash during an online backup, which left a
             * backup_label around that references a WAL segment that's
             * already been archived.
             */
            if (checkPoint.redo < checkPointLoc)
            {
                if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false))
                    ereport(FATAL,
                            (errmsg("could not find redo location referenced by checkpoint record"),
                             errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
            }
        }
        else
        {
            ereport(FATAL,
                    (errmsg("could not locate required checkpoint record"),
                     errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
            wasShutdown = false;    /* keep compiler quiet */
        }
        /* set flag to delete it later */
        haveBackupLabel = true;
    }
    else
    {
        /*
         * It's possible that archive recovery was requested, but we don't
         * know how far we need to replay the WAL before we reach consistency.
         * This can happen for example if a base backup is taken from a running
         * server using an atomic filesystem snapshot, without calling
         * pg_start/stop_backup. Or if you just kill a running master server
         * and put it into archive recovery by creating a recovery.conf file.
         *
         * Our strategy in that case is to perform crash recovery first,
         * replaying all the WAL present in pg_xlog, and only enter archive
         * recovery after that.
         *
         * But usually we already know how far we need to replay the WAL (up to
         * minRecoveryPoint, up to backupEndPoint, or until we see an
         * end-of-backup record), and we can enter archive recovery directly.
         */
        if (ArchiveRecoveryRequested &&
            (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
             ControlFile->backupEndRequired ||
             ControlFile->backupEndPoint != InvalidXLogRecPtr ||
             ControlFile->state == DB_SHUTDOWNED))
        {
            InArchiveRecovery = true;
            if (StandbyModeRequested)
                StandbyMode = true;
        }

        /*
         * Get the last valid checkpoint record.  If the latest one according
         * to pg_control is broken, try the next-to-last one.
         */
        checkPointLoc = ControlFile->checkPoint;
        RedoStartLSN = ControlFile->checkPointCopy.redo;
        record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
        if (record != NULL)
        {
            ereport(DEBUG1,
                    (errmsg("checkpoint record is at %X/%X",
                            (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
        }
        else if (StandbyMode)
        {
            /*
             * The last valid checkpoint record required for a streaming
             * recovery exists in neither standby nor the primary.
             */
            ereport(PANIC,
                    (errmsg("could not locate a valid checkpoint record")));
        }
        else
        {
            checkPointLoc = ControlFile->prevCheckPoint;
            record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
            if (record != NULL)
            {
                ereport(LOG,
                        (errmsg("using previous checkpoint record at %X/%X",
                                (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
                InRecovery = true;      /* force recovery even if SHUTDOWNED */
            }
            else
                ereport(PANIC,
                     (errmsg("could not locate a valid checkpoint record")));
        }
        memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
        wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
    }

    /*
     * If the location of the checkpoint record is not on the expected
     * timeline in the history of the requested timeline, we cannot proceed:
     * the backup is not part of the history of the requested timeline.
     */
    Assert(expectedTLEs); /* was initialized by reading checkpoint record */
    if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
            checkPoint.ThisTimeLineID)
    {
        XLogRecPtr switchpoint;

        /*
         * tliSwitchPoint will throw an error if the checkpoint's timeline
         * is not in expectedTLEs at all.
         */
        switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
        ereport(FATAL,
                (errmsg("requested timeline %u is not a child of this server's history",
                        recoveryTargetTLI),
                 errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X",
                           (uint32) (ControlFile->checkPoint >> 32),
                           (uint32) ControlFile->checkPoint,
                           ControlFile->checkPointCopy.ThisTimeLineID,
                           (uint32) (switchpoint >> 32),
                           (uint32) switchpoint)));
    }

    /*
     * The min recovery point should be part of the requested timeline's
     * history, too.
     */
    if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
        tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
            ControlFile->minRecoveryPointTLI)
        ereport(FATAL,
                (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
                        recoveryTargetTLI,
                        (uint32) (ControlFile->minRecoveryPoint >> 32),
                        (uint32) ControlFile->minRecoveryPoint,
                        ControlFile->minRecoveryPointTLI)));

    LastRec = RecPtr = checkPointLoc;

    ereport(DEBUG1,
            (errmsg("redo record is at %X/%X; shutdown %s",
                    (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
                    wasShutdown ? "TRUE" : "FALSE")));
    ereport(DEBUG1,
            (errmsg("next transaction ID: %u/%u; next OID: %u",
                    checkPoint.nextXidEpoch, checkPoint.nextXid,
                    checkPoint.nextOid)));
    ereport(DEBUG1,
            (errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
                    checkPoint.nextMulti, checkPoint.nextMultiOffset)));
    ereport(DEBUG1,
            (errmsg("oldest unfrozen transaction ID: %u, in database %u",
                    checkPoint.oldestXid, checkPoint.oldestXidDB)));
    ereport(DEBUG1,
            (errmsg("oldest MultiXactId: %u, in database %u",
                    checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
    if (!TransactionIdIsNormal(checkPoint.nextXid))
        ereport(PANIC,
                (errmsg("invalid next transaction ID")));

    /* initialize shared memory variables from the checkpoint record */
    ShmemVariableCache->nextXid = checkPoint.nextXid;
    ShmemVariableCache->nextOid = checkPoint.nextOid;
    ShmemVariableCache->oidCount = 0;
    MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
    SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
    SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
    XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
    XLogCtl->ckptXid = checkPoint.nextXid;

    /*
     * Initialize unlogged LSN. On a clean shutdown, it's restored from the
     * control file. On recovery, all unlogged relations are blown away, so
     * the unlogged LSN counter can be reset too.
     */
    if (ControlFile->state == DB_SHUTDOWNED)
        XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
    else
        XLogCtl->unloggedLSN = 1;

    /*
     * We must replay WAL entries using the same TimeLineID they were created
     * under, so temporarily adopt the TLI indicated by the checkpoint (see
     * also xlog_redo()).
     */
    ThisTimeLineID = checkPoint.ThisTimeLineID;

    /*
     * Copy any missing timeline history files between 'now' and the
     * recovery target timeline from archive to pg_xlog. While we don't need
     * those files ourselves - the history file of the recovery target
     * timeline covers all the previous timelines in the history too - a
     * cascading standby server might be interested in them. Or, if you
     * archive the WAL from this server to a different archive than the
     * master, it'd be good for all the history files to get archived there
     * after failover, so that you can use one of the old timelines as a
     * PITR target. Timeline history files are small, so it's better to copy
     * them unnecessarily than not copy them and regret later.
     */
    restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);

    lastFullPageWrites = checkPoint.fullPageWrites;

    RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;

    if (RecPtr < checkPoint.redo)
        ereport(PANIC,
                (errmsg("invalid redo in checkpoint record")));

    /*
     * Check whether we need to force recovery from WAL.  If it appears to
     * have been a clean shutdown and we did not have a recovery.conf file,
     * then assume no recovery needed.
     */
    if (checkPoint.redo < RecPtr)
    {
        if (wasShutdown)
            ereport(PANIC,
                    (errmsg("invalid redo record in shutdown checkpoint")));
        InRecovery = true;
    }
    else if (ControlFile->state != DB_SHUTDOWNED)
        InRecovery = true;
    else if (ArchiveRecoveryRequested)
    {
        /* force recovery due to presence of recovery.conf */
        InRecovery = true;
    }

    /* REDO */
    if (InRecovery)
    {
        int         rmid;

        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        /*
         * Update pg_control to show that we are recovering and to show the
         * selected checkpoint as the place we are starting from. We also mark
         * pg_control with any minimum recovery stop point obtained from a
         * backup history file.
         */
        dbstate_at_startup = ControlFile->state;
        if (InArchiveRecovery)
            ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
        else
        {
            ereport(LOG,
                    (errmsg("database system was not properly shut down; "
                            "automatic recovery in progress")));
            if (recoveryTargetTLI > 0)
                ereport(LOG,
                    (errmsg("crash recovery starts in timeline %u "
                            "and has target timeline %u",
                            ControlFile->checkPointCopy.ThisTimeLineID,
                            recoveryTargetTLI)));
            ControlFile->state = DB_IN_CRASH_RECOVERY;
        }
        ControlFile->prevCheckPoint = ControlFile->checkPoint;
        ControlFile->checkPoint = checkPointLoc;
        ControlFile->checkPointCopy = checkPoint;
        if (InArchiveRecovery)
        {
            /* initialize minRecoveryPoint if not set yet */
            if (ControlFile->minRecoveryPoint < checkPoint.redo)
            {
                ControlFile->minRecoveryPoint = checkPoint.redo;
                ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
            }
        }

        /*
         * Set backupStartPoint if we're starting recovery from a base backup.
         *
         * Set backupEndPoint and use minRecoveryPoint as the backup end
         * location if we're starting recovery from a base backup which was
         * taken from the standby. In this case, the database system status in
         * pg_control must indicate DB_IN_ARCHIVE_RECOVERY. If not, which
         * means that backup is corrupted, so we cancel recovery.
         */
        if (haveBackupLabel)
        {
            ControlFile->backupStartPoint = checkPoint.redo;
            ControlFile->backupEndRequired = backupEndRequired;

            if (backupFromStandby)
            {
                if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY)
                    ereport(FATAL,
                            (errmsg("backup_label contains data inconsistent with control file"),
                             errhint("This means that the backup is corrupted and you will "
                               "have to use another backup for recovery.")));
                ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
            }
        }
        ControlFile->time = (pg_time_t) time(NULL);
        /* No need to hold ControlFileLock yet, we aren't up far enough */
        UpdateControlFile();

        /* initialize our local copy of minRecoveryPoint */
        minRecoveryPoint = ControlFile->minRecoveryPoint;
        minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;

        /*
         * Reset pgstat data, because it may be invalid after recovery.
         */
        pgstat_reset_all();

        /*
         * If there was a backup label file, it's done its job and the info
         * has now been propagated into pg_control.  We must get rid of the
         * label file so that if we crash during recovery, we'll pick up at
         * the latest recovery restartpoint instead of going all the way back
         * to the backup start point.  It seems prudent though to just rename
         * the file out of the way rather than delete it completely.
         */
        if (haveBackupLabel)
        {
            unlink(BACKUP_LABEL_OLD);
            if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
                ereport(FATAL,
                        (errcode_for_file_access(),
                         errmsg("could not rename file \"%s\" to \"%s\": %m",
                                BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
        }

        /* Check that the GUCs used to generate the WAL allow recovery */
        CheckRequiredParameterValues();

        /*
         * We're in recovery, so unlogged relations may be trashed and must be
         * reset.  This should be done BEFORE allowing Hot Standby
         * connections, so that read-only backends don't try to read whatever
         * garbage is left over from before.
         */
        ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);

        /*
         * Likewise, delete any saved transaction snapshot files that got left
         * behind by crashed backends.
         */
        DeleteAllExportedSnapshotFiles();

        /*
         * Initialize for Hot Standby, if enabled. We won't let backends in
         * yet, not until we've reached the min recovery point specified in
         * control file and we've established a recovery snapshot from a
         * running-xacts WAL record.
         */
        if (ArchiveRecoveryRequested && EnableHotStandby)
        {
            TransactionId *xids;
            int         nxids;

            ereport(DEBUG1,
                    (errmsg("initializing for hot standby")));

            InitRecoveryTransactionEnvironment();

            if (wasShutdown)
                oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
            else
                oldestActiveXID = checkPoint.oldestActiveXid;
            Assert(TransactionIdIsValid(oldestActiveXID));

            /*
             * Startup commit log and subtrans only. Other SLRUs are not
             * maintained during recovery and need not be started yet.
             */
            StartupCLOG();
            StartupSUBTRANS(oldestActiveXID);

            /*
             * If we're beginning at a shutdown checkpoint, we know that
             * nothing was running on the master at this point. So fake-up an
             * empty running-xacts record and use that here and now. Recover
             * additional standby state for prepared transactions.
             */
            if (wasShutdown)
            {
                RunningTransactionsData running;
                TransactionId latestCompletedXid;

                /*
                 * Construct a RunningTransactions snapshot representing a
                 * shut down server, with only prepared transactions still
                 * alive. We're never overflowed at this point because all
                 * subxids are listed with their parent prepared transactions.
                 */
                running.xcnt = nxids;
                running.subxcnt = 0;
                running.subxid_overflow = false;
                running.nextXid = checkPoint.nextXid;
                running.oldestRunningXid = oldestActiveXID;
                latestCompletedXid = checkPoint.nextXid;
                TransactionIdRetreat(latestCompletedXid);
                Assert(TransactionIdIsNormal(latestCompletedXid));
                running.latestCompletedXid = latestCompletedXid;
                running.xids = xids;

                ProcArrayApplyRecoveryInfo(&running);

                StandbyRecoverPreparedTransactions(false);
            }
        }

        /* Initialize resource managers */
        for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
        {
            if (RmgrTable[rmid].rm_startup != NULL)
                RmgrTable[rmid].rm_startup();
        }

        /*
         * Initialize shared replayEndRecPtr, lastReplayedEndRecPtr, and
         * recoveryLastXTime.
         *
         * This is slightly confusing if we're starting from an online
         * checkpoint; we've just read and replayed the chekpoint record, but
         * we're going to start replay from its redo pointer, which precedes
         * the location of the checkpoint record itself. So even though the
         * last record we've replayed is indeed ReadRecPtr, we haven't
         * replayed all the preceding records yet. That's OK for the current
         * use of these variables.
         */
        SpinLockAcquire(&xlogctl->info_lck);
        xlogctl->replayEndRecPtr = ReadRecPtr;
        xlogctl->replayEndTLI = ThisTimeLineID;
        xlogctl->lastReplayedEndRecPtr = EndRecPtr;
        xlogctl->lastReplayedTLI = ThisTimeLineID;
        xlogctl->recoveryLastXTime = 0;
        xlogctl->currentChunkStartTime = 0;
        xlogctl->recoveryPause = false;
        SpinLockRelease(&xlogctl->info_lck);

        /* Also ensure XLogReceiptTime has a sane value */
        XLogReceiptTime = GetCurrentTimestamp();

        /*
         * Let postmaster know we've started redo now, so that it can launch
         * checkpointer to perform restartpoints.  We don't bother during
         * crash recovery as restartpoints can only be performed during
         * archive recovery.  And we'd like to keep crash recovery simple, to
         * avoid introducing bugs that could affect you when recovering after
         * crash.
         *
         * After this point, we can no longer assume that we're the only
         * process in addition to postmaster!  Also, fsync requests are
         * subsequently to be handled by the checkpointer, not locally.
         */
        if (ArchiveRecoveryRequested && IsUnderPostmaster)
        {
            PublishStartupProcessInformation();
            SetForwardFsyncRequests();
            SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
            bgwriterLaunched = true;
        }

        /*
         * Allow read-only connections immediately if we're consistent
         * already.
         */
        CheckRecoveryConsistency();

        /*
         * Find the first record that logically follows the checkpoint --- it
         * might physically precede it, though.
         */
        if (checkPoint.redo < RecPtr)
        {
            /* back up to find the record */
            record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
        }
        else
        {
            /* just have to read next record after CheckPoint */
            record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
        }

        if (record != NULL)
        {
            bool        recoveryContinue = true;
            bool        recoveryApply = true;
            ErrorContextCallback errcallback;
            TimestampTz xtime;

            InRedo = true;

            ereport(LOG,
                    (errmsg("redo starts at %X/%X",
                            (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));

            /*
             * main redo apply loop
             */
            do
            {
                bool switchedTLI = false;
#ifdef WAL_DEBUG
                if (XLOG_DEBUG ||
                 (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
                    (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
                {
                    StringInfoData buf;

                    initStringInfo(&buf);
                    appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
                                     (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
                                     (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
                    xlog_outrec(&buf, record);
                    appendStringInfo(&buf, " - ");
                    RmgrTable[record->xl_rmid].rm_desc(&buf,
                                                       record->xl_info,
                                                     XLogRecGetData(record));
                    elog(LOG, "%s", buf.data);
                    pfree(buf.data);
                }
#endif

                /* Handle interrupt signals of startup process */
                HandleStartupProcInterrupts();

                /*
                 * Pause WAL replay, if requested by a hot-standby session via
                 * SetRecoveryPause().
                 *
                 * Note that we intentionally don't take the info_lck spinlock
                 * here.  We might therefore read a slightly stale value of
                 * the recoveryPause flag, but it can't be very stale (no
                 * worse than the last spinlock we did acquire).  Since a
                 * pause request is a pretty asynchronous thing anyway,
                 * possibly responding to it one WAL record later than we
                 * otherwise would is a minor issue, so it doesn't seem worth
                 * adding another spinlock cycle to prevent that.
                 */
                if (xlogctl->recoveryPause)
                    recoveryPausesHere();

                /*
                 * Have we reached our recovery target?
                 */
                if (recoveryStopsHere(record, &recoveryApply))
                {
                    if (recoveryPauseAtTarget)
                    {
                        SetRecoveryPause(true);
                        recoveryPausesHere();
                    }
                    reachedStopPoint = true;    /* see below */
                    recoveryContinue = false;

                    /* Exit loop if we reached non-inclusive recovery target */
                    if (!recoveryApply)
                        break;
                }

                /* Setup error traceback support for ereport() */
                errcallback.callback = rm_redo_error_callback;
                errcallback.arg = (void *) record;
                errcallback.previous = error_context_stack;
                error_context_stack = &errcallback;

                /*
                 * ShmemVariableCache->nextXid must be beyond record's xid.
                 *
                 * We don't expect anyone else to modify nextXid, hence we
                 * don't need to hold a lock while examining it.  We still
                 * acquire the lock to modify it, though.
                 */
                if (TransactionIdFollowsOrEquals(record->xl_xid,
                                                 ShmemVariableCache->nextXid))
                {
                    LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
                    ShmemVariableCache->nextXid = record->xl_xid;
                    TransactionIdAdvance(ShmemVariableCache->nextXid);
                    LWLockRelease(XidGenLock);
                }

                /*
                 * Before replaying this record, check if this record
                 * causes the current timeline to change. The record is
                 * already considered to be part of the new timeline,
                 * so we update ThisTimeLineID before replaying it.
                 * That's important so that replayEndTLI, which is
                 * recorded as the minimum recovery point's TLI if
                 * recovery stops after this record, is set correctly.
                 */
                if (record->xl_rmid == RM_XLOG_ID)
                {
                    TimeLineID  newTLI = ThisTimeLineID;
                    TimeLineID  prevTLI = ThisTimeLineID;
                    uint8       info = record->xl_info & ~XLR_INFO_MASK;

                    if (info == XLOG_CHECKPOINT_SHUTDOWN)
                    {
                        CheckPoint  checkPoint;

                        memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
                        newTLI = checkPoint.ThisTimeLineID;
                        prevTLI = checkPoint.PrevTimeLineID;
                    }
                    else if (info == XLOG_END_OF_RECOVERY)
                    {
                        xl_end_of_recovery  xlrec;

                        memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
                        newTLI = xlrec.ThisTimeLineID;
                        prevTLI = xlrec.PrevTimeLineID;
                    }

                    if (newTLI != ThisTimeLineID)
                    {
                        /* Check that it's OK to switch to this TLI */
                        checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);

                        /* Following WAL records should be run with new TLI */
                        ThisTimeLineID = newTLI;
                        switchedTLI = true;
                    }
                }

                /*
                 * Update shared replayEndRecPtr before replaying this record,
                 * so that XLogFlush will update minRecoveryPoint correctly.
                 */
                SpinLockAcquire(&xlogctl->info_lck);
                xlogctl->replayEndRecPtr = EndRecPtr;
                xlogctl->replayEndTLI = ThisTimeLineID;
                SpinLockRelease(&xlogctl->info_lck);

                /*
                 * If we are attempting to enter Hot Standby mode, process
                 * XIDs we see
                 */
                if (standbyState >= STANDBY_INITIALIZED &&
                    TransactionIdIsValid(record->xl_xid))
                    RecordKnownAssignedTransactionIds(record->xl_xid);

                /* Now apply the WAL record itself */
                RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);

                /* Pop the error context stack */
                error_context_stack = errcallback.previous;

                /*
                 * Update lastReplayedEndRecPtr after this record has been
                 * successfully replayed.
                 */
                SpinLockAcquire(&xlogctl->info_lck);
                xlogctl->lastReplayedEndRecPtr = EndRecPtr;
                xlogctl->lastReplayedTLI = ThisTimeLineID;
                SpinLockRelease(&xlogctl->info_lck);

                /* Remember this record as the last-applied one */
                LastRec = ReadRecPtr;

                /* Allow read-only connections if we're consistent now */
                CheckRecoveryConsistency();

                /*
                 * If this record was a timeline switch, wake up any
                 * walsenders to notice that we are on a new timeline.
                 */
                if (switchedTLI && AllowCascadeReplication())
                    WalSndWakeup();

                /* Exit loop if we reached inclusive recovery target */
                if (!recoveryContinue)
                    break;

                /* Else, try to fetch the next WAL record */
                record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
            } while (record != NULL);

            /*
             * end of main redo apply loop
             */

            ereport(LOG,
                    (errmsg("redo done at %X/%X",
                            (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
            xtime = GetLatestXTime();
            if (xtime)
                ereport(LOG,
                     (errmsg("last completed transaction was at log time %s",
                             timestamptz_to_str(xtime))));
            InRedo = false;
        }
        else
        {
            /* there are no WAL records following the checkpoint */
            ereport(LOG,
                    (errmsg("redo is not required")));
        }
    }

    /*
     * Kill WAL receiver, if it's still running, before we continue to write
     * the startup checkpoint record. It will trump over the checkpoint and
     * subsequent records if it's still alive when we start writing WAL.
     */
    ShutdownWalRcv();

    /*
     * We don't need the latch anymore. It's not strictly necessary to disown
     * it, but let's do it for the sake of tidiness.
     */
    if (StandbyModeRequested)
        DisownLatch(&XLogCtl->recoveryWakeupLatch);

    /*
     * We are now done reading the xlog from stream. Turn off streaming
     * recovery to force fetching the files (which would be required at end of
     * recovery, e.g., timeline history file) from archive or pg_xlog.
     */
    StandbyMode = false;

    /*
     * Re-fetch the last valid or last applied record, so we can identify the
     * exact endpoint of what we consider the valid portion of WAL.
     */
    record = ReadRecord(xlogreader, LastRec, PANIC, false);
    EndOfLog = EndRecPtr;
    XLByteToPrevSeg(EndOfLog, endLogSegNo);

    /*
     * Complain if we did not roll forward far enough to render the backup
     * dump consistent.  Note: it is indeed okay to look at the local variable
     * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
     * be further ahead --- ControlFile->minRecoveryPoint cannot have been
     * advanced beyond the WAL we processed.
     */
    if (InRecovery &&
        (EndOfLog < minRecoveryPoint ||
         !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
    {
        if (reachedStopPoint)
        {
            /* stopped because of stop request */
            ereport(FATAL,
                    (errmsg("requested recovery stop point is before consistent recovery point")));
        }

        /*
         * Ran off end of WAL before reaching end-of-backup WAL record, or
         * minRecoveryPoint. That's usually a bad sign, indicating that you
         * tried to recover from an online backup but never called
         * pg_stop_backup(), or you didn't archive all the WAL up to that
         * point. However, this also happens in crash recovery, if the system
         * crashes while an online backup is in progress. We must not treat
         * that as an error, or the database will refuse to start up.
         */
        if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
        {
            if (ControlFile->backupEndRequired)
                ereport(FATAL,
                        (errmsg("WAL ends before end of online backup"),
                         errhint("All WAL generated while online backup was taken must be available at recovery.")));
            else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
                ereport(FATAL,
                        (errmsg("WAL ends before end of online backup"),
                         errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
            else
                ereport(FATAL,
                      (errmsg("WAL ends before consistent recovery point")));
        }
    }

    /*
     * Consider whether we need to assign a new timeline ID.
     *
     * If we are doing an archive recovery, we always assign a new ID.  This
     * handles a couple of issues.  If we stopped short of the end of WAL
     * during recovery, then we are clearly generating a new timeline and must
     * assign it a unique new ID.  Even if we ran to the end, modifying the
     * current last segment is problematic because it may result in trying to
     * overwrite an already-archived copy of that segment, and we encourage
     * DBAs to make their archive_commands reject that.  We can dodge the
     * problem by making the new active segment have a new timeline ID.
     *
     * In a normal crash recovery, we can just extend the timeline we were in.
     */
    PrevTimeLineID = ThisTimeLineID;
    if (ArchiveRecoveryRequested)
    {
        char    reason[200];

        Assert(InArchiveRecovery);

        ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
        ereport(LOG,
                (errmsg("selected new timeline ID: %u", ThisTimeLineID)));

        /*
         * Create a comment for the history file to explain why and where
         * timeline changed.
         */
        if (recoveryTarget == RECOVERY_TARGET_XID)
            snprintf(reason, sizeof(reason),
                     "%s transaction %u",
                     recoveryStopAfter ? "after" : "before",
                     recoveryStopXid);
        else if (recoveryTarget == RECOVERY_TARGET_TIME)
            snprintf(reason, sizeof(reason),
                     "%s %s\n",
                     recoveryStopAfter ? "after" : "before",
                     timestamptz_to_str(recoveryStopTime));
        else if (recoveryTarget == RECOVERY_TARGET_NAME)
            snprintf(reason, sizeof(reason),
                     "at restore point \"%s\"",
                     recoveryStopName);
        else
            snprintf(reason, sizeof(reason), "no recovery target specified");

        writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
                             EndRecPtr, reason);
    }

    /* Save the selected TimeLineID in shared memory, too */
    XLogCtl->ThisTimeLineID = ThisTimeLineID;
    XLogCtl->PrevTimeLineID = PrevTimeLineID;

    /*
     * We are now done reading the old WAL.  Turn off archive fetching if it
     * was active, and make a writable copy of the last WAL segment. (Note
     * that we also have a copy of the last block of the old WAL in readBuf;
     * we will use that below.)
     */
    if (ArchiveRecoveryRequested)
        exitArchiveRecovery(xlogreader->readPageTLI, endLogSegNo);

    /*
     * Prepare to write WAL starting at EndOfLog position, and init xlog
     * buffer cache using the block containing the last record from the
     * previous incarnation.
     */
    openLogSegNo = endLogSegNo;
    openLogFile = XLogFileOpen(openLogSegNo);
    openLogOff = 0;
    Insert = &XLogCtl->Insert;
    Insert->PrevRecord = LastRec;
    XLogCtl->xlblocks[0] = ((EndOfLog - 1) / XLOG_BLCKSZ + 1) * XLOG_BLCKSZ;

    /*
     * Tricky point here: readBuf contains the *last* block that the LastRec
     * record spans, not the one it starts in.  The last block is indeed the
     * one we want to use.
     */
    if (EndOfLog % XLOG_BLCKSZ == 0)
    {
        memset(Insert->currpage, 0, XLOG_BLCKSZ);
    }
    else
    {
        Assert(readOff == (XLogCtl->xlblocks[0] - XLOG_BLCKSZ) % XLogSegSize);
        memcpy((char *) Insert->currpage, xlogreader->readBuf, XLOG_BLCKSZ);
    }
    Insert->currpos = (char *) Insert->currpage +
        (EndOfLog + XLOG_BLCKSZ - XLogCtl->xlblocks[0]);

    LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;

    XLogCtl->LogwrtResult = LogwrtResult;

    XLogCtl->LogwrtRqst.Write = EndOfLog;
    XLogCtl->LogwrtRqst.Flush = EndOfLog;

    freespace = INSERT_FREESPACE(Insert);
    if (freespace > 0)
    {
        /* Make sure rest of page is zero */
        MemSet(Insert->currpos, 0, freespace);
        XLogCtl->Write.curridx = 0;
    }
    else
    {
        /*
         * Whenever LogwrtResult points to exactly the end of a page,
         * Write.curridx must point to the *next* page (see XLogWrite()).
         *
         * Note: it might seem we should do AdvanceXLInsertBuffer() here, but
         * this is sufficient.  The first actual attempt to insert a log
         * record will advance the insert state.
         */
        XLogCtl->Write.curridx = NextBufIdx(0);
    }

    /* Pre-scan prepared transactions to find out the range of XIDs present */
    oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);

    /*
     * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
     * record before resource manager writes cleanup WAL records or checkpoint
     * record is written.
     */
    Insert->fullPageWrites = lastFullPageWrites;
    LocalSetXLogInsertAllowed();
    UpdateFullPageWrites();
    LocalXLogInsertAllowed = -1;

    if (InRecovery)
    {
        int         rmid;

        /*
         * Resource managers might need to write WAL records, eg, to record
         * index cleanup actions.  So temporarily enable XLogInsertAllowed in
         * this process only.
         */
        LocalSetXLogInsertAllowed();

        /*
         * Allow resource managers to do any required cleanup.
         */
        for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
        {
            if (RmgrTable[rmid].rm_cleanup != NULL)
                RmgrTable[rmid].rm_cleanup();
        }

        /* Disallow XLogInsert again */
        LocalXLogInsertAllowed = -1;

        /*
         * Perform a checkpoint to update all our recovery activity to disk.
         *
         * Note that we write a shutdown checkpoint rather than an on-line
         * one. This is not particularly critical, but since we may be
         * assigning a new TLI, using a shutdown checkpoint allows us to have
         * the rule that TLI only changes in shutdown checkpoints, which
         * allows some extra error checking in xlog_redo.
         *
         * In fast promotion, only create a lightweight end-of-recovery record
         * instead of a full checkpoint. A checkpoint is requested later, after
         * we're fully out of recovery mode and already accepting queries.
         */
        if (bgwriterLaunched)
        {
            if (fast_promote)
            {
                checkPointLoc = ControlFile->prevCheckPoint;

                /*
                 * Confirm the last checkpoint is available for us to recover
                 * from if we fail. Note that we don't check for the secondary
                 * checkpoint since that isn't available in most base backups.
                 */
                record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
                if (record != NULL)
                {
                    fast_promoted = true;
                    CreateEndOfRecoveryRecord();
                }
            }

            if (!fast_promoted)
                RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
                                    CHECKPOINT_IMMEDIATE |
                                    CHECKPOINT_WAIT);
        }
        else
            CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);

        /*
         * And finally, execute the recovery_end_command, if any.
         */
        if (recoveryEndCommand)
            ExecuteRecoveryCommand(recoveryEndCommand,
                                   "recovery_end_command",
                                   true);
    }

    /*
     * Preallocate additional log files, if wanted.
     */
    PreallocXlogFiles(EndOfLog);

    /*
     * Reset initial contents of unlogged relations.  This has to be done
     * AFTER recovery is complete so that any unlogged relations created
     * during recovery also get picked up.
     */
    if (InRecovery)
        ResetUnloggedRelations(UNLOGGED_RELATION_INIT);

    /*
     * Okay, we're officially UP.
     */
    InRecovery = false;

    LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    ControlFile->state = DB_IN_PRODUCTION;
    ControlFile->time = (pg_time_t) time(NULL);
    UpdateControlFile();
    LWLockRelease(ControlFileLock);

    /* start the archive_timeout timer running */
    XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);

    /* also initialize latestCompletedXid, to nextXid - 1 */
    LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
    ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
    TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
    LWLockRelease(ProcArrayLock);

    /*
     * Start up the commit log and subtrans, if not already done for hot
     * standby.
     */
    if (standbyState == STANDBY_DISABLED)
    {
        StartupCLOG();
        StartupSUBTRANS(oldestActiveXID);
    }

    /*
     * Perform end of recovery actions for any SLRUs that need it.
     */
    StartupMultiXact();
    TrimCLOG();

    /* Reload shared-memory state for prepared transactions */
    RecoverPreparedTransactions();

    /*
     * Shutdown the recovery environment. This must occur after
     * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
     */
    if (standbyState != STANDBY_DISABLED)
        ShutdownRecoveryTransactionEnvironment();

    /* Shut down xlogreader */
    if (readFile >= 0)
    {
        close(readFile);
        readFile = -1;
    }
    XLogReaderFree(xlogreader);

    /*
     * If any of the critical GUCs have changed, log them before we allow
     * backends to write WAL.
     */
    LocalSetXLogInsertAllowed();
    XLogReportParameters();

    /*
     * All done.  Allow backends to write WAL.  (Although the bool flag is
     * probably atomic in itself, we use the info_lck here to ensure that
     * there are no race conditions concerning visibility of other recent
     * updates to shared memory.)
     */
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        xlogctl->SharedRecoveryInProgress = false;
        SpinLockRelease(&xlogctl->info_lck);
    }

    /*
     * If there were cascading standby servers connected to us, nudge any
     * wal sender processes to notice that we've been promoted.
     */
    WalSndWakeup();

    /*
     * If this was a fast promotion, request an (online) checkpoint now. This
     * isn't required for consistency, but the last restartpoint might be far
     * back, and in case of a crash, recovering from it might take a longer
     * than is appropriate now that we're not in standby mode anymore.
     */
    if (fast_promoted)
        RequestCheckpoint(0);
}

void UpdateControlFile ( void   ) 

Definition at line 3742 of file xlog.c.

References BasicOpenFile(), close, COMP_CRC32, ControlFileData::crc, ereport, errcode_for_file_access(), errmsg(), FIN_CRC32, INIT_CRC32, offsetof, PANIC, PG_BINARY, pg_fsync(), write, and XLOG_CONTROL_FILE.

Referenced by CheckRecoveryConsistency(), CreateCheckPoint(), CreateEndOfRecoveryRecord(), CreateRestartPoint(), ReadRecord(), StartupXLOG(), UpdateMinRecoveryPoint(), xlog_redo(), and XLogReportParameters().

{
    int         fd;

    INIT_CRC32(ControlFile->crc);
    COMP_CRC32(ControlFile->crc,
               (char *) ControlFile,
               offsetof(ControlFileData, crc));
    FIN_CRC32(ControlFile->crc);

    fd = BasicOpenFile(XLOG_CONTROL_FILE,
                       O_RDWR | PG_BINARY,
                       S_IRUSR | S_IWUSR);
    if (fd < 0)
        ereport(PANIC,
                (errcode_for_file_access(),
                 errmsg("could not open control file \"%s\": %m",
                        XLOG_CONTROL_FILE)));

    errno = 0;
    if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
    {
        /* if write didn't set errno, assume problem is no disk space */
        if (errno == 0)
            errno = ENOSPC;
        ereport(PANIC,
                (errcode_for_file_access(),
                 errmsg("could not write to control file: %m")));
    }

    if (pg_fsync(fd) != 0)
        ereport(PANIC,
                (errcode_for_file_access(),
                 errmsg("could not fsync control file: %m")));

    if (close(fd))
        ereport(PANIC,
                (errcode_for_file_access(),
                 errmsg("could not close control file: %m")));
}

void UpdateFullPageWrites ( void   ) 

Definition at line 7781 of file xlog.c.

References XLogRecData::buffer, XLogRecData::data, END_CRIT_SECTION, XLogCtlInsert::fullPageWrites, fullPageWrites, XLogCtlData::Insert, Insert(), XLogRecData::len, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), XLogRecData::next, RecoveryInProgress(), START_CRIT_SECTION, WALInsertLock, XLOG_FPW_CHANGE, XLogInsert(), and XLogStandbyInfoActive.

Referenced by StartupXLOG(), and UpdateSharedMemoryConfig().

{
    XLogCtlInsert *Insert = &XLogCtl->Insert;

    /*
     * Do nothing if full_page_writes has not been changed.
     *
     * It's safe to check the shared full_page_writes without the lock,
     * because we assume that there is no concurrently running process which
     * can update it.
     */
    if (fullPageWrites == Insert->fullPageWrites)
        return;

    START_CRIT_SECTION();

    /*
     * It's always safe to take full page images, even when not strictly
     * required, but not the other round. So if we're setting full_page_writes
     * to true, first set it true and then write the WAL record. If we're
     * setting it to false, first write the WAL record and then set the global
     * flag.
     */
    if (fullPageWrites)
    {
        LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
        Insert->fullPageWrites = true;
        LWLockRelease(WALInsertLock);
    }

    /*
     * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
     * full_page_writes during archive recovery, if required.
     */
    if (XLogStandbyInfoActive() && !RecoveryInProgress())
    {
        XLogRecData rdata;

        rdata.data = (char *) (&fullPageWrites);
        rdata.len = sizeof(bool);
        rdata.buffer = InvalidBuffer;
        rdata.next = NULL;

        XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE, &rdata);
    }

    if (!fullPageWrites)
    {
        LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
        Insert->fullPageWrites = false;
        LWLockRelease(WALInsertLock);
    }
    END_CRIT_SECTION();
}

void WakeupRecovery ( void   ) 
void xlog_desc ( StringInfo  buf,
uint8  xl_info,
char *  rec 
)

Definition at line 35 of file xlogdesc.c.

References appendStringInfo(), BkpBlock::block, xl_end_of_recovery::end_time, BkpBlock::fork, CheckPoint::fullPageWrites, xl_parameter_change::max_locks_per_xact, xl_parameter_change::max_prepared_xacts, xl_parameter_change::MaxConnections, config_enum_entry::name, CheckPoint::nextMulti, CheckPoint::nextMultiOffset, CheckPoint::nextOid, CheckPoint::nextXid, CheckPoint::nextXidEpoch, BkpBlock::node, CheckPoint::oldestActiveXid, CheckPoint::oldestMulti, CheckPoint::oldestMultiDB, CheckPoint::oldestXid, CheckPoint::oldestXidDB, xl_end_of_recovery::PrevTimeLineID, CheckPoint::PrevTimeLineID, CheckPoint::redo, relpathperm, xl_restore_point::rp_name, xl_end_of_recovery::ThisTimeLineID, CheckPoint::ThisTimeLineID, timestamptz_to_str(), config_enum_entry::val, xl_parameter_change::wal_level, XLOG_BACKUP_END, XLOG_CHECKPOINT_ONLINE, XLOG_CHECKPOINT_SHUTDOWN, XLOG_END_OF_RECOVERY, XLOG_FPW_CHANGE, XLOG_HINT, XLOG_NEXTOID, XLOG_NOOP, XLOG_PARAMETER_CHANGE, XLOG_RESTORE_POINT, and XLOG_SWITCH.

{
    uint8       info = xl_info & ~XLR_INFO_MASK;

    if (info == XLOG_CHECKPOINT_SHUTDOWN ||
        info == XLOG_CHECKPOINT_ONLINE)
    {
        CheckPoint *checkpoint = (CheckPoint *) rec;

        appendStringInfo(buf, "checkpoint: redo %X/%X; "
                         "tli %u; prev tli %u; fpw %s; xid %u/%u; oid %u; multi %u; offset %u; "
                         "oldest xid %u in DB %u; oldest multi %u in DB %u; "
                         "oldest running xid %u; %s",
                         (uint32) (checkpoint->redo >> 32), (uint32) checkpoint->redo,
                         checkpoint->ThisTimeLineID,
                         checkpoint->PrevTimeLineID,
                         checkpoint->fullPageWrites ? "true" : "false",
                         checkpoint->nextXidEpoch, checkpoint->nextXid,
                         checkpoint->nextOid,
                         checkpoint->nextMulti,
                         checkpoint->nextMultiOffset,
                         checkpoint->oldestXid,
                         checkpoint->oldestXidDB,
                         checkpoint->oldestMulti,
                         checkpoint->oldestMultiDB,
                         checkpoint->oldestActiveXid,
                 (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");
    }
    else if (info == XLOG_NOOP)
    {
        appendStringInfo(buf, "xlog no-op");
    }
    else if (info == XLOG_NEXTOID)
    {
        Oid         nextOid;

        memcpy(&nextOid, rec, sizeof(Oid));
        appendStringInfo(buf, "nextOid: %u", nextOid);
    }
    else if (info == XLOG_SWITCH)
    {
        appendStringInfo(buf, "xlog switch");
    }
    else if (info == XLOG_RESTORE_POINT)
    {
        xl_restore_point *xlrec = (xl_restore_point *) rec;

        appendStringInfo(buf, "restore point: %s", xlrec->rp_name);

    }
    else if (info == XLOG_HINT)
    {
        BkpBlock *bkp = (BkpBlock *) rec;
        appendStringInfo(buf, "page hint: %s block %u",
                         relpathperm(bkp->node, bkp->fork),
                         bkp->block);
    }
    else if (info == XLOG_BACKUP_END)
    {
        XLogRecPtr  startpoint;

        memcpy(&startpoint, rec, sizeof(XLogRecPtr));
        appendStringInfo(buf, "backup end: %X/%X",
                         (uint32) (startpoint >> 32), (uint32) startpoint);
    }
    else if (info == XLOG_PARAMETER_CHANGE)
    {
        xl_parameter_change xlrec;
        const char *wal_level_str;
        const struct config_enum_entry *entry;

        memcpy(&xlrec, rec, sizeof(xl_parameter_change));

        /* Find a string representation for wal_level */
        wal_level_str = "?";
        for (entry = wal_level_options; entry->name; entry++)
        {
            if (entry->val == xlrec.wal_level)
            {
                wal_level_str = entry->name;
                break;
            }
        }

        appendStringInfo(buf, "parameter change: max_connections=%d max_prepared_xacts=%d max_locks_per_xact=%d wal_level=%s",
                         xlrec.MaxConnections,
                         xlrec.max_prepared_xacts,
                         xlrec.max_locks_per_xact,
                         wal_level_str);
    }
    else if (info == XLOG_FPW_CHANGE)
    {
        bool        fpw;

        memcpy(&fpw, rec, sizeof(bool));
        appendStringInfo(buf, "full_page_writes: %s", fpw ? "true" : "false");
    }
    else if (info == XLOG_END_OF_RECOVERY)
    {
        xl_end_of_recovery xlrec;

        memcpy(&xlrec, rec, sizeof(xl_end_of_recovery));
        appendStringInfo(buf, "end_of_recovery: tli %u; prev tli %u; time %s",
                         xlrec.ThisTimeLineID, xlrec.PrevTimeLineID,
                         timestamptz_to_str(xlrec.end_time));
    }
    else
        appendStringInfo(buf, "UNKNOWN");
}

void xlog_redo ( XLogRecPtr  lsn,
XLogRecord record 
)

Definition at line 7890 of file xlog.c.

References ArchiveRecoveryRequested, Assert, ControlFileData::backupEndPoint, ControlFileData::backupEndRequired, ControlFileData::backupStartPoint, ControlFileData::checkPointCopy, CheckRequiredParameterValues(), XLogCtlData::ckptXid, XLogCtlData::ckptXidEpoch, ControlFileLock, DEBUG1, elog, ereport, errmsg(), XLogCtlData::info_lck, XLogCtlData::lastFpwDisableRecPtr, lastFullPageWrites, RunningTransactionsData::latestCompletedXid, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), xl_parameter_change::max_locks_per_xact, ControlFileData::max_locks_per_xact, xl_parameter_change::max_prepared_xacts, ControlFileData::max_prepared_xacts, xl_parameter_change::MaxConnections, ControlFileData::MaxConnections, minRecoveryPoint, ControlFileData::minRecoveryPoint, minRecoveryPointTLI, ControlFileData::minRecoveryPointTLI, MultiXactAdvanceNextMXact(), MultiXactAdvanceOldest(), MultiXactSetNextMXact(), CheckPoint::nextMulti, CheckPoint::nextMultiOffset, CheckPoint::nextOid, VariableCacheData::nextOid, RunningTransactionsData::nextXid, CheckPoint::nextXid, VariableCacheData::nextXid, CheckPoint::nextXidEpoch, VariableCacheData::oidCount, OidGenLock, CheckPoint::oldestMulti, CheckPoint::oldestMultiDB, RunningTransactionsData::oldestRunningXid, VariableCacheData::oldestXid, CheckPoint::oldestXid, CheckPoint::oldestXidDB, PANIC, PrescanPreparedTransactions(), ProcArrayApplyRecoveryInfo(), ReadRecPtr, RecoveryRestartPoint(), RestoreBackupBlockContents(), SetMultiXactIdLimit(), SetTransactionIdLimit(), ShmemVariableCache, SpinLockAcquire, SpinLockRelease, STANDBY_INITIALIZED, StandbyRecoverPreparedTransactions(), standbyState, RunningTransactionsData::subxcnt, RunningTransactionsData::subxid_overflow, xl_end_of_recovery::ThisTimeLineID, ThisTimeLineID, CheckPoint::ThisTimeLineID, TransactionIdIsNormal, TransactionIdPrecedes(), TransactionIdRetreat, UpdateControlFile(), xl_parameter_change::wal_level, ControlFileData::wal_level, RunningTransactionsData::xcnt, XidGenLock, RunningTransactionsData::xids, XLogRecord::xl_info, XLOG_BACKUP_END, XLOG_CHECKPOINT_ONLINE, XLOG_CHECKPOINT_SHUTDOWN, XLOG_END_OF_RECOVERY, XLOG_FPW_CHANGE, XLOG_HINT, XLOG_NEXTOID, XLOG_NOOP, XLOG_PARAMETER_CHANGE, XLOG_RESTORE_POINT, XLOG_SWITCH, XLogRecGetData, XLogRecPtrIsInvalid, and XLR_BKP_BLOCK_MASK.

{
    uint8       info = record->xl_info & ~XLR_INFO_MASK;

    /* Backup blocks are not used by XLOG rmgr */
    Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));

    if (info == XLOG_NEXTOID)
    {
        Oid         nextOid;

        /*
         * We used to try to take the maximum of ShmemVariableCache->nextOid
         * and the recorded nextOid, but that fails if the OID counter wraps
         * around.  Since no OID allocation should be happening during replay
         * anyway, better to just believe the record exactly.  We still take
         * OidGenLock while setting the variable, just in case.
         */
        memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
        LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
        ShmemVariableCache->nextOid = nextOid;
        ShmemVariableCache->oidCount = 0;
        LWLockRelease(OidGenLock);
    }
    else if (info == XLOG_CHECKPOINT_SHUTDOWN)
    {
        CheckPoint  checkPoint;

        memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
        /* In a SHUTDOWN checkpoint, believe the counters exactly */
        LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
        ShmemVariableCache->nextXid = checkPoint.nextXid;
        LWLockRelease(XidGenLock);
        LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
        ShmemVariableCache->nextOid = checkPoint.nextOid;
        ShmemVariableCache->oidCount = 0;
        LWLockRelease(OidGenLock);
        MultiXactSetNextMXact(checkPoint.nextMulti,
                              checkPoint.nextMultiOffset);
        SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
        SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);

        /*
         * If we see a shutdown checkpoint while waiting for an end-of-backup
         * record, the backup was canceled and the end-of-backup record will
         * never arrive.
         */
        if (ArchiveRecoveryRequested &&
            !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
            XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
            ereport(PANIC,
            (errmsg("online backup was canceled, recovery cannot continue")));

        /*
         * If we see a shutdown checkpoint, we know that nothing was running
         * on the master at this point. So fake-up an empty running-xacts
         * record and use that here and now. Recover additional standby state
         * for prepared transactions.
         */
        if (standbyState >= STANDBY_INITIALIZED)
        {
            TransactionId *xids;
            int         nxids;
            TransactionId oldestActiveXID;
            TransactionId latestCompletedXid;
            RunningTransactionsData running;

            oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);

            /*
             * Construct a RunningTransactions snapshot representing a shut
             * down server, with only prepared transactions still alive. We're
             * never overflowed at this point because all subxids are listed
             * with their parent prepared transactions.
             */
            running.xcnt = nxids;
            running.subxcnt = 0;
            running.subxid_overflow = false;
            running.nextXid = checkPoint.nextXid;
            running.oldestRunningXid = oldestActiveXID;
            latestCompletedXid = checkPoint.nextXid;
            TransactionIdRetreat(latestCompletedXid);
            Assert(TransactionIdIsNormal(latestCompletedXid));
            running.latestCompletedXid = latestCompletedXid;
            running.xids = xids;

            ProcArrayApplyRecoveryInfo(&running);

            StandbyRecoverPreparedTransactions(true);
        }

        /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
        ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
        ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;

        /* Update shared-memory copy of checkpoint XID/epoch */
        {
            /* use volatile pointer to prevent code rearrangement */
            volatile XLogCtlData *xlogctl = XLogCtl;

            SpinLockAcquire(&xlogctl->info_lck);
            xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
            xlogctl->ckptXid = checkPoint.nextXid;
            SpinLockRelease(&xlogctl->info_lck);
        }

        /*
         * We should've already switched to the new TLI before replaying this
         * record.
         */
        if (checkPoint.ThisTimeLineID != ThisTimeLineID)
            ereport(PANIC,
                    (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
                            checkPoint.ThisTimeLineID, ThisTimeLineID)));

        RecoveryRestartPoint(&checkPoint);
    }
    else if (info == XLOG_CHECKPOINT_ONLINE)
    {
        CheckPoint  checkPoint;

        memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
        /* In an ONLINE checkpoint, treat the XID counter as a minimum */
        LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
        if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
                                  checkPoint.nextXid))
            ShmemVariableCache->nextXid = checkPoint.nextXid;
        LWLockRelease(XidGenLock);
        /* ... but still treat OID counter as exact */
        LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
        ShmemVariableCache->nextOid = checkPoint.nextOid;
        ShmemVariableCache->oidCount = 0;
        LWLockRelease(OidGenLock);
        MultiXactAdvanceNextMXact(checkPoint.nextMulti,
                                  checkPoint.nextMultiOffset);
        if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
                                  checkPoint.oldestXid))
            SetTransactionIdLimit(checkPoint.oldestXid,
                                  checkPoint.oldestXidDB);
        MultiXactAdvanceOldest(checkPoint.oldestMulti,
                               checkPoint.oldestMultiDB);

        /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
        ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
        ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;

        /* Update shared-memory copy of checkpoint XID/epoch */
        {
            /* use volatile pointer to prevent code rearrangement */
            volatile XLogCtlData *xlogctl = XLogCtl;

            SpinLockAcquire(&xlogctl->info_lck);
            xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
            xlogctl->ckptXid = checkPoint.nextXid;
            SpinLockRelease(&xlogctl->info_lck);
        }

        /* TLI should not change in an on-line checkpoint */
        if (checkPoint.ThisTimeLineID != ThisTimeLineID)
            ereport(PANIC,
                    (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
                            checkPoint.ThisTimeLineID, ThisTimeLineID)));

        RecoveryRestartPoint(&checkPoint);
    }
    else if (info == XLOG_END_OF_RECOVERY)
    {
        xl_end_of_recovery xlrec;

        memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));

        /*
         * For Hot Standby, we could treat this like a Shutdown Checkpoint,
         * but this case is rarer and harder to test, so the benefit doesn't
         * outweigh the potential extra cost of maintenance.
         */

        /*
         * We should've already switched to the new TLI before replaying this
         * record.
         */
        if (xlrec.ThisTimeLineID != ThisTimeLineID)
            ereport(PANIC,
                    (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
                            xlrec.ThisTimeLineID, ThisTimeLineID)));
    }
    else if (info == XLOG_NOOP)
    {
        /* nothing to do here */
    }
    else if (info == XLOG_SWITCH)
    {
        /* nothing to do here */
    }
    else if (info == XLOG_RESTORE_POINT)
    {
        /* nothing to do here */
    }
    else if (info == XLOG_HINT)
    {
        char *data;
        BkpBlock bkpb;

        /*
         * Hint bit records contain a backup block stored "inline" in the normal
         * data since the locking when writing hint records isn't sufficient to
         * use the normal backup block mechanism, which assumes exclusive lock
         * on the buffer supplied.
         *
         * Since the only change in these backup block are hint bits, there are
         * no recovery conflicts generated.
         *
         * This also means there is no corresponding API call for this,
         * so an smgr implementation has no need to implement anything.
         * Which means nothing is needed in md.c etc
         */
        data = XLogRecGetData(record);
        memcpy(&bkpb, data, sizeof(BkpBlock));
        data += sizeof(BkpBlock);

        RestoreBackupBlockContents(lsn, bkpb, data, false, false);
    }
    else if (info == XLOG_BACKUP_END)
    {
        XLogRecPtr  startpoint;

        memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));

        if (ControlFile->backupStartPoint == startpoint)
        {
            /*
             * We have reached the end of base backup, the point where
             * pg_stop_backup() was done. The data on disk is now consistent.
             * Reset backupStartPoint, and update minRecoveryPoint to make
             * sure we don't allow starting up at an earlier point even if
             * recovery is stopped and restarted soon after this.
             */
            elog(DEBUG1, "end of backup reached");

            LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);

            if (ControlFile->minRecoveryPoint < lsn)
            {
                ControlFile->minRecoveryPoint = lsn;
                ControlFile->minRecoveryPointTLI = ThisTimeLineID;
            }
            ControlFile->backupStartPoint = InvalidXLogRecPtr;
            ControlFile->backupEndRequired = false;
            UpdateControlFile();

            LWLockRelease(ControlFileLock);
        }
    }
    else if (info == XLOG_PARAMETER_CHANGE)
    {
        xl_parameter_change xlrec;

        /* Update our copy of the parameters in pg_control */
        memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));

        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
        ControlFile->MaxConnections = xlrec.MaxConnections;
        ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
        ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
        ControlFile->wal_level = xlrec.wal_level;

        /*
         * Update minRecoveryPoint to ensure that if recovery is aborted, we
         * recover back up to this point before allowing hot standby again.
         * This is particularly important if wal_level was set to 'archive'
         * before, and is now 'hot_standby', to ensure you don't run queries
         * against the WAL preceding the wal_level change. Same applies to
         * decreasing max_* settings.
         */
        minRecoveryPoint = ControlFile->minRecoveryPoint;
        minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
        if (minRecoveryPoint != 0 && minRecoveryPoint < lsn)
        {
            ControlFile->minRecoveryPoint = lsn;
            ControlFile->minRecoveryPointTLI = ThisTimeLineID;
        }

        UpdateControlFile();
        LWLockRelease(ControlFileLock);

        /* Check to see if any changes to max_connections give problems */
        CheckRequiredParameterValues();
    }
    else if (info == XLOG_FPW_CHANGE)
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;
        bool        fpw;

        memcpy(&fpw, XLogRecGetData(record), sizeof(bool));

        /*
         * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
         * do_pg_start_backup() and do_pg_stop_backup() can check whether
         * full_page_writes has been disabled during online backup.
         */
        if (!fpw)
        {
            SpinLockAcquire(&xlogctl->info_lck);
            if (xlogctl->lastFpwDisableRecPtr < ReadRecPtr)
                xlogctl->lastFpwDisableRecPtr = ReadRecPtr;
            SpinLockRelease(&xlogctl->info_lck);
        }

        /* Keep track of full_page_writes */
        lastFullPageWrites = fpw;
    }
}

bool XLogBackgroundFlush ( void   ) 

Definition at line 2072 of file xlog.c.

References XLogCtlData::asyncXactLSN, elog, END_CRIT_SECTION, XLogwrtRqst::Flush, XLogwrtResult::Flush, XLogCtlData::info_lck, LOG, XLogCtlData::LogwrtResult, XLogCtlData::LogwrtRqst, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), openLogFile, openLogSegNo, RecoveryInProgress(), SpinLockAcquire, SpinLockRelease, START_CRIT_SECTION, WalSndWakeupProcessRequests, WALWriteLock, XLogwrtResult::Write, XLogwrtRqst::Write, XLByteInPrevSeg, XLogFileClose(), and XLogWrite().

Referenced by WalWriterMain().

{
    XLogRecPtr  WriteRqstPtr;
    bool        flexible = true;
    bool        wrote_something = false;

    /* XLOG doesn't need flushing during recovery */
    if (RecoveryInProgress())
        return false;

    /* read LogwrtResult and update local state */
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        LogwrtResult = xlogctl->LogwrtResult;
        WriteRqstPtr = xlogctl->LogwrtRqst.Write;
        SpinLockRelease(&xlogctl->info_lck);
    }

    /* back off to last completed page boundary */
    WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;

    /* if we have already flushed that far, consider async commit records */
    if (WriteRqstPtr <= LogwrtResult.Flush)
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        WriteRqstPtr = xlogctl->asyncXactLSN;
        SpinLockRelease(&xlogctl->info_lck);
        flexible = false;       /* ensure it all gets written */
    }

    /*
     * If already known flushed, we're done. Just need to check if we are
     * holding an open file handle to a logfile that's no longer in use,
     * preventing the file from being deleted.
     */
    if (WriteRqstPtr <= LogwrtResult.Flush)
    {
        if (openLogFile >= 0)
        {
            if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
            {
                XLogFileClose();
            }
        }
        return false;
    }

#ifdef WAL_DEBUG
    if (XLOG_DEBUG)
        elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
             (uint32) (WriteRqstPtr >> 32), (uint32) WriteRqstPtr,
             (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
             (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
#endif

    START_CRIT_SECTION();

    /* now wait for the write lock */
    LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
    LogwrtResult = XLogCtl->LogwrtResult;
    if (WriteRqstPtr > LogwrtResult.Flush)
    {
        XLogwrtRqst WriteRqst;

        WriteRqst.Write = WriteRqstPtr;
        WriteRqst.Flush = WriteRqstPtr;
        XLogWrite(WriteRqst, flexible, false);
        wrote_something = true;
    }
    LWLockRelease(WALWriteLock);

    END_CRIT_SECTION();

    /* wake up walsenders now that we've released heavily contended locks */
    WalSndWakeupProcessRequests();

    return wrote_something;
}

int XLogFileInit ( XLogSegNo  segno,
bool use_existent,
bool  use_lock 
)

Definition at line 2245 of file xlog.c.

References BasicOpenFile(), close, DEBUG2, elog, ereport, errcode_for_file_access(), errmsg(), ERROR, get_sync_bit(), InstallXLogFileSegment(), MAXPGPATH, palloc0(), pfree(), PG_BINARY, pg_fsync(), snprintf(), sync_method, ThisTimeLineID, unlink(), write, XLOGDIR, and XLogFilePath.

Referenced by BootStrapXLOG(), PreallocXlogFiles(), XLogWalRcvWrite(), and XLogWrite().

{
    char        path[MAXPGPATH];
    char        tmppath[MAXPGPATH];
    char       *zbuffer;
    XLogSegNo   installed_segno;
    int         max_advance;
    int         fd;
    int         nbytes;

    XLogFilePath(path, ThisTimeLineID, logsegno);

    /*
     * Try to use existent file (checkpoint maker may have created it already)
     */
    if (*use_existent)
    {
        fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
                           S_IRUSR | S_IWUSR);
        if (fd < 0)
        {
            if (errno != ENOENT)
                ereport(ERROR,
                        (errcode_for_file_access(),
                         errmsg("could not open file \"%s\": %m", path)));
        }
        else
            return fd;
    }

    /*
     * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
     * another process is doing the same thing.  If so, we will end up
     * pre-creating an extra log segment.  That seems OK, and better than
     * holding the lock throughout this lengthy process.
     */
    elog(DEBUG2, "creating and filling new WAL file");

    snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());

    unlink(tmppath);

    /*
     * Allocate a buffer full of zeros. This is done before opening the file
     * so that we don't leak the file descriptor if palloc fails.
     *
     * Note: palloc zbuffer, instead of just using a local char array, to
     * ensure it is reasonably well-aligned; this may save a few cycles
     * transferring data to the kernel.
     */
    zbuffer = (char *) palloc0(XLOG_BLCKSZ);

    /* do not use get_sync_bit() here --- want to fsync only at end of fill */
    fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
                       S_IRUSR | S_IWUSR);
    if (fd < 0)
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not create file \"%s\": %m", tmppath)));

    /*
     * Zero-fill the file.  We have to do this the hard way to ensure that all
     * the file space has really been allocated --- on platforms that allow
     * "holes" in files, just seeking to the end doesn't allocate intermediate
     * space.  This way, we know that we have all the space and (after the
     * fsync below) that all the indirect blocks are down on disk.  Therefore,
     * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
     * log file.
     */
    for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
    {
        errno = 0;
        if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
        {
            int         save_errno = errno;

            /*
             * If we fail to make the file, delete it to release disk space
             */
            unlink(tmppath);

            close(fd);

            /* if write didn't set errno, assume problem is no disk space */
            errno = save_errno ? save_errno : ENOSPC;

            ereport(ERROR,
                    (errcode_for_file_access(),
                     errmsg("could not write to file \"%s\": %m", tmppath)));
        }
    }
    pfree(zbuffer);

    if (pg_fsync(fd) != 0)
    {
        close(fd);
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not fsync file \"%s\": %m", tmppath)));
    }

    if (close(fd))
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not close file \"%s\": %m", tmppath)));

    /*
     * Now move the segment into place with its final name.
     *
     * If caller didn't want to use a pre-existing file, get rid of any
     * pre-existing file.  Otherwise, cope with possibility that someone else
     * has created the file while we were filling ours: if so, use ours to
     * pre-create a future log segment.
     */
    installed_segno = logsegno;
    max_advance = XLOGfileslop;
    if (!InstallXLogFileSegment(&installed_segno, tmppath,
                                *use_existent, &max_advance,
                                use_lock))
    {
        /*
         * No need for any more future segments, or InstallXLogFileSegment()
         * failed to rename the file into place. If the rename failed, opening
         * the file below will fail.
         */
        unlink(tmppath);
    }

    /* Set flag to tell caller there was no existent file */
    *use_existent = false;

    /* Now open original target segment (might not be file I just made) */
    fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
                       S_IRUSR | S_IWUSR);
    if (fd < 0)
        ereport(ERROR,
                (errcode_for_file_access(),
           errmsg("could not open file \"%s\": %m", path)));

    elog(DEBUG2, "done creating and filling new WAL file");

    return fd;
}

char* XLogFileNameP ( TimeLineID  tli,
XLogSegNo  segno 
)

Definition at line 8365 of file xlog.c.

References MAXFNAMELEN, palloc(), and XLogFileName.

Referenced by assign_xlog_sync_method(), issue_xlog_fsync(), WalReceiverMain(), XLogFileClose(), XLogRead(), XLogWalRcvWrite(), and XLogWrite().

{
    char       *result = palloc(MAXFNAMELEN);
    XLogFileName(result, tli, segno);
    return result;
}

int XLogFileOpen ( XLogSegNo  segno  ) 

Definition at line 2597 of file xlog.c.

References BasicOpenFile(), ereport, errcode_for_file_access(), errmsg(), get_sync_bit(), PANIC, PG_BINARY, sync_method, ThisTimeLineID, and XLogFilePath.

Referenced by StartupXLOG(), and XLogWrite().

{
    char        path[MAXPGPATH];
    int         fd;

    XLogFilePath(path, ThisTimeLineID, segno);

    fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
                       S_IRUSR | S_IWUSR);
    if (fd < 0)
        ereport(PANIC,
                (errcode_for_file_access(),
                 errmsg("could not open xlog file \"%s\": %m", path)));

    return fd;
}

void XLogFlush ( XLogRecPtr  RecPtr  ) 

Definition at line 1891 of file xlog.c.

References CommitDelay, CommitSiblings, XLogCtlInsert::curridx, elog, enableFsync, END_CRIT_SECTION, ERROR, XLogwrtRqst::Flush, XLogwrtResult::Flush, XLogCtlData::info_lck, XLogCtlData::Insert, Insert(), INSERT_FREESPACE, LOG, XLogCtlData::LogwrtResult, XLogCtlData::LogwrtRqst, LW_EXCLUSIVE, LWLockAcquireOrWait(), LWLockConditionalAcquire(), LWLockRelease(), MinimumActiveBackends(), pg_usleep(), SpinLockAcquire, SpinLockRelease, START_CRIT_SECTION, UpdateMinRecoveryPoint(), WALInsertLock, WalSndWakeupProcessRequests, WALWriteLock, XLogwrtRqst::Write, XLogwrtResult::Write, XLogCtlData::xlblocks, XLogInsertAllowed(), and XLogWrite().

Referenced by CreateCheckPoint(), CreateEndOfRecoveryRecord(), EndPrepare(), FlushBuffer(), RecordTransactionAbortPrepared(), RecordTransactionCommit(), RecordTransactionCommitPrepared(), RelationTruncate(), SlruPhysicalWritePage(), smgr_redo(), write_relmap_file(), WriteTruncateXlogRec(), and xact_redo_commit_internal().

{
    XLogRecPtr  WriteRqstPtr;
    XLogwrtRqst WriteRqst;

    /*
     * During REDO, we are reading not writing WAL.  Therefore, instead of
     * trying to flush the WAL, we should update minRecoveryPoint instead. We
     * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
     * to act this way too, and because when it tries to write the
     * end-of-recovery checkpoint, it should indeed flush.
     */
    if (!XLogInsertAllowed())
    {
        UpdateMinRecoveryPoint(record, false);
        return;
    }

    /* Quick exit if already known flushed */
    if (record <= LogwrtResult.Flush)
        return;

#ifdef WAL_DEBUG
    if (XLOG_DEBUG)
        elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
             (uint32) (record >> 32), (uint32) record,
             (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
             (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
#endif

    START_CRIT_SECTION();

    /*
     * Since fsync is usually a horribly expensive operation, we try to
     * piggyback as much data as we can on each fsync: if we see any more data
     * entered into the xlog buffer, we'll write and fsync that too, so that
     * the final value of LogwrtResult.Flush is as large as possible. This
     * gives us some chance of avoiding another fsync immediately after.
     */

    /* initialize to given target; may increase below */
    WriteRqstPtr = record;

    /*
     * Now wait until we get the write lock, or someone else does the flush
     * for us.
     */
    for (;;)
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        /* read LogwrtResult and update local state */
        SpinLockAcquire(&xlogctl->info_lck);
        if (WriteRqstPtr < xlogctl->LogwrtRqst.Write)
            WriteRqstPtr = xlogctl->LogwrtRqst.Write;
        LogwrtResult = xlogctl->LogwrtResult;
        SpinLockRelease(&xlogctl->info_lck);

        /* done already? */
        if (record <= LogwrtResult.Flush)
            break;

        /*
         * Try to get the write lock. If we can't get it immediately, wait
         * until it's released, and recheck if we still need to do the flush
         * or if the backend that held the lock did it for us already. This
         * helps to maintain a good rate of group committing when the system
         * is bottlenecked by the speed of fsyncing.
         */
        if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
        {
            /*
             * The lock is now free, but we didn't acquire it yet. Before we
             * do, loop back to check if someone else flushed the record for
             * us already.
             */
            continue;
        }

        /* Got the lock; recheck whether request is satisfied */
        LogwrtResult = XLogCtl->LogwrtResult;
        if (record <= LogwrtResult.Flush)
        {
            LWLockRelease(WALWriteLock);
            break;
        }

        /*
         * Sleep before flush! By adding a delay here, we may give further
         * backends the opportunity to join the backlog of group commit
         * followers; this can significantly improve transaction throughput, at
         * the risk of increasing transaction latency.
         *
         * We do not sleep if enableFsync is not turned on, nor if there are
         * fewer than CommitSiblings other backends with active transactions.
         */
        if (CommitDelay > 0 && enableFsync &&
            MinimumActiveBackends(CommitSiblings))
            pg_usleep(CommitDelay);

        /* try to write/flush later additions to XLOG as well */
        if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
        {
            XLogCtlInsert *Insert = &XLogCtl->Insert;
            uint32      freespace = INSERT_FREESPACE(Insert);

            if (freespace == 0)     /* buffer is full */
                WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
            else
            {
                WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
                WriteRqstPtr -= freespace;
            }
            LWLockRelease(WALInsertLock);
            WriteRqst.Write = WriteRqstPtr;
            WriteRqst.Flush = WriteRqstPtr;
        }
        else
        {
            WriteRqst.Write = WriteRqstPtr;
            WriteRqst.Flush = record;
        }
        XLogWrite(WriteRqst, false, false);

        LWLockRelease(WALWriteLock);
        /* done */
        break;
    }

    END_CRIT_SECTION();

    /* wake up walsenders now that we've released heavily contended locks */
    WalSndWakeupProcessRequests();

    /*
     * If we still haven't flushed to the request point then we have a
     * problem; most likely, the requested flush point is past end of XLOG.
     * This has been seen to occur when a disk page has a corrupted LSN.
     *
     * Formerly we treated this as a PANIC condition, but that hurts the
     * system's robustness rather than helping it: we do not want to take down
     * the whole system due to corruption on one data page.  In particular, if
     * the bad page is encountered again during recovery then we would be
     * unable to restart the database at all!  (This scenario actually
     * happened in the field several times with 7.1 releases.)  As of 8.4, bad
     * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
     * the only time we can reach here during recovery is while flushing the
     * end-of-recovery checkpoint record, and we don't expect that to have a
     * bad LSN.
     *
     * Note that for calls from xact.c, the ERROR will be promoted to PANIC
     * since xact.c calls this routine inside a critical section.  However,
     * calls from bufmgr.c are not within critical sections and so we will not
     * force a restart for a bad LSN on a data page.
     */
    if (LogwrtResult.Flush < record)
        elog(ERROR,
        "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
             (uint32) (record >> 32), (uint32) record,
             (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
}

XLogRecPtr XLogInsert ( RmgrId  rmid,
uint8  info,
XLogRecData rdata 
)

Definition at line 712 of file xlog.c.

References AdvanceXLInsertBuffer(), appendStringInfo(), Assert, buf, XLogRecData::buffer, BufferGetBlock, COMP_CRC32, XLogCtlWrite::curridx, XLogCtlInsert::curridx, XLogCtlInsert::currpage, XLogCtlInsert::currpos, StringInfoData::data, XLogRecData::data, elog, END_CRIT_SECTION, ERROR, FIN_CRC32, XLogwrtRqst::Flush, XLogwrtResult::Flush, XLogCtlInsert::forcePageWrites, XLogCtlInsert::fullPageWrites, GetCurrentTransactionIdIfAny(), BkpBlock::hole_length, BkpBlock::hole_offset, i, XLogCtlData::info_lck, INIT_CRC32, initStringInfo(), XLogCtlData::Insert, Insert(), INSERT_FREESPACE, INSERT_RECPTR, InvalidBuffer, IsBootstrapProcessingMode, XLogRecData::len, LOG, XLogCtlData::LogwrtResult, XLogCtlData::LogwrtRqst, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), malloc, MAXALIGN, MemSet, XLogRecData::next, NULL, offsetof, PANIC, pfree(), PrevBufIdx, XLogCtlInsert::PrevRecord, ProcLastRecPtr, XLogCtlInsert::RedoRecPtr, RedoRecPtr, RmgrData::rm_desc, RmgrTable, SizeOfXLogLongPHD, SizeOfXLogRecord, SpinLockAcquire, SpinLockRelease, START_CRIT_SECTION, WALInsertLock, WalSndWakeupProcessRequests, WALWriteLock, XLogwrtResult::Write, XLogCtlData::Write, XLogwrtRqst::Write, XactLastRecEnd, XLogRecord::xl_info, XLogRecord::xl_len, XLogRecord::xl_prev, XLogRecord::xl_rmid, XLogRecord::xl_tot_len, XLogRecord::xl_xid, XLogCtlData::xlblocks, XLogCheckBuffer(), XLogInsertAllowed(), XLogSegSize, XLogWrite(), XLogPageHeaderData::xlp_info, XLogPageHeaderData::xlp_rem_len, XLR_BKP_BLOCK, and XLR_INFO_MASK.

Referenced by _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_getroot(), _bt_insertonpg(), _bt_log_reuse_page(), _bt_newroot(), _bt_pagedel(), _bt_split(), addLeafTuple(), AlterSequence(), AssignTransactionId(), CreateCheckPoint(), createdb(), CreateEndOfRecoveryRecord(), CreateMultiXactId(), createPostingTree(), CreateTableSpace(), do_pg_stop_backup(), do_setval(), doPickSplit(), DropTableSpace(), EndPrepare(), fill_seq_with_data(), ginbuild(), ginDeletePage(), ginHeapTupleFastInsert(), ginInsertValue(), ginUpdateStats(), gistbuild(), gistXLogSplit(), gistXLogUpdate(), heap_delete(), heap_inplace_update(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), log_heap_clean(), log_heap_cleanup_info(), log_heap_freeze(), log_heap_update(), log_heap_visible(), log_newpage(), log_newpage_buffer(), log_smgrcreate(), LogAccessExclusiveLocks(), LogCurrentRunningXacts(), movedb(), moveLeafs(), nextval_internal(), RecordTransactionAbort(), RecordTransactionAbortPrepared(), RecordTransactionCommit(), RecordTransactionCommitPrepared(), RelationTruncate(), remove_dbtablespaces(), RequestXLogSwitch(), shiftList(), spgAddNodeAction(), spgbuild(), spgSplitNodeAction(), UpdateFullPageWrites(), vacuumLeafPage(), vacuumLeafRoot(), vacuumRedirectAndPlaceholder(), write_relmap_file(), writeListPage(), WriteMZeroPageXlogRec(), WriteTruncateXlogRec(), WriteZeroPageXlogRec(), XLogPutNextOid(), XLogReportParameters(), XLogRestorePoint(), XLogSaveBufferForHint(), and xlogVacuumPage().

{
    XLogCtlInsert *Insert = &XLogCtl->Insert;
    XLogRecPtr  RecPtr;
    XLogRecPtr  WriteRqst;
    uint32      freespace;
    int         curridx;
    XLogRecData *rdt;
    XLogRecData *rdt_lastnormal;
    Buffer      dtbuf[XLR_MAX_BKP_BLOCKS];
    bool        dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
    BkpBlock    dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
    XLogRecPtr  dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
    XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
    XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
    XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
    XLogRecData hdr_rdt;
    pg_crc32    rdata_crc;
    uint32      len,
                write_len;
    unsigned    i;
    bool        updrqst;
    bool        doPageWrites;
    bool        isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
    uint8       info_orig = info;
    static XLogRecord *rechdr;

    if (rechdr == NULL)
    {
        rechdr = malloc(SizeOfXLogRecord);
        if (rechdr == NULL)
            elog(ERROR, "out of memory");
        MemSet(rechdr, 0, SizeOfXLogRecord);
    }

    /* cross-check on whether we should be here or not */
    if (!XLogInsertAllowed())
        elog(ERROR, "cannot make new WAL entries during recovery");

    /* info's high bits are reserved for use by me */
    if (info & XLR_INFO_MASK)
        elog(PANIC, "invalid xlog info mask %02X", info);

    TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);

    /*
     * In bootstrap mode, we don't actually log anything but XLOG resources;
     * return a phony record pointer.
     */
    if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
    {
        RecPtr = SizeOfXLogLongPHD;     /* start of 1st chkpt record */
        return RecPtr;
    }

    /*
     * Here we scan the rdata chain, to determine which buffers must be backed
     * up.
     *
     * We may have to loop back to here if a race condition is detected below.
     * We could prevent the race by doing all this work while holding the
     * insert lock, but it seems better to avoid doing CRC calculations while
     * holding the lock.
     *
     * We add entries for backup blocks to the chain, so that they don't need
     * any special treatment in the critical section where the chunks are
     * copied into the WAL buffers. Those entries have to be unlinked from the
     * chain if we have to loop back here.
     */
begin:;
    for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
    {
        dtbuf[i] = InvalidBuffer;
        dtbuf_bkp[i] = false;
    }

    /*
     * Decide if we need to do full-page writes in this XLOG record: true if
     * full_page_writes is on or we have a PITR request for it.  Since we
     * don't yet have the insert lock, fullPageWrites and forcePageWrites
     * could change under us, but we'll recheck them once we have the lock.
     */
    doPageWrites = Insert->fullPageWrites || Insert->forcePageWrites;

    len = 0;
    for (rdt = rdata;;)
    {
        if (rdt->buffer == InvalidBuffer)
        {
            /* Simple data, just include it */
            len += rdt->len;
        }
        else
        {
            /* Find info for buffer */
            for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
            {
                if (rdt->buffer == dtbuf[i])
                {
                    /* Buffer already referenced by earlier chain item */
                    if (dtbuf_bkp[i])
                    {
                        rdt->data = NULL;
                        rdt->len = 0;
                    }
                    else if (rdt->data)
                        len += rdt->len;
                    break;
                }
                if (dtbuf[i] == InvalidBuffer)
                {
                    /* OK, put it in this slot */
                    dtbuf[i] = rdt->buffer;
                    if (doPageWrites && XLogCheckBuffer(rdt, true,
                                        &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
                    {
                        dtbuf_bkp[i] = true;
                        rdt->data = NULL;
                        rdt->len = 0;
                    }
                    else if (rdt->data)
                        len += rdt->len;
                    break;
                }
            }
            if (i >= XLR_MAX_BKP_BLOCKS)
                elog(PANIC, "can backup at most %d blocks per xlog record",
                     XLR_MAX_BKP_BLOCKS);
        }
        /* Break out of loop when rdt points to last chain item */
        if (rdt->next == NULL)
            break;
        rdt = rdt->next;
    }

    /*
     * NOTE: We disallow len == 0 because it provides a useful bit of extra
     * error checking in ReadRecord.  This means that all callers of
     * XLogInsert must supply at least some not-in-a-buffer data.  However, we
     * make an exception for XLOG SWITCH records because we don't want them to
     * ever cross a segment boundary.
     */
    if (len == 0 && !isLogSwitch)
        elog(PANIC, "invalid xlog record length %u", len);

    /*
     * Make additional rdata chain entries for the backup blocks, so that we
     * don't need to special-case them in the write loop.  This modifies the
     * original rdata chain, but we keep a pointer to the last regular entry,
     * rdt_lastnormal, so that we can undo this if we have to loop back to the
     * beginning.
     *
     * At the exit of this loop, write_len includes the backup block data.
     *
     * Also set the appropriate info bits to show which buffers were backed
     * up. The XLR_BKP_BLOCK(N) bit corresponds to the N'th distinct buffer
     * value (ignoring InvalidBuffer) appearing in the rdata chain.
     */
    rdt_lastnormal = rdt;
    write_len = len;
    for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
    {
        BkpBlock   *bkpb;
        char       *page;

        if (!dtbuf_bkp[i])
            continue;

        info |= XLR_BKP_BLOCK(i);

        bkpb = &(dtbuf_xlg[i]);
        page = (char *) BufferGetBlock(dtbuf[i]);

        rdt->next = &(dtbuf_rdt1[i]);
        rdt = rdt->next;

        rdt->data = (char *) bkpb;
        rdt->len = sizeof(BkpBlock);
        write_len += sizeof(BkpBlock);

        rdt->next = &(dtbuf_rdt2[i]);
        rdt = rdt->next;

        if (bkpb->hole_length == 0)
        {
            rdt->data = page;
            rdt->len = BLCKSZ;
            write_len += BLCKSZ;
            rdt->next = NULL;
        }
        else
        {
            /* must skip the hole */
            rdt->data = page;
            rdt->len = bkpb->hole_offset;
            write_len += bkpb->hole_offset;

            rdt->next = &(dtbuf_rdt3[i]);
            rdt = rdt->next;

            rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
            rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
            write_len += rdt->len;
            rdt->next = NULL;
        }
    }

    /*
     * Calculate CRC of the data, including all the backup blocks
     *
     * Note that the record header isn't added into the CRC initially since we
     * don't know the prev-link yet.  Thus, the CRC will represent the CRC of
     * the whole record in the order: rdata, then backup blocks, then record
     * header.
     */
    INIT_CRC32(rdata_crc);
    for (rdt = rdata; rdt != NULL; rdt = rdt->next)
        COMP_CRC32(rdata_crc, rdt->data, rdt->len);

    /*
     * Construct record header (prev-link and CRC are filled in later), and
     * make that the first chunk in the chain.
     */
    rechdr->xl_xid = GetCurrentTransactionIdIfAny();
    rechdr->xl_tot_len = SizeOfXLogRecord + write_len;
    rechdr->xl_len = len;       /* doesn't include backup blocks */
    rechdr->xl_info = info;
    rechdr->xl_rmid = rmid;

    hdr_rdt.next = rdata;
    hdr_rdt.data = (char *) rechdr;
    hdr_rdt.len = SizeOfXLogRecord;

    write_len += SizeOfXLogRecord;

    START_CRIT_SECTION();

    /* Now wait to get insert lock */
    LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);

    /*
     * Check to see if my RedoRecPtr is out of date.  If so, may have to go
     * back and recompute everything.  This can only happen just after a
     * checkpoint, so it's better to be slow in this case and fast otherwise.
     *
     * If we aren't doing full-page writes then RedoRecPtr doesn't actually
     * affect the contents of the XLOG record, so we'll update our local copy
     * but not force a recomputation.
     */
    if (RedoRecPtr != Insert->RedoRecPtr)
    {
        Assert(RedoRecPtr < Insert->RedoRecPtr);
        RedoRecPtr = Insert->RedoRecPtr;

        if (doPageWrites)
        {
            for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
            {
                if (dtbuf[i] == InvalidBuffer)
                    continue;
                if (dtbuf_bkp[i] == false &&
                    dtbuf_lsn[i] <= RedoRecPtr)
                {
                    /*
                     * Oops, this buffer now needs to be backed up, but we
                     * didn't think so above.  Start over.
                     */
                    LWLockRelease(WALInsertLock);
                    END_CRIT_SECTION();
                    rdt_lastnormal->next = NULL;
                    info = info_orig;
                    goto begin;
                }
            }
        }
    }

    /*
     * Also check to see if fullPageWrites or forcePageWrites was just turned
     * on; if we weren't already doing full-page writes then go back and
     * recompute. (If it was just turned off, we could recompute the record
     * without full pages, but we choose not to bother.)
     */
    if ((Insert->fullPageWrites || Insert->forcePageWrites) && !doPageWrites)
    {
        /* Oops, must redo it with full-page data. */
        LWLockRelease(WALInsertLock);
        END_CRIT_SECTION();
        rdt_lastnormal->next = NULL;
        info = info_orig;
        goto begin;
    }

    /*
     * If the current page is completely full, the record goes to the next
     * page, right after the page header.
     */
    updrqst = false;
    freespace = INSERT_FREESPACE(Insert);
    if (freespace == 0)
    {
        updrqst = AdvanceXLInsertBuffer(false);
        freespace = INSERT_FREESPACE(Insert);
    }

    /* Compute record's XLOG location */
    curridx = Insert->curridx;
    INSERT_RECPTR(RecPtr, Insert, curridx);

    /*
     * If the record is an XLOG_SWITCH, and we are exactly at the start of a
     * segment, we need not insert it (and don't want to because we'd like
     * consecutive switch requests to be no-ops).  Instead, make sure
     * everything is written and flushed through the end of the prior segment,
     * and return the prior segment's end address.
     */
    if (isLogSwitch && (RecPtr % XLogSegSize) == SizeOfXLogLongPHD)
    {
        /* We can release insert lock immediately */
        LWLockRelease(WALInsertLock);

        RecPtr -= SizeOfXLogLongPHD;

        LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
        LogwrtResult = XLogCtl->LogwrtResult;
        if (LogwrtResult.Flush < RecPtr)
        {
            XLogwrtRqst FlushRqst;

            FlushRqst.Write = RecPtr;
            FlushRqst.Flush = RecPtr;
            XLogWrite(FlushRqst, false, false);
        }
        LWLockRelease(WALWriteLock);

        END_CRIT_SECTION();

        /* wake up walsenders now that we've released heavily contended locks */
        WalSndWakeupProcessRequests();
        return RecPtr;
    }

    /* Finish the record header */
    rechdr->xl_prev = Insert->PrevRecord;

    /* Now we can finish computing the record's CRC */
    COMP_CRC32(rdata_crc, (char *) rechdr, offsetof(XLogRecord, xl_crc));
    FIN_CRC32(rdata_crc);
    rechdr->xl_crc = rdata_crc;

#ifdef WAL_DEBUG
    if (XLOG_DEBUG)
    {
        StringInfoData buf;

        initStringInfo(&buf);
        appendStringInfo(&buf, "INSERT @ %X/%X: ",
                         (uint32) (RecPtr >> 32), (uint32) RecPtr);
        xlog_outrec(&buf, rechdr);
        if (rdata->data != NULL)
        {
            appendStringInfo(&buf, " - ");
            RmgrTable[rechdr->xl_rmid].rm_desc(&buf, rechdr->xl_info, rdata->data);
        }
        elog(LOG, "%s", buf.data);
        pfree(buf.data);
    }
#endif

    /* Record begin of record in appropriate places */
    ProcLastRecPtr = RecPtr;
    Insert->PrevRecord = RecPtr;

    /*
     * Append the data, including backup blocks if any
     */
    rdata = &hdr_rdt;
    while (write_len)
    {
        while (rdata->data == NULL)
            rdata = rdata->next;

        if (freespace > 0)
        {
            if (rdata->len > freespace)
            {
                memcpy(Insert->currpos, rdata->data, freespace);
                rdata->data += freespace;
                rdata->len -= freespace;
                write_len -= freespace;
            }
            else
            {
                memcpy(Insert->currpos, rdata->data, rdata->len);
                freespace -= rdata->len;
                write_len -= rdata->len;
                Insert->currpos += rdata->len;
                rdata = rdata->next;
                continue;
            }
        }

        /* Use next buffer */
        updrqst = AdvanceXLInsertBuffer(false);
        curridx = Insert->curridx;
        /* Mark page header to indicate this record continues on the page */
        Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
        Insert->currpage->xlp_rem_len = write_len;
        freespace = INSERT_FREESPACE(Insert);
    }

    /* Ensure next record will be properly aligned */
    Insert->currpos = (char *) Insert->currpage +
        MAXALIGN(Insert->currpos - (char *) Insert->currpage);
    freespace = INSERT_FREESPACE(Insert);

    /*
     * The recptr I return is the beginning of the *next* record. This will be
     * stored as LSN for changed data pages...
     */
    INSERT_RECPTR(RecPtr, Insert, curridx);

    /*
     * If the record is an XLOG_SWITCH, we must now write and flush all the
     * existing data, and then forcibly advance to the start of the next
     * segment.  It's not good to do this I/O while holding the insert lock,
     * but there seems too much risk of confusion if we try to release the
     * lock sooner.  Fortunately xlog switch needn't be a high-performance
     * operation anyway...
     */
    if (isLogSwitch)
    {
        XLogwrtRqst FlushRqst;
        XLogRecPtr  OldSegEnd;

        TRACE_POSTGRESQL_XLOG_SWITCH();

        LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);

        /*
         * Flush through the end of the page containing XLOG_SWITCH, and
         * perform end-of-segment actions (eg, notifying archiver).
         */
        WriteRqst = XLogCtl->xlblocks[curridx];
        FlushRqst.Write = WriteRqst;
        FlushRqst.Flush = WriteRqst;
        XLogWrite(FlushRqst, false, true);

        /* Set up the next buffer as first page of next segment */
        /* Note: AdvanceXLInsertBuffer cannot need to do I/O here */
        (void) AdvanceXLInsertBuffer(true);

        /* There should be no unwritten data */
        curridx = Insert->curridx;
        Assert(curridx == XLogCtl->Write.curridx);

        /* Compute end address of old segment */
        OldSegEnd = XLogCtl->xlblocks[curridx];
        OldSegEnd -= XLOG_BLCKSZ;

        /* Make it look like we've written and synced all of old segment */
        LogwrtResult.Write = OldSegEnd;
        LogwrtResult.Flush = OldSegEnd;

        /*
         * Update shared-memory status --- this code should match XLogWrite
         */
        {
            /* use volatile pointer to prevent code rearrangement */
            volatile XLogCtlData *xlogctl = XLogCtl;

            SpinLockAcquire(&xlogctl->info_lck);
            xlogctl->LogwrtResult = LogwrtResult;
            if (xlogctl->LogwrtRqst.Write < LogwrtResult.Write)
                xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
            if (xlogctl->LogwrtRqst.Flush < LogwrtResult.Flush)
                xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
            SpinLockRelease(&xlogctl->info_lck);
        }

        LWLockRelease(WALWriteLock);

        updrqst = false;        /* done already */
    }
    else
    {
        /* normal case, ie not xlog switch */

        /* Need to update shared LogwrtRqst if some block was filled up */
        if (freespace == 0)
        {
            /* curridx is filled and available for writing out */
            updrqst = true;
        }
        else
        {
            /* if updrqst already set, write through end of previous buf */
            curridx = PrevBufIdx(curridx);
        }
        WriteRqst = XLogCtl->xlblocks[curridx];
    }

    LWLockRelease(WALInsertLock);

    if (updrqst)
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        /* advance global request to include new block(s) */
        if (xlogctl->LogwrtRqst.Write < WriteRqst)
            xlogctl->LogwrtRqst.Write = WriteRqst;
        /* update local result copy while I have the chance */
        LogwrtResult = xlogctl->LogwrtResult;
        SpinLockRelease(&xlogctl->info_lck);
    }

    XactLastRecEnd = RecPtr;

    END_CRIT_SECTION();

    /* wake up walsenders now that we've released heavily contended locks */
    WalSndWakeupProcessRequests();

    return RecPtr;
}

bool XLogInsertAllowed ( void   ) 

Definition at line 6283 of file xlog.c.

References LocalXLogInsertAllowed, and RecoveryInProgress().

Referenced by XLogFlush(), and XLogInsert().

{
    /*
     * If value is "unconditionally true" or "unconditionally false", just
     * return it.  This provides the normal fast path once recovery is known
     * done.
     */
    if (LocalXLogInsertAllowed >= 0)
        return (bool) LocalXLogInsertAllowed;

    /*
     * Else, must check to see if we're still in recovery.
     */
    if (RecoveryInProgress())
        return false;

    /*
     * On exit from recovery, reset to "unconditionally true", since there is
     * no need to keep checking.
     */
    LocalXLogInsertAllowed = 1;
    return true;
}

bool XLogNeedsFlush ( XLogRecPtr  RecPtr  ) 

Definition at line 2164 of file xlog.c.

References ControlFileLock, XLogwrtResult::Flush, XLogCtlData::info_lck, XLogCtlData::LogwrtResult, LW_SHARED, LWLockConditionalAcquire(), LWLockRelease(), ControlFileData::minRecoveryPoint, minRecoveryPoint, ControlFileData::minRecoveryPointTLI, minRecoveryPointTLI, RecoveryInProgress(), SpinLockAcquire, SpinLockRelease, and updateMinRecoveryPoint.

Referenced by BufferAlloc(), and SetHintBits().

{
    /*
     * During recovery, we don't flush WAL but update minRecoveryPoint
     * instead. So "needs flush" is taken to mean whether minRecoveryPoint
     * would need to be updated.
     */
    if (RecoveryInProgress())
    {
        /* Quick exit if already known updated */
        if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
            return false;

        /*
         * Update local copy of minRecoveryPoint. But if the lock is busy,
         * just return a conservative guess.
         */
        if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
            return true;
        minRecoveryPoint = ControlFile->minRecoveryPoint;
        minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
        LWLockRelease(ControlFileLock);

        /*
         * An invalid minRecoveryPoint means that we need to recover all the
         * WAL, i.e., we're doing crash recovery.  We never modify the control
         * file's value in that case, so we can short-circuit future checks
         * here too.
         */
        if (minRecoveryPoint == 0)
            updateMinRecoveryPoint = false;

        /* check again */
        if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
            return false;
        else
            return true;
    }

    /* Quick exit if already known flushed */
    if (record <= LogwrtResult.Flush)
        return false;

    /* read LogwrtResult and update local state */
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        LogwrtResult = xlogctl->LogwrtResult;
        SpinLockRelease(&xlogctl->info_lck);
    }

    /* check again */
    if (record <= LogwrtResult.Flush)
        return false;

    return true;
}

void XLogPutNextOid ( Oid  nextOid  ) 

Definition at line 7555 of file xlog.c.

References XLogRecData::buffer, XLogRecData::data, XLogRecData::len, XLogRecData::next, XLOG_NEXTOID, and XLogInsert().

Referenced by GetNewObjectId().

{
    XLogRecData rdata;

    rdata.data = (char *) (&nextOid);
    rdata.len = sizeof(Oid);
    rdata.buffer = InvalidBuffer;
    rdata.next = NULL;
    (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);

    /*
     * We need not flush the NEXTOID record immediately, because any of the
     * just-allocated OIDs could only reach disk as part of a tuple insert or
     * update that would have its own XLOG record that must follow the NEXTOID
     * record.  Therefore, the standard buffer LSN interlock applied to those
     * records will ensure no such OID reaches disk before the NEXTOID record
     * does.
     *
     * Note, however, that the above statement only covers state "within" the
     * database.  When we use a generated OID as a file or directory name, we
     * are in a sense violating the basic WAL rule, because that filesystem
     * change may reach disk before the NEXTOID WAL record does.  The impact
     * of this is that if a database crash occurs immediately afterward, we
     * might after restart re-generate the same OID and find that it conflicts
     * with the leftover file or directory.  But since for safety's sake we
     * always loop until finding a nonconflicting filename, this poses no real
     * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
     */
}

XLogRecPtr XLogRestorePoint ( const char *  rpName  ) 

Definition at line 7616 of file xlog.c.

References XLogRecData::buffer, XLogRecData::data, ereport, errmsg(), GetCurrentTimestamp(), XLogRecData::len, LOG, MAXFNAMELEN, XLogRecData::next, xl_restore_point::rp_name, xl_restore_point::rp_time, XLOG_RESTORE_POINT, and XLogInsert().

Referenced by pg_create_restore_point().

{
    XLogRecPtr  RecPtr;
    XLogRecData rdata;
    xl_restore_point xlrec;

    xlrec.rp_time = GetCurrentTimestamp();
    strncpy(xlrec.rp_name, rpName, MAXFNAMELEN);

    rdata.buffer = InvalidBuffer;
    rdata.data = (char *) &xlrec;
    rdata.len = sizeof(xl_restore_point);
    rdata.next = NULL;

    RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT, &rdata);

    ereport(LOG,
            (errmsg("restore point \"%s\" created at %X/%X",
                    rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));

    return RecPtr;
}

XLogRecPtr XLogSaveBufferForHint ( Buffer  buffer  ) 

Definition at line 7664 of file xlog.c.

References Assert, XLogRecData::buffer, XLogRecData::buffer_std, BufferGetBlock, XLogRecData::data, PGXACT::delayChkpt, GetRedoRecPtr(), BkpBlock::hole_length, BkpBlock::hole_offset, XLogRecData::len, MyPgXact, XLogRecData::next, XLOG_HINT, XLogCheckBuffer(), and XLogInsert().

Referenced by MarkBufferDirtyHint().

{
    XLogRecPtr recptr = InvalidXLogRecPtr;
    XLogRecPtr lsn;
    XLogRecData rdata[2];
    BkpBlock bkpb;

    /*
     * Ensure no checkpoint can change our view of RedoRecPtr.
     */
    Assert(MyPgXact->delayChkpt);

    /*
     * Update RedoRecPtr so XLogCheckBuffer can make the right decision
     */
    GetRedoRecPtr();

    /*
     * Setup phony rdata element for use within XLogCheckBuffer only.
     * We reuse and reset rdata for any actual WAL record insert.
     */
    rdata[0].buffer = buffer;
    rdata[0].buffer_std = true;

    /*
     * Check buffer while not holding an exclusive lock.
     */
    if (XLogCheckBuffer(rdata, false, &lsn, &bkpb))
    {
        char copied_buffer[BLCKSZ];
        char *origdata = (char *) BufferGetBlock(buffer);

        /*
         * Copy buffer so we don't have to worry about concurrent hint bit or
         * lsn updates. We assume pd_lower/upper cannot be changed without an
         * exclusive lock, so the contents bkp are not racy.
         */
        memcpy(copied_buffer, origdata, bkpb.hole_offset);
        memcpy(copied_buffer + bkpb.hole_offset,
                origdata + bkpb.hole_offset + bkpb.hole_length,
                BLCKSZ - bkpb.hole_offset - bkpb.hole_length);

        /*
         * Header for backup block.
         */
        rdata[0].data = (char *) &bkpb;
        rdata[0].len = sizeof(BkpBlock);
        rdata[0].buffer = InvalidBuffer;
        rdata[0].next = &(rdata[1]);

        /*
         * Save copy of the buffer.
         */
        rdata[1].data = copied_buffer;
        rdata[1].len = BLCKSZ - bkpb.hole_length;
        rdata[1].buffer = InvalidBuffer;
        rdata[1].next = NULL;

        recptr = XLogInsert(RM_XLOG_ID, XLOG_HINT, rdata);
    }

    return recptr;
}

void XLogSetAsyncXactLSN ( XLogRecPtr  record  ) 

Definition at line 1765 of file xlog.c.

References XLogCtlData::asyncXactLSN, XLogwrtResult::Flush, XLogCtlData::info_lck, XLogCtlData::LogwrtResult, ProcGlobal, SetLatch(), SpinLockAcquire, SpinLockRelease, PROC_HDR::walwriterLatch, and XLogCtlData::WalWriterSleeping.

Referenced by RecordTransactionAbort(), and RecordTransactionCommit().

{
    XLogRecPtr  WriteRqstPtr = asyncXactLSN;
    bool        sleeping;

    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    LogwrtResult = xlogctl->LogwrtResult;
    sleeping = xlogctl->WalWriterSleeping;
    if (xlogctl->asyncXactLSN < asyncXactLSN)
        xlogctl->asyncXactLSN = asyncXactLSN;
    SpinLockRelease(&xlogctl->info_lck);

    /*
     * If the WALWriter is sleeping, we should kick it to make it come out of
     * low-power mode.  Otherwise, determine whether there's a full page of
     * WAL available to write.
     */
    if (!sleeping)
    {
        /* back off to last completed page boundary */
        WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;

        /* if we have already flushed that far, we're done */
        if (WriteRqstPtr <= LogwrtResult.Flush)
            return;
    }

    /*
     * Nudge the WALWriter: it has a full page of WAL to write, or we want it
     * to come out of low-power mode so that this async commit will reach disk
     * within the expected amount of time.
     */
    if (ProcGlobal->walwriterLatch)
        SetLatch(ProcGlobal->walwriterLatch);
}

void XLOGShmemInit ( void   ) 

Definition at line 3929 of file xlog.c.

References ALIGNOF_XLOG_BUFFER, Assert, XLogCtlInsert::currpage, XLogCtlData::info_lck, InitSharedLatch(), XLogCtlData::Insert, IsBootstrapProcessingMode, XLogCtlData::pages, ReadControlFile(), XLogCtlData::recoveryWakeupLatch, XLogCtlData::SharedHotStandbyActive, XLogCtlData::SharedRecoveryInProgress, ShmemInitStruct(), SpinLockInit, TYPEALIGN, XLogCtlData::ulsn_lck, XLogCtlData::WalWriterSleeping, XLogCtlData::xlblocks, XLOGbuffers, XLogCtlData::XLogCacheBlck, and XLOGShmemSize().

Referenced by CreateSharedMemoryAndSemaphores().

{
    bool        foundCFile,
                foundXLog;
    char       *allocptr;

    ControlFile = (ControlFileData *)
        ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
    XLogCtl = (XLogCtlData *)
        ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);

    if (foundCFile || foundXLog)
    {
        /* both should be present or neither */
        Assert(foundCFile && foundXLog);
        return;
    }

    memset(XLogCtl, 0, sizeof(XLogCtlData));

    /*
     * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
     * multiple of the alignment for same, so no extra alignment padding is
     * needed here.
     */
    allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
    XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
    memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
    allocptr += sizeof(XLogRecPtr) * XLOGbuffers;

    /*
     * Align the start of the page buffers to an ALIGNOF_XLOG_BUFFER boundary.
     */
    allocptr = (char *) TYPEALIGN(ALIGNOF_XLOG_BUFFER, allocptr);
    XLogCtl->pages = allocptr;
    memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);

    /*
     * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
     * in additional info.)
     */
    XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
    XLogCtl->SharedRecoveryInProgress = true;
    XLogCtl->SharedHotStandbyActive = false;
    XLogCtl->WalWriterSleeping = false;
    XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
    SpinLockInit(&XLogCtl->info_lck);
    SpinLockInit(&XLogCtl->ulsn_lck);
    InitSharedLatch(&XLogCtl->recoveryWakeupLatch);

    /*
     * If we are not in bootstrap mode, pg_control should already exist. Read
     * and validate it immediately (see comments in ReadControlFile() for the
     * reasons why).
     */
    if (!IsBootstrapProcessingMode())
        ReadControlFile();
}

Size XLOGShmemSize ( void   ) 

Definition at line 3891 of file xlog.c.

References add_size(), ALIGNOF_XLOG_BUFFER, Assert, buf, mul_size(), PGC_POSTMASTER, PGC_S_OVERRIDE, SetConfigOption(), snprintf(), XLOGbuffers, and XLOGChooseNumBuffers().

Referenced by CreateSharedMemoryAndSemaphores(), and XLOGShmemInit().

{
    Size        size;

    /*
     * If the value of wal_buffers is -1, use the preferred auto-tune value.
     * This isn't an amazingly clean place to do this, but we must wait till
     * NBuffers has received its final value, and must do it before using the
     * value of XLOGbuffers to do anything important.
     */
    if (XLOGbuffers == -1)
    {
        char        buf[32];

        snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
        SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
    }
    Assert(XLOGbuffers > 0);

    /* XLogCtl */
    size = sizeof(XLogCtlData);
    /* xlblocks array */
    size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
    /* extra alignment padding for XLOG I/O buffers */
    size = add_size(size, ALIGNOF_XLOG_BUFFER);
    /* and the buffers themselves */
    size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));

    /*
     * Note: we don't count ControlFileData, it comes out of the "slop factor"
     * added by CreateSharedMemoryAndSemaphores.  This lets us use this
     * routine again below to compute the actual allocation size.
     */

    return size;
}


Variable Documentation

Definition at line 73 of file xlog.c.

Referenced by IsCheckpointOnSchedule(), and XLogCheckpointNeeded().

Definition at line 130 of file xlog.c.

Referenced by BufferSync(), CheckPointBuffers(), and mdsync().

Definition at line 80 of file xlog.c.

Referenced by BootStrapXLOG(), and UpdateFullPageWrites().

Definition at line 81 of file xlog.c.

Referenced by CreateCheckPoint(), CreateRestartPoint(), LogCheckpointEnd(), and mdsync().

Definition at line 74 of file xlog.c.

Referenced by KeepLogSeg().

int wal_level

Definition at line 83 of file xlog.c.

Referenced by BootStrapXLOG(), PostmasterMain(), and XLogReportParameters().

Definition at line 78 of file xlog.c.

Referenced by pgarch_archiveXlog(), and show_archive_command().

Definition at line 77 of file xlog.c.

Referenced by PostmasterMain().

Definition at line 76 of file xlog.c.

Referenced by CheckArchiveTimeout(), and CheckpointerMain().

Definition at line 75 of file xlog.c.

Referenced by check_wal_buffers(), XLOGShmemInit(), and XLOGShmemSize().