Header And Logo

PostgreSQL
| The world's most advanced open source database.

Data Structures | Defines | Typedefs | Enumerations | Functions | Variables

xlog.c File Reference

#include "postgres.h"
#include <ctype.h>
#include <time.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <unistd.h>
#include "access/clog.h"
#include "access/multixact.h"
#include "access/subtrans.h"
#include "access/timeline.h"
#include "access/transam.h"
#include "access/tuptoaster.h"
#include "access/twophase.h"
#include "access/xact.h"
#include "access/xlog_internal.h"
#include "access/xlogreader.h"
#include "access/xlogutils.h"
#include "catalog/catversion.h"
#include "catalog/pg_control.h"
#include "catalog/pg_database.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "postmaster/startup.h"
#include "replication/walreceiver.h"
#include "replication/walsender.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/latch.h"
#include "storage/pmsignal.h"
#include "storage/predicate.h"
#include "storage/proc.h"
#include "storage/procarray.h"
#include "storage/reinit.h"
#include "storage/smgr.h"
#include "storage/spin.h"
#include "utils/builtins.h"
#include "utils/guc.h"
#include "utils/ps_status.h"
#include "utils/relmapper.h"
#include "utils/snapmgr.h"
#include "utils/timestamp.h"
#include "pg_trace.h"
Include dependency graph for xlog.c:

Go to the source code of this file.

Data Structures

struct  XLogwrtRqst
struct  XLogwrtResult
struct  XLogCtlInsert
struct  XLogCtlWrite
struct  XLogCtlData
struct  XLogPageReadPrivate

Defines

#define RECOVERY_COMMAND_FILE   "recovery.conf"
#define RECOVERY_COMMAND_DONE   "recovery.done"
#define PROMOTE_SIGNAL_FILE   "promote"
#define FAST_PROMOTE_SIGNAL_FILE   "fast_promote"
#define XLOGfileslop   (2*CheckPointSegments + 1)
#define INSERT_FREESPACE(Insert)   (XLOG_BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))
#define INSERT_RECPTR(recptr, Insert, curridx)   (recptr) = XLogCtl->xlblocks[curridx] - INSERT_FREESPACE(Insert)
#define PrevBufIdx(idx)   (((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))
#define NextBufIdx(idx)   (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
#define RecoveryRequiresIntParameter(param_name, currValue, minValue)

Typedefs

typedef struct XLogwrtRqst XLogwrtRqst
typedef struct XLogwrtResult XLogwrtResult
typedef struct XLogCtlInsert XLogCtlInsert
typedef struct XLogCtlWrite XLogCtlWrite
typedef struct XLogCtlData XLogCtlData
typedef struct XLogPageReadPrivate XLogPageReadPrivate

Enumerations

enum  XLogSource { XLOG_FROM_ANY = 0, XLOG_FROM_ARCHIVE, XLOG_FROM_PG_XLOG, XLOG_FROM_STREAM }

Functions

static void readRecoveryCommandFile (void)
static void exitArchiveRecovery (TimeLineID endTLI, XLogSegNo endLogSegNo)
static bool recoveryStopsHere (XLogRecord *record, bool *includeThis)
static void recoveryPausesHere (void)
static void SetLatestXTime (TimestampTz xtime)
static void SetCurrentChunkStartTime (TimestampTz xtime)
static void CheckRequiredParameterValues (void)
static void XLogReportParameters (void)
static void checkTimeLineSwitch (XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
static void LocalSetXLogInsertAllowed (void)
static void CreateEndOfRecoveryRecord (void)
static void CheckPointGuts (XLogRecPtr checkPointRedo, int flags)
static void KeepLogSeg (XLogRecPtr recptr, XLogSegNo *logSegNo)
static bool XLogCheckBuffer (XLogRecData *rdata, bool holdsExclusiveLock, XLogRecPtr *lsn, BkpBlock *bkpb)
static Buffer RestoreBackupBlockContents (XLogRecPtr lsn, BkpBlock bkpb, char *blk, bool get_cleanup_lock, bool keep_buffer)
static bool AdvanceXLInsertBuffer (bool new_segment)
static bool XLogCheckpointNeeded (XLogSegNo new_segno)
static void XLogWrite (XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
static bool InstallXLogFileSegment (XLogSegNo *segno, char *tmppath, bool find_free, int *max_advance, bool use_lock)
static int XLogFileRead (XLogSegNo segno, int emode, TimeLineID tli, int source, bool notexistOk)
static int XLogFileReadAnyTLI (XLogSegNo segno, int emode, int source)
static int XLogPageRead (XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI)
static bool WaitForWALToBecomeAvailable (XLogRecPtr RecPtr, bool randAccess, bool fetching_ckpt, XLogRecPtr tliRecPtr)
static int emode_for_corrupt_record (int emode, XLogRecPtr RecPtr)
static void XLogFileClose (void)
static void PreallocXlogFiles (XLogRecPtr endptr)
static void RemoveOldXlogFiles (XLogSegNo segno, XLogRecPtr endptr)
static void UpdateLastRemovedPtr (char *filename)
static void ValidateXLOGDirectoryStructure (void)
static void CleanupBackupHistory (void)
static void UpdateMinRecoveryPoint (XLogRecPtr lsn, bool force)
static XLogRecordReadRecord (XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode, bool fetching_ckpt)
static void CheckRecoveryConsistency (void)
static XLogRecordReadCheckpointRecord (XLogReaderState *xlogreader, XLogRecPtr RecPtr, int whichChkpti, bool report)
static bool rescanLatestTimeLine (void)
static void WriteControlFile (void)
static void ReadControlFile (void)
static char * str_time (pg_time_t tnow)
static bool CheckForStandbyTrigger (void)
static void pg_start_backup_callback (int code, Datum arg)
static bool read_backup_label (XLogRecPtr *checkPointLoc, bool *backupEndRequired, bool *backupFromStandby)
static void rm_redo_error_callback (void *arg)
static int get_sync_bit (int method)
XLogRecPtr XLogInsert (RmgrId rmid, uint8 info, XLogRecData *rdata)
void XLogSetAsyncXactLSN (XLogRecPtr asyncXactLSN)
void XLogFlush (XLogRecPtr record)
bool XLogBackgroundFlush (void)
bool XLogNeedsFlush (XLogRecPtr record)
int XLogFileInit (XLogSegNo logsegno, bool *use_existent, bool use_lock)
static void XLogFileCopy (XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno)
int XLogFileOpen (XLogSegNo segno)
void CheckXLogRemoved (XLogSegNo segno, TimeLineID tli)
Buffer RestoreBackupBlock (XLogRecPtr lsn, XLogRecord *record, int block_index, bool get_cleanup_lock, bool keep_buffer)
void UpdateControlFile (void)
uint64 GetSystemIdentifier (void)
bool DataChecksumsEnabled (void)
XLogRecPtr GetFakeLSNForUnloggedRel (void)
static int XLOGChooseNumBuffers (void)
bool check_wal_buffers (int *newval, void **extra, GucSource source)
Size XLOGShmemSize (void)
void XLOGShmemInit (void)
void BootStrapXLOG (void)
bool RecoveryIsPaused (void)
void SetRecoveryPause (bool recoveryPause)
TimestampTz GetLatestXTime (void)
TimestampTz GetCurrentChunkReplayStartTime (void)
void GetXLogReceiptTime (TimestampTz *rtime, bool *fromStream)
void StartupXLOG (void)
bool RecoveryInProgress (void)
bool HotStandbyActive (void)
bool XLogInsertAllowed (void)
void InitXLOGAccess (void)
XLogRecPtr GetRedoRecPtr (void)
XLogRecPtr GetInsertRecPtr (void)
XLogRecPtr GetFlushRecPtr (void)
pg_time_t GetLastSegSwitchTime (void)
void GetNextXidAndEpoch (TransactionId *xid, uint32 *epoch)
void ShutdownXLOG (int code, Datum arg)
static void LogCheckpointStart (int flags, bool restartpoint)
static void LogCheckpointEnd (bool restartpoint)
void CreateCheckPoint (int flags)
static void RecoveryRestartPoint (const CheckPoint *checkPoint)
bool CreateRestartPoint (int flags)
void XLogPutNextOid (Oid nextOid)
XLogRecPtr RequestXLogSwitch (void)
XLogRecPtr XLogRestorePoint (const char *rpName)
XLogRecPtr XLogSaveBufferForHint (Buffer buffer)
void UpdateFullPageWrites (void)
void xlog_redo (XLogRecPtr lsn, XLogRecord *record)
void assign_xlog_sync_method (int new_sync_method, void *extra)
void issue_xlog_fsync (int fd, XLogSegNo segno)
char * XLogFileNameP (TimeLineID tli, XLogSegNo segno)
XLogRecPtr do_pg_start_backup (const char *backupidstr, bool fast, TimeLineID *starttli_p, char **labelfile)
XLogRecPtr do_pg_stop_backup (char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
void do_pg_abort_backup (void)
XLogRecPtr GetXLogReplayRecPtr (TimeLineID *replayTLI)
XLogRecPtr GetXLogInsertRecPtr (void)
XLogRecPtr GetXLogWriteRecPtr (void)
void GetOldestRestartPoint (XLogRecPtr *oldrecptr, TimeLineID *oldtli)
bool BackupInProgress (void)
void CancelBackup (void)
bool CheckPromoteSignal (void)
void WakeupRecovery (void)
void SetWalWriterSleeping (bool sleeping)

Variables

uint32 bootstrap_data_checksum_version
int CheckPointSegments = 3
int wal_keep_segments = 0
int XLOGbuffers = -1
int XLogArchiveTimeout = 0
bool XLogArchiveMode = false
char * XLogArchiveCommand = NULL
bool EnableHotStandby = false
bool fullPageWrites = true
bool log_checkpoints = false
int sync_method = DEFAULT_SYNC_METHOD
int wal_level = WAL_LEVEL_MINIMAL
int CommitDelay = 0
int CommitSiblings = 5
struct config_enum_entry sync_method_options []
CheckpointStatsData CheckpointStats
TimeLineID ThisTimeLineID = 0
bool InRecovery = false
HotStandbyState standbyState = STANDBY_DISABLED
static XLogRecPtr LastRec
static XLogRecPtr receivedUpto = 0
static TimeLineID receiveTLI = 0
static bool lastFullPageWrites
static bool LocalRecoveryInProgress = true
static bool LocalHotStandbyActive = false
static int LocalXLogInsertAllowed = -1
bool ArchiveRecoveryRequested = false
bool InArchiveRecovery = false
static bool restoredFromArchive = false
char * recoveryRestoreCommand = NULL
static char * recoveryEndCommand = NULL
static char * archiveCleanupCommand = NULL
static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET
static bool recoveryTargetInclusive = true
static bool recoveryPauseAtTarget = true
static TransactionId recoveryTargetXid
static TimestampTz recoveryTargetTime
static char * recoveryTargetName
static bool StandbyModeRequested = false
static char * PrimaryConnInfo = NULL
static char * TriggerFile = NULL
bool StandbyMode = false
static bool fast_promote = false
static TransactionId recoveryStopXid
static TimestampTz recoveryStopTime
static char recoveryStopName [MAXFNAMELEN]
static bool recoveryStopAfter
static TimeLineID recoveryTargetTLI
static bool recoveryTargetIsLatest = false
static ListexpectedTLEs
static TimeLineID curFileTLI
static XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr
XLogRecPtr XactLastRecEnd = InvalidXLogRecPtr
static XLogRecPtr RedoRecPtr
static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr
static XLogCtlDataXLogCtl = NULL
static ControlFileDataControlFile = NULL
static XLogwrtResult LogwrtResult = {0, 0}
static const char * xlogSourceNames [] = { "any", "archive", "pg_xlog", "stream" }
static int openLogFile = -1
static XLogSegNo openLogSegNo = 0
static uint32 openLogOff = 0
static int readFile = -1
static XLogSegNo readSegNo = 0
static uint32 readOff = 0
static uint32 readLen = 0
static XLogSource readSource = 0
static XLogSource currentSource = 0
static bool lastSourceFailed = false
static TimestampTz XLogReceiptTime = 0
static XLogSource XLogReceiptSource = 0
static XLogRecPtr ReadRecPtr
static XLogRecPtr EndRecPtr
static XLogRecPtr minRecoveryPoint
static TimeLineID minRecoveryPointTLI
static bool updateMinRecoveryPoint = true
bool reachedConsistency = false
static bool InRedo = false
static bool bgwriterLaunched = false

Define Documentation

#define FAST_PROMOTE_SIGNAL_FILE   "fast_promote"

Definition at line 69 of file xlog.c.

Referenced by CheckForStandbyTrigger(), and CheckPromoteSignal().

#define INSERT_FREESPACE (   Insert  )     (XLOG_BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))

Definition at line 526 of file xlog.c.

Referenced by CreateCheckPoint(), StartupXLOG(), XLogFlush(), and XLogInsert().

#define INSERT_RECPTR (   recptr,
  Insert,
  curridx 
)    (recptr) = XLogCtl->xlblocks[curridx] - INSERT_FREESPACE(Insert)

Definition at line 530 of file xlog.c.

Referenced by CreateCheckPoint(), GetXLogInsertRecPtr(), and XLogInsert().

#define NextBufIdx (   idx  )     (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))

Definition at line 536 of file xlog.c.

Referenced by AdvanceXLInsertBuffer(), StartupXLOG(), and XLogWrite().

#define PrevBufIdx (   idx  )     (((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))

Definition at line 533 of file xlog.c.

Referenced by XLogInsert().

#define PROMOTE_SIGNAL_FILE   "promote"

Definition at line 68 of file xlog.c.

Referenced by CheckForStandbyTrigger(), and CheckPromoteSignal().

#define RECOVERY_COMMAND_DONE   "recovery.done"

Definition at line 67 of file xlog.c.

Referenced by exitArchiveRecovery().

#define RECOVERY_COMMAND_FILE   "recovery.conf"

Definition at line 66 of file xlog.c.

Referenced by exitArchiveRecovery(), and readRecoveryCommandFile().

#define RecoveryRequiresIntParameter (   param_name,
  currValue,
  minValue 
)
Value:
do { \
    if ((currValue) < (minValue)) \
        ereport(ERROR, \
                (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
                 errmsg("hot standby is not possible because " \
                        "%s = %d is a lower setting than on the master server " \
                        "(its value was %d)", \
                        param_name, \
                        currValue, \
                        minValue))); \
} while(0)

Definition at line 4787 of file xlog.c.

Referenced by CheckRequiredParameterValues().

#define XLOGfileslop   (2*CheckPointSegments + 1)

Definition at line 102 of file xlog.c.


Typedef Documentation

typedef struct XLogCtlData XLogCtlData
typedef struct XLogCtlInsert XLogCtlInsert
typedef struct XLogCtlWrite XLogCtlWrite
typedef struct XLogwrtResult XLogwrtResult
typedef struct XLogwrtRqst XLogwrtRqst

Enumeration Type Documentation

enum XLogSource
Enumerator:
XLOG_FROM_ANY 
XLOG_FROM_ARCHIVE 
XLOG_FROM_PG_XLOG 
XLOG_FROM_STREAM 

Definition at line 549 of file xlog.c.

{
    XLOG_FROM_ANY = 0,      /* request to read WAL from any source */
    XLOG_FROM_ARCHIVE,      /* restored using restore_command */
    XLOG_FROM_PG_XLOG,      /* existing file in pg_xlog */
    XLOG_FROM_STREAM,       /* streamed from master */
} XLogSource;


Function Documentation

static bool AdvanceXLInsertBuffer ( bool  new_segment  )  [static]

Definition at line 1320 of file xlog.c.

References XLogCtlInsert::curridx, XLogCtlInsert::currpage, XLogCtlInsert::currpos, XLogwrtRqst::Flush, XLogCtlInsert::forcePageWrites, XLogCtlData::info_lck, XLogCtlData::Insert, Insert(), XLogCtlData::LogwrtResult, XLogCtlData::LogwrtRqst, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), MemSet, NextBufIdx, XLogCtlData::pages, SpinLockAcquire, SpinLockRelease, ControlFileData::system_identifier, ThisTimeLineID, WALWriteLock, XLogwrtRqst::Write, XLogwrtResult::Write, XLogCtlData::xlblocks, XLogSegSize, XLogWrite(), XLogLongPageHeaderData::xlp_seg_size, XLogLongPageHeaderData::xlp_sysid, and XLogLongPageHeaderData::xlp_xlog_blcksz.

Referenced by CreateCheckPoint(), and XLogInsert().

{
    XLogCtlInsert *Insert = &XLogCtl->Insert;
    int         nextidx = NextBufIdx(Insert->curridx);
    bool        update_needed = true;
    XLogRecPtr  OldPageRqstPtr;
    XLogwrtRqst WriteRqst;
    XLogRecPtr  NewPageEndPtr;
    XLogRecPtr  NewPageBeginPtr;
    XLogPageHeader NewPage;

    /*
     * Get ending-offset of the buffer page we need to replace (this may be
     * zero if the buffer hasn't been used yet).  Fall through if it's already
     * written out.
     */
    OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
    if (LogwrtResult.Write < OldPageRqstPtr)
    {
        /* nope, got work to do... */
        XLogRecPtr  FinishedPageRqstPtr;

        FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];

        /* Before waiting, get info_lck and update LogwrtResult */
        {
            /* use volatile pointer to prevent code rearrangement */
            volatile XLogCtlData *xlogctl = XLogCtl;

            SpinLockAcquire(&xlogctl->info_lck);
            if (xlogctl->LogwrtRqst.Write < FinishedPageRqstPtr)
                xlogctl->LogwrtRqst.Write = FinishedPageRqstPtr;
            LogwrtResult = xlogctl->LogwrtResult;
            SpinLockRelease(&xlogctl->info_lck);
        }

        update_needed = false;  /* Did the shared-request update */

        /*
         * Now that we have an up-to-date LogwrtResult value, see if we still
         * need to write it or if someone else already did.
         */
        if (LogwrtResult.Write < OldPageRqstPtr)
        {
            /* Must acquire write lock */
            LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
            LogwrtResult = XLogCtl->LogwrtResult;
            if (LogwrtResult.Write >= OldPageRqstPtr)
            {
                /* OK, someone wrote it already */
                LWLockRelease(WALWriteLock);
            }
            else
            {
                /*
                 * Have to write buffers while holding insert lock. This is
                 * not good, so only write as much as we absolutely must.
                 */
                TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
                WriteRqst.Write = OldPageRqstPtr;
                WriteRqst.Flush = 0;
                XLogWrite(WriteRqst, false, false);
                LWLockRelease(WALWriteLock);
                TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
            }
        }
    }

    /*
     * Now the next buffer slot is free and we can set it up to be the next
     * output page.
     */
    NewPageBeginPtr = XLogCtl->xlblocks[Insert->curridx];

    if (new_segment)
    {
        /* force it to a segment start point */
        if (NewPageBeginPtr % XLogSegSize != 0)
            NewPageBeginPtr += XLogSegSize - NewPageBeginPtr % XLogSegSize;
    }

    NewPageEndPtr = NewPageBeginPtr;
    NewPageEndPtr += XLOG_BLCKSZ;
    XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
    NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);

    Insert->curridx = nextidx;
    Insert->currpage = NewPage;

    Insert->currpos = ((char *) NewPage) +SizeOfXLogShortPHD;

    /*
     * Be sure to re-zero the buffer so that bytes beyond what we've written
     * will look like zeroes and not valid XLOG records...
     */
    MemSet((char *) NewPage, 0, XLOG_BLCKSZ);

    /*
     * Fill the new page's header
     */
    NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;

    /* NewPage->xlp_info = 0; */    /* done by memset */
    NewPage   ->xlp_tli = ThisTimeLineID;
    NewPage   ->xlp_pageaddr = NewPageBeginPtr;

    /*
     * If online backup is not in progress, mark the header to indicate that
     * WAL records beginning in this page have removable backup blocks.  This
     * allows the WAL archiver to know whether it is safe to compress archived
     * WAL data by transforming full-block records into the non-full-block
     * format.  It is sufficient to record this at the page level because we
     * force a page switch (in fact a segment switch) when starting a backup,
     * so the flag will be off before any records can be written during the
     * backup.  At the end of a backup, the last page will be marked as all
     * unsafe when perhaps only part is unsafe, but at worst the archiver
     * would miss the opportunity to compress a few records.
     */
    if (!Insert->forcePageWrites)
        NewPage   ->xlp_info |= XLP_BKP_REMOVABLE;

    /*
     * If first page of an XLOG segment file, make it a long header.
     */
    if ((NewPage->xlp_pageaddr % XLogSegSize) == 0)
    {
        XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;

        NewLongPage->xlp_sysid = ControlFile->system_identifier;
        NewLongPage->xlp_seg_size = XLogSegSize;
        NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
        NewPage   ->xlp_info |= XLP_LONG_HEADER;

        Insert->currpos = ((char *) NewPage) +SizeOfXLogLongPHD;
    }

    return update_needed;
}

void assign_xlog_sync_method ( int  new_sync_method,
void *  extra 
)

Definition at line 8291 of file xlog.c.

References ereport, errcode_for_file_access(), errmsg(), get_sync_bit(), openLogFile, openLogSegNo, PANIC, pg_fsync(), sync_method, ThisTimeLineID, XLogFileClose(), and XLogFileNameP().

{
    if (sync_method != new_sync_method)
    {
        /*
         * To ensure that no blocks escape unsynced, force an fsync on the
         * currently open log segment (if any).  Also, if the open flag is
         * changing, close the log file so it will be reopened (with new flag
         * bit) at next use.
         */
        if (openLogFile >= 0)
        {
            if (pg_fsync(openLogFile) != 0)
                ereport(PANIC,
                        (errcode_for_file_access(),
                         errmsg("could not fsync log segment %s: %m",
                                XLogFileNameP(ThisTimeLineID, openLogSegNo))));
            if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
                XLogFileClose();
        }
    }
}

bool BackupInProgress ( void   ) 

Definition at line 9284 of file xlog.c.

References BACKUP_LABEL_FILE.

Referenced by pg_is_in_backup(), and PostmasterStateMachine().

{
    struct stat stat_buf;

    return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
}

void BootStrapXLOG ( void   ) 

Definition at line 3993 of file xlog.c.

References ALIGNOF_XLOG_BUFFER, bootstrap_data_checksum_version, BootStrapCLOG(), BootStrapMultiXact(), BootStrapSUBTRANS(), ControlFileData::checkPoint, ControlFileData::checkPointCopy, close, COMP_CRC32, ControlFileData::data_checksum_version, ereport, errcode_for_file_access(), errmsg(), FIN_CRC32, fullPageWrites, CheckPoint::fullPageWrites, gettimeofday(), INIT_CRC32, max_locks_per_xact, ControlFileData::max_locks_per_xact, max_prepared_xacts, ControlFileData::max_prepared_xacts, MaxConnections, ControlFileData::MaxConnections, MultiXactSetNextMXact(), CheckPoint::nextMulti, CheckPoint::nextMultiOffset, VariableCacheData::nextOid, CheckPoint::nextOid, VariableCacheData::nextXid, CheckPoint::nextXid, CheckPoint::nextXidEpoch, NULL, offsetof, VariableCacheData::oidCount, CheckPoint::oldestActiveXid, CheckPoint::oldestMulti, CheckPoint::oldestMultiDB, CheckPoint::oldestXid, CheckPoint::oldestXidDB, openLogFile, palloc(), PANIC, pfree(), pg_fsync(), CheckPoint::PrevTimeLineID, CheckPoint::redo, SetMultiXactIdLimit(), SetTransactionIdLimit(), ShmemVariableCache, SizeOfXLogRecord, ControlFileData::state, ControlFileData::system_identifier, CheckPoint::ThisTimeLineID, ThisTimeLineID, ControlFileData::time, CheckPoint::time, TYPEALIGN, ControlFileData::unloggedLSN, wal_level, ControlFileData::wal_level, write, WriteControlFile(), XLogRecord::xl_info, XLogRecord::xl_len, XLogRecord::xl_prev, XLogRecord::xl_rmid, XLogRecord::xl_tot_len, XLogRecord::xl_xid, XLogFileInit(), XLogRecGetData, XLogSegSize, XLogPageHeaderData::xlp_info, XLogPageHeaderData::xlp_magic, XLogPageHeaderData::xlp_pageaddr, XLogLongPageHeaderData::xlp_seg_size, XLogLongPageHeaderData::xlp_sysid, XLogPageHeaderData::xlp_tli, and XLogLongPageHeaderData::xlp_xlog_blcksz.

Referenced by AuxiliaryProcessMain().

{
    CheckPoint  checkPoint;
    char       *buffer;
    XLogPageHeader page;
    XLogLongPageHeader longpage;
    XLogRecord *record;
    bool        use_existent;
    uint64      sysidentifier;
    struct timeval tv;
    pg_crc32    crc;

    /*
     * Select a hopefully-unique system identifier code for this installation.
     * We use the result of gettimeofday(), including the fractional seconds
     * field, as being about as unique as we can easily get.  (Think not to
     * use random(), since it hasn't been seeded and there's no portable way
     * to seed it other than the system clock value...)  The upper half of the
     * uint64 value is just the tv_sec part, while the lower half is the XOR
     * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
     * unnecessarily if "uint64" is really only 32 bits wide.  A person
     * knowing this encoding can determine the initialization time of the
     * installation, which could perhaps be useful sometimes.
     */
    gettimeofday(&tv, NULL);
    sysidentifier = ((uint64) tv.tv_sec) << 32;
    sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);

    /* First timeline ID is always 1 */
    ThisTimeLineID = 1;

    /* page buffer must be aligned suitably for O_DIRECT */
    buffer = (char *) palloc(XLOG_BLCKSZ + ALIGNOF_XLOG_BUFFER);
    page = (XLogPageHeader) TYPEALIGN(ALIGNOF_XLOG_BUFFER, buffer);
    memset(page, 0, XLOG_BLCKSZ);

    /*
     * Set up information for the initial checkpoint record
     *
     * The initial checkpoint record is written to the beginning of the WAL
     * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
     * used, so that we can use 0/0 to mean "before any valid WAL segment".
     */
    checkPoint.redo = XLogSegSize + SizeOfXLogLongPHD;
    checkPoint.ThisTimeLineID = ThisTimeLineID;
    checkPoint.PrevTimeLineID = ThisTimeLineID;
    checkPoint.fullPageWrites = fullPageWrites;
    checkPoint.nextXidEpoch = 0;
    checkPoint.nextXid = FirstNormalTransactionId;
    checkPoint.nextOid = FirstBootstrapObjectId;
    checkPoint.nextMulti = FirstMultiXactId;
    checkPoint.nextMultiOffset = 0;
    checkPoint.oldestXid = FirstNormalTransactionId;
    checkPoint.oldestXidDB = TemplateDbOid;
    checkPoint.oldestMulti = FirstMultiXactId;
    checkPoint.oldestMultiDB = TemplateDbOid;
    checkPoint.time = (pg_time_t) time(NULL);
    checkPoint.oldestActiveXid = InvalidTransactionId;

    ShmemVariableCache->nextXid = checkPoint.nextXid;
    ShmemVariableCache->nextOid = checkPoint.nextOid;
    ShmemVariableCache->oidCount = 0;
    MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
    SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
    SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);

    /* Set up the XLOG page header */
    page->xlp_magic = XLOG_PAGE_MAGIC;
    page->xlp_info = XLP_LONG_HEADER;
    page->xlp_tli = ThisTimeLineID;
    page->xlp_pageaddr = XLogSegSize;
    longpage = (XLogLongPageHeader) page;
    longpage->xlp_sysid = sysidentifier;
    longpage->xlp_seg_size = XLogSegSize;
    longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;

    /* Insert the initial checkpoint record */
    record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
    record->xl_prev = 0;
    record->xl_xid = InvalidTransactionId;
    record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
    record->xl_len = sizeof(checkPoint);
    record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
    record->xl_rmid = RM_XLOG_ID;
    memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));

    INIT_CRC32(crc);
    COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
    COMP_CRC32(crc, (char *) record, offsetof(XLogRecord, xl_crc));
    FIN_CRC32(crc);
    record->xl_crc = crc;

    /* Create first XLOG segment file */
    use_existent = false;
    openLogFile = XLogFileInit(1, &use_existent, false);

    /* Write the first page with the initial record */
    errno = 0;
    if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
    {
        /* if write didn't set errno, assume problem is no disk space */
        if (errno == 0)
            errno = ENOSPC;
        ereport(PANIC,
                (errcode_for_file_access(),
              errmsg("could not write bootstrap transaction log file: %m")));
    }

    if (pg_fsync(openLogFile) != 0)
        ereport(PANIC,
                (errcode_for_file_access(),
              errmsg("could not fsync bootstrap transaction log file: %m")));

    if (close(openLogFile))
        ereport(PANIC,
                (errcode_for_file_access(),
              errmsg("could not close bootstrap transaction log file: %m")));

    openLogFile = -1;

    /* Now create pg_control */

    memset(ControlFile, 0, sizeof(ControlFileData));
    /* Initialize pg_control status fields */
    ControlFile->system_identifier = sysidentifier;
    ControlFile->state = DB_SHUTDOWNED;
    ControlFile->time = checkPoint.time;
    ControlFile->checkPoint = checkPoint.redo;
    ControlFile->checkPointCopy = checkPoint;
    ControlFile->unloggedLSN = 1;

    /* Set important parameter values for use when replaying WAL */
    ControlFile->MaxConnections = MaxConnections;
    ControlFile->max_prepared_xacts = max_prepared_xacts;
    ControlFile->max_locks_per_xact = max_locks_per_xact;
    ControlFile->wal_level = wal_level;
    ControlFile->data_checksum_version = bootstrap_data_checksum_version;

    /* some additional ControlFile fields are set in WriteControlFile() */

    WriteControlFile();

    /* Bootstrap the commit log, too */
    BootStrapCLOG();
    BootStrapSUBTRANS();
    BootStrapMultiXact();

    pfree(buffer);
}

void CancelBackup ( void   ) 

Definition at line 9299 of file xlog.c.

References BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, ereport, errcode_for_file_access(), errdetail(), errmsg(), LOG, unlink(), and WARNING.

Referenced by PostmasterStateMachine().

{
    struct stat stat_buf;

    /* if the file is not there, return */
    if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
        return;

    /* remove leftover file from previously canceled backup if it exists */
    unlink(BACKUP_LABEL_OLD);

    if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
    {
        ereport(LOG,
                (errmsg("online backup mode canceled"),
                 errdetail("\"%s\" was renamed to \"%s\".",
                           BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
    }
    else
    {
        ereport(WARNING,
                (errcode_for_file_access(),
                 errmsg("online backup mode was not canceled"),
                 errdetail("Could not rename \"%s\" to \"%s\": %m.",
                           BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
    }
}

bool check_wal_buffers ( int *  newval,
void **  extra,
GucSource  source 
)

Definition at line 3856 of file xlog.c.

References XLOGbuffers, and XLOGChooseNumBuffers().

{
    /*
     * -1 indicates a request for auto-tune.
     */
    if (*newval == -1)
    {
        /*
         * If we haven't yet changed the boot_val default of -1, just let it
         * be.  We'll fix it when XLOGShmemSize is called.
         */
        if (XLOGbuffers == -1)
            return true;

        /* Otherwise, substitute the auto-tune value */
        *newval = XLOGChooseNumBuffers();
    }

    /*
     * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
     * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
     * the case, we just silently treat such values as a request for the
     * minimum.  (We could throw an error instead, but that doesn't seem very
     * helpful.)
     */
    if (*newval < 4)
        *newval = 4;

    return true;
}

static bool CheckForStandbyTrigger ( void   )  [static]

Definition at line 9880 of file xlog.c.

References ereport, errmsg(), fast_promote, FAST_PROMOTE_SIGNAL_FILE, IsPromoteTriggered(), LOG, NULL, PROMOTE_SIGNAL_FILE, ResetPromoteTriggered(), TriggerFile, and unlink().

Referenced by ReadRecord(), and WaitForWALToBecomeAvailable().

{
    struct stat stat_buf;
    static bool triggered = false;

    if (triggered)
        return true;

    if (IsPromoteTriggered())
    {
        /*
         * In 9.1 and 9.2 the postmaster unlinked the promote file
         * inside the signal handler. We now leave the file in place
         * and let the Startup process do the unlink. This allows
         * Startup to know whether we're doing fast or normal
         * promotion. Fast promotion takes precedence.
         */
        if (stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
        {
            unlink(FAST_PROMOTE_SIGNAL_FILE);
            unlink(PROMOTE_SIGNAL_FILE);
            fast_promote = true;
        }
        else if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
        {
            unlink(PROMOTE_SIGNAL_FILE);
            fast_promote = false;
        }

        ereport(LOG, (errmsg("received promote request")));

        ResetPromoteTriggered();
        triggered = true;
        return true;
    }

    if (TriggerFile == NULL)
        return false;

    if (stat(TriggerFile, &stat_buf) == 0)
    {
        ereport(LOG,
                (errmsg("trigger file found: %s", TriggerFile)));
        unlink(TriggerFile);
        triggered = true;
        fast_promote = true;
        return true;
    }
    return false;
}

static void CheckPointGuts ( XLogRecPtr  checkPointRedo,
int  flags 
) [static]

Definition at line 7237 of file xlog.c.

References CheckPointBuffers(), CheckPointCLOG(), CheckPointMultiXact(), CheckPointPredicate(), CheckPointRelationMap(), CheckPointSUBTRANS(), and CheckPointTwoPhase().

Referenced by CreateCheckPoint(), and CreateRestartPoint().

{
    CheckPointCLOG();
    CheckPointSUBTRANS();
    CheckPointMultiXact();
    CheckPointPredicate();
    CheckPointRelationMap();
    CheckPointBuffers(flags);   /* performs all required fsyncs */
    /* We deliberately delay 2PC checkpointing as long as possible */
    CheckPointTwoPhase(checkPointRedo);
}

bool CheckPromoteSignal ( void   ) 

Definition at line 9936 of file xlog.c.

References FAST_PROMOTE_SIGNAL_FILE, and PROMOTE_SIGNAL_FILE.

Referenced by sigusr1_handler().

{
    struct stat stat_buf;

    if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
        stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
        return true;

    return false;
}

static void CheckRecoveryConsistency ( void   )  [static]

Definition at line 6117 of file xlog.c.

References ControlFileData::backupEndPoint, ControlFileData::backupEndRequired, ControlFileData::backupStartPoint, ControlFileLock, DEBUG1, elog, EndRecPtr, ereport, errmsg(), XLogCtlData::info_lck, IsUnderPostmaster, XLogCtlData::lastReplayedEndRecPtr, LocalHotStandbyActive, LOG, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), ControlFileData::minRecoveryPoint, minRecoveryPoint, PMSIGNAL_BEGIN_HOT_STANDBY, reachedConsistency, SendPostmasterSignal(), XLogCtlData::SharedHotStandbyActive, SpinLockAcquire, SpinLockRelease, STANDBY_SNAPSHOT_READY, standbyState, UpdateControlFile(), XLogCheckInvalidPages(), and XLogRecPtrIsInvalid.

Referenced by ReadRecord(), and StartupXLOG().

{
    /*
     * During crash recovery, we don't reach a consistent state until we've
     * replayed all the WAL.
     */
    if (XLogRecPtrIsInvalid(minRecoveryPoint))
        return;

    /*
     * Have we reached the point where our base backup was completed?
     */
    if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
        ControlFile->backupEndPoint <= EndRecPtr)
    {
        /*
         * We have reached the end of base backup, as indicated by pg_control.
         * The data on disk is now consistent. Reset backupStartPoint and
         * backupEndPoint, and update minRecoveryPoint to make sure we don't
         * allow starting up at an earlier point even if recovery is stopped
         * and restarted soon after this.
         */
        elog(DEBUG1, "end of backup reached");

        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);

        if (ControlFile->minRecoveryPoint < EndRecPtr)
            ControlFile->minRecoveryPoint = EndRecPtr;

        ControlFile->backupStartPoint = InvalidXLogRecPtr;
        ControlFile->backupEndPoint = InvalidXLogRecPtr;
        ControlFile->backupEndRequired = false;
        UpdateControlFile();

        LWLockRelease(ControlFileLock);
    }

    /*
     * Have we passed our safe starting point? Note that minRecoveryPoint
     * is known to be incorrectly set if ControlFile->backupEndRequired,
     * until the XLOG_BACKUP_RECORD arrives to advise us of the correct
     * minRecoveryPoint. All we know prior to that is that we're not
     * consistent yet.
     */
    if (!reachedConsistency && !ControlFile->backupEndRequired &&
        minRecoveryPoint <= XLogCtl->lastReplayedEndRecPtr &&
        XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
    {
        /*
         * Check to see if the XLOG sequence contained any unresolved
         * references to uninitialized pages.
         */
        XLogCheckInvalidPages();

        reachedConsistency = true;
        ereport(LOG,
                (errmsg("consistent recovery state reached at %X/%X",
                        (uint32) (XLogCtl->lastReplayedEndRecPtr >> 32),
                        (uint32) XLogCtl->lastReplayedEndRecPtr)));
    }

    /*
     * Have we got a valid starting snapshot that will allow queries to be
     * run? If so, we can tell postmaster that the database is consistent now,
     * enabling connections.
     */
    if (standbyState == STANDBY_SNAPSHOT_READY &&
        !LocalHotStandbyActive &&
        reachedConsistency &&
        IsUnderPostmaster)
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        xlogctl->SharedHotStandbyActive = true;
        SpinLockRelease(&xlogctl->info_lck);

        LocalHotStandbyActive = true;

        SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
    }
}

static void CheckRequiredParameterValues ( void   )  [static]

Definition at line 4805 of file xlog.c.

References EnableHotStandby, ereport, errhint(), errmsg(), ERROR, InArchiveRecovery, ControlFileData::max_locks_per_xact, max_locks_per_xact, ControlFileData::max_prepared_xacts, max_prepared_xacts, ControlFileData::MaxConnections, MaxConnections, RecoveryRequiresIntParameter, ControlFileData::wal_level, WAL_LEVEL_HOT_STANDBY, WAL_LEVEL_MINIMAL, and WARNING.

Referenced by StartupXLOG(), and xlog_redo().

{
    /*
     * For archive recovery, the WAL must be generated with at least 'archive'
     * wal_level.
     */
    if (InArchiveRecovery && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
    {
        ereport(WARNING,
                (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
                 errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
    }

    /*
     * For Hot Standby, the WAL must be generated with 'hot_standby' mode, and
     * we must have at least as many backend slots as the primary.
     */
    if (InArchiveRecovery && EnableHotStandby)
    {
        if (ControlFile->wal_level < WAL_LEVEL_HOT_STANDBY)
            ereport(ERROR,
                    (errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" on the master server"),
                     errhint("Either set wal_level to \"hot_standby\" on the master, or turn off hot_standby here.")));

        /* We ignore autovacuum_max_workers when we make this test. */
        RecoveryRequiresIntParameter("max_connections",
                                     MaxConnections,
                                     ControlFile->MaxConnections);
        RecoveryRequiresIntParameter("max_prepared_transactions",
                                     max_prepared_xacts,
                                     ControlFile->max_prepared_xacts);
        RecoveryRequiresIntParameter("max_locks_per_transaction",
                                     max_locks_per_xact,
                                     ControlFile->max_locks_per_xact);
    }
}

static void checkTimeLineSwitch ( XLogRecPtr  lsn,
TimeLineID  newTLI,
TimeLineID  prevTLI 
) [static]

Definition at line 7843 of file xlog.c.

References ereport, errmsg(), minRecoveryPoint, minRecoveryPointTLI, PANIC, ThisTimeLineID, tliInHistory(), and XLogRecPtrIsInvalid.

Referenced by StartupXLOG().

{
    /* Check that the record agrees on what the current (old) timeline is */
    if (prevTLI != ThisTimeLineID)
        ereport(PANIC,
                (errmsg("unexpected prev timeline ID %u (current timeline ID %u) in checkpoint record",
                        prevTLI, ThisTimeLineID)));
    /*
     * The new timeline better be in the list of timelines we expect
     * to see, according to the timeline history. It should also not
     * decrease.
     */
    if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
        ereport(PANIC,
                (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
                        newTLI, ThisTimeLineID)));

    /*
     * If we have not yet reached min recovery point, and we're about
     * to switch to a timeline greater than the timeline of the min
     * recovery point: trouble. After switching to the new timeline,
     * we could not possibly visit the min recovery point on the
     * correct timeline anymore. This can happen if there is a newer
     * timeline in the archive that branched before the timeline the
     * min recovery point is on, and you attempt to do PITR to the
     * new timeline.
     */
    if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
        lsn < minRecoveryPoint &&
        newTLI > minRecoveryPointTLI)
        ereport(PANIC,
                (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
                        newTLI,
                        (uint32) (minRecoveryPoint >> 32),
                        (uint32) minRecoveryPoint,
                        minRecoveryPointTLI)));

    /* Looks good */
}

void CheckXLogRemoved ( XLogSegNo  segno,
TimeLineID  tli 
)

Definition at line 2840 of file xlog.c.

References ereport, errcode_for_file_access(), errmsg(), ERROR, filename, XLogCtlData::info_lck, XLogCtlData::lastRemovedSegNo, SpinLockAcquire, SpinLockRelease, and XLogFileName.

Referenced by perform_base_backup(), and XLogRead().

{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;
    XLogSegNo   lastRemovedSegNo;

    SpinLockAcquire(&xlogctl->info_lck);
    lastRemovedSegNo = xlogctl->lastRemovedSegNo;
    SpinLockRelease(&xlogctl->info_lck);

    if (segno <= lastRemovedSegNo)
    {
        char        filename[MAXFNAMELEN];

        XLogFileName(filename, tli, segno);
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("requested WAL segment %s has already been removed",
                        filename)));
    }
}

static void CleanupBackupHistory ( void   )  [static]

Definition at line 3081 of file xlog.c.

References AllocateDir(), dirent::d_name, DEBUG2, ereport, errcode_for_file_access(), errmsg(), ERROR, FreeDir(), MAXPGPATH, NULL, ReadDir(), snprintf(), unlink(), XLogArchiveCheckDone(), XLogArchiveCleanup(), and XLOGDIR.

Referenced by do_pg_stop_backup().

{
    DIR        *xldir;
    struct dirent *xlde;
    char        path[MAXPGPATH];

    xldir = AllocateDir(XLOGDIR);
    if (xldir == NULL)
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not open transaction log directory \"%s\": %m",
                        XLOGDIR)));

    while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    {
        if (strlen(xlde->d_name) > 24 &&
            strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
            strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
                   ".backup") == 0)
        {
            if (XLogArchiveCheckDone(xlde->d_name))
            {
                ereport(DEBUG2,
                (errmsg("removing transaction log backup history file \"%s\"",
                        xlde->d_name)));
                snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
                unlink(path);
                XLogArchiveCleanup(xlde->d_name);
            }
        }
    }

    FreeDir(xldir);
}

void CreateCheckPoint ( int  flags  ) 

Definition at line 6764 of file xlog.c.

References AdvanceXLInsertBuffer(), XLogRecData::buffer, ControlFileData::checkPoint, CHECKPOINT_END_OF_RECOVERY, CHECKPOINT_FORCE, CHECKPOINT_IS_SHUTDOWN, ControlFileData::checkPointCopy, CheckPointGuts(), CheckpointLock, CheckpointStatsData::ckpt_bufs_written, CheckpointStatsData::ckpt_segs_added, CheckpointStatsData::ckpt_segs_recycled, CheckpointStatsData::ckpt_segs_removed, CheckpointStatsData::ckpt_start_t, XLogCtlData::ckptXid, XLogCtlData::ckptXidEpoch, ControlFileLock, XLogCtlInsert::curridx, XLogRecData::data, elog, END_CRIT_SECTION, ereport, errmsg(), ERROR, XLogCtlInsert::fullPageWrites, CheckPoint::fullPageWrites, GetCurrentTimestamp(), GetOldestActiveTransactionId(), GetOldestXmin(), GetVirtualXIDsDelayingChkpt(), HaveVirtualXIDsDelayingChkpt(), XLogCtlData::info_lck, XLogCtlData::Insert, Insert(), INSERT_FREESPACE, INSERT_RECPTR, KeepLogSeg(), XLogRecData::len, LocalSetXLogInsertAllowed(), LocalXLogInsertAllowed, log_checkpoints, LogCheckpointEnd(), LogCheckpointStart(), LogStandbySnapshot(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), MAXALIGN, MemSet, ControlFileData::minRecoveryPoint, ControlFileData::minRecoveryPointTLI, MultiXactGetCheckptMulti(), NBuffers, XLogRecData::next, CheckPoint::nextMulti, CheckPoint::nextMultiOffset, VariableCacheData::nextOid, CheckPoint::nextOid, VariableCacheData::nextXid, CheckPoint::nextXid, CheckPoint::nextXidEpoch, NULL, VariableCacheData::oidCount, OidGenLock, CheckPoint::oldestActiveXid, CheckPoint::oldestMulti, CheckPoint::oldestMultiDB, VariableCacheData::oldestXid, CheckPoint::oldestXid, VariableCacheData::oldestXidDB, CheckPoint::oldestXidDB, PANIC, pfree(), pg_usleep(), PreallocXlogFiles(), ControlFileData::prevCheckPoint, XLogCtlData::PrevTimeLineID, CheckPoint::PrevTimeLineID, ProcLastRecPtr, RecoveryInProgress(), CheckPoint::redo, XLogCtlInsert::RedoRecPtr, RedoRecPtr, RemoveOldXlogFiles(), ShmemVariableCache, SizeOfXLogRecord, smgrpostckpt(), smgrpreckpt(), SpinLockAcquire, SpinLockRelease, START_CRIT_SECTION, ControlFileData::state, ThisTimeLineID, CheckPoint::ThisTimeLineID, CheckPoint::time, ControlFileData::time, TruncateSUBTRANS(), XLogCtlData::ulsn_lck, XLogCtlData::unloggedLSN, ControlFileData::unloggedLSN, UpdateControlFile(), WALInsertLock, XidGenLock, XLByteToSeg, XLOG_CHECKPOINT_ONLINE, XLOG_CHECKPOINT_SHUTDOWN, XLogFlush(), XLogInsert(), and XLogStandbyInfoActive.

Referenced by CheckpointerMain(), RequestCheckpoint(), ShutdownXLOG(), and StartupXLOG().

{
    bool        shutdown;
    CheckPoint  checkPoint;
    XLogRecPtr  recptr;
    XLogCtlInsert *Insert = &XLogCtl->Insert;
    XLogRecData rdata;
    uint32      freespace;
    XLogSegNo   _logSegNo;
    VirtualTransactionId *vxids;
    int nvxids;

    /*
     * An end-of-recovery checkpoint is really a shutdown checkpoint, just
     * issued at a different time.
     */
    if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
        shutdown = true;
    else
        shutdown = false;

    /* sanity check */
    if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
        elog(ERROR, "can't create a checkpoint during recovery");

    /*
     * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
     * (This is just pro forma, since in the present system structure there is
     * only one process that is allowed to issue checkpoints at any given
     * time.)
     */
    LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);

    /*
     * Prepare to accumulate statistics.
     *
     * Note: because it is possible for log_checkpoints to change while a
     * checkpoint proceeds, we always accumulate stats, even if
     * log_checkpoints is currently off.
     */
    MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
    CheckpointStats.ckpt_start_t = GetCurrentTimestamp();

    /*
     * Use a critical section to force system panic if we have trouble.
     */
    START_CRIT_SECTION();

    if (shutdown)
    {
        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
        ControlFile->state = DB_SHUTDOWNING;
        ControlFile->time = (pg_time_t) time(NULL);
        UpdateControlFile();
        LWLockRelease(ControlFileLock);
    }

    /*
     * Let smgr prepare for checkpoint; this has to happen before we determine
     * the REDO pointer.  Note that smgr must not do anything that'd have to
     * be undone if we decide no checkpoint is needed.
     */
    smgrpreckpt();

    /* Begin filling in the checkpoint WAL record */
    MemSet(&checkPoint, 0, sizeof(checkPoint));
    checkPoint.time = (pg_time_t) time(NULL);

    /*
     * For Hot Standby, derive the oldestActiveXid before we fix the redo
     * pointer. This allows us to begin accumulating changes to assemble our
     * starting snapshot of locks and transactions.
     */
    if (!shutdown && XLogStandbyInfoActive())
        checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
    else
        checkPoint.oldestActiveXid = InvalidTransactionId;

    /*
     * We must hold WALInsertLock while examining insert state to determine
     * the checkpoint REDO pointer.
     */
    LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);

    /*
     * If this isn't a shutdown or forced checkpoint, and we have not inserted
     * any XLOG records since the start of the last checkpoint, skip the
     * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
     * when the system is idle. That wastes log space, and more importantly it
     * exposes us to possible loss of both current and previous checkpoint
     * records if the machine crashes just as we're writing the update.
     * (Perhaps it'd make even more sense to checkpoint only when the previous
     * checkpoint record is in a different xlog page?)
     *
     * We have to make two tests to determine that nothing has happened since
     * the start of the last checkpoint: current insertion point must match
     * the end of the last checkpoint record, and its redo pointer must point
     * to itself.
     */
    if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
                  CHECKPOINT_FORCE)) == 0)
    {
        XLogRecPtr  curInsert;

        INSERT_RECPTR(curInsert, Insert, Insert->curridx);
        if (curInsert == ControlFile->checkPoint + 
            MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
            ControlFile->checkPoint == ControlFile->checkPointCopy.redo)
        {
            LWLockRelease(WALInsertLock);
            LWLockRelease(CheckpointLock);
            END_CRIT_SECTION();
            return;
        }
    }

    /*
     * An end-of-recovery checkpoint is created before anyone is allowed to
     * write WAL. To allow us to write the checkpoint record, temporarily
     * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
     * initialized, which we need here and in AdvanceXLInsertBuffer.)
     */
    if (flags & CHECKPOINT_END_OF_RECOVERY)
        LocalSetXLogInsertAllowed();

    checkPoint.ThisTimeLineID = ThisTimeLineID;
    if (flags & CHECKPOINT_END_OF_RECOVERY)
        checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
    else
        checkPoint.PrevTimeLineID = ThisTimeLineID;

    checkPoint.fullPageWrites = Insert->fullPageWrites;

    /*
     * Compute new REDO record ptr = location of next XLOG record.
     *
     * NB: this is NOT necessarily where the checkpoint record itself will be,
     * since other backends may insert more XLOG records while we're off doing
     * the buffer flush work.  Those XLOG records are logically after the
     * checkpoint, even though physically before it.  Got that?
     */
    freespace = INSERT_FREESPACE(Insert);
    if (freespace == 0)
    {
        (void) AdvanceXLInsertBuffer(false);
        /* OK to ignore update return flag, since we will do flush anyway */
        freespace = INSERT_FREESPACE(Insert);
    }
    INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);

    /*
     * Here we update the shared RedoRecPtr for future XLogInsert calls; this
     * must be done while holding the insert lock AND the info_lck.
     *
     * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
     * pointing past where it really needs to point.  This is okay; the only
     * consequence is that XLogInsert might back up whole buffers that it
     * didn't really need to.  We can't postpone advancing RedoRecPtr because
     * XLogInserts that happen while we are dumping buffers must assume that
     * their buffer changes are not included in the checkpoint.
     */
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
        SpinLockRelease(&xlogctl->info_lck);
    }

    /*
     * Now we can release WAL insert lock, allowing other xacts to proceed
     * while we are flushing disk buffers.
     */
    LWLockRelease(WALInsertLock);

    /*
     * If enabled, log checkpoint start.  We postpone this until now so as not
     * to log anything if we decided to skip the checkpoint.
     */
    if (log_checkpoints)
        LogCheckpointStart(flags, false);

    TRACE_POSTGRESQL_CHECKPOINT_START(flags);

    /*
     * In some cases there are groups of actions that must all occur on
     * one side or the other of a checkpoint record. Before flushing the
     * checkpoint record we must explicitly wait for any backend currently
     * performing those groups of actions.
     *
     * One example is end of transaction, so we must wait for any transactions
     * that are currently in commit critical sections.  If an xact inserted
     * its commit record into XLOG just before the REDO point, then a crash
     * restart from the REDO point would not replay that record, which means
     * that our flushing had better include the xact's update of pg_clog.  So
     * we wait till he's out of his commit critical section before proceeding.
     * See notes in RecordTransactionCommit().
     *
     * Because we've already released WALInsertLock, this test is a bit fuzzy:
     * it is possible that we will wait for xacts we didn't really need to
     * wait for.  But the delay should be short and it seems better to make
     * checkpoint take a bit longer than to hold locks longer than necessary.
     * (In fact, the whole reason we have this issue is that xact.c does
     * commit record XLOG insertion and clog update as two separate steps
     * protected by different locks, but again that seems best on grounds of
     * minimizing lock contention.)
     *
     * A transaction that has not yet set delayChkpt when we look cannot be at
     * risk, since he's not inserted his commit record yet; and one that's
     * already cleared it is not at risk either, since he's done fixing clog
     * and we will correctly flush the update below.  So we cannot miss any
     * xacts we need to wait for.
     */
    vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
    if (nvxids > 0)
    {
        uint32  nwaits = 0;

        do
        {
            pg_usleep(10000L);  /* wait for 10 msec */
            nwaits++;
        } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
    }
    pfree(vxids);

    /*
     * Get the other info we need for the checkpoint record.
     */
    LWLockAcquire(XidGenLock, LW_SHARED);
    checkPoint.nextXid = ShmemVariableCache->nextXid;
    checkPoint.oldestXid = ShmemVariableCache->oldestXid;
    checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
    LWLockRelease(XidGenLock);

    /* Increase XID epoch if we've wrapped around since last checkpoint */
    checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
    if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
        checkPoint.nextXidEpoch++;

    LWLockAcquire(OidGenLock, LW_SHARED);
    checkPoint.nextOid = ShmemVariableCache->nextOid;
    if (!shutdown)
        checkPoint.nextOid += ShmemVariableCache->oidCount;
    LWLockRelease(OidGenLock);

    MultiXactGetCheckptMulti(shutdown,
                             &checkPoint.nextMulti,
                             &checkPoint.nextMultiOffset,
                             &checkPoint.oldestMulti,
                             &checkPoint.oldestMultiDB);

    /*
     * Having constructed the checkpoint record, ensure all shmem disk buffers
     * and commit-log buffers are flushed to disk.
     *
     * This I/O could fail for various reasons.  If so, we will fail to
     * complete the checkpoint, but there is no reason to force a system
     * panic. Accordingly, exit critical section while doing it.
     */
    END_CRIT_SECTION();

    CheckPointGuts(checkPoint.redo, flags);

    /*
     * Take a snapshot of running transactions and write this to WAL. This
     * allows us to reconstruct the state of running transactions during
     * archive recovery, if required. Skip, if this info disabled.
     *
     * If we are shutting down, or Startup process is completing crash
     * recovery we don't need to write running xact data.
     */
    if (!shutdown && XLogStandbyInfoActive())
        LogStandbySnapshot();

    START_CRIT_SECTION();

    /*
     * Now insert the checkpoint record into XLOG.
     */
    rdata.data = (char *) (&checkPoint);
    rdata.len = sizeof(checkPoint);
    rdata.buffer = InvalidBuffer;
    rdata.next = NULL;

    recptr = XLogInsert(RM_XLOG_ID,
                        shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
                        XLOG_CHECKPOINT_ONLINE,
                        &rdata);

    XLogFlush(recptr);

    /*
     * We mustn't write any new WAL after a shutdown checkpoint, or it will be
     * overwritten at next startup.  No-one should even try, this just allows
     * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
     * to just temporarily disable writing until the system has exited
     * recovery.
     */
    if (shutdown)
    {
        if (flags & CHECKPOINT_END_OF_RECOVERY)
            LocalXLogInsertAllowed = -1;        /* return to "check" state */
        else
            LocalXLogInsertAllowed = 0; /* never again write WAL */
    }

    /*
     * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
     * = end of actual checkpoint record.
     */
    if (shutdown && checkPoint.redo != ProcLastRecPtr)
        ereport(PANIC,
                (errmsg("concurrent transaction log activity while database system is shutting down")));

    /*
     * Select point at which we can truncate the log, which we base on the
     * prior checkpoint's earliest info.
     */
    XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);

    /*
     * Update the control file.
     */
    LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    if (shutdown)
        ControlFile->state = DB_SHUTDOWNED;
    ControlFile->prevCheckPoint = ControlFile->checkPoint;
    ControlFile->checkPoint = ProcLastRecPtr;
    ControlFile->checkPointCopy = checkPoint;
    ControlFile->time = (pg_time_t) time(NULL);
    /* crash recovery should always recover to the end of WAL */
    ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
    ControlFile->minRecoveryPointTLI = 0;

    /*
     * Persist unloggedLSN value. It's reset on crash recovery, so this goes
     * unused on non-shutdown checkpoints, but seems useful to store it always
     * for debugging purposes.
     */
    SpinLockAcquire(&XLogCtl->ulsn_lck);
    ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
    SpinLockRelease(&XLogCtl->ulsn_lck);

    UpdateControlFile();
    LWLockRelease(ControlFileLock);

    /* Update shared-memory copy of checkpoint XID/epoch */
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
        xlogctl->ckptXid = checkPoint.nextXid;
        SpinLockRelease(&xlogctl->info_lck);
    }

    /*
     * We are now done with critical updates; no need for system panic if we
     * have trouble while fooling with old log segments.
     */
    END_CRIT_SECTION();

    /*
     * Let smgr do post-checkpoint cleanup (eg, deleting old files).
     */
    smgrpostckpt();

    /*
     * Delete old log files (those no longer needed even for previous
     * checkpoint or the standbys in XLOG streaming).
     */
    if (_logSegNo)
    {
        KeepLogSeg(recptr, &_logSegNo);
        _logSegNo--;
        RemoveOldXlogFiles(_logSegNo, recptr);
    }

    /*
     * Make more log segments if needed.  (Do this after recycling old log
     * segments, since that may supply some of the needed files.)
     */
    if (!shutdown)
        PreallocXlogFiles(recptr);

    /*
     * Truncate pg_subtrans if possible.  We can throw away all data before
     * the oldest XMIN of any running transaction.  No future transaction will
     * attempt to reference any pg_subtrans entry older than that (see Asserts
     * in subtrans.c).  During recovery, though, we mustn't do this because
     * StartupSUBTRANS hasn't been called yet.
     */
    if (!RecoveryInProgress())
        TruncateSUBTRANS(GetOldestXmin(true, false));

    /* Real work is done, but log and update stats before releasing lock. */
    LogCheckpointEnd(false);

    TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
                                     NBuffers,
                                     CheckpointStats.ckpt_segs_added,
                                     CheckpointStats.ckpt_segs_removed,
                                     CheckpointStats.ckpt_segs_recycled);

    LWLockRelease(CheckpointLock);
}

void CreateEndOfRecoveryRecord ( void   )  [static]

Definition at line 7184 of file xlog.c.

References XLogRecData::buffer, ControlFileLock, XLogRecData::data, elog, END_CRIT_SECTION, xl_end_of_recovery::end_time, ERROR, XLogRecData::len, LocalSetXLogInsertAllowed(), LocalXLogInsertAllowed, LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), ControlFileData::minRecoveryPoint, ControlFileData::minRecoveryPointTLI, XLogRecData::next, NULL, XLogCtlData::PrevTimeLineID, xl_end_of_recovery::PrevTimeLineID, RecoveryInProgress(), START_CRIT_SECTION, ThisTimeLineID, xl_end_of_recovery::ThisTimeLineID, ControlFileData::time, UpdateControlFile(), WALInsertLock, XLOG_END_OF_RECOVERY, XLogFlush(), and XLogInsert().

Referenced by StartupXLOG().

{
    xl_end_of_recovery  xlrec;
    XLogRecData         rdata;
    XLogRecPtr          recptr;

    /* sanity check */
    if (!RecoveryInProgress())
        elog(ERROR, "can only be used to end recovery");

    xlrec.end_time = time(NULL);

    LWLockAcquire(WALInsertLock, LW_SHARED);
    xlrec.ThisTimeLineID = ThisTimeLineID;
    xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
    LWLockRelease(WALInsertLock);

    LocalSetXLogInsertAllowed();

    START_CRIT_SECTION();

    rdata.data = (char *) &xlrec;
    rdata.len = sizeof(xl_end_of_recovery);
    rdata.buffer = InvalidBuffer;
    rdata.next = NULL;

    recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY, &rdata);

    XLogFlush(recptr);

    /*
     * Update the control file so that crash recovery can follow
     * the timeline changes to this point.
     */
    LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    ControlFile->time = (pg_time_t) xlrec.end_time;
    ControlFile->minRecoveryPoint = recptr;
    ControlFile->minRecoveryPointTLI = ThisTimeLineID;
    UpdateControlFile();
    LWLockRelease(ControlFileLock);

    END_CRIT_SECTION();

    LocalXLogInsertAllowed = -1;        /* return to "check" state */
}

bool CreateRestartPoint ( int  flags  ) 

Definition at line 7326 of file xlog.c.

References XLogCtlData::archiveCleanupCommand, ControlFileData::checkPoint, CHECKPOINT_IS_SHUTDOWN, ControlFileData::checkPointCopy, CheckPointGuts(), CheckpointLock, CheckpointStatsData::ckpt_start_t, ControlFileLock, DB_IN_ARCHIVE_RECOVERY, DEBUG2, EnableHotStandby, ereport, errdetail(), errmsg(), ExecuteRecoveryCommand(), GetCurrentTimestamp(), GetLatestXTime(), GetOldestXmin(), GetWalRcvWriteRecPtr(), GetXLogReplayRecPtr(), XLogCtlData::info_lck, XLogCtlData::Insert, InvalidXLogRecPtr, KeepLogSeg(), XLogCtlData::lastCheckPoint, XLogCtlData::lastCheckPointRecPtr, LOG, log_checkpoints, LogCheckpointEnd(), LogCheckpointStart(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), MemSet, NULL, PreallocXlogFiles(), ControlFileData::prevCheckPoint, RecoveryInProgress(), CheckPoint::redo, XLogCtlInsert::RedoRecPtr, RemoveOldXlogFiles(), SpinLockAcquire, SpinLockRelease, ControlFileData::state, ThisTimeLineID, ControlFileData::time, timestamptz_to_str(), TruncateSUBTRANS(), UpdateControlFile(), UpdateMinRecoveryPoint(), WALInsertLock, XLByteToSeg, and XLogRecPtrIsInvalid.

Referenced by CheckpointerMain(), and ShutdownXLOG().

{
    XLogRecPtr  lastCheckPointRecPtr;
    CheckPoint  lastCheckPoint;
    XLogSegNo   _logSegNo;
    TimestampTz xtime;

    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;

    /*
     * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
     * happens at a time.
     */
    LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);

    /* Get a local copy of the last safe checkpoint record. */
    SpinLockAcquire(&xlogctl->info_lck);
    lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
    lastCheckPoint = xlogctl->lastCheckPoint;
    SpinLockRelease(&xlogctl->info_lck);

    /*
     * Check that we're still in recovery mode. It's ok if we exit recovery
     * mode after this check, the restart point is valid anyway.
     */
    if (!RecoveryInProgress())
    {
        ereport(DEBUG2,
              (errmsg("skipping restartpoint, recovery has already ended")));
        LWLockRelease(CheckpointLock);
        return false;
    }

    /*
     * If the last checkpoint record we've replayed is already our last
     * restartpoint, we can't perform a new restart point. We still update
     * minRecoveryPoint in that case, so that if this is a shutdown restart
     * point, we won't start up earlier than before. That's not strictly
     * necessary, but when hot standby is enabled, it would be rather weird if
     * the database opened up for read-only connections at a point-in-time
     * before the last shutdown. Such time travel is still possible in case of
     * immediate shutdown, though.
     *
     * We don't explicitly advance minRecoveryPoint when we do create a
     * restartpoint. It's assumed that flushing the buffers will do that as a
     * side-effect.
     */
    if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
        lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
    {
        ereport(DEBUG2,
                (errmsg("skipping restartpoint, already performed at %X/%X",
                        (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo)));

        UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
        if (flags & CHECKPOINT_IS_SHUTDOWN)
        {
            LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
            ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
            ControlFile->time = (pg_time_t) time(NULL);
            UpdateControlFile();
            LWLockRelease(ControlFileLock);
        }
        LWLockRelease(CheckpointLock);
        return false;
    }

    /*
     * Update the shared RedoRecPtr so that the startup process can calculate
     * the number of segments replayed since last restartpoint, and request a
     * restartpoint if it exceeds checkpoint_segments.
     *
     * You need to hold WALInsertLock and info_lck to update it, although
     * during recovery acquiring WALInsertLock is just pro forma, because
     * there is no other processes updating Insert.RedoRecPtr.
     */
    LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->Insert.RedoRecPtr = lastCheckPoint.redo;
    SpinLockRelease(&xlogctl->info_lck);
    LWLockRelease(WALInsertLock);

    /*
     * Prepare to accumulate statistics.
     *
     * Note: because it is possible for log_checkpoints to change while a
     * checkpoint proceeds, we always accumulate stats, even if
     * log_checkpoints is currently off.
     */
    MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
    CheckpointStats.ckpt_start_t = GetCurrentTimestamp();

    if (log_checkpoints)
        LogCheckpointStart(flags, true);

    CheckPointGuts(lastCheckPoint.redo, flags);

    /*
     * Select point at which we can truncate the xlog, which we base on the
     * prior checkpoint's earliest info.
     */
    XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);

    /*
     * Update pg_control, using current time.  Check that it still shows
     * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
     * this is a quick hack to make sure nothing really bad happens if somehow
     * we get here after the end-of-recovery checkpoint.
     */
    LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
        ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
    {
        ControlFile->prevCheckPoint = ControlFile->checkPoint;
        ControlFile->checkPoint = lastCheckPointRecPtr;
        ControlFile->checkPointCopy = lastCheckPoint;
        ControlFile->time = (pg_time_t) time(NULL);
        if (flags & CHECKPOINT_IS_SHUTDOWN)
            ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
        UpdateControlFile();
    }
    LWLockRelease(ControlFileLock);

    /*
     * Delete old log files (those no longer needed even for previous
     * checkpoint/restartpoint) to prevent the disk holding the xlog from
     * growing full.
     */
    if (_logSegNo)
    {
        XLogRecPtr  receivePtr;
        XLogRecPtr  replayPtr;
        XLogRecPtr  endptr;

        /*
         * Get the current end of xlog replayed or received, whichever is later.
         */
        receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
        replayPtr = GetXLogReplayRecPtr(NULL);
        endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;

        KeepLogSeg(endptr, &_logSegNo);
        _logSegNo--;

        /*
         * Update ThisTimeLineID to the timeline we're currently replaying,
         * so that we install any recycled segments on that timeline.
         *
         * There is no guarantee that the WAL segments will be useful on the
         * current timeline; if recovery proceeds to a new timeline right
         * after this, the pre-allocated WAL segments on this timeline will
         * not be used, and will go wasted until recycled on the next
         * restartpoint. We'll live with that.
         */
        (void) GetXLogReplayRecPtr(&ThisTimeLineID);

        RemoveOldXlogFiles(_logSegNo, endptr);

        /*
         * Make more log segments if needed.  (Do this after recycling old log
         * segments, since that may supply some of the needed files.)
         */
        PreallocXlogFiles(endptr);
    }

    /*
     * Truncate pg_subtrans if possible.  We can throw away all data before
     * the oldest XMIN of any running transaction.  No future transaction will
     * attempt to reference any pg_subtrans entry older than that (see Asserts
     * in subtrans.c).  When hot standby is disabled, though, we mustn't do
     * this because StartupSUBTRANS hasn't been called yet.
     */
    if (EnableHotStandby)
        TruncateSUBTRANS(GetOldestXmin(true, false));

    /* Real work is done, but log and update before releasing lock. */
    LogCheckpointEnd(true);

    xtime = GetLatestXTime();
    ereport((log_checkpoints ? LOG : DEBUG2),
            (errmsg("recovery restart point at %X/%X",
                    (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
           xtime ? errdetail("last completed transaction was at log time %s",
                             timestamptz_to_str(xtime)) : 0));

    LWLockRelease(CheckpointLock);

    /*
     * Finally, execute archive_cleanup_command, if any.
     */
    if (XLogCtl->archiveCleanupCommand[0])
        ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
                               "archive_cleanup_command",
                               false);

    return true;
}

bool DataChecksumsEnabled ( void   ) 
void do_pg_abort_backup ( void   ) 
XLogRecPtr do_pg_start_backup ( const char *  backupidstr,
bool  fast,
TimeLineID starttli_p,
char **  labelfile 
)

Definition at line 8397 of file xlog.c.

References AllocateFile(), appendStringInfo(), BACKUP_LABEL_FILE, backup_started_in_recovery, BoolGetDatum, ControlFileData::checkPoint, CHECKPOINT_FORCE, CHECKPOINT_IMMEDIATE, CHECKPOINT_WAIT, ControlFileData::checkPointCopy, ControlFileLock, StringInfoData::data, ereport, errcode(), errcode_for_file_access(), errhint(), errmsg(), ERROR, XLogCtlInsert::exclusiveBackup, XLogCtlInsert::forcePageWrites, FreeFile(), CheckPoint::fullPageWrites, GetUserId(), has_rolreplication(), XLogCtlData::info_lck, initStringInfo(), XLogCtlData::Insert, XLogCtlInsert::lastBackupStart, XLogCtlData::lastFpwDisableRecPtr, StringInfoData::len, log_timezone, LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), MAXPGPATH, XLogCtlInsert::nonExclusiveBackups, NULL, pfree(), PG_END_ENSURE_ERROR_CLEANUP, PG_ENSURE_ERROR_CLEANUP, pg_fsync(), pg_localtime(), pg_start_backup_callback(), pg_strftime(), RecoveryInProgress(), CheckPoint::redo, RequestCheckpoint(), RequestXLogSwitch(), SpinLockAcquire, SpinLockRelease, superuser(), ThisTimeLineID, CheckPoint::ThisTimeLineID, WALInsertLock, XLByteToSeg, XLogFileName, and XLogIsNeeded.

Referenced by perform_base_backup(), and pg_start_backup().

{
    bool        exclusive = (labelfile == NULL);
    bool        backup_started_in_recovery = false;
    XLogRecPtr  checkpointloc;
    XLogRecPtr  startpoint;
    TimeLineID  starttli;
    pg_time_t   stamp_time;
    char        strfbuf[128];
    char        xlogfilename[MAXFNAMELEN];
    XLogSegNo   _logSegNo;
    struct stat stat_buf;
    FILE       *fp;
    StringInfoData labelfbuf;

    backup_started_in_recovery = RecoveryInProgress();

    if (!superuser() && !has_rolreplication(GetUserId()))
        ereport(ERROR,
                (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
           errmsg("must be superuser or replication role to run a backup")));

    /*
     * Currently only non-exclusive backup can be taken during recovery.
     */
    if (backup_started_in_recovery && exclusive)
        ereport(ERROR,
                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                 errmsg("recovery is in progress"),
                 errhint("WAL control functions cannot be executed during recovery.")));

    /*
     * During recovery, we don't need to check WAL level. Because, if WAL
     * level is not sufficient, it's impossible to get here during recovery.
     */
    if (!backup_started_in_recovery && !XLogIsNeeded())
        ereport(ERROR,
                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
              errmsg("WAL level not sufficient for making an online backup"),
                 errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));

    if (strlen(backupidstr) > MAXPGPATH)
        ereport(ERROR,
                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                 errmsg("backup label too long (max %d bytes)",
                        MAXPGPATH)));

    /*
     * Mark backup active in shared memory.  We must do full-page WAL writes
     * during an on-line backup even if not doing so at other times, because
     * it's quite possible for the backup dump to obtain a "torn" (partially
     * written) copy of a database page if it reads the page concurrently with
     * our write to the same page.  This can be fixed as long as the first
     * write to the page in the WAL sequence is a full-page write. Hence, we
     * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
     * are no dirty pages in shared memory that might get dumped while the
     * backup is in progress without having a corresponding WAL record.  (Once
     * the backup is complete, we need not force full-page writes anymore,
     * since we expect that any pages not modified during the backup interval
     * must have been correctly captured by the backup.)
     *
     * Note that forcePageWrites has no effect during an online backup from
     * the standby.
     *
     * We must hold WALInsertLock to change the value of forcePageWrites, to
     * ensure adequate interlocking against XLogInsert().
     */
    LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
    if (exclusive)
    {
        if (XLogCtl->Insert.exclusiveBackup)
        {
            LWLockRelease(WALInsertLock);
            ereport(ERROR,
                    (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                     errmsg("a backup is already in progress"),
                     errhint("Run pg_stop_backup() and try again.")));
        }
        XLogCtl->Insert.exclusiveBackup = true;
    }
    else
        XLogCtl->Insert.nonExclusiveBackups++;
    XLogCtl->Insert.forcePageWrites = true;
    LWLockRelease(WALInsertLock);

    /* Ensure we release forcePageWrites if fail below */
    PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
    {
        bool        gotUniqueStartpoint = false;

        /*
         * Force an XLOG file switch before the checkpoint, to ensure that the
         * WAL segment the checkpoint is written to doesn't contain pages with
         * old timeline IDs.  That would otherwise happen if you called
         * pg_start_backup() right after restoring from a PITR archive: the
         * first WAL segment containing the startup checkpoint has pages in
         * the beginning with the old timeline ID.  That can cause trouble at
         * recovery: we won't have a history file covering the old timeline if
         * pg_xlog directory was not included in the base backup and the WAL
         * archive was cleared too before starting the backup.
         *
         * This also ensures that we have emitted a WAL page header that has
         * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
         * Therefore, if a WAL archiver (such as pglesslog) is trying to
         * compress out removable backup blocks, it won't remove any that
         * occur after this point.
         *
         * During recovery, we skip forcing XLOG file switch, which means that
         * the backup taken during recovery is not available for the special
         * recovery case described above.
         */
        if (!backup_started_in_recovery)
            RequestXLogSwitch();

        do
        {
            bool        checkpointfpw;

            /*
             * Force a CHECKPOINT.  Aside from being necessary to prevent torn
             * page problems, this guarantees that two successive backup runs
             * will have different checkpoint positions and hence different
             * history file names, even if nothing happened in between.
             *
             * During recovery, establish a restartpoint if possible. We use
             * the last restartpoint as the backup starting checkpoint. This
             * means that two successive backup runs can have same checkpoint
             * positions.
             *
             * Since the fact that we are executing do_pg_start_backup()
             * during recovery means that checkpointer is running, we can use
             * RequestCheckpoint() to establish a restartpoint.
             *
             * We use CHECKPOINT_IMMEDIATE only if requested by user (via
             * passing fast = true).  Otherwise this can take awhile.
             */
            RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
                              (fast ? CHECKPOINT_IMMEDIATE : 0));

            /*
             * Now we need to fetch the checkpoint record location, and also
             * its REDO pointer.  The oldest point in WAL that would be needed
             * to restore starting from the checkpoint is precisely the REDO
             * pointer.
             */
            LWLockAcquire(ControlFileLock, LW_SHARED);
            checkpointloc = ControlFile->checkPoint;
            startpoint = ControlFile->checkPointCopy.redo;
            starttli = ControlFile->checkPointCopy.ThisTimeLineID;
            checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
            LWLockRelease(ControlFileLock);

            if (backup_started_in_recovery)
            {
                /* use volatile pointer to prevent code rearrangement */
                volatile XLogCtlData *xlogctl = XLogCtl;
                XLogRecPtr  recptr;

                /*
                 * Check to see if all WAL replayed during online backup
                 * (i.e., since last restartpoint used as backup starting
                 * checkpoint) contain full-page writes.
                 */
                SpinLockAcquire(&xlogctl->info_lck);
                recptr = xlogctl->lastFpwDisableRecPtr;
                SpinLockRelease(&xlogctl->info_lck);

                if (!checkpointfpw || startpoint <= recptr)
                    ereport(ERROR,
                          (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                           errmsg("WAL generated with full_page_writes=off was replayed "
                                  "since last restartpoint"),
                           errhint("This means that the backup being taken on the standby "
                                   "is corrupt and should not be used. "
                                   "Enable full_page_writes and run CHECKPOINT on the master, "
                                   "and then try an online backup again.")));

                /*
                 * During recovery, since we don't use the end-of-backup WAL
                 * record and don't write the backup history file, the
                 * starting WAL location doesn't need to be unique. This means
                 * that two base backups started at the same time might use
                 * the same checkpoint as starting locations.
                 */
                gotUniqueStartpoint = true;
            }

            /*
             * If two base backups are started at the same time (in WAL sender
             * processes), we need to make sure that they use different
             * checkpoints as starting locations, because we use the starting
             * WAL location as a unique identifier for the base backup in the
             * end-of-backup WAL record and when we write the backup history
             * file. Perhaps it would be better generate a separate unique ID
             * for each backup instead of forcing another checkpoint, but
             * taking a checkpoint right after another is not that expensive
             * either because only few buffers have been dirtied yet.
             */
            LWLockAcquire(WALInsertLock, LW_SHARED);
            if (XLogCtl->Insert.lastBackupStart < startpoint)
            {
                XLogCtl->Insert.lastBackupStart = startpoint;
                gotUniqueStartpoint = true;
            }
            LWLockRelease(WALInsertLock);
        } while (!gotUniqueStartpoint);

        XLByteToSeg(startpoint, _logSegNo);
        XLogFileName(xlogfilename, ThisTimeLineID, _logSegNo);

        /*
         * Construct backup label file
         */
        initStringInfo(&labelfbuf);

        /* Use the log timezone here, not the session timezone */
        stamp_time = (pg_time_t) time(NULL);
        pg_strftime(strfbuf, sizeof(strfbuf),
                    "%Y-%m-%d %H:%M:%S %Z",
                    pg_localtime(&stamp_time, log_timezone));
        appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n",
                         (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
        appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n",
                         (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
        appendStringInfo(&labelfbuf, "BACKUP METHOD: %s\n",
                         exclusive ? "pg_start_backup" : "streamed");
        appendStringInfo(&labelfbuf, "BACKUP FROM: %s\n",
                         backup_started_in_recovery ? "standby" : "master");
        appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
        appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);

        /*
         * Okay, write the file, or return its contents to caller.
         */
        if (exclusive)
        {
            /*
             * Check for existing backup label --- implies a backup is already
             * running.  (XXX given that we checked exclusiveBackup above,
             * maybe it would be OK to just unlink any such label file?)
             */
            if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
            {
                if (errno != ENOENT)
                    ereport(ERROR,
                            (errcode_for_file_access(),
                             errmsg("could not stat file \"%s\": %m",
                                    BACKUP_LABEL_FILE)));
            }
            else
                ereport(ERROR,
                        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                         errmsg("a backup is already in progress"),
                         errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
                                 BACKUP_LABEL_FILE)));

            fp = AllocateFile(BACKUP_LABEL_FILE, "w");

            if (!fp)
                ereport(ERROR,
                        (errcode_for_file_access(),
                         errmsg("could not create file \"%s\": %m",
                                BACKUP_LABEL_FILE)));
            if (fwrite(labelfbuf.data, labelfbuf.len, 1, fp) != 1 ||
                fflush(fp) != 0 ||
                pg_fsync(fileno(fp)) != 0 ||
                ferror(fp) ||
                FreeFile(fp))
                ereport(ERROR,
                        (errcode_for_file_access(),
                         errmsg("could not write file \"%s\": %m",
                                BACKUP_LABEL_FILE)));
            pfree(labelfbuf.data);
        }
        else
            *labelfile = labelfbuf.data;
    }
    PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));

    /*
     * We're done.  As a convenience, return the starting WAL location.
     */
    if (starttli_p)
        *starttli_p = starttli;
    return startpoint;
}

XLogRecPtr do_pg_stop_backup ( char *  labelfile,
bool  waitforarchive,
TimeLineID stoptli_p 
)

Definition at line 8723 of file xlog.c.

References AllocateFile(), Assert, BACKUP_LABEL_FILE, backup_started_in_recovery, BackupHistoryFileName, BackupHistoryFilePath, XLogRecData::buffer, CHECK_FOR_INTERRUPTS, CleanupBackupHistory(), ControlFileLock, XLogRecData::data, ereport, errcode(), errcode_for_file_access(), errhint(), errmsg(), ERROR, XLogCtlInsert::exclusiveBackup, XLogCtlInsert::forcePageWrites, FreeFile(), GetUserId(), has_rolreplication(), XLogCtlData::info_lck, XLogCtlData::Insert, XLogCtlData::lastFpwDisableRecPtr, XLogRecData::len, log_timezone, LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), ControlFileData::minRecoveryPoint, ControlFileData::minRecoveryPointTLI, XLogRecData::next, XLogCtlInsert::nonExclusiveBackups, NOTICE, NULL, palloc(), pg_localtime(), pg_strftime(), pg_usleep(), RecoveryInProgress(), remaining, RequestXLogSwitch(), SpinLockAcquire, SpinLockRelease, superuser(), ThisTimeLineID, unlink(), WALInsertLock, WARNING, XLByteToPrevSeg, XLByteToSeg, XLOG_BACKUP_END, XLogArchiveIsBusy(), XLogArchivingActive, XLogFileName, XLogInsert(), XLogIsNeeded, and XLogSegSize.

Referenced by perform_base_backup(), and pg_stop_backup().

{
    bool        exclusive = (labelfile == NULL);
    bool        backup_started_in_recovery = false;
    XLogRecPtr  startpoint;
    XLogRecPtr  stoppoint;
    TimeLineID  stoptli;
    XLogRecData rdata;
    pg_time_t   stamp_time;
    char        strfbuf[128];
    char        histfilepath[MAXPGPATH];
    char        startxlogfilename[MAXFNAMELEN];
    char        stopxlogfilename[MAXFNAMELEN];
    char        lastxlogfilename[MAXFNAMELEN];
    char        histfilename[MAXFNAMELEN];
    char        backupfrom[20];
    XLogSegNo   _logSegNo;
    FILE       *lfp;
    FILE       *fp;
    char        ch;
    int         seconds_before_warning;
    int         waits = 0;
    bool        reported_waiting = false;
    char       *remaining;
    char       *ptr;
    uint32      hi,
                lo;

    backup_started_in_recovery = RecoveryInProgress();

    if (!superuser() && !has_rolreplication(GetUserId()))
        ereport(ERROR,
                (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
         (errmsg("must be superuser or replication role to run a backup"))));

    /*
     * Currently only non-exclusive backup can be taken during recovery.
     */
    if (backup_started_in_recovery && exclusive)
        ereport(ERROR,
                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                 errmsg("recovery is in progress"),
                 errhint("WAL control functions cannot be executed during recovery.")));

    /*
     * During recovery, we don't need to check WAL level. Because, if WAL
     * level is not sufficient, it's impossible to get here during recovery.
     */
    if (!backup_started_in_recovery && !XLogIsNeeded())
        ereport(ERROR,
                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
              errmsg("WAL level not sufficient for making an online backup"),
                 errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));

    /*
     * OK to update backup counters and forcePageWrites
     */
    LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
    if (exclusive)
        XLogCtl->Insert.exclusiveBackup = false;
    else
    {
        /*
         * The user-visible pg_start/stop_backup() functions that operate on
         * exclusive backups can be called at any time, but for non-exclusive
         * backups, it is expected that each do_pg_start_backup() call is
         * matched by exactly one do_pg_stop_backup() call.
         */
        Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
        XLogCtl->Insert.nonExclusiveBackups--;
    }

    if (!XLogCtl->Insert.exclusiveBackup &&
        XLogCtl->Insert.nonExclusiveBackups == 0)
    {
        XLogCtl->Insert.forcePageWrites = false;
    }
    LWLockRelease(WALInsertLock);

    if (exclusive)
    {
        /*
         * Read the existing label file into memory.
         */
        struct stat statbuf;
        int         r;

        if (stat(BACKUP_LABEL_FILE, &statbuf))
        {
            if (errno != ENOENT)
                ereport(ERROR,
                        (errcode_for_file_access(),
                         errmsg("could not stat file \"%s\": %m",
                                BACKUP_LABEL_FILE)));
            ereport(ERROR,
                    (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                     errmsg("a backup is not in progress")));
        }

        lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
        if (!lfp)
        {
            ereport(ERROR,
                    (errcode_for_file_access(),
                     errmsg("could not read file \"%s\": %m",
                            BACKUP_LABEL_FILE)));
        }
        labelfile = palloc(statbuf.st_size + 1);
        r = fread(labelfile, statbuf.st_size, 1, lfp);
        labelfile[statbuf.st_size] = '\0';

        /*
         * Close and remove the backup label file
         */
        if (r != 1 || ferror(lfp) || FreeFile(lfp))
            ereport(ERROR,
                    (errcode_for_file_access(),
                     errmsg("could not read file \"%s\": %m",
                            BACKUP_LABEL_FILE)));
        if (unlink(BACKUP_LABEL_FILE) != 0)
            ereport(ERROR,
                    (errcode_for_file_access(),
                     errmsg("could not remove file \"%s\": %m",
                            BACKUP_LABEL_FILE)));
    }

    /*
     * Read and parse the START WAL LOCATION line (this code is pretty crude,
     * but we are not expecting any variability in the file format).
     */
    if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
               &hi, &lo, startxlogfilename,
               &ch) != 4 || ch != '\n')
        ereport(ERROR,
                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
    startpoint = ((uint64) hi) << 32 | lo;
    remaining = strchr(labelfile, '\n') + 1;    /* %n is not portable enough */

    /*
     * Parse the BACKUP FROM line. If we are taking an online backup from the
     * standby, we confirm that the standby has not been promoted during the
     * backup.
     */
    ptr = strstr(remaining, "BACKUP FROM:");
    if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
        ereport(ERROR,
                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
    if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
        ereport(ERROR,
                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                 errmsg("the standby was promoted during online backup"),
                 errhint("This means that the backup being taken is corrupt "
                         "and should not be used. "
                         "Try taking another online backup.")));

    /*
     * During recovery, we don't write an end-of-backup record. We assume that
     * pg_control was backed up last and its minimum recovery point can be
     * available as the backup end location. Since we don't have an
     * end-of-backup record, we use the pg_control value to check whether
     * we've reached the end of backup when starting recovery from this
     * backup. We have no way of checking if pg_control wasn't backed up last
     * however.
     *
     * We don't force a switch to new WAL file and wait for all the required
     * files to be archived. This is okay if we use the backup to start the
     * standby. But, if it's for an archive recovery, to ensure all the
     * required files are available, a user should wait for them to be
     * archived, or include them into the backup.
     *
     * We return the current minimum recovery point as the backup end
     * location. Note that it can be greater than the exact backup end
     * location if the minimum recovery point is updated after the backup of
     * pg_control. This is harmless for current uses.
     *
     * XXX currently a backup history file is for informational and debug
     * purposes only. It's not essential for an online backup. Furthermore,
     * even if it's created, it will not be archived during recovery because
     * an archiver is not invoked. So it doesn't seem worthwhile to write a
     * backup history file during recovery.
     */
    if (backup_started_in_recovery)
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;
        XLogRecPtr  recptr;

        /*
         * Check to see if all WAL replayed during online backup contain
         * full-page writes.
         */
        SpinLockAcquire(&xlogctl->info_lck);
        recptr = xlogctl->lastFpwDisableRecPtr;
        SpinLockRelease(&xlogctl->info_lck);

        if (startpoint <= recptr)
            ereport(ERROR,
                    (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
               errmsg("WAL generated with full_page_writes=off was replayed "
                      "during online backup"),
                 errhint("This means that the backup being taken on the standby "
                         "is corrupt and should not be used. "
                 "Enable full_page_writes and run CHECKPOINT on the master, "
                         "and then try an online backup again.")));


        LWLockAcquire(ControlFileLock, LW_SHARED);
        stoppoint = ControlFile->minRecoveryPoint;
        stoptli = ControlFile->minRecoveryPointTLI;
        LWLockRelease(ControlFileLock);

        if (stoptli_p)
            *stoptli_p = stoptli;
        return stoppoint;
    }

    /*
     * Write the backup-end xlog record
     */
    rdata.data = (char *) (&startpoint);
    rdata.len = sizeof(startpoint);
    rdata.buffer = InvalidBuffer;
    rdata.next = NULL;
    stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END, &rdata);
    stoptli = ThisTimeLineID;

    /*
     * Force a switch to a new xlog segment file, so that the backup is valid
     * as soon as archiver moves out the current segment file.
     */
    RequestXLogSwitch();

    XLByteToPrevSeg(stoppoint, _logSegNo);
    XLogFileName(stopxlogfilename, ThisTimeLineID, _logSegNo);

    /* Use the log timezone here, not the session timezone */
    stamp_time = (pg_time_t) time(NULL);
    pg_strftime(strfbuf, sizeof(strfbuf),
                "%Y-%m-%d %H:%M:%S %Z",
                pg_localtime(&stamp_time, log_timezone));

    /*
     * Write the backup history file
     */
    XLByteToSeg(startpoint, _logSegNo);
    BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logSegNo,
                          (uint32) (startpoint % XLogSegSize));
    fp = AllocateFile(histfilepath, "w");
    if (!fp)
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not create file \"%s\": %m",
                        histfilepath)));
    fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
            (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
    fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
            (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
    /* transfer remaining lines from label to history file */
    fprintf(fp, "%s", remaining);
    fprintf(fp, "STOP TIME: %s\n", strfbuf);
    if (fflush(fp) || ferror(fp) || FreeFile(fp))
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not write file \"%s\": %m",
                        histfilepath)));

    /*
     * Clean out any no-longer-needed history files.  As a side effect, this
     * will post a .ready file for the newly created history file, notifying
     * the archiver that history file may be archived immediately.
     */
    CleanupBackupHistory();

    /*
     * If archiving is enabled, wait for all the required WAL files to be
     * archived before returning. If archiving isn't enabled, the required WAL
     * needs to be transported via streaming replication (hopefully with
     * wal_keep_segments set high enough), or some more exotic mechanism like
     * polling and copying files from pg_xlog with script. We have no
     * knowledge of those mechanisms, so it's up to the user to ensure that he
     * gets all the required WAL.
     *
     * We wait until both the last WAL file filled during backup and the
     * history file have been archived, and assume that the alphabetic sorting
     * property of the WAL files ensures any earlier WAL files are safely
     * archived as well.
     *
     * We wait forever, since archive_command is supposed to work and we
     * assume the admin wanted his backup to work completely. If you don't
     * wish to wait, you can set statement_timeout.  Also, some notices are
     * issued to clue in anyone who might be doing this interactively.
     */
    if (waitforarchive && XLogArchivingActive())
    {
        XLByteToPrevSeg(stoppoint, _logSegNo);
        XLogFileName(lastxlogfilename, ThisTimeLineID, _logSegNo);

        XLByteToSeg(startpoint, _logSegNo);
        BackupHistoryFileName(histfilename, ThisTimeLineID, _logSegNo,
                              (uint32) (startpoint % XLogSegSize));

        seconds_before_warning = 60;
        waits = 0;

        while (XLogArchiveIsBusy(lastxlogfilename) ||
               XLogArchiveIsBusy(histfilename))
        {
            CHECK_FOR_INTERRUPTS();

            if (!reported_waiting && waits > 5)
            {
                ereport(NOTICE,
                        (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
                reported_waiting = true;
            }

            pg_usleep(1000000L);

            if (++waits >= seconds_before_warning)
            {
                seconds_before_warning *= 2;    /* This wraps in >10 years... */
                ereport(WARNING,
                        (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
                                waits),
                         errhint("Check that your archive_command is executing properly.  "
                                 "pg_stop_backup can be canceled safely, "
                                 "but the database backup will not be usable without all the WAL segments.")));
            }
        }

        ereport(NOTICE,
                (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
    }
    else if (waitforarchive)
        ereport(NOTICE,
                (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));

    /*
     * We're done.  As a convenience, return the ending WAL location.
     */
    if (stoptli_p)
        *stoptli_p = stoptli;
    return stoppoint;
}

static int emode_for_corrupt_record ( int  emode,
XLogRecPtr  RecPtr 
) [static]

Definition at line 9861 of file xlog.c.

References LOG, readSource, and XLOG_FROM_PG_XLOG.

Referenced by ReadRecord(), and XLogPageRead().

{
    static XLogRecPtr lastComplaint = 0;

    if (readSource == XLOG_FROM_PG_XLOG && emode == LOG)
    {
        if (RecPtr == lastComplaint)
            emode = DEBUG1;
        else
            lastComplaint = RecPtr;
    }
    return emode;
}

static void exitArchiveRecovery ( TimeLineID  endTLI,
XLogSegNo  endLogSegNo 
) [static]

Definition at line 4400 of file xlog.c.

References close, ereport, errcode_for_file_access(), errmsg(), FATAL, InArchiveRecovery, InvalidXLogRecPtr, LOG, MAXPGPATH, readFile, RECOVERY_COMMAND_DONE, RECOVERY_COMMAND_FILE, snprintf(), ThisTimeLineID, unlink(), UpdateMinRecoveryPoint(), XLogArchiveCleanup(), XLogArchiveNotify(), XLogArchivingActive, XLOGDIR, XLogFileCopy(), and XLogFileName.

Referenced by StartupXLOG().

{
    char        recoveryPath[MAXPGPATH];
    char        xlogpath[MAXPGPATH];

    /*
     * We are no longer in archive recovery state.
     */
    InArchiveRecovery = false;

    /*
     * Update min recovery point one last time.
     */
    UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);

    /*
     * If the ending log segment is still open, close it (to avoid problems on
     * Windows with trying to rename or delete an open file).
     */
    if (readFile >= 0)
    {
        close(readFile);
        readFile = -1;
    }

    /*
     * If we are establishing a new timeline, we have to copy data from the
     * last WAL segment of the old timeline to create a starting WAL segment
     * for the new timeline.
     *
     * Notify the archiver that the last WAL segment of the old timeline is
     * ready to copy to archival storage. Otherwise, it is not archived for a
     * while.
     */
    if (endTLI != ThisTimeLineID)
    {
        XLogFileCopy(endLogSegNo, endTLI, endLogSegNo);

        if (XLogArchivingActive())
        {
            XLogFileName(xlogpath, endTLI, endLogSegNo);
            XLogArchiveNotify(xlogpath);
        }
    }

    /*
     * Let's just make real sure there are not .ready or .done flags posted
     * for the new segment.
     */
    XLogFileName(xlogpath, ThisTimeLineID, endLogSegNo);
    XLogArchiveCleanup(xlogpath);

    /*
     * Since there might be a partial WAL segment named RECOVERYXLOG, get rid
     * of it.
     */
    snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
    unlink(recoveryPath);       /* ignore any error */

    /* Get rid of any remaining recovered timeline-history file, too */
    snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
    unlink(recoveryPath);       /* ignore any error */

    /*
     * Rename the config file out of the way, so that we don't accidentally
     * re-enter archive recovery mode in a subsequent crash.
     */
    unlink(RECOVERY_COMMAND_DONE);
    if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
        ereport(FATAL,
                (errcode_for_file_access(),
                 errmsg("could not rename file \"%s\" to \"%s\": %m",
                        RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));

    ereport(LOG,
            (errmsg("archive recovery complete")));
}

static int get_sync_bit ( int  method  )  [static]

Definition at line 8235 of file xlog.c.

References AmWalReceiverProcess, elog, enableFsync, ERROR, SYNC_METHOD_FDATASYNC, SYNC_METHOD_FSYNC, SYNC_METHOD_FSYNC_WRITETHROUGH, SYNC_METHOD_OPEN, SYNC_METHOD_OPEN_DSYNC, and XLogIsNeeded.

Referenced by assign_xlog_sync_method(), XLogFileInit(), and XLogFileOpen().

{
    int         o_direct_flag = 0;

    /* If fsync is disabled, never open in sync mode */
    if (!enableFsync)
        return 0;

    /*
     * Optimize writes by bypassing kernel cache with O_DIRECT when using
     * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
     * disabled, otherwise the archive command or walsender process will read
     * the WAL soon after writing it, which is guaranteed to cause a physical
     * read if we bypassed the kernel cache. We also skip the
     * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
     * reason.
     *
     * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
     * written by walreceiver is normally read by the startup process soon
     * after its written. Also, walreceiver performs unaligned writes, which
     * don't work with O_DIRECT, so it is required for correctness too.
     */
    if (!XLogIsNeeded() && !AmWalReceiverProcess())
        o_direct_flag = PG_O_DIRECT;

    switch (method)
    {
            /*
             * enum values for all sync options are defined even if they are
             * not supported on the current platform.  But if not, they are
             * not included in the enum option array, and therefore will never
             * be seen here.
             */
        case SYNC_METHOD_FSYNC:
        case SYNC_METHOD_FSYNC_WRITETHROUGH:
        case SYNC_METHOD_FDATASYNC:
            return 0;
#ifdef OPEN_SYNC_FLAG
        case SYNC_METHOD_OPEN:
            return OPEN_SYNC_FLAG | o_direct_flag;
#endif
#ifdef OPEN_DATASYNC_FLAG
        case SYNC_METHOD_OPEN_DSYNC:
            return OPEN_DATASYNC_FLAG | o_direct_flag;
#endif
        default:
            /* can't happen (unless we are out of sync with option array) */
            elog(ERROR, "unrecognized wal_sync_method: %d", method);
            return 0;           /* silence warning */
    }
}

TimestampTz GetCurrentChunkReplayStartTime ( void   ) 

Definition at line 4753 of file xlog.c.

References XLogCtlData::currentChunkStartTime, XLogCtlData::info_lck, SpinLockAcquire, and SpinLockRelease.

Referenced by GetReplicationApplyDelay().

{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;
    TimestampTz xtime;

    SpinLockAcquire(&xlogctl->info_lck);
    xtime = xlogctl->currentChunkStartTime;
    SpinLockRelease(&xlogctl->info_lck);

    return xtime;
}

XLogRecPtr GetFakeLSNForUnloggedRel ( void   ) 

Definition at line 3813 of file xlog.c.

References SpinLockAcquire, SpinLockRelease, XLogCtlData::ulsn_lck, and XLogCtlData::unloggedLSN.

Referenced by gistGetFakeLSN().

{
    XLogRecPtr nextUnloggedLSN;

    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;

    /* increment the unloggedLSN counter, need SpinLock */
    SpinLockAcquire(&xlogctl->ulsn_lck);
    nextUnloggedLSN = xlogctl->unloggedLSN++;
    SpinLockRelease(&xlogctl->ulsn_lck);

    return nextUnloggedLSN;
}

XLogRecPtr GetFlushRecPtr ( void   ) 

Definition at line 6512 of file xlog.c.

References XLogwrtResult::Flush, XLogCtlData::info_lck, XLogCtlData::LogwrtResult, SpinLockAcquire, and SpinLockRelease.

Referenced by StartReplication(), and XLogSend().

{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;
    XLogRecPtr  recptr;

    SpinLockAcquire(&xlogctl->info_lck);
    recptr = xlogctl->LogwrtResult.Flush;
    SpinLockRelease(&xlogctl->info_lck);

    return recptr;
}

XLogRecPtr GetInsertRecPtr ( void   ) 

Definition at line 6494 of file xlog.c.

References XLogCtlData::info_lck, XLogCtlData::LogwrtRqst, SpinLockAcquire, SpinLockRelease, and XLogwrtRqst::Write.

Referenced by CheckpointerMain(), IdentifySystem(), and IsCheckpointOnSchedule().

{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;
    XLogRecPtr  recptr;

    SpinLockAcquire(&xlogctl->info_lck);
    recptr = xlogctl->LogwrtRqst.Write;
    SpinLockRelease(&xlogctl->info_lck);

    return recptr;
}

pg_time_t GetLastSegSwitchTime ( void   ) 

Definition at line 6529 of file xlog.c.

References XLogCtlWrite::lastSegSwitchTime, LW_SHARED, LWLockAcquire(), LWLockRelease(), WALWriteLock, and XLogCtlData::Write.

Referenced by CheckArchiveTimeout().

{
    pg_time_t   result;

    /* Need WALWriteLock, but shared lock is sufficient */
    LWLockAcquire(WALWriteLock, LW_SHARED);
    result = XLogCtl->Write.lastSegSwitchTime;
    LWLockRelease(WALWriteLock);

    return result;
}

TimestampTz GetLatestXTime ( void   ) 

Definition at line 4718 of file xlog.c.

References XLogCtlData::info_lck, XLogCtlData::recoveryLastXTime, SpinLockAcquire, and SpinLockRelease.

Referenced by CreateRestartPoint(), pg_last_xact_replay_timestamp(), and StartupXLOG().

{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;
    TimestampTz xtime;

    SpinLockAcquire(&xlogctl->info_lck);
    xtime = xlogctl->recoveryLastXTime;
    SpinLockRelease(&xlogctl->info_lck);

    return xtime;
}

void GetNextXidAndEpoch ( TransactionId xid,
uint32 epoch 
)

Definition at line 6550 of file xlog.c.

References XLogCtlData::ckptXid, XLogCtlData::ckptXidEpoch, XLogCtlData::info_lck, ReadNewTransactionId(), SpinLockAcquire, and SpinLockRelease.

Referenced by load_xid_epoch(), ProcessStandbyHSFeedbackMessage(), and XLogWalRcvSendHSFeedback().

{
    uint32      ckptXidEpoch;
    TransactionId ckptXid;
    TransactionId nextXid;

    /* Must read checkpoint info first, else have race condition */
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        ckptXidEpoch = xlogctl->ckptXidEpoch;
        ckptXid = xlogctl->ckptXid;
        SpinLockRelease(&xlogctl->info_lck);
    }

    /* Now fetch current nextXid */
    nextXid = ReadNewTransactionId();

    /*
     * nextXid is certainly logically later than ckptXid.  So if it's
     * numerically less, it must have wrapped into the next epoch.
     */
    if (nextXid < ckptXid)
        ckptXidEpoch++;

    *xid = nextXid;
    *epoch = ckptXidEpoch;
}

void GetOldestRestartPoint ( XLogRecPtr oldrecptr,
TimeLineID oldtli 
)
XLogRecPtr GetRedoRecPtr ( void   ) 

Definition at line 6471 of file xlog.c.

References Assert, XLogCtlData::info_lck, XLogCtlData::Insert, Insert(), XLogCtlInsert::RedoRecPtr, RedoRecPtr, SpinLockAcquire, and SpinLockRelease.

Referenced by InitXLOGAccess(), nextval_internal(), XLogPageRead(), XLogSaveBufferForHint(), and XLogWrite().

{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    Assert(RedoRecPtr <= xlogctl->Insert.RedoRecPtr);
    RedoRecPtr = xlogctl->Insert.RedoRecPtr;
    SpinLockRelease(&xlogctl->info_lck);

    return RedoRecPtr;
}

uint64 GetSystemIdentifier ( void   ) 

Definition at line 3787 of file xlog.c.

References Assert, NULL, and ControlFileData::system_identifier.

Referenced by IdentifySystem(), and libpqrcv_identify_system().

XLogRecPtr GetXLogInsertRecPtr ( void   ) 
void GetXLogReceiptTime ( TimestampTz rtime,
bool fromStream 
)

Definition at line 4771 of file xlog.c.

References Assert, InRecovery, XLogReceiptSource, and XLogReceiptTime.

Referenced by GetStandbyLimitTime().

{
    /*
     * This must be executed in the startup process, since we don't export the
     * relevant state to shared memory.
     */
    Assert(InRecovery);

    *rtime = XLogReceiptTime;
    *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
}

XLogRecPtr GetXLogReplayRecPtr ( TimeLineID replayTLI  ) 

Definition at line 9103 of file xlog.c.

References XLogCtlData::info_lck, XLogCtlData::lastReplayedEndRecPtr, XLogCtlData::lastReplayedTLI, SpinLockAcquire, and SpinLockRelease.

Referenced by CreateRestartPoint(), GetReplicationApplyDelay(), GetStandbyFlushRecPtr(), pg_last_xlog_replay_location(), WalReceiverMain(), and XLogWalRcvSendReply().

{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;
    XLogRecPtr  recptr;
    TimeLineID  tli;

    SpinLockAcquire(&xlogctl->info_lck);
    recptr = xlogctl->lastReplayedEndRecPtr;
    tli = xlogctl->lastReplayedTLI;
    SpinLockRelease(&xlogctl->info_lck);

    if (replayTLI)
        *replayTLI = tli;
    return recptr;
}

XLogRecPtr GetXLogWriteRecPtr ( void   ) 

Definition at line 9140 of file xlog.c.

References XLogCtlData::info_lck, XLogCtlData::LogwrtResult, SpinLockAcquire, SpinLockRelease, and XLogwrtResult::Write.

Referenced by pg_current_xlog_location().

{
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        LogwrtResult = xlogctl->LogwrtResult;
        SpinLockRelease(&xlogctl->info_lck);
    }

    return LogwrtResult.Write;
}

bool HotStandbyActive ( void   ) 

Definition at line 6252 of file xlog.c.

References XLogCtlData::info_lck, LocalHotStandbyActive, XLogCtlData::SharedHotStandbyActive, SpinLockAcquire, and SpinLockRelease.

Referenced by XLogWalRcvSendHSFeedback().

{
    /*
     * We check shared state each time only until Hot Standby is active. We
     * can't de-activate Hot Standby, so there's no need to keep checking
     * after the shared variable has once been seen true.
     */
    if (LocalHotStandbyActive)
        return true;
    else
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        /* spinlock is essential on machines with weak memory ordering! */
        SpinLockAcquire(&xlogctl->info_lck);
        LocalHotStandbyActive = xlogctl->SharedHotStandbyActive;
        SpinLockRelease(&xlogctl->info_lck);

        return LocalHotStandbyActive;
    }
}

void InitXLOGAccess ( void   ) 

Definition at line 6455 of file xlog.c.

References Assert, GetRedoRecPtr(), IsBootstrapProcessingMode, XLogCtlData::ThisTimeLineID, and ThisTimeLineID.

Referenced by AuxiliaryProcessMain(), LocalSetXLogInsertAllowed(), and RecoveryInProgress().

{
    /* ThisTimeLineID doesn't change so we need no lock to copy it */
    ThisTimeLineID = XLogCtl->ThisTimeLineID;
    Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());

    /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
    (void) GetRedoRecPtr();
}

static bool InstallXLogFileSegment ( XLogSegNo segno,
char *  tmppath,
bool  find_free,
int *  max_advance,
bool  use_lock 
) [static]

Definition at line 2519 of file xlog.c.

References ControlFileLock, ereport, errcode_for_file_access(), errmsg(), link(), LOG, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), ThisTimeLineID, unlink(), and XLogFilePath.

Referenced by RemoveOldXlogFiles(), XLogFileCopy(), and XLogFileInit().

{
    char        path[MAXPGPATH];
    struct stat stat_buf;

    XLogFilePath(path, ThisTimeLineID, *segno);

    /*
     * We want to be sure that only one process does this at a time.
     */
    if (use_lock)
        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);

    if (!find_free)
    {
        /* Force installation: get rid of any pre-existing segment file */
        unlink(path);
    }
    else
    {
        /* Find a free slot to put it in */
        while (stat(path, &stat_buf) == 0)
        {
            if (*max_advance <= 0)
            {
                /* Failed to find a free slot within specified range */
                if (use_lock)
                    LWLockRelease(ControlFileLock);
                return false;
            }
            (*segno)++;
            (*max_advance)--;
            XLogFilePath(path, ThisTimeLineID, *segno);
        }
    }

    /*
     * Prefer link() to rename() here just to be really sure that we don't
     * overwrite an existing logfile.  However, there shouldn't be one, so
     * rename() is an acceptable substitute except for the truly paranoid.
     */
#if HAVE_WORKING_LINK
    if (link(tmppath, path) < 0)
    {
        if (use_lock)
            LWLockRelease(ControlFileLock);
        ereport(LOG,
                (errcode_for_file_access(),
                 errmsg("could not link file \"%s\" to \"%s\" (initialization of log file): %m",
                        tmppath, path)));
        return false;
    }
    unlink(tmppath);
#else
    if (rename(tmppath, path) < 0)
    {
        if (use_lock)
            LWLockRelease(ControlFileLock);
        ereport(LOG,
                (errcode_for_file_access(),
                 errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file): %m",
                        tmppath, path)));
        return false;
    }
#endif

    if (use_lock)
        LWLockRelease(ControlFileLock);

    return true;
}

void issue_xlog_fsync ( int  fd,
XLogSegNo  segno 
)

Definition at line 8322 of file xlog.c.

References elog, ereport, errcode_for_file_access(), errmsg(), PANIC, pg_fdatasync(), pg_fsync_no_writethrough(), pg_fsync_writethrough(), sync_method, SYNC_METHOD_FDATASYNC, SYNC_METHOD_FSYNC, SYNC_METHOD_FSYNC_WRITETHROUGH, SYNC_METHOD_OPEN, SYNC_METHOD_OPEN_DSYNC, ThisTimeLineID, and XLogFileNameP().

Referenced by XLogWalRcvFlush(), and XLogWrite().

{
    switch (sync_method)
    {
        case SYNC_METHOD_FSYNC:
            if (pg_fsync_no_writethrough(fd) != 0)
                ereport(PANIC,
                        (errcode_for_file_access(),
                         errmsg("could not fsync log file %s: %m",
                                XLogFileNameP(ThisTimeLineID, segno))));
            break;
#ifdef HAVE_FSYNC_WRITETHROUGH
        case SYNC_METHOD_FSYNC_WRITETHROUGH:
            if (pg_fsync_writethrough(fd) != 0)
                ereport(PANIC,
                        (errcode_for_file_access(),
                         errmsg("could not fsync write-through log file %s: %m",
                                XLogFileNameP(ThisTimeLineID, segno))));
            break;
#endif
#ifdef HAVE_FDATASYNC
        case SYNC_METHOD_FDATASYNC:
            if (pg_fdatasync(fd) != 0)
                ereport(PANIC,
                        (errcode_for_file_access(),
                         errmsg("could not fdatasync log file %s: %m",
                                XLogFileNameP(ThisTimeLineID, segno))));
            break;
#endif
        case SYNC_METHOD_OPEN:
        case SYNC_METHOD_OPEN_DSYNC:
            /* write synced it already */
            break;
        default:
            elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
            break;
    }
}

static void KeepLogSeg ( XLogRecPtr  recptr,
XLogSegNo logSegNo 
) [static]

Definition at line 7531 of file xlog.c.

References wal_keep_segments, and XLByteToSeg.

Referenced by CreateCheckPoint(), and CreateRestartPoint().

{
    XLogSegNo   segno;

    if (wal_keep_segments == 0)
        return;

    XLByteToSeg(recptr, segno);

    /* avoid underflow, don't go below 1 */
    if (segno <= wal_keep_segments)
        segno = 1;
    else
        segno = segno - wal_keep_segments;

    /* don't delete WAL segments newer than the calculated segment */
    if (segno < *logSegNo)
        *logSegNo = segno;
}

static void LocalSetXLogInsertAllowed ( void   )  [static]

Definition at line 6314 of file xlog.c.

References Assert, InitXLOGAccess(), and LocalXLogInsertAllowed.

Referenced by CreateCheckPoint(), CreateEndOfRecoveryRecord(), and StartupXLOG().

{
    Assert(LocalXLogInsertAllowed == -1);
    LocalXLogInsertAllowed = 1;

    /* Initialize as RecoveryInProgress() would do when switching state */
    InitXLOGAccess();
}

static void LogCheckpointEnd ( bool  restartpoint  )  [static]

Definition at line 6644 of file xlog.c.

References BgWriterStats, CheckpointStatsData::ckpt_agg_sync_time, CheckpointStatsData::ckpt_bufs_written, CheckpointStatsData::ckpt_end_t, CheckpointStatsData::ckpt_longest_sync, CheckpointStatsData::ckpt_segs_added, CheckpointStatsData::ckpt_segs_recycled, CheckpointStatsData::ckpt_segs_removed, CheckpointStatsData::ckpt_start_t, CheckpointStatsData::ckpt_sync_end_t, CheckpointStatsData::ckpt_sync_rels, CheckpointStatsData::ckpt_sync_t, CheckpointStatsData::ckpt_write_t, elog, GetCurrentTimestamp(), LOG, log_checkpoints, PgStat_MsgBgWriter::m_checkpoint_sync_time, PgStat_MsgBgWriter::m_checkpoint_write_time, NBuffers, and TimestampDifference().

Referenced by CreateCheckPoint(), and CreateRestartPoint().

{
    long        write_secs,
                sync_secs,
                total_secs,
                longest_secs,
                average_secs;
    int         write_usecs,
                sync_usecs,
                total_usecs,
                longest_usecs,
                average_usecs;
    uint64      average_sync_time;

    CheckpointStats.ckpt_end_t = GetCurrentTimestamp();

    TimestampDifference(CheckpointStats.ckpt_write_t,
                        CheckpointStats.ckpt_sync_t,
                        &write_secs, &write_usecs);

    TimestampDifference(CheckpointStats.ckpt_sync_t,
                        CheckpointStats.ckpt_sync_end_t,
                        &sync_secs, &sync_usecs);

    /* Accumulate checkpoint timing summary data, in milliseconds. */
    BgWriterStats.m_checkpoint_write_time +=
        write_secs * 1000 + write_usecs / 1000;
    BgWriterStats.m_checkpoint_sync_time +=
        sync_secs * 1000 + sync_usecs / 1000;

    /*
     * All of the published timing statistics are accounted for.  Only
     * continue if a log message is to be written.
     */
    if (!log_checkpoints)
        return;

    TimestampDifference(CheckpointStats.ckpt_start_t,
                        CheckpointStats.ckpt_end_t,
                        &total_secs, &total_usecs);

    /*
     * Timing values returned from CheckpointStats are in microseconds.
     * Convert to the second plus microsecond form that TimestampDifference
     * returns for homogeneous printing.
     */
    longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
    longest_usecs = CheckpointStats.ckpt_longest_sync -
        (uint64) longest_secs *1000000;

    average_sync_time = 0;
    if (CheckpointStats.ckpt_sync_rels > 0)
        average_sync_time = CheckpointStats.ckpt_agg_sync_time /
            CheckpointStats.ckpt_sync_rels;
    average_secs = (long) (average_sync_time / 1000000);
    average_usecs = average_sync_time - (uint64) average_secs *1000000;

    if (restartpoint)
        elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
             "%d transaction log file(s) added, %d removed, %d recycled; "
             "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
             "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
             CheckpointStats.ckpt_bufs_written,
             (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
             CheckpointStats.ckpt_segs_added,
             CheckpointStats.ckpt_segs_removed,
             CheckpointStats.ckpt_segs_recycled,
             write_secs, write_usecs / 1000,
             sync_secs, sync_usecs / 1000,
             total_secs, total_usecs / 1000,
             CheckpointStats.ckpt_sync_rels,
             longest_secs, longest_usecs / 1000,
             average_secs, average_usecs / 1000);
    else
        elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
             "%d transaction log file(s) added, %d removed, %d recycled; "
             "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
             "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
             CheckpointStats.ckpt_bufs_written,
             (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
             CheckpointStats.ckpt_segs_added,
             CheckpointStats.ckpt_segs_removed,
             CheckpointStats.ckpt_segs_recycled,
             write_secs, write_usecs / 1000,
             sync_secs, sync_usecs / 1000,
             total_secs, total_usecs / 1000,
             CheckpointStats.ckpt_sync_rels,
             longest_secs, longest_usecs / 1000,
             average_secs, average_usecs / 1000);
}

static void LogCheckpointStart ( int  flags,
bool  restartpoint 
) [static]

Definition at line 6617 of file xlog.c.

References CHECKPOINT_CAUSE_TIME, CHECKPOINT_CAUSE_XLOG, CHECKPOINT_END_OF_RECOVERY, CHECKPOINT_FORCE, CHECKPOINT_IMMEDIATE, CHECKPOINT_IS_SHUTDOWN, CHECKPOINT_WAIT, elog, and LOG.

Referenced by CreateCheckPoint(), and CreateRestartPoint().

{
    const char *msg;

    /*
     * XXX: This is hopelessly untranslatable. We could call gettext_noop for
     * the main message, but what about all the flags?
     */
    if (restartpoint)
        msg = "restartpoint starting:%s%s%s%s%s%s%s";
    else
        msg = "checkpoint starting:%s%s%s%s%s%s%s";

    elog(LOG, msg,
         (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
         (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
         (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
         (flags & CHECKPOINT_FORCE) ? " force" : "",
         (flags & CHECKPOINT_WAIT) ? " wait" : "",
         (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
         (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
}

static void pg_start_backup_callback ( int  code,
Datum  arg 
) [static]
static void PreallocXlogFiles ( XLogRecPtr  endptr  )  [static]

Definition at line 2814 of file xlog.c.

References CheckpointStatsData::ckpt_segs_added, close, XLByteToPrevSeg, XLogFileInit(), and XLogSegSize.

Referenced by CreateCheckPoint(), CreateRestartPoint(), and StartupXLOG().

{
    XLogSegNo   _logSegNo;
    int         lf;
    bool        use_existent;

    XLByteToPrevSeg(endptr, _logSegNo);
    if ((endptr - 1) % XLogSegSize >= (uint32) (0.75 * XLogSegSize))
    {
        _logSegNo++;
        use_existent = true;
        lf = XLogFileInit(_logSegNo, &use_existent, true);
        close(lf);
        if (!use_existent)
            CheckpointStats.ckpt_segs_added++;
    }
}

static bool read_backup_label ( XLogRecPtr checkPointLoc,
bool backupEndRequired,
bool backupFromStandby 
) [static]

Definition at line 9184 of file xlog.c.

References AllocateFile(), BACKUP_LABEL_FILE, ereport, errcode(), errcode_for_file_access(), errmsg(), FATAL, FreeFile(), and RedoStartLSN.

Referenced by StartupXLOG().

{
    char        startxlogfilename[MAXFNAMELEN];
    TimeLineID  tli;
    FILE       *lfp;
    char        ch;
    char        backuptype[20];
    char        backupfrom[20];
    uint32      hi,
                lo;

    *backupEndRequired = false;
    *backupFromStandby = false;

    /*
     * See if label file is present
     */
    lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
    if (!lfp)
    {
        if (errno != ENOENT)
            ereport(FATAL,
                    (errcode_for_file_access(),
                     errmsg("could not read file \"%s\": %m",
                            BACKUP_LABEL_FILE)));
        return false;           /* it's not there, all is fine */
    }

    /*
     * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
     * is pretty crude, but we are not expecting any variability in the file
     * format).
     */
    if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
               &hi, &lo, &tli, startxlogfilename, &ch) != 5 || ch != '\n')
        ereport(FATAL,
                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
    RedoStartLSN = ((uint64) hi) << 32 | lo;
    if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
               &hi, &lo, &ch) != 3 || ch != '\n')
        ereport(FATAL,
                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
    *checkPointLoc = ((uint64) hi) << 32 | lo;

    /*
     * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
     * from an older backup anyway, but since the information on it is not
     * strictly required, don't error out if it's missing for some reason.
     */
    if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
    {
        if (strcmp(backuptype, "streamed") == 0)
            *backupEndRequired = true;
    }

    if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
    {
        if (strcmp(backupfrom, "standby") == 0)
            *backupFromStandby = true;
    }

    if (ferror(lfp) || FreeFile(lfp))
        ereport(FATAL,
                (errcode_for_file_access(),
                 errmsg("could not read file \"%s\": %m",
                        BACKUP_LABEL_FILE)));

    return true;
}

static XLogRecord * ReadCheckpointRecord ( XLogReaderState xlogreader,
XLogRecPtr  RecPtr,
int  whichChkpti,
bool  report 
) [static]

Definition at line 6330 of file xlog.c.

References ereport, errmsg(), LOG, NULL, ReadRecord(), SizeOfXLogRecord, XLogRecord::xl_info, XLogRecord::xl_len, XLogRecord::xl_rmid, XLogRecord::xl_tot_len, XLOG_CHECKPOINT_ONLINE, XLOG_CHECKPOINT_SHUTDOWN, and XRecOffIsValid.

Referenced by StartupXLOG().

{
    XLogRecord *record;

    if (!XRecOffIsValid(RecPtr))
    {
        if (!report)
            return NULL;

        switch (whichChkpt)
        {
            case 1:
                ereport(LOG,
                (errmsg("invalid primary checkpoint link in control file")));
                break;
            case 2:
                ereport(LOG,
                        (errmsg("invalid secondary checkpoint link in control file")));
                break;
            default:
                ereport(LOG,
                   (errmsg("invalid checkpoint link in backup_label file")));
                break;
        }
        return NULL;
    }

    record = ReadRecord(xlogreader, RecPtr, LOG, true);

    if (record == NULL)
    {
        if (!report)
            return NULL;

        switch (whichChkpt)
        {
            case 1:
                ereport(LOG,
                        (errmsg("invalid primary checkpoint record")));
                break;
            case 2:
                ereport(LOG,
                        (errmsg("invalid secondary checkpoint record")));
                break;
            default:
                ereport(LOG,
                        (errmsg("invalid checkpoint record")));
                break;
        }
        return NULL;
    }
    if (record->xl_rmid != RM_XLOG_ID)
    {
        switch (whichChkpt)
        {
            case 1:
                ereport(LOG,
                        (errmsg("invalid resource manager ID in primary checkpoint record")));
                break;
            case 2:
                ereport(LOG,
                        (errmsg("invalid resource manager ID in secondary checkpoint record")));
                break;
            default:
                ereport(LOG,
                (errmsg("invalid resource manager ID in checkpoint record")));
                break;
        }
        return NULL;
    }
    if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
        record->xl_info != XLOG_CHECKPOINT_ONLINE)
    {
        switch (whichChkpt)
        {
            case 1:
                ereport(LOG,
                   (errmsg("invalid xl_info in primary checkpoint record")));
                break;
            case 2:
                ereport(LOG,
                 (errmsg("invalid xl_info in secondary checkpoint record")));
                break;
            default:
                ereport(LOG,
                        (errmsg("invalid xl_info in checkpoint record")));
                break;
        }
        return NULL;
    }
    if (record->xl_len != sizeof(CheckPoint) ||
        record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
    {
        switch (whichChkpt)
        {
            case 1:
                ereport(LOG,
                    (errmsg("invalid length of primary checkpoint record")));
                break;
            case 2:
                ereport(LOG,
                  (errmsg("invalid length of secondary checkpoint record")));
                break;
            default:
                ereport(LOG,
                        (errmsg("invalid length of checkpoint record")));
                break;
        }
        return NULL;
    }
    return record;
}

static void ReadControlFile ( void   )  [static]

Definition at line 3559 of file xlog.c.

References BasicOpenFile(), CATALOG_VERSION_NO, close, COMP_CRC32, EQ_CRC32, ereport, errcode_for_file_access(), errdetail(), errhint(), errmsg(), FATAL, FIN_CRC32, FLOATFORMAT_VALUE, INDEX_MAX_KEYS, INIT_CRC32, NAMEDATALEN, offsetof, PANIC, PG_BINARY, PG_CONTROL_VERSION, ControlFileData::pg_control_version, read, TOAST_MAX_CHUNK_SIZE, and XLOG_CONTROL_FILE.

Referenced by StartupXLOG(), and XLOGShmemInit().

{
    pg_crc32    crc;
    int         fd;

    /*
     * Read data...
     */
    fd = BasicOpenFile(XLOG_CONTROL_FILE,
                       O_RDWR | PG_BINARY,
                       S_IRUSR | S_IWUSR);
    if (fd < 0)
        ereport(PANIC,
                (errcode_for_file_access(),
                 errmsg("could not open control file \"%s\": %m",
                        XLOG_CONTROL_FILE)));

    if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
        ereport(PANIC,
                (errcode_for_file_access(),
                 errmsg("could not read from control file: %m")));

    close(fd);

    /*
     * Check for expected pg_control format version.  If this is wrong, the
     * CRC check will likely fail because we'll be checking the wrong number
     * of bytes.  Complaining about wrong version will probably be more
     * enlightening than complaining about wrong CRC.
     */

    if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
        ereport(FATAL,
                (errmsg("database files are incompatible with server"),
                 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
         " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
            ControlFile->pg_control_version, ControlFile->pg_control_version,
                           PG_CONTROL_VERSION, PG_CONTROL_VERSION),
                 errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));

    if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
        ereport(FATAL,
                (errmsg("database files are incompatible with server"),
                 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
                  " but the server was compiled with PG_CONTROL_VERSION %d.",
                        ControlFile->pg_control_version, PG_CONTROL_VERSION),
                 errhint("It looks like you need to initdb.")));

    /* Now check the CRC. */
    INIT_CRC32(crc);
    COMP_CRC32(crc,
               (char *) ControlFile,
               offsetof(ControlFileData, crc));
    FIN_CRC32(crc);

    if (!EQ_CRC32(crc, ControlFile->crc))
        ereport(FATAL,
                (errmsg("incorrect checksum in control file")));

    /*
     * Do compatibility checking immediately.  If the database isn't
     * compatible with the backend executable, we want to abort before we can
     * possibly do any damage.
     */
    if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
        ereport(FATAL,
                (errmsg("database files are incompatible with server"),
                 errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
                  " but the server was compiled with CATALOG_VERSION_NO %d.",
                        ControlFile->catalog_version_no, CATALOG_VERSION_NO),
                 errhint("It looks like you need to initdb.")));
    if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
        ereport(FATAL,
                (errmsg("database files are incompatible with server"),
           errdetail("The database cluster was initialized with MAXALIGN %d,"
                     " but the server was compiled with MAXALIGN %d.",
                     ControlFile->maxAlign, MAXIMUM_ALIGNOF),
                 errhint("It looks like you need to initdb.")));
    if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
        ereport(FATAL,
                (errmsg("database files are incompatible with server"),
                 errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
                 errhint("It looks like you need to initdb.")));
    if (ControlFile->blcksz != BLCKSZ)
        ereport(FATAL,
                (errmsg("database files are incompatible with server"),
             errdetail("The database cluster was initialized with BLCKSZ %d,"
                       " but the server was compiled with BLCKSZ %d.",
                       ControlFile->blcksz, BLCKSZ),
                 errhint("It looks like you need to recompile or initdb.")));
    if (ControlFile->relseg_size != RELSEG_SIZE)
        ereport(FATAL,
                (errmsg("database files are incompatible with server"),
        errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
                  " but the server was compiled with RELSEG_SIZE %d.",
                  ControlFile->relseg_size, RELSEG_SIZE),
                 errhint("It looks like you need to recompile or initdb.")));
    if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
        ereport(FATAL,
                (errmsg("database files are incompatible with server"),
        errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
                  " but the server was compiled with XLOG_BLCKSZ %d.",
                  ControlFile->xlog_blcksz, XLOG_BLCKSZ),
                 errhint("It looks like you need to recompile or initdb.")));
    if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
        ereport(FATAL,
                (errmsg("database files are incompatible with server"),
                 errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
                       " but the server was compiled with XLOG_SEG_SIZE %d.",
                           ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
                 errhint("It looks like you need to recompile or initdb.")));
    if (ControlFile->nameDataLen != NAMEDATALEN)
        ereport(FATAL,
                (errmsg("database files are incompatible with server"),
        errdetail("The database cluster was initialized with NAMEDATALEN %d,"
                  " but the server was compiled with NAMEDATALEN %d.",
                  ControlFile->nameDataLen, NAMEDATALEN),
                 errhint("It looks like you need to recompile or initdb.")));
    if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
        ereport(FATAL,
                (errmsg("database files are incompatible with server"),
                 errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
                      " but the server was compiled with INDEX_MAX_KEYS %d.",
                           ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
                 errhint("It looks like you need to recompile or initdb.")));
    if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
        ereport(FATAL,
                (errmsg("database files are incompatible with server"),
                 errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
                " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
              ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
                 errhint("It looks like you need to recompile or initdb.")));

#ifdef HAVE_INT64_TIMESTAMP
    if (ControlFile->enableIntTimes != true)
        ereport(FATAL,
                (errmsg("database files are incompatible with server"),
                 errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
                  " but the server was compiled with HAVE_INT64_TIMESTAMP."),
                 errhint("It looks like you need to recompile or initdb.")));
#else
    if (ControlFile->enableIntTimes != false)
        ereport(FATAL,
                (errmsg("database files are incompatible with server"),
                 errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
               " but the server was compiled without HAVE_INT64_TIMESTAMP."),
                 errhint("It looks like you need to recompile or initdb.")));
#endif

#ifdef USE_FLOAT4_BYVAL
    if (ControlFile->float4ByVal != true)
        ereport(FATAL,
                (errmsg("database files are incompatible with server"),
                 errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
                      " but the server was compiled with USE_FLOAT4_BYVAL."),
                 errhint("It looks like you need to recompile or initdb.")));
#else
    if (ControlFile->float4ByVal != false)
        ereport(FATAL,
                (errmsg("database files are incompatible with server"),
        errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
                  " but the server was compiled without USE_FLOAT4_BYVAL."),
                 errhint("It looks like you need to recompile or initdb.")));
#endif

#ifdef USE_FLOAT8_BYVAL
    if (ControlFile->float8ByVal != true)
        ereport(FATAL,
                (errmsg("database files are incompatible with server"),
                 errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
                      " but the server was compiled with USE_FLOAT8_BYVAL."),
                 errhint("It looks like you need to recompile or initdb.")));
#else
    if (ControlFile->float8ByVal != false)
        ereport(FATAL,
                (errmsg("database files are incompatible with server"),
        errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
                  " but the server was compiled without USE_FLOAT8_BYVAL."),
                 errhint("It looks like you need to recompile or initdb.")));
#endif
}

static XLogRecord * ReadRecord ( XLogReaderState xlogreader,
XLogRecPtr  RecPtr,
int  emode,
bool  fetching_ckpt 
) [static]

Definition at line 3245 of file xlog.c.

References ArchiveRecoveryRequested, CheckForStandbyTrigger(), CheckRecoveryConsistency(), close, ControlFileLock, currentSource, DEBUG1, emode_for_corrupt_record(), XLogReaderState::EndRecPtr, EndRecPtr, ereport, errmsg(), errmsg_internal(), InArchiveRecovery, InvalidXLogRecPtr, lastSourceFailed, XLogReaderState::latestPagePtr, XLogReaderState::latestPageTLI, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), minRecoveryPoint, ControlFileData::minRecoveryPoint, minRecoveryPointTLI, ControlFileData::minRecoveryPointTLI, NULL, XLogReaderState::private_data, readFile, XLogReaderState::readPageTLI, XLogReaderState::ReadRecPtr, ReadRecPtr, StandbyMode, StandbyModeRequested, ControlFileData::state, ThisTimeLineID, tliInHistory(), UpdateControlFile(), XLByteToSeg, XLogFileName, and XLogReadRecord().

Referenced by ReadCheckpointRecord(), and StartupXLOG().

{
    XLogRecord *record;
    XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;

    /* Pass through parameters to XLogPageRead */
    private->fetching_ckpt = fetching_ckpt;
    private->emode = emode;
    private->randAccess = (RecPtr != InvalidXLogRecPtr);

    /* This is the first attempt to read this page. */
    lastSourceFailed = false;

    for (;;)
    {
        char   *errormsg;

        record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
        ReadRecPtr = xlogreader->ReadRecPtr;
        EndRecPtr = xlogreader->EndRecPtr;
        if (record == NULL)
        {
            if (readFile >= 0)
            {
                close(readFile);
                readFile = -1;
            }

            /*
             * We only end up here without a message when XLogPageRead() failed
             * - in that case we already logged something.
             * In StandbyMode that only happens if we have been triggered, so
             * we shouldn't loop anymore in that case.
             */
            if (errormsg)
                ereport(emode_for_corrupt_record(emode,
                                                 RecPtr ? RecPtr : EndRecPtr),
                        (errmsg_internal("%s", errormsg) /* already translated */));
        }
        /*
         * Check page TLI is one of the expected values.
         */
        else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
        {
            char        fname[MAXFNAMELEN];
            XLogSegNo segno;
            int32 offset;

            XLByteToSeg(xlogreader->latestPagePtr, segno);
            offset = xlogreader->latestPagePtr % XLogSegSize;
            XLogFileName(fname, xlogreader->readPageTLI, segno);
            ereport(emode_for_corrupt_record(emode,
                                             RecPtr ? RecPtr : EndRecPtr),
                    (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
                            xlogreader->latestPageTLI,
                            fname,
                            offset)));
            record = NULL;
        }

        if (record)
        {
            /* Great, got a record */
            return record;
        }
        else
        {
            /* No valid record available from this source */
            lastSourceFailed = true;

            /*
             * If archive recovery was requested, but we were still doing crash
             * recovery, switch to archive recovery and retry using the offline
             * archive. We have now replayed all the valid WAL in pg_xlog, so
             * we are presumably now consistent.
             *
             * We require that there's at least some valid WAL present in
             * pg_xlog, however (!fetch_ckpt). We could recover using the WAL
             * from the archive, even if pg_xlog is completely empty, but we'd
             * have no idea how far we'd have to replay to reach consistency.
             * So err on the safe side and give up.
             */
            if (!InArchiveRecovery && ArchiveRecoveryRequested &&
                !fetching_ckpt)
            {
                ereport(DEBUG1,
                        (errmsg_internal("reached end of WAL in pg_xlog, entering archive recovery")));
                InArchiveRecovery = true;
                if (StandbyModeRequested)
                    StandbyMode = true;

                /* initialize minRecoveryPoint to this record */
                LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
                ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
                if (ControlFile->minRecoveryPoint < EndRecPtr)
                {
                    ControlFile->minRecoveryPoint = EndRecPtr;
                    ControlFile->minRecoveryPointTLI = ThisTimeLineID;
                }
                /* update local copy */
                minRecoveryPoint = ControlFile->minRecoveryPoint;
                minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;

                UpdateControlFile();
                LWLockRelease(ControlFileLock);

                CheckRecoveryConsistency();

                /*
                 * Before we retry, reset lastSourceFailed and currentSource
                 * so that we will check the archive next.
                 */
                lastSourceFailed = false;
                currentSource = 0;

                continue;
            }

            /* In standby mode, loop back to retry. Otherwise, give up. */
            if (StandbyMode && !CheckForStandbyTrigger())
                continue;
            else
                return NULL;
        }
    }
}

static void readRecoveryCommandFile ( void   )  [static]

Definition at line 4162 of file xlog.c.

References AllocateFile(), archiveCleanupCommand, ArchiveRecoveryRequested, CStringGetDatum, DatumGetTimestampTz, DEBUG2, DirectFunctionCall3, ereport, errcode(), errcode_for_file_access(), errhint(), errmsg(), errmsg_internal(), ERROR, existsTimeLineHistory(), FATAL, findNewestTimeLine(), FreeConfigVariables(), FreeFile(), Int32GetDatum, InvalidOid, MAXFNAMELEN, ConfigVariable::name, ConfigVariable::next, NULL, ObjectIdGetDatum, parse_bool(), ParseConfigFp(), PrimaryConnInfo, pstrdup(), RECOVERY_COMMAND_FILE, RECOVERY_TARGET_NAME, RECOVERY_TARGET_XID, recoveryEndCommand, recoveryPauseAtTarget, recoveryRestoreCommand, recoveryTarget, recoveryTargetInclusive, recoveryTargetIsLatest, recoveryTargetName, recoveryTargetTime, recoveryTargetTLI, recoveryTargetXid, StandbyModeRequested, timestamptz_in(), timestamptz_to_str(), TriggerFile, ConfigVariable::value, and WARNING.

Referenced by StartupXLOG().

{
    FILE       *fd;
    TimeLineID  rtli = 0;
    bool        rtliGiven = false;
    ConfigVariable *item,
               *head = NULL,
               *tail = NULL;

    fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
    if (fd == NULL)
    {
        if (errno == ENOENT)
            return;             /* not there, so no archive recovery */
        ereport(FATAL,
                (errcode_for_file_access(),
                 errmsg("could not open recovery command file \"%s\": %m",
                        RECOVERY_COMMAND_FILE)));
    }

    /*
     * Since we're asking ParseConfigFp() to report errors as FATAL, there's
     * no need to check the return value.
     */
    (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);

    FreeFile(fd);

    for (item = head; item; item = item->next)
    {
        if (strcmp(item->name, "restore_command") == 0)
        {
            recoveryRestoreCommand = pstrdup(item->value);
            ereport(DEBUG2,
                    (errmsg_internal("restore_command = '%s'",
                                     recoveryRestoreCommand)));
        }
        else if (strcmp(item->name, "recovery_end_command") == 0)
        {
            recoveryEndCommand = pstrdup(item->value);
            ereport(DEBUG2,
                    (errmsg_internal("recovery_end_command = '%s'",
                                     recoveryEndCommand)));
        }
        else if (strcmp(item->name, "archive_cleanup_command") == 0)
        {
            archiveCleanupCommand = pstrdup(item->value);
            ereport(DEBUG2,
                    (errmsg_internal("archive_cleanup_command = '%s'",
                                     archiveCleanupCommand)));
        }
        else if (strcmp(item->name, "pause_at_recovery_target") == 0)
        {
            if (!parse_bool(item->value, &recoveryPauseAtTarget))
                ereport(ERROR,
                        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                         errmsg("parameter \"%s\" requires a Boolean value", "pause_at_recovery_target")));
            ereport(DEBUG2,
                    (errmsg_internal("pause_at_recovery_target = '%s'",
                                     item->value)));
        }
        else if (strcmp(item->name, "recovery_target_timeline") == 0)
        {
            rtliGiven = true;
            if (strcmp(item->value, "latest") == 0)
                rtli = 0;
            else
            {
                errno = 0;
                rtli = (TimeLineID) strtoul(item->value, NULL, 0);
                if (errno == EINVAL || errno == ERANGE)
                    ereport(FATAL,
                            (errmsg("recovery_target_timeline is not a valid number: \"%s\"",
                                    item->value)));
            }
            if (rtli)
                ereport(DEBUG2,
                   (errmsg_internal("recovery_target_timeline = %u", rtli)));
            else
                ereport(DEBUG2,
                     (errmsg_internal("recovery_target_timeline = latest")));
        }
        else if (strcmp(item->name, "recovery_target_xid") == 0)
        {
            errno = 0;
            recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0);
            if (errno == EINVAL || errno == ERANGE)
                ereport(FATAL,
                 (errmsg("recovery_target_xid is not a valid number: \"%s\"",
                         item->value)));
            ereport(DEBUG2,
                    (errmsg_internal("recovery_target_xid = %u",
                                     recoveryTargetXid)));
            recoveryTarget = RECOVERY_TARGET_XID;
        }
        else if (strcmp(item->name, "recovery_target_time") == 0)
        {
            /*
             * if recovery_target_xid or recovery_target_name specified, then
             * this overrides recovery_target_time
             */
            if (recoveryTarget == RECOVERY_TARGET_XID ||
                recoveryTarget == RECOVERY_TARGET_NAME)
                continue;
            recoveryTarget = RECOVERY_TARGET_TIME;

            /*
             * Convert the time string given by the user to TimestampTz form.
             */
            recoveryTargetTime =
                DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
                                                CStringGetDatum(item->value),
                                                ObjectIdGetDatum(InvalidOid),
                                                        Int32GetDatum(-1)));
            ereport(DEBUG2,
                    (errmsg_internal("recovery_target_time = '%s'",
                                   timestamptz_to_str(recoveryTargetTime))));
        }
        else if (strcmp(item->name, "recovery_target_name") == 0)
        {
            /*
             * if recovery_target_xid specified, then this overrides
             * recovery_target_name
             */
            if (recoveryTarget == RECOVERY_TARGET_XID)
                continue;
            recoveryTarget = RECOVERY_TARGET_NAME;

            recoveryTargetName = pstrdup(item->value);
            if (strlen(recoveryTargetName) >= MAXFNAMELEN)
                ereport(FATAL,
                        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                         errmsg("recovery_target_name is too long (maximum %d characters)",
                                MAXFNAMELEN - 1)));

            ereport(DEBUG2,
                    (errmsg_internal("recovery_target_name = '%s'",
                                     recoveryTargetName)));
        }
        else if (strcmp(item->name, "recovery_target_inclusive") == 0)
        {
            /*
             * does nothing if a recovery_target is not also set
             */
            if (!parse_bool(item->value, &recoveryTargetInclusive))
                ereport(ERROR,
                        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                         errmsg("parameter \"%s\" requires a Boolean value",
                                "recovery_target_inclusive")));
            ereport(DEBUG2,
                    (errmsg_internal("recovery_target_inclusive = %s",
                                     item->value)));
        }
        else if (strcmp(item->name, "standby_mode") == 0)
        {
            if (!parse_bool(item->value, &StandbyModeRequested))
                ereport(ERROR,
                        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                         errmsg("parameter \"%s\" requires a Boolean value",
                                "standby_mode")));
            ereport(DEBUG2,
                    (errmsg_internal("standby_mode = '%s'", item->value)));
        }
        else if (strcmp(item->name, "primary_conninfo") == 0)
        {
            PrimaryConnInfo = pstrdup(item->value);
            ereport(DEBUG2,
                    (errmsg_internal("primary_conninfo = '%s'",
                                     PrimaryConnInfo)));
        }
        else if (strcmp(item->name, "trigger_file") == 0)
        {
            TriggerFile = pstrdup(item->value);
            ereport(DEBUG2,
                    (errmsg_internal("trigger_file = '%s'",
                                     TriggerFile)));
        }
        else
            ereport(FATAL,
                    (errmsg("unrecognized recovery parameter \"%s\"",
                            item->name)));
    }

    /*
     * Check for compulsory parameters
     */
    if (StandbyModeRequested)
    {
        if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
            ereport(WARNING,
                    (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
                            RECOVERY_COMMAND_FILE),
                     errhint("The database server will regularly poll the pg_xlog subdirectory to check for files placed there.")));
    }
    else
    {
        if (recoveryRestoreCommand == NULL)
            ereport(FATAL,
                    (errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
                            RECOVERY_COMMAND_FILE)));
    }

    /* Enable fetching from archive recovery area */
    ArchiveRecoveryRequested = true;

    /*
     * If user specified recovery_target_timeline, validate it or compute the
     * "latest" value.  We can't do this until after we've gotten the restore
     * command and set InArchiveRecovery, because we need to fetch timeline
     * history files from the archive.
     */
    if (rtliGiven)
    {
        if (rtli)
        {
            /* Timeline 1 does not have a history file, all else should */
            if (rtli != 1 && !existsTimeLineHistory(rtli))
                ereport(FATAL,
                        (errmsg("recovery target timeline %u does not exist",
                                rtli)));
            recoveryTargetTLI = rtli;
            recoveryTargetIsLatest = false;
        }
        else
        {
            /* We start the "latest" search from pg_control's timeline */
            recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
            recoveryTargetIsLatest = true;
        }
    }

    FreeConfigVariables(head);
}

bool RecoveryInProgress ( void   ) 

Definition at line 6211 of file xlog.c.

References XLogCtlData::info_lck, InitXLOGAccess(), LocalRecoveryInProgress, XLogCtlData::SharedRecoveryInProgress, SpinLockAcquire, and SpinLockRelease.

Referenced by check_transaction_read_only(), check_XactIsoLevel(), CheckArchiveTimeout(), CheckpointerMain(), CreateCheckPoint(), CreateEndOfRecoveryRecord(), CreateRestartPoint(), do_pg_start_backup(), do_pg_stop_backup(), get_relation_info(), GetNewMultiXactId(), GetNewObjectId(), GetNewTransactionId(), GetOldestActiveTransactionId(), GetOldestXmin(), GetRunningTransactionData(), GetSerializableTransactionSnapshot(), GetSerializableTransactionSnapshotInt(), GetSnapshotData(), heap_page_prune_opt(), IdentifySystem(), InitPostgres(), InitTempTableNamespace(), InitWalSender(), IsCheckpointOnSchedule(), LockAcquireExtended(), MarkBufferDirtyHint(), OldSerXidSetActiveSerXmin(), perform_base_backup(), pg_create_restore_point(), pg_current_xlog_insert_location(), pg_current_xlog_location(), pg_is_in_recovery(), pg_is_xlog_replay_paused(), pg_switch_xlog(), pg_xlog_replay_pause(), pg_xlog_replay_resume(), pg_xlogfile_name(), pg_xlogfile_name_offset(), PreventCommandDuringRecovery(), ProcSendSignal(), ProcSleep(), sendDir(), ShutdownXLOG(), standard_ProcessUtility(), StartTransaction(), TransactionIdIsInProgress(), UpdateFullPageWrites(), WalReceiverMain(), XLogBackgroundFlush(), XLogInsertAllowed(), XLogNeedsFlush(), and XLogSend().

{
    /*
     * We check shared state each time only until we leave recovery mode. We
     * can't re-enter recovery, so there's no need to keep checking after the
     * shared variable has once been seen false.
     */
    if (!LocalRecoveryInProgress)
        return false;
    else
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        /* spinlock is essential on machines with weak memory ordering! */
        SpinLockAcquire(&xlogctl->info_lck);
        LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
        SpinLockRelease(&xlogctl->info_lck);

        /*
         * Initialize TimeLineID and RedoRecPtr when we discover that recovery
         * is finished. InitPostgres() relies upon this behaviour to ensure
         * that InitXLOGAccess() is called at backend startup.  (If you change
         * this, see also LocalSetXLogInsertAllowed.)
         */
        if (!LocalRecoveryInProgress)
            InitXLOGAccess();

        return LocalRecoveryInProgress;
    }
}

bool RecoveryIsPaused ( void   ) 

Definition at line 4672 of file xlog.c.

References XLogCtlData::info_lck, XLogCtlData::recoveryPause, SpinLockAcquire, and SpinLockRelease.

Referenced by pg_is_xlog_replay_paused(), and recoveryPausesHere().

{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;
    bool        recoveryPause;

    SpinLockAcquire(&xlogctl->info_lck);
    recoveryPause = xlogctl->recoveryPause;
    SpinLockRelease(&xlogctl->info_lck);

    return recoveryPause;
}

static void recoveryPausesHere ( void   )  [static]

Definition at line 4654 of file xlog.c.

References ereport, errhint(), errmsg(), HandleStartupProcInterrupts(), LocalHotStandbyActive, LOG, pg_usleep(), and RecoveryIsPaused().

Referenced by StartupXLOG().

{
    /* Don't pause unless users can connect! */
    if (!LocalHotStandbyActive)
        return;

    ereport(LOG,
            (errmsg("recovery has paused"),
             errhint("Execute pg_xlog_replay_resume() to continue.")));

    while (RecoveryIsPaused())
    {
        pg_usleep(1000000L);    /* 1000 ms */
        HandleStartupProcInterrupts();
    }
}

static void RecoveryRestartPoint ( const CheckPoint checkPoint  )  [static]

Definition at line 7260 of file xlog.c.

References DEBUG2, elog, XLogCtlData::info_lck, XLogCtlData::lastCheckPoint, XLogCtlData::lastCheckPointRecPtr, NULL, ReadRecPtr, CheckPoint::redo, RmgrTable, SpinLockAcquire, SpinLockRelease, trace_recovery(), and XLogHaveInvalidPages().

Referenced by xlog_redo().

{
    int         rmid;

    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;

    /*
     * Is it safe to restartpoint?  We must ask each of the resource managers
     * whether they have any partial state information that might prevent a
     * correct restart from this point.  If so, we skip this opportunity, but
     * return at the next checkpoint record for another try.
     */
    for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
    {
        if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
            if (!(RmgrTable[rmid].rm_safe_restartpoint()))
            {
                elog(trace_recovery(DEBUG2),
                     "RM %d not safe to record restart point at %X/%X",
                     rmid,
                     (uint32) (checkPoint->redo >> 32),
                     (uint32) checkPoint->redo);
                return;
            }
    }

    /*
     * Also refrain from creating a restartpoint if we have seen any
     * references to non-existent pages. Restarting recovery from the
     * restartpoint would not see the references, so we would lose the
     * cross-check that the pages belonged to a relation that was dropped
     * later.
     */
    if (XLogHaveInvalidPages())
    {
        elog(trace_recovery(DEBUG2),
             "could not record restart point at %X/%X because there "
             "are unresolved references to invalid pages",
             (uint32) (checkPoint->redo >> 32),
             (uint32) checkPoint->redo);
        return;
    }

    /*
     * Copy the checkpoint record to shared memory, so that checkpointer can
     * work out the next time it wants to perform a restartpoint.
     */
    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->lastCheckPointRecPtr = ReadRecPtr;
    xlogctl->lastCheckPoint = *checkPoint;
    SpinLockRelease(&xlogctl->info_lck);
}

static bool recoveryStopsHere ( XLogRecord record,
bool includeThis 
) [static]

Definition at line 4491 of file xlog.c.

References ereport, errmsg(), LOG, MAXFNAMELEN, RECOVERY_TARGET_NAME, RECOVERY_TARGET_UNSET, RECOVERY_TARGET_XID, recoveryStopAfter, recoveryStopName, recoveryStopTime, recoveryStopXid, recoveryTarget, recoveryTargetInclusive, recoveryTargetName, recoveryTargetTime, recoveryTargetXid, xl_restore_point::rp_name, xl_restore_point::rp_time, SetLatestXTime(), timestamptz_to_str(), xl_xact_abort::xact_time, xl_xact_commit::xact_time, xl_xact_commit_compact::xact_time, XLogRecord::xl_info, XLogRecord::xl_rmid, XLogRecord::xl_xid, XLOG_RESTORE_POINT, XLOG_XACT_ABORT, XLOG_XACT_COMMIT, XLOG_XACT_COMMIT_COMPACT, and XLogRecGetData.

Referenced by StartupXLOG().

{
    bool        stopsHere;
    uint8       record_info;
    TimestampTz recordXtime;
    char        recordRPName[MAXFNAMELEN];

    /* We only consider stopping at COMMIT, ABORT or RESTORE POINT records */
    if (record->xl_rmid != RM_XACT_ID && record->xl_rmid != RM_XLOG_ID)
        return false;
    record_info = record->xl_info & ~XLR_INFO_MASK;
    if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_COMPACT)
    {
        xl_xact_commit_compact *recordXactCommitData;

        recordXactCommitData = (xl_xact_commit_compact *) XLogRecGetData(record);
        recordXtime = recordXactCommitData->xact_time;
    }
    else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT)
    {
        xl_xact_commit *recordXactCommitData;

        recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
        recordXtime = recordXactCommitData->xact_time;
    }
    else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT)
    {
        xl_xact_abort *recordXactAbortData;

        recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
        recordXtime = recordXactAbortData->xact_time;
    }
    else if (record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
    {
        xl_restore_point *recordRestorePointData;

        recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
        recordXtime = recordRestorePointData->rp_time;
        strncpy(recordRPName, recordRestorePointData->rp_name, MAXFNAMELEN);
    }
    else
        return false;

    /* Do we have a PITR target at all? */
    if (recoveryTarget == RECOVERY_TARGET_UNSET)
    {
        /*
         * Save timestamp of latest transaction commit/abort if this is a
         * transaction record
         */
        if (record->xl_rmid == RM_XACT_ID)
            SetLatestXTime(recordXtime);
        return false;
    }

    if (recoveryTarget == RECOVERY_TARGET_XID)
    {
        /*
         * There can be only one transaction end record with this exact
         * transactionid
         *
         * when testing for an xid, we MUST test for equality only, since
         * transactions are numbered in the order they start, not the order
         * they complete. A higher numbered xid will complete before you about
         * 50% of the time...
         */
        stopsHere = (record->xl_xid == recoveryTargetXid);
        if (stopsHere)
            *includeThis = recoveryTargetInclusive;
    }
    else if (recoveryTarget == RECOVERY_TARGET_NAME)
    {
        /*
         * There can be many restore points that share the same name, so we
         * stop at the first one
         */
        stopsHere = (strcmp(recordRPName, recoveryTargetName) == 0);

        /*
         * Ignore recoveryTargetInclusive because this is not a transaction
         * record
         */
        *includeThis = false;
    }
    else
    {
        /*
         * There can be many transactions that share the same commit time, so
         * we stop after the last one, if we are inclusive, or stop at the
         * first one if we are exclusive
         */
        if (recoveryTargetInclusive)
            stopsHere = (recordXtime > recoveryTargetTime);
        else
            stopsHere = (recordXtime >= recoveryTargetTime);
        if (stopsHere)
            *includeThis = false;
    }

    if (stopsHere)
    {
        recoveryStopXid = record->xl_xid;
        recoveryStopTime = recordXtime;
        recoveryStopAfter = *includeThis;

        if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
        {
            if (recoveryStopAfter)
                ereport(LOG,
                        (errmsg("recovery stopping after commit of transaction %u, time %s",
                                recoveryStopXid,
                                timestamptz_to_str(recoveryStopTime))));
            else
                ereport(LOG,
                        (errmsg("recovery stopping before commit of transaction %u, time %s",
                                recoveryStopXid,
                                timestamptz_to_str(recoveryStopTime))));
        }
        else if (record_info == XLOG_XACT_ABORT)
        {
            if (recoveryStopAfter)
                ereport(LOG,
                        (errmsg("recovery stopping after abort of transaction %u, time %s",
                                recoveryStopXid,
                                timestamptz_to_str(recoveryStopTime))));
            else
                ereport(LOG,
                        (errmsg("recovery stopping before abort of transaction %u, time %s",
                                recoveryStopXid,
                                timestamptz_to_str(recoveryStopTime))));
        }
        else
        {
            strncpy(recoveryStopName, recordRPName, MAXFNAMELEN);

            ereport(LOG,
                (errmsg("recovery stopping at restore point \"%s\", time %s",
                        recoveryStopName,
                        timestamptz_to_str(recoveryStopTime))));
        }

        /*
         * Note that if we use a RECOVERY_TARGET_TIME then we can stop at a
         * restore point since they are timestamped, though the latest
         * transaction time is not updated.
         */
        if (record->xl_rmid == RM_XACT_ID && recoveryStopAfter)
            SetLatestXTime(recordXtime);
    }
    else if (record->xl_rmid == RM_XACT_ID)
        SetLatestXTime(recordXtime);

    return stopsHere;
}

static void RemoveOldXlogFiles ( XLogSegNo  segno,
XLogRecPtr  endptr 
) [static]

Definition at line 2889 of file xlog.c.

References AllocateDir(), CheckpointStatsData::ckpt_segs_recycled, CheckpointStatsData::ckpt_segs_removed, dirent::d_name, DEBUG2, elog, ereport, errcode_for_file_access(), errmsg(), ERROR, FreeDir(), InstallXLogFileSegment(), LOG, lstat, MAXPGPATH, NULL, ReadDir(), snprintf(), unlink(), UpdateLastRemovedPtr(), XLByteToPrevSeg, XLogArchiveCheckDone(), XLogArchiveCleanup(), XLOGDIR, and XLogFileName.

Referenced by CreateCheckPoint(), and CreateRestartPoint().

{
    XLogSegNo   endlogSegNo;
    int         max_advance;
    DIR        *xldir;
    struct dirent *xlde;
    char        lastoff[MAXFNAMELEN];
    char        path[MAXPGPATH];

#ifdef WIN32
    char        newpath[MAXPGPATH];
#endif
    struct stat statbuf;

    /*
     * Initialize info about where to try to recycle to.  We allow recycling
     * segments up to XLOGfileslop segments beyond the current XLOG location.
     */
    XLByteToPrevSeg(endptr, endlogSegNo);
    max_advance = XLOGfileslop;

    xldir = AllocateDir(XLOGDIR);
    if (xldir == NULL)
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not open transaction log directory \"%s\": %m",
                        XLOGDIR)));

    /*
     * Construct a filename of the last segment to be kept. The timeline ID
     * doesn't matter, we ignore that in the comparison. (During recovery,
     * ThisTimeLineID isn't set, so we can't use that.)
     */
    XLogFileName(lastoff, 0, segno);

    elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
         lastoff);

    while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
    {
        /*
         * We ignore the timeline part of the XLOG segment identifiers in
         * deciding whether a segment is still needed.  This ensures that we
         * won't prematurely remove a segment from a parent timeline. We could
         * probably be a little more proactive about removing segments of
         * non-parent timelines, but that would be a whole lot more
         * complicated.
         *
         * We use the alphanumeric sorting property of the filenames to decide
         * which ones are earlier than the lastoff segment.
         */
        if (strlen(xlde->d_name) == 24 &&
            strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
            strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
        {
            if (XLogArchiveCheckDone(xlde->d_name))
            {
                snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);

                /* Update the last removed location in shared memory first */
                UpdateLastRemovedPtr(xlde->d_name);

                /*
                 * Before deleting the file, see if it can be recycled as a
                 * future log segment. Only recycle normal files, pg_standby
                 * for example can create symbolic links pointing to a
                 * separate archive directory.
                 */
                if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
                    InstallXLogFileSegment(&endlogSegNo, path,
                                           true, &max_advance, true))
                {
                    ereport(DEBUG2,
                            (errmsg("recycled transaction log file \"%s\"",
                                    xlde->d_name)));
                    CheckpointStats.ckpt_segs_recycled++;
                    /* Needn't recheck that slot on future iterations */
                    if (max_advance > 0)
                    {
                        endlogSegNo++;
                        max_advance--;
                    }
                }
                else
                {
                    /* No need for any more future segments... */
                    int         rc;

                    ereport(DEBUG2,
                            (errmsg("removing transaction log file \"%s\"",
                                    xlde->d_name)));

#ifdef WIN32

                    /*
                     * On Windows, if another process (e.g another backend)
                     * holds the file open in FILE_SHARE_DELETE mode, unlink
                     * will succeed, but the file will still show up in
                     * directory listing until the last handle is closed. To
                     * avoid confusing the lingering deleted file for a live
                     * WAL file that needs to be archived, rename it before
                     * deleting it.
                     *
                     * If another process holds the file open without
                     * FILE_SHARE_DELETE flag, rename will fail. We'll try
                     * again at the next checkpoint.
                     */
                    snprintf(newpath, MAXPGPATH, "%s.deleted", path);
                    if (rename(path, newpath) != 0)
                    {
                        ereport(LOG,
                                (errcode_for_file_access(),
                                 errmsg("could not rename old transaction log file \"%s\": %m",
                                        path)));
                        continue;
                    }
                    rc = unlink(newpath);
#else
                    rc = unlink(path);
#endif
                    if (rc != 0)
                    {
                        ereport(LOG,
                                (errcode_for_file_access(),
                                 errmsg("could not remove old transaction log file \"%s\": %m",
                                        path)));
                        continue;
                    }
                    CheckpointStats.ckpt_segs_removed++;
                }

                XLogArchiveCleanup(xlde->d_name);
            }
        }
    }

    FreeDir(xldir);
}

XLogRecPtr RequestXLogSwitch ( void   ) 

Definition at line 7596 of file xlog.c.

References XLogRecData::buffer, XLogRecData::data, XLogRecData::len, XLogRecData::next, XLOG_SWITCH, and XLogInsert().

Referenced by CheckArchiveTimeout(), do_pg_start_backup(), do_pg_stop_backup(), pg_switch_xlog(), and ShutdownXLOG().

{
    XLogRecPtr  RecPtr;
    XLogRecData rdata;

    /* XLOG SWITCH, alone among xlog record types, has no data */
    rdata.buffer = InvalidBuffer;
    rdata.data = NULL;
    rdata.len = 0;
    rdata.next = NULL;

    RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);

    return RecPtr;
}

static bool rescanLatestTimeLine ( void   )  [static]

Definition at line 3381 of file xlog.c.

References TimeLineHistoryEntry::end, EndRecPtr, ereport, errmsg(), findNewestTimeLine(), lfirst, list_free_deep(), LOG, readTimeLineHistory(), recoveryTargetTLI, restoreTimeLineHistoryFiles(), ThisTimeLineID, and TimeLineHistoryEntry::tli.

Referenced by WaitForWALToBecomeAvailable().

{
    List       *newExpectedTLEs;
    bool        found;
    ListCell   *cell;
    TimeLineID  newtarget;
    TimeLineID  oldtarget = recoveryTargetTLI;
    TimeLineHistoryEntry *currentTle = NULL;

    newtarget = findNewestTimeLine(recoveryTargetTLI);
    if (newtarget == recoveryTargetTLI)
    {
        /* No new timelines found */
        return false;
    }

    /*
     * Determine the list of expected TLIs for the new TLI
     */

    newExpectedTLEs = readTimeLineHistory(newtarget);

    /*
     * If the current timeline is not part of the history of the new
     * timeline, we cannot proceed to it.
     */
    found = false;
    foreach (cell, newExpectedTLEs)
    {
        currentTle = (TimeLineHistoryEntry *) lfirst(cell);

        if (currentTle->tli == recoveryTargetTLI)
        {
            found = true;
            break;
        }
    }
    if (!found)
    {
        ereport(LOG,
                (errmsg("new timeline %u is not a child of database system timeline %u",
                        newtarget,
                        ThisTimeLineID)));
        return false;
    }

    /*
     * The current timeline was found in the history file, but check that the
     * next timeline was forked off from it *after* the current recovery
     * location.
     */
    if (currentTle->end < EndRecPtr)
    {
        ereport(LOG,
                (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
                        newtarget,
                        ThisTimeLineID,
                        (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
        return false;
    }

    /* The new timeline history seems valid. Switch target */
    recoveryTargetTLI = newtarget;
    list_free_deep(expectedTLEs);
    expectedTLEs = newExpectedTLEs;

    /*
     * As in StartupXLOG(), try to ensure we have all the history files
     * between the old target and new target in pg_xlog.
     */
    restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);

    ereport(LOG,
            (errmsg("new target timeline is %u",
                    recoveryTargetTLI)));

    return true;
}

Buffer RestoreBackupBlock ( XLogRecPtr  lsn,
XLogRecord record,
int  block_index,
bool  get_cleanup_lock,
bool  keep_buffer 
)

Definition at line 3149 of file xlog.c.

References elog, ERROR, BkpBlock::hole_length, i, RestoreBackupBlockContents(), XLogRecord::xl_info, XLogRecord::xl_len, XLogRecGetData, and XLR_BKP_BLOCK.

Referenced by btree_xlog_delete(), btree_xlog_delete_page(), btree_xlog_insert(), btree_xlog_split(), btree_xlog_vacuum(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoUpdateMetapage(), ginRedoVacuumPage(), gistRedoClearFollowRight(), gistRedoPageUpdateRecord(), heap_xlog_clean(), heap_xlog_delete(), heap_xlog_freeze(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_update(), heap_xlog_visible(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), and spgRedoVacuumRoot().

{
    BkpBlock    bkpb;
    char       *blk;
    int         i;

    /* Locate requested BkpBlock in the record */
    blk = (char *) XLogRecGetData(record) + record->xl_len;
    for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
    {
        if (!(record->xl_info & XLR_BKP_BLOCK(i)))
            continue;

        memcpy(&bkpb, blk, sizeof(BkpBlock));
        blk += sizeof(BkpBlock);

        if (i == block_index)
        {
            /* Found it, apply the update */
            return RestoreBackupBlockContents(lsn, bkpb, blk, get_cleanup_lock,
                                              keep_buffer);
        }

        blk += BLCKSZ - bkpb.hole_length;
    }

    /* Caller specified a bogus block_index */
    elog(ERROR, "failed to restore block_index %d", block_index);
    return InvalidBuffer;       /* keep compiler quiet */
}

static Buffer RestoreBackupBlockContents ( XLogRecPtr  lsn,
BkpBlock  bkpb,
char *  blk,
bool  get_cleanup_lock,
bool  keep_buffer 
) [static]

Definition at line 3187 of file xlog.c.

References Assert, BkpBlock::block, BUFFER_LOCK_EXCLUSIVE, BufferGetPage, BufferIsValid, BkpBlock::fork, BkpBlock::hole_length, BkpBlock::hole_offset, LockBuffer(), LockBufferForCleanup(), MarkBufferDirty(), MemSet, BkpBlock::node, PageSetLSN, RBM_ZERO, UnlockReleaseBuffer(), and XLogReadBufferExtended().

Referenced by RestoreBackupBlock(), and xlog_redo().

{
    Buffer      buffer;
    Page        page;

    buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
                                    RBM_ZERO);
    Assert(BufferIsValid(buffer));
    if (get_cleanup_lock)
        LockBufferForCleanup(buffer);
    else
        LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);

    page = (Page) BufferGetPage(buffer);

    if (bkpb.hole_length == 0)
    {
        memcpy((char *) page, blk, BLCKSZ);
    }
    else
    {
        memcpy((char *) page, blk, bkpb.hole_offset);
        /* must zero-fill the hole */
        MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length);
        memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
               blk + bkpb.hole_offset,
               BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
    }

    /*
     * The checksum value on this page is currently invalid. We don't
     * need to reset it here since it will be set before being written.
     */

    PageSetLSN(page, lsn);
    MarkBufferDirty(buffer);

    if (!keep_buffer)
        UnlockReleaseBuffer(buffer);

    return buffer;
}

static void rm_redo_error_callback ( void *  arg  )  [static]

Definition at line 9261 of file xlog.c.

References StringInfoData::data, errcontext, initStringInfo(), StringInfoData::len, pfree(), RmgrData::rm_desc, RmgrTable, XLogRecord::xl_info, XLogRecord::xl_rmid, and XLogRecGetData.

{
    XLogRecord *record = (XLogRecord *) arg;
    StringInfoData buf;

    initStringInfo(&buf);
    RmgrTable[record->xl_rmid].rm_desc(&buf,
                                       record->xl_info,
                                       XLogRecGetData(record));

    /* don't bother emitting empty description */
    if (buf.len > 0)
        errcontext("xlog redo %s", buf.data);

    pfree(buf.data);
}

static void SetCurrentChunkStartTime ( TimestampTz  xtime  )  [static]

Definition at line 4738 of file xlog.c.

References XLogCtlData::currentChunkStartTime, XLogCtlData::info_lck, SpinLockAcquire, and SpinLockRelease.

Referenced by WaitForWALToBecomeAvailable().

{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->currentChunkStartTime = xtime;
    SpinLockRelease(&xlogctl->info_lck);
}

static void SetLatestXTime ( TimestampTz  xtime  )  [static]

Definition at line 4704 of file xlog.c.

References XLogCtlData::info_lck, XLogCtlData::recoveryLastXTime, SpinLockAcquire, and SpinLockRelease.

Referenced by recoveryStopsHere().

{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->recoveryLastXTime = xtime;
    SpinLockRelease(&xlogctl->info_lck);
}

void SetRecoveryPause ( bool  recoveryPause  ) 

Definition at line 4686 of file xlog.c.

References XLogCtlData::info_lck, XLogCtlData::recoveryPause, SpinLockAcquire, and SpinLockRelease.

Referenced by pg_xlog_replay_pause(), pg_xlog_replay_resume(), and StartupXLOG().

{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->recoveryPause = recoveryPause;
    SpinLockRelease(&xlogctl->info_lck);
}

void SetWalWriterSleeping ( bool  sleeping  ) 

Definition at line 9961 of file xlog.c.

References XLogCtlData::info_lck, SpinLockAcquire, SpinLockRelease, and XLogCtlData::WalWriterSleeping.

Referenced by WalWriterMain().

{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->WalWriterSleeping = sleeping;
    SpinLockRelease(&xlogctl->info_lck);
}

void ShutdownXLOG ( int  code,
Datum  arg 
)

Definition at line 6585 of file xlog.c.

References CHECKPOINT_IMMEDIATE, CHECKPOINT_IS_SHUTDOWN, CreateCheckPoint(), CreateRestartPoint(), ereport, errmsg(), LOG, RecoveryInProgress(), RequestXLogSwitch(), ShutdownCLOG(), ShutdownMultiXact(), ShutdownSUBTRANS(), XLogArchiveCommandSet, and XLogArchivingActive.

Referenced by CheckpointerMain(), and InitPostgres().

{
    ereport(LOG,
            (errmsg("shutting down")));

    if (RecoveryInProgress())
        CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
    else
    {
        /*
         * If archiving is enabled, rotate the last XLOG file so that all the
         * remaining records are archived (postmaster wakes up the archiver
         * process one more time at the end of shutdown). The checkpoint
         * record will go to the next XLOG file and won't be archived (yet).
         */
        if (XLogArchivingActive() && XLogArchiveCommandSet())
            RequestXLogSwitch();

        CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
    }
    ShutdownCLOG();
    ShutdownSUBTRANS();
    ShutdownMultiXact();

    ereport(LOG,
            (errmsg("database system is shut down")));
}

void StartupXLOG ( void   ) 

Definition at line 4846 of file xlog.c.

References AllowCascadeReplication, appendStringInfo(), archiveCleanupCommand, XLogCtlData::archiveCleanupCommand, ArchiveRecoveryRequested, ErrorContextCallback::arg, Assert, BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, ControlFileData::backupEndPoint, ControlFileData::backupEndRequired, ControlFileData::backupStartPoint, bgwriterLaunched, buf, ErrorContextCallback::callback, ControlFileData::checkPoint, CHECKPOINT_END_OF_RECOVERY, CHECKPOINT_IMMEDIATE, CHECKPOINT_WAIT, ControlFileData::checkPointCopy, CheckRecoveryConsistency(), CheckRequiredParameterValues(), checkTimeLineSwitch(), XLogCtlData::ckptXid, XLogCtlData::ckptXidEpoch, close, ControlFileLock, CreateCheckPoint(), CreateEndOfRecoveryRecord(), XLogCtlData::currentChunkStartTime, XLogCtlWrite::curridx, XLogCtlInsert::currpage, XLogCtlInsert::currpos, StringInfoData::data, DataDir, DB_IN_ARCHIVE_RECOVERY, DB_IN_CRASH_RECOVERY, DB_IN_PRODUCTION, DB_SHUTDOWNED, DB_SHUTDOWNED_IN_RECOVERY, DB_SHUTDOWNING, DEBUG1, DEBUG2, DEBUG3, DeleteAllExportedSnapshotFiles(), DisownLatch(), elog, EnableHotStandby, EndRecPtr, ereport, errcode(), errcode_for_file_access(), errdetail(), errhint(), errmsg(), ERROR, error_context_stack, ExecuteRecoveryCommand(), exitArchiveRecovery(), fast_promote, FATAL, findNewestTimeLine(), XLogwrtRqst::Flush, XLogwrtResult::Flush, XLogCtlInsert::fullPageWrites, CheckPoint::fullPageWrites, GetCurrentTimestamp(), GetLatestXTime(), HandleStartupProcInterrupts(), InArchiveRecovery, XLogCtlData::info_lck, InitRecoveryTransactionEnvironment(), initStringInfo(), InRecovery, InRedo, XLogCtlData::Insert, INSERT_FREESPACE, InvalidXLogRecPtr, IsUnderPostmaster, lastFullPageWrites, LastRec, XLogCtlData::lastReplayedEndRecPtr, XLogCtlData::lastReplayedTLI, XLogCtlWrite::lastSegSwitchTime, VariableCacheData::latestCompletedXid, RunningTransactionsData::latestCompletedXid, LocalSetXLogInsertAllowed(), LocalXLogInsertAllowed, LOG, XLogCtlData::LogwrtResult, XLogCtlData::LogwrtRqst, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), MemSet, minRecoveryPoint, ControlFileData::minRecoveryPoint, minRecoveryPointTLI, ControlFileData::minRecoveryPointTLI, MultiXactSetNextMXact(), NextBufIdx, CheckPoint::nextMulti, CheckPoint::nextMultiOffset, VariableCacheData::nextOid, CheckPoint::nextOid, RunningTransactionsData::nextXid, VariableCacheData::nextXid, CheckPoint::nextXid, CheckPoint::nextXidEpoch, NULL, VariableCacheData::oidCount, CheckPoint::oldestActiveXid, CheckPoint::oldestMulti, CheckPoint::oldestMultiDB, RunningTransactionsData::oldestRunningXid, CheckPoint::oldestXid, CheckPoint::oldestXidDB, openLogFile, openLogOff, openLogSegNo, OwnLatch(), PANIC, pfree(), pg_usleep(), pgstat_reset_all(), PMSIGNAL_RECOVERY_STARTED, PreallocXlogFiles(), PrescanPreparedTransactions(), ControlFileData::prevCheckPoint, ErrorContextCallback::previous, XLogCtlInsert::PrevRecord, XLogCtlData::PrevTimeLineID, xl_end_of_recovery::PrevTimeLineID, CheckPoint::PrevTimeLineID, ProcArrayApplyRecoveryInfo(), ProcArrayLock, PublishStartupProcessInformation(), read_backup_label(), XLogReaderState::readBuf, ReadCheckpointRecord(), ReadControlFile(), readFile, readOff, XLogReaderState::readPageTLI, ReadRecord(), readRecoveryCommandFile(), ReadRecPtr, RecordKnownAssignedTransactionIds(), RecoverPreparedTransactions(), RECOVERY_TARGET_NAME, RECOVERY_TARGET_TIME, RECOVERY_TARGET_XID, recoveryEndCommand, XLogCtlData::recoveryLastXTime, XLogCtlData::recoveryPause, recoveryPauseAtTarget, recoveryPausesHere(), recoveryStopAfter, recoveryStopName, recoveryStopsHere(), recoveryStopTime, recoveryStopXid, recoveryTarget, recoveryTargetIsLatest, recoveryTargetName, recoveryTargetTime, recoveryTargetTLI, recoveryTargetXid, XLogCtlData::recoveryWakeupLatch, CheckPoint::redo, XLogCtlInsert::RedoRecPtr, RedoRecPtr, RedoStartLSN, RelationCacheInitFileRemove(), XLogCtlData::replayEndRecPtr, XLogCtlData::replayEndTLI, RequestCheckpoint(), ResetUnloggedRelations(), restoreTimeLineHistoryFiles(), RmgrData::rm_cleanup, RmgrData::rm_desc, RmgrData::rm_redo, RmgrData::rm_startup, RmgrTable, SendPostmasterSignal(), SetForwardFsyncRequests(), SetMultiXactIdLimit(), SetRecoveryPause(), SetTransactionIdLimit(), XLogCtlData::SharedRecoveryInProgress, ShmemVariableCache, ShutdownRecoveryTransactionEnvironment(), ShutdownWalRcv(), snprintf(), SpinLockAcquire, SpinLockRelease, STANDBY_DISABLED, STANDBY_INITIALIZED, StandbyMode, StandbyModeRequested, StandbyRecoverPreparedTransactions(), standbyState, StartupCLOG(), StartupMultiXact(), StartupSUBTRANS(), ControlFileData::state, str_time(), RunningTransactionsData::subxcnt, RunningTransactionsData::subxid_overflow, ControlFileData::system_identifier, XLogReaderState::system_identifier, XLogCtlData::ThisTimeLineID, xl_end_of_recovery::ThisTimeLineID, ThisTimeLineID, CheckPoint::ThisTimeLineID, CheckPoint::time, ControlFileData::time, timestamptz_to_str(), tliOfPointInHistory(), tliSwitchPoint(), trace_recovery_messages, TransactionIdAdvance, TransactionIdFollowsOrEquals(), TransactionIdIsNormal, TransactionIdIsValid, TransactionIdRetreat, TrimCLOG(), unlink(), UNLOGGED_RELATION_CLEANUP, UNLOGGED_RELATION_INIT, ControlFileData::unloggedLSN, XLogCtlData::unloggedLSN, UpdateControlFile(), UpdateFullPageWrites(), ValidateXLOGDirectoryStructure(), WalSndWakeup(), XLogCtlData::Write, XLogwrtRqst::Write, XLogwrtResult::Write, writeTimeLineHistory(), RunningTransactionsData::xcnt, XidGenLock, RunningTransactionsData::xids, XLogRecord::xl_info, XLogRecord::xl_rmid, XLogRecord::xl_xid, XLogCtlData::xlblocks, XLByteToPrevSeg, XLOG_CHECKPOINT_SHUTDOWN, XLOG_END_OF_RECOVERY, XLogFileOpen(), XLogPageRead(), XLogReaderAllocate(), XLogReaderFree(), XLogReceiptTime, XLogRecGetData, XLogRecPtrIsInvalid, XLogReportParameters(), XLogSegSize, and XRecOffIsValid.

Referenced by InitPostgres(), and StartupProcessMain().

{
    XLogCtlInsert *Insert;
    CheckPoint  checkPoint;
    bool        wasShutdown;
    bool        reachedStopPoint = false;
    bool        haveBackupLabel = false;
    XLogRecPtr  RecPtr,
                checkPointLoc,
                EndOfLog;
    XLogSegNo   endLogSegNo;
    TimeLineID  PrevTimeLineID;
    XLogRecord *record;
    uint32      freespace;
    TransactionId oldestActiveXID;
    bool        backupEndRequired = false;
    bool        backupFromStandby = false;
    DBState     dbstate_at_startup;
    XLogReaderState *xlogreader;
    XLogPageReadPrivate private;
    bool        fast_promoted = false;

    /*
     * Read control file and check XLOG status looks valid.
     *
     * Note: in most control paths, *ControlFile is already valid and we need
     * not do ReadControlFile() here, but might as well do it to be sure.
     */
    ReadControlFile();

    if (ControlFile->state < DB_SHUTDOWNED ||
        ControlFile->state > DB_IN_PRODUCTION ||
        !XRecOffIsValid(ControlFile->checkPoint))
        ereport(FATAL,
                (errmsg("control file contains invalid data")));

    if (ControlFile->state == DB_SHUTDOWNED)
        ereport(LOG,
                (errmsg("database system was shut down at %s",
                        str_time(ControlFile->time))));
    else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
        ereport(LOG,
                (errmsg("database system was shut down in recovery at %s",
                        str_time(ControlFile->time))));
    else if (ControlFile->state == DB_SHUTDOWNING)
        ereport(LOG,
                (errmsg("database system shutdown was interrupted; last known up at %s",
                        str_time(ControlFile->time))));
    else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
        ereport(LOG,
           (errmsg("database system was interrupted while in recovery at %s",
                   str_time(ControlFile->time)),
            errhint("This probably means that some data is corrupted and"
                    " you will have to use the last backup for recovery.")));
    else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
        ereport(LOG,
                (errmsg("database system was interrupted while in recovery at log time %s",
                        str_time(ControlFile->checkPointCopy.time)),
                 errhint("If this has occurred more than once some data might be corrupted"
              " and you might need to choose an earlier recovery target.")));
    else if (ControlFile->state == DB_IN_PRODUCTION)
        ereport(LOG,
              (errmsg("database system was interrupted; last known up at %s",
                      str_time(ControlFile->time))));

    /* This is just to allow attaching to startup process with a debugger */
#ifdef XLOG_REPLAY_DELAY
    if (ControlFile->state != DB_SHUTDOWNED)
        pg_usleep(60000000L);
#endif

    /*
     * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
     * someone has performed a copy for PITR, these directories may have been
     * excluded and need to be re-created.
     */
    ValidateXLOGDirectoryStructure();

    /*
     * Clear out any old relcache cache files.  This is *necessary* if we do
     * any WAL replay, since that would probably result in the cache files
     * being out of sync with database reality.  In theory we could leave them
     * in place if the database had been cleanly shut down, but it seems
     * safest to just remove them always and let them be rebuilt during the
     * first backend startup.
     */
    RelationCacheInitFileRemove();

    /*
     * Initialize on the assumption we want to recover to the same timeline
     * that's active according to pg_control.
     */
    recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;

    /*
     * Check for recovery control file, and if so set up state for offline
     * recovery
     */
    readRecoveryCommandFile();

    /*
     * Save archive_cleanup_command in shared memory so that other processes
     * can see it.
     */
    strncpy(XLogCtl->archiveCleanupCommand,
            archiveCleanupCommand ? archiveCleanupCommand : "",
            sizeof(XLogCtl->archiveCleanupCommand));

    if (ArchiveRecoveryRequested)
    {
        if (StandbyModeRequested)
            ereport(LOG,
                    (errmsg("entering standby mode")));
        else if (recoveryTarget == RECOVERY_TARGET_XID)
            ereport(LOG,
                    (errmsg("starting point-in-time recovery to XID %u",
                            recoveryTargetXid)));
        else if (recoveryTarget == RECOVERY_TARGET_TIME)
            ereport(LOG,
                    (errmsg("starting point-in-time recovery to %s",
                            timestamptz_to_str(recoveryTargetTime))));
        else if (recoveryTarget == RECOVERY_TARGET_NAME)
            ereport(LOG,
                    (errmsg("starting point-in-time recovery to \"%s\"",
                            recoveryTargetName)));
        else
            ereport(LOG,
                    (errmsg("starting archive recovery")));
    }
    else if (ControlFile->minRecoveryPointTLI > 0)
    {
        /*
         * If the minRecoveryPointTLI is set when not in Archive Recovery
         * it means that we have crashed after ending recovery and
         * yet before we wrote a new checkpoint on the new timeline.
         * That means we are doing a crash recovery that needs to cross
         * timelines to get to our newly assigned timeline again.
         * The timeline we are headed for is exact and not 'latest'.
         * As soon as we hit a checkpoint, the minRecoveryPointTLI is
         * reset, so we will not enter crash recovery again.
         */
        Assert(ControlFile->minRecoveryPointTLI != 1);
        recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
        recoveryTargetIsLatest = false;
    }

    /*
     * Take ownership of the wakeup latch if we're going to sleep during
     * recovery.
     */
    if (StandbyModeRequested)
        OwnLatch(&XLogCtl->recoveryWakeupLatch);

    /* Set up XLOG reader facility */
    MemSet(&private, 0, sizeof(XLogPageReadPrivate));
    xlogreader = XLogReaderAllocate(&XLogPageRead, &private);
    if (!xlogreader)
        ereport(ERROR,
                (errcode(ERRCODE_OUT_OF_MEMORY),
                 errmsg("out of memory"),
                 errdetail("Failed while allocating an XLog reading processor")));
    xlogreader->system_identifier = ControlFile->system_identifier;

    if (read_backup_label(&checkPointLoc, &backupEndRequired,
                          &backupFromStandby))
    {
        /*
         * Archive recovery was requested, and thanks to the backup label file,
         * we know how far we need to replay to reach consistency. Enter
         * archive recovery directly.
         */
        InArchiveRecovery = true;
        if (StandbyModeRequested)
            StandbyMode = true;

        /*
         * When a backup_label file is present, we want to roll forward from
         * the checkpoint it identifies, rather than using pg_control.
         */
        record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
        if (record != NULL)
        {
            memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
            wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
            ereport(DEBUG1,
                    (errmsg("checkpoint record is at %X/%X",
                            (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
            InRecovery = true;  /* force recovery even if SHUTDOWNED */

            /*
             * Make sure that REDO location exists. This may not be the case
             * if there was a crash during an online backup, which left a
             * backup_label around that references a WAL segment that's
             * already been archived.
             */
            if (checkPoint.redo < checkPointLoc)
            {
                if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false))
                    ereport(FATAL,
                            (errmsg("could not find redo location referenced by checkpoint record"),
                             errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
            }
        }
        else
        {
            ereport(FATAL,
                    (errmsg("could not locate required checkpoint record"),
                     errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
            wasShutdown = false;    /* keep compiler quiet */
        }
        /* set flag to delete it later */
        haveBackupLabel = true;
    }
    else
    {
        /*
         * It's possible that archive recovery was requested, but we don't
         * know how far we need to replay the WAL before we reach consistency.
         * This can happen for example if a base backup is taken from a running
         * server using an atomic filesystem snapshot, without calling
         * pg_start/stop_backup. Or if you just kill a running master server
         * and put it into archive recovery by creating a recovery.conf file.
         *
         * Our strategy in that case is to perform crash recovery first,
         * replaying all the WAL present in pg_xlog, and only enter archive
         * recovery after that.
         *
         * But usually we already know how far we need to replay the WAL (up to
         * minRecoveryPoint, up to backupEndPoint, or until we see an
         * end-of-backup record), and we can enter archive recovery directly.
         */
        if (ArchiveRecoveryRequested &&
            (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
             ControlFile->backupEndRequired ||
             ControlFile->backupEndPoint != InvalidXLogRecPtr ||
             ControlFile->state == DB_SHUTDOWNED))
        {
            InArchiveRecovery = true;
            if (StandbyModeRequested)
                StandbyMode = true;
        }

        /*
         * Get the last valid checkpoint record.  If the latest one according
         * to pg_control is broken, try the next-to-last one.
         */
        checkPointLoc = ControlFile->checkPoint;
        RedoStartLSN = ControlFile->checkPointCopy.redo;
        record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
        if (record != NULL)
        {
            ereport(DEBUG1,
                    (errmsg("checkpoint record is at %X/%X",
                            (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
        }
        else if (StandbyMode)
        {
            /*
             * The last valid checkpoint record required for a streaming
             * recovery exists in neither standby nor the primary.
             */
            ereport(PANIC,
                    (errmsg("could not locate a valid checkpoint record")));
        }
        else
        {
            checkPointLoc = ControlFile->prevCheckPoint;
            record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
            if (record != NULL)
            {
                ereport(LOG,
                        (errmsg("using previous checkpoint record at %X/%X",
                                (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
                InRecovery = true;      /* force recovery even if SHUTDOWNED */
            }
            else
                ereport(PANIC,
                     (errmsg("could not locate a valid checkpoint record")));
        }
        memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
        wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
    }

    /*
     * If the location of the checkpoint record is not on the expected
     * timeline in the history of the requested timeline, we cannot proceed:
     * the backup is not part of the history of the requested timeline.
     */
    Assert(expectedTLEs); /* was initialized by reading checkpoint record */
    if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
            checkPoint.ThisTimeLineID)
    {
        XLogRecPtr switchpoint;

        /*
         * tliSwitchPoint will throw an error if the checkpoint's timeline
         * is not in expectedTLEs at all.
         */
        switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
        ereport(FATAL,
                (errmsg("requested timeline %u is not a child of this server's history",
                        recoveryTargetTLI),
                 errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X",
                           (uint32) (ControlFile->checkPoint >> 32),
                           (uint32) ControlFile->checkPoint,
                           ControlFile->checkPointCopy.ThisTimeLineID,
                           (uint32) (switchpoint >> 32),
                           (uint32) switchpoint)));
    }

    /*
     * The min recovery point should be part of the requested timeline's
     * history, too.
     */
    if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
        tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
            ControlFile->minRecoveryPointTLI)
        ereport(FATAL,
                (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
                        recoveryTargetTLI,
                        (uint32) (ControlFile->minRecoveryPoint >> 32),
                        (uint32) ControlFile->minRecoveryPoint,
                        ControlFile->minRecoveryPointTLI)));

    LastRec = RecPtr = checkPointLoc;

    ereport(DEBUG1,
            (errmsg("redo record is at %X/%X; shutdown %s",
                    (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
                    wasShutdown ? "TRUE" : "FALSE")));
    ereport(DEBUG1,
            (errmsg("next transaction ID: %u/%u; next OID: %u",
                    checkPoint.nextXidEpoch, checkPoint.nextXid,
                    checkPoint.nextOid)));
    ereport(DEBUG1,
            (errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
                    checkPoint.nextMulti, checkPoint.nextMultiOffset)));
    ereport(DEBUG1,
            (errmsg("oldest unfrozen transaction ID: %u, in database %u",
                    checkPoint.oldestXid, checkPoint.oldestXidDB)));
    ereport(DEBUG1,
            (errmsg("oldest MultiXactId: %u, in database %u",
                    checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
    if (!TransactionIdIsNormal(checkPoint.nextXid))
        ereport(PANIC,
                (errmsg("invalid next transaction ID")));

    /* initialize shared memory variables from the checkpoint record */
    ShmemVariableCache->nextXid = checkPoint.nextXid;
    ShmemVariableCache->nextOid = checkPoint.nextOid;
    ShmemVariableCache->oidCount = 0;
    MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
    SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
    SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
    XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
    XLogCtl->ckptXid = checkPoint.nextXid;

    /*
     * Initialize unlogged LSN. On a clean shutdown, it's restored from the
     * control file. On recovery, all unlogged relations are blown away, so
     * the unlogged LSN counter can be reset too.
     */
    if (ControlFile->state == DB_SHUTDOWNED)
        XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
    else
        XLogCtl->unloggedLSN = 1;

    /*
     * We must replay WAL entries using the same TimeLineID they were created
     * under, so temporarily adopt the TLI indicated by the checkpoint (see
     * also xlog_redo()).
     */
    ThisTimeLineID = checkPoint.ThisTimeLineID;

    /*
     * Copy any missing timeline history files between 'now' and the
     * recovery target timeline from archive to pg_xlog. While we don't need
     * those files ourselves - the history file of the recovery target
     * timeline covers all the previous timelines in the history too - a
     * cascading standby server might be interested in them. Or, if you
     * archive the WAL from this server to a different archive than the
     * master, it'd be good for all the history files to get archived there
     * after failover, so that you can use one of the old timelines as a
     * PITR target. Timeline history files are small, so it's better to copy
     * them unnecessarily than not copy them and regret later.
     */
    restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);

    lastFullPageWrites = checkPoint.fullPageWrites;

    RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;

    if (RecPtr < checkPoint.redo)
        ereport(PANIC,
                (errmsg("invalid redo in checkpoint record")));

    /*
     * Check whether we need to force recovery from WAL.  If it appears to
     * have been a clean shutdown and we did not have a recovery.conf file,
     * then assume no recovery needed.
     */
    if (checkPoint.redo < RecPtr)
    {
        if (wasShutdown)
            ereport(PANIC,
                    (errmsg("invalid redo record in shutdown checkpoint")));
        InRecovery = true;
    }
    else if (ControlFile->state != DB_SHUTDOWNED)
        InRecovery = true;
    else if (ArchiveRecoveryRequested)
    {
        /* force recovery due to presence of recovery.conf */
        InRecovery = true;
    }

    /* REDO */
    if (InRecovery)
    {
        int         rmid;

        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        /*
         * Update pg_control to show that we are recovering and to show the
         * selected checkpoint as the place we are starting from. We also mark
         * pg_control with any minimum recovery stop point obtained from a
         * backup history file.
         */
        dbstate_at_startup = ControlFile->state;
        if (InArchiveRecovery)
            ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
        else
        {
            ereport(LOG,
                    (errmsg("database system was not properly shut down; "
                            "automatic recovery in progress")));
            if (recoveryTargetTLI > 0)
                ereport(LOG,
                    (errmsg("crash recovery starts in timeline %u "
                            "and has target timeline %u",
                            ControlFile->checkPointCopy.ThisTimeLineID,
                            recoveryTargetTLI)));
            ControlFile->state = DB_IN_CRASH_RECOVERY;
        }
        ControlFile->prevCheckPoint = ControlFile->checkPoint;
        ControlFile->checkPoint = checkPointLoc;
        ControlFile->checkPointCopy = checkPoint;
        if (InArchiveRecovery)
        {
            /* initialize minRecoveryPoint if not set yet */
            if (ControlFile->minRecoveryPoint < checkPoint.redo)
            {
                ControlFile->minRecoveryPoint = checkPoint.redo;
                ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
            }
        }

        /*
         * Set backupStartPoint if we're starting recovery from a base backup.
         *
         * Set backupEndPoint and use minRecoveryPoint as the backup end
         * location if we're starting recovery from a base backup which was
         * taken from the standby. In this case, the database system status in
         * pg_control must indicate DB_IN_ARCHIVE_RECOVERY. If not, which
         * means that backup is corrupted, so we cancel recovery.
         */
        if (haveBackupLabel)
        {
            ControlFile->backupStartPoint = checkPoint.redo;
            ControlFile->backupEndRequired = backupEndRequired;

            if (backupFromStandby)
            {
                if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY)
                    ereport(FATAL,
                            (errmsg("backup_label contains data inconsistent with control file"),
                             errhint("This means that the backup is corrupted and you will "
                               "have to use another backup for recovery.")));
                ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
            }
        }
        ControlFile->time = (pg_time_t) time(NULL);
        /* No need to hold ControlFileLock yet, we aren't up far enough */
        UpdateControlFile();

        /* initialize our local copy of minRecoveryPoint */
        minRecoveryPoint = ControlFile->minRecoveryPoint;
        minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;

        /*
         * Reset pgstat data, because it may be invalid after recovery.
         */
        pgstat_reset_all();

        /*
         * If there was a backup label file, it's done its job and the info
         * has now been propagated into pg_control.  We must get rid of the
         * label file so that if we crash during recovery, we'll pick up at
         * the latest recovery restartpoint instead of going all the way back
         * to the backup start point.  It seems prudent though to just rename
         * the file out of the way rather than delete it completely.
         */
        if (haveBackupLabel)
        {
            unlink(BACKUP_LABEL_OLD);
            if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
                ereport(FATAL,
                        (errcode_for_file_access(),
                         errmsg("could not rename file \"%s\" to \"%s\": %m",
                                BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
        }

        /* Check that the GUCs used to generate the WAL allow recovery */
        CheckRequiredParameterValues();

        /*
         * We're in recovery, so unlogged relations may be trashed and must be
         * reset.  This should be done BEFORE allowing Hot Standby
         * connections, so that read-only backends don't try to read whatever
         * garbage is left over from before.
         */
        ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);

        /*
         * Likewise, delete any saved transaction snapshot files that got left
         * behind by crashed backends.
         */
        DeleteAllExportedSnapshotFiles();

        /*
         * Initialize for Hot Standby, if enabled. We won't let backends in
         * yet, not until we've reached the min recovery point specified in
         * control file and we've established a recovery snapshot from a
         * running-xacts WAL record.
         */
        if (ArchiveRecoveryRequested && EnableHotStandby)
        {
            TransactionId *xids;
            int         nxids;

            ereport(DEBUG1,
                    (errmsg("initializing for hot standby")));

            InitRecoveryTransactionEnvironment();

            if (wasShutdown)
                oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
            else
                oldestActiveXID = checkPoint.oldestActiveXid;
            Assert(TransactionIdIsValid(oldestActiveXID));

            /*
             * Startup commit log and subtrans only. Other SLRUs are not
             * maintained during recovery and need not be started yet.
             */
            StartupCLOG();
            StartupSUBTRANS(oldestActiveXID);

            /*
             * If we're beginning at a shutdown checkpoint, we know that
             * nothing was running on the master at this point. So fake-up an
             * empty running-xacts record and use that here and now. Recover
             * additional standby state for prepared transactions.
             */
            if (wasShutdown)
            {
                RunningTransactionsData running;
                TransactionId latestCompletedXid;

                /*
                 * Construct a RunningTransactions snapshot representing a
                 * shut down server, with only prepared transactions still
                 * alive. We're never overflowed at this point because all
                 * subxids are listed with their parent prepared transactions.
                 */
                running.xcnt = nxids;
                running.subxcnt = 0;
                running.subxid_overflow = false;
                running.nextXid = checkPoint.nextXid;
                running.oldestRunningXid = oldestActiveXID;
                latestCompletedXid = checkPoint.nextXid;
                TransactionIdRetreat(latestCompletedXid);
                Assert(TransactionIdIsNormal(latestCompletedXid));
                running.latestCompletedXid = latestCompletedXid;
                running.xids = xids;

                ProcArrayApplyRecoveryInfo(&running);

                StandbyRecoverPreparedTransactions(false);
            }
        }

        /* Initialize resource managers */
        for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
        {
            if (RmgrTable[rmid].rm_startup != NULL)
                RmgrTable[rmid].rm_startup();
        }

        /*
         * Initialize shared replayEndRecPtr, lastReplayedEndRecPtr, and
         * recoveryLastXTime.
         *
         * This is slightly confusing if we're starting from an online
         * checkpoint; we've just read and replayed the chekpoint record, but
         * we're going to start replay from its redo pointer, which precedes
         * the location of the checkpoint record itself. So even though the
         * last record we've replayed is indeed ReadRecPtr, we haven't
         * replayed all the preceding records yet. That's OK for the current
         * use of these variables.
         */
        SpinLockAcquire(&xlogctl->info_lck);
        xlogctl->replayEndRecPtr = ReadRecPtr;
        xlogctl->replayEndTLI = ThisTimeLineID;
        xlogctl->lastReplayedEndRecPtr = EndRecPtr;
        xlogctl->lastReplayedTLI = ThisTimeLineID;
        xlogctl->recoveryLastXTime = 0;
        xlogctl->currentChunkStartTime = 0;
        xlogctl->recoveryPause = false;
        SpinLockRelease(&xlogctl->info_lck);

        /* Also ensure XLogReceiptTime has a sane value */
        XLogReceiptTime = GetCurrentTimestamp();

        /*
         * Let postmaster know we've started redo now, so that it can launch
         * checkpointer to perform restartpoints.  We don't bother during
         * crash recovery as restartpoints can only be performed during
         * archive recovery.  And we'd like to keep crash recovery simple, to
         * avoid introducing bugs that could affect you when recovering after
         * crash.
         *
         * After this point, we can no longer assume that we're the only
         * process in addition to postmaster!  Also, fsync requests are
         * subsequently to be handled by the checkpointer, not locally.
         */
        if (ArchiveRecoveryRequested && IsUnderPostmaster)
        {
            PublishStartupProcessInformation();
            SetForwardFsyncRequests();
            SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
            bgwriterLaunched = true;
        }

        /*
         * Allow read-only connections immediately if we're consistent
         * already.
         */
        CheckRecoveryConsistency();

        /*
         * Find the first record that logically follows the checkpoint --- it
         * might physically precede it, though.
         */
        if (checkPoint.redo < RecPtr)
        {
            /* back up to find the record */
            record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
        }
        else
        {
            /* just have to read next record after CheckPoint */
            record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
        }

        if (record != NULL)
        {
            bool        recoveryContinue = true;
            bool        recoveryApply = true;
            ErrorContextCallback errcallback;
            TimestampTz xtime;

            InRedo = true;

            ereport(LOG,
                    (errmsg("redo starts at %X/%X",
                            (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));

            /*
             * main redo apply loop
             */
            do
            {
                bool switchedTLI = false;
#ifdef WAL_DEBUG
                if (XLOG_DEBUG ||
                 (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
                    (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
                {
                    StringInfoData buf;

                    initStringInfo(&buf);
                    appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
                                     (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
                                     (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
                    xlog_outrec(&buf, record);
                    appendStringInfo(&buf, " - ");
                    RmgrTable[record->xl_rmid].rm_desc(&buf,
                                                       record->xl_info,
                                                     XLogRecGetData(record));
                    elog(LOG, "%s", buf.data);
                    pfree(buf.data);
                }
#endif

                /* Handle interrupt signals of startup process */
                HandleStartupProcInterrupts();

                /*
                 * Pause WAL replay, if requested by a hot-standby session via
                 * SetRecoveryPause().
                 *
                 * Note that we intentionally don't take the info_lck spinlock
                 * here.  We might therefore read a slightly stale value of
                 * the recoveryPause flag, but it can't be very stale (no
                 * worse than the last spinlock we did acquire).  Since a
                 * pause request is a pretty asynchronous thing anyway,
                 * possibly responding to it one WAL record later than we
                 * otherwise would is a minor issue, so it doesn't seem worth
                 * adding another spinlock cycle to prevent that.
                 */
                if (xlogctl->recoveryPause)
                    recoveryPausesHere();

                /*
                 * Have we reached our recovery target?
                 */
                if (recoveryStopsHere(record, &recoveryApply))
                {
                    if (recoveryPauseAtTarget)
                    {
                        SetRecoveryPause(true);
                        recoveryPausesHere();
                    }
                    reachedStopPoint = true;    /* see below */
                    recoveryContinue = false;

                    /* Exit loop if we reached non-inclusive recovery target */
                    if (!recoveryApply)
                        break;
                }

                /* Setup error traceback support for ereport() */
                errcallback.callback = rm_redo_error_callback;
                errcallback.arg = (void *) record;
                errcallback.previous = error_context_stack;
                error_context_stack = &errcallback;

                /*
                 * ShmemVariableCache->nextXid must be beyond record's xid.
                 *
                 * We don't expect anyone else to modify nextXid, hence we
                 * don't need to hold a lock while examining it.  We still
                 * acquire the lock to modify it, though.
                 */
                if (TransactionIdFollowsOrEquals(record->xl_xid,
                                                 ShmemVariableCache->nextXid))
                {
                    LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
                    ShmemVariableCache->nextXid = record->xl_xid;
                    TransactionIdAdvance(ShmemVariableCache->nextXid);
                    LWLockRelease(XidGenLock);
                }

                /*
                 * Before replaying this record, check if this record
                 * causes the current timeline to change. The record is
                 * already considered to be part of the new timeline,
                 * so we update ThisTimeLineID before replaying it.
                 * That's important so that replayEndTLI, which is
                 * recorded as the minimum recovery point's TLI if
                 * recovery stops after this record, is set correctly.
                 */
                if (record->xl_rmid == RM_XLOG_ID)
                {
                    TimeLineID  newTLI = ThisTimeLineID;
                    TimeLineID  prevTLI = ThisTimeLineID;
                    uint8       info = record->xl_info & ~XLR_INFO_MASK;

                    if (info == XLOG_CHECKPOINT_SHUTDOWN)
                    {
                        CheckPoint  checkPoint;

                        memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
                        newTLI = checkPoint.ThisTimeLineID;
                        prevTLI = checkPoint.PrevTimeLineID;
                    }
                    else if (info == XLOG_END_OF_RECOVERY)
                    {
                        xl_end_of_recovery  xlrec;

                        memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
                        newTLI = xlrec.ThisTimeLineID;
                        prevTLI = xlrec.PrevTimeLineID;
                    }

                    if (newTLI != ThisTimeLineID)
                    {
                        /* Check that it's OK to switch to this TLI */
                        checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);

                        /* Following WAL records should be run with new TLI */
                        ThisTimeLineID = newTLI;
                        switchedTLI = true;
                    }
                }

                /*
                 * Update shared replayEndRecPtr before replaying this record,
                 * so that XLogFlush will update minRecoveryPoint correctly.
                 */
                SpinLockAcquire(&xlogctl->info_lck);
                xlogctl->replayEndRecPtr = EndRecPtr;
                xlogctl->replayEndTLI = ThisTimeLineID;
                SpinLockRelease(&xlogctl->info_lck);

                /*
                 * If we are attempting to enter Hot Standby mode, process
                 * XIDs we see
                 */
                if (standbyState >= STANDBY_INITIALIZED &&
                    TransactionIdIsValid(record->xl_xid))
                    RecordKnownAssignedTransactionIds(record->xl_xid);

                /* Now apply the WAL record itself */
                RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);

                /* Pop the error context stack */
                error_context_stack = errcallback.previous;

                /*
                 * Update lastReplayedEndRecPtr after this record has been
                 * successfully replayed.
                 */
                SpinLockAcquire(&xlogctl->info_lck);
                xlogctl->lastReplayedEndRecPtr = EndRecPtr;
                xlogctl->lastReplayedTLI = ThisTimeLineID;
                SpinLockRelease(&xlogctl->info_lck);

                /* Remember this record as the last-applied one */
                LastRec = ReadRecPtr;

                /* Allow read-only connections if we're consistent now */
                CheckRecoveryConsistency();

                /*
                 * If this record was a timeline switch, wake up any
                 * walsenders to notice that we are on a new timeline.
                 */
                if (switchedTLI && AllowCascadeReplication())
                    WalSndWakeup();

                /* Exit loop if we reached inclusive recovery target */
                if (!recoveryContinue)
                    break;

                /* Else, try to fetch the next WAL record */
                record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
            } while (record != NULL);

            /*
             * end of main redo apply loop
             */

            ereport(LOG,
                    (errmsg("redo done at %X/%X",
                            (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
            xtime = GetLatestXTime();
            if (xtime)
                ereport(LOG,
                     (errmsg("last completed transaction was at log time %s",
                             timestamptz_to_str(xtime))));
            InRedo = false;
        }
        else
        {
            /* there are no WAL records following the checkpoint */
            ereport(LOG,
                    (errmsg("redo is not required")));
        }
    }

    /*
     * Kill WAL receiver, if it's still running, before we continue to write
     * the startup checkpoint record. It will trump over the checkpoint and
     * subsequent records if it's still alive when we start writing WAL.
     */
    ShutdownWalRcv();

    /*
     * We don't need the latch anymore. It's not strictly necessary to disown
     * it, but let's do it for the sake of tidiness.
     */
    if (StandbyModeRequested)
        DisownLatch(&XLogCtl->recoveryWakeupLatch);

    /*
     * We are now done reading the xlog from stream. Turn off streaming
     * recovery to force fetching the files (which would be required at end of
     * recovery, e.g., timeline history file) from archive or pg_xlog.
     */
    StandbyMode = false;

    /*
     * Re-fetch the last valid or last applied record, so we can identify the
     * exact endpoint of what we consider the valid portion of WAL.
     */
    record = ReadRecord(xlogreader, LastRec, PANIC, false);
    EndOfLog = EndRecPtr;
    XLByteToPrevSeg(EndOfLog, endLogSegNo);

    /*
     * Complain if we did not roll forward far enough to render the backup
     * dump consistent.  Note: it is indeed okay to look at the local variable
     * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
     * be further ahead --- ControlFile->minRecoveryPoint cannot have been
     * advanced beyond the WAL we processed.
     */
    if (InRecovery &&
        (EndOfLog < minRecoveryPoint ||
         !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
    {
        if (reachedStopPoint)
        {
            /* stopped because of stop request */
            ereport(FATAL,
                    (errmsg("requested recovery stop point is before consistent recovery point")));
        }

        /*
         * Ran off end of WAL before reaching end-of-backup WAL record, or
         * minRecoveryPoint. That's usually a bad sign, indicating that you
         * tried to recover from an online backup but never called
         * pg_stop_backup(), or you didn't archive all the WAL up to that
         * point. However, this also happens in crash recovery, if the system
         * crashes while an online backup is in progress. We must not treat
         * that as an error, or the database will refuse to start up.
         */
        if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
        {
            if (ControlFile->backupEndRequired)
                ereport(FATAL,
                        (errmsg("WAL ends before end of online backup"),
                         errhint("All WAL generated while online backup was taken must be available at recovery.")));
            else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
                ereport(FATAL,
                        (errmsg("WAL ends before end of online backup"),
                         errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
            else
                ereport(FATAL,
                      (errmsg("WAL ends before consistent recovery point")));
        }
    }

    /*
     * Consider whether we need to assign a new timeline ID.
     *
     * If we are doing an archive recovery, we always assign a new ID.  This
     * handles a couple of issues.  If we stopped short of the end of WAL
     * during recovery, then we are clearly generating a new timeline and must
     * assign it a unique new ID.  Even if we ran to the end, modifying the
     * current last segment is problematic because it may result in trying to
     * overwrite an already-archived copy of that segment, and we encourage
     * DBAs to make their archive_commands reject that.  We can dodge the
     * problem by making the new active segment have a new timeline ID.
     *
     * In a normal crash recovery, we can just extend the timeline we were in.
     */
    PrevTimeLineID = ThisTimeLineID;
    if (ArchiveRecoveryRequested)
    {
        char    reason[200];

        Assert(InArchiveRecovery);

        ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
        ereport(LOG,
                (errmsg("selected new timeline ID: %u", ThisTimeLineID)));

        /*
         * Create a comment for the history file to explain why and where
         * timeline changed.
         */
        if (recoveryTarget == RECOVERY_TARGET_XID)
            snprintf(reason, sizeof(reason),
                     "%s transaction %u",
                     recoveryStopAfter ? "after" : "before",
                     recoveryStopXid);
        else if (recoveryTarget == RECOVERY_TARGET_TIME)
            snprintf(reason, sizeof(reason),
                     "%s %s\n",
                     recoveryStopAfter ? "after" : "before",
                     timestamptz_to_str(recoveryStopTime));
        else if (recoveryTarget == RECOVERY_TARGET_NAME)
            snprintf(reason, sizeof(reason),
                     "at restore point \"%s\"",
                     recoveryStopName);
        else
            snprintf(reason, sizeof(reason), "no recovery target specified");

        writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
                             EndRecPtr, reason);
    }

    /* Save the selected TimeLineID in shared memory, too */
    XLogCtl->ThisTimeLineID = ThisTimeLineID;
    XLogCtl->PrevTimeLineID = PrevTimeLineID;

    /*
     * We are now done reading the old WAL.  Turn off archive fetching if it
     * was active, and make a writable copy of the last WAL segment. (Note
     * that we also have a copy of the last block of the old WAL in readBuf;
     * we will use that below.)
     */
    if (ArchiveRecoveryRequested)
        exitArchiveRecovery(xlogreader->readPageTLI, endLogSegNo);

    /*
     * Prepare to write WAL starting at EndOfLog position, and init xlog
     * buffer cache using the block containing the last record from the
     * previous incarnation.
     */
    openLogSegNo = endLogSegNo;
    openLogFile = XLogFileOpen(openLogSegNo);
    openLogOff = 0;
    Insert = &XLogCtl->Insert;
    Insert->PrevRecord = LastRec;
    XLogCtl->xlblocks[0] = ((EndOfLog - 1) / XLOG_BLCKSZ + 1) * XLOG_BLCKSZ;

    /*
     * Tricky point here: readBuf contains the *last* block that the LastRec
     * record spans, not the one it starts in.  The last block is indeed the
     * one we want to use.
     */
    if (EndOfLog % XLOG_BLCKSZ == 0)
    {
        memset(Insert->currpage, 0, XLOG_BLCKSZ);
    }
    else
    {
        Assert(readOff == (XLogCtl->xlblocks[0] - XLOG_BLCKSZ) % XLogSegSize);
        memcpy((char *) Insert->currpage, xlogreader->readBuf, XLOG_BLCKSZ);
    }
    Insert->currpos = (char *) Insert->currpage +
        (EndOfLog + XLOG_BLCKSZ - XLogCtl->xlblocks[0]);

    LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;

    XLogCtl->LogwrtResult = LogwrtResult;

    XLogCtl->LogwrtRqst.Write = EndOfLog;
    XLogCtl->LogwrtRqst.Flush = EndOfLog;

    freespace = INSERT_FREESPACE(Insert);
    if (freespace > 0)
    {
        /* Make sure rest of page is zero */
        MemSet(Insert->currpos, 0, freespace);
        XLogCtl->Write.curridx = 0;
    }
    else
    {
        /*
         * Whenever LogwrtResult points to exactly the end of a page,
         * Write.curridx must point to the *next* page (see XLogWrite()).
         *
         * Note: it might seem we should do AdvanceXLInsertBuffer() here, but
         * this is sufficient.  The first actual attempt to insert a log
         * record will advance the insert state.
         */
        XLogCtl->Write.curridx = NextBufIdx(0);
    }

    /* Pre-scan prepared transactions to find out the range of XIDs present */
    oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);

    /*
     * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
     * record before resource manager writes cleanup WAL records or checkpoint
     * record is written.
     */
    Insert->fullPageWrites = lastFullPageWrites;
    LocalSetXLogInsertAllowed();
    UpdateFullPageWrites();
    LocalXLogInsertAllowed = -1;

    if (InRecovery)
    {
        int         rmid;

        /*
         * Resource managers might need to write WAL records, eg, to record
         * index cleanup actions.  So temporarily enable XLogInsertAllowed in
         * this process only.
         */
        LocalSetXLogInsertAllowed();

        /*
         * Allow resource managers to do any required cleanup.
         */
        for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
        {
            if (RmgrTable[rmid].rm_cleanup != NULL)
                RmgrTable[rmid].rm_cleanup();
        }

        /* Disallow XLogInsert again */
        LocalXLogInsertAllowed = -1;

        /*
         * Perform a checkpoint to update all our recovery activity to disk.
         *
         * Note that we write a shutdown checkpoint rather than an on-line
         * one. This is not particularly critical, but since we may be
         * assigning a new TLI, using a shutdown checkpoint allows us to have
         * the rule that TLI only changes in shutdown checkpoints, which
         * allows some extra error checking in xlog_redo.
         *
         * In fast promotion, only create a lightweight end-of-recovery record
         * instead of a full checkpoint. A checkpoint is requested later, after
         * we're fully out of recovery mode and already accepting queries.
         */
        if (bgwriterLaunched)
        {
            if (fast_promote)
            {
                checkPointLoc = ControlFile->prevCheckPoint;

                /*
                 * Confirm the last checkpoint is available for us to recover
                 * from if we fail. Note that we don't check for the secondary
                 * checkpoint since that isn't available in most base backups.
                 */
                record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
                if (record != NULL)
                {
                    fast_promoted = true;
                    CreateEndOfRecoveryRecord();
                }
            }

            if (!fast_promoted)
                RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
                                    CHECKPOINT_IMMEDIATE |
                                    CHECKPOINT_WAIT);
        }
        else
            CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);

        /*
         * And finally, execute the recovery_end_command, if any.
         */
        if (recoveryEndCommand)
            ExecuteRecoveryCommand(recoveryEndCommand,
                                   "recovery_end_command",
                                   true);
    }

    /*
     * Preallocate additional log files, if wanted.
     */
    PreallocXlogFiles(EndOfLog);

    /*
     * Reset initial contents of unlogged relations.  This has to be done
     * AFTER recovery is complete so that any unlogged relations created
     * during recovery also get picked up.
     */
    if (InRecovery)
        ResetUnloggedRelations(UNLOGGED_RELATION_INIT);

    /*
     * Okay, we're officially UP.
     */
    InRecovery = false;

    LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    ControlFile->state = DB_IN_PRODUCTION;
    ControlFile->time = (pg_time_t) time(NULL);
    UpdateControlFile();
    LWLockRelease(ControlFileLock);

    /* start the archive_timeout timer running */
    XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);

    /* also initialize latestCompletedXid, to nextXid - 1 */
    LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
    ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
    TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
    LWLockRelease(ProcArrayLock);

    /*
     * Start up the commit log and subtrans, if not already done for hot
     * standby.
     */
    if (standbyState == STANDBY_DISABLED)
    {
        StartupCLOG();
        StartupSUBTRANS(oldestActiveXID);
    }

    /*
     * Perform end of recovery actions for any SLRUs that need it.
     */
    StartupMultiXact();
    TrimCLOG();

    /* Reload shared-memory state for prepared transactions */
    RecoverPreparedTransactions();

    /*
     * Shutdown the recovery environment. This must occur after
     * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
     */
    if (standbyState != STANDBY_DISABLED)
        ShutdownRecoveryTransactionEnvironment();

    /* Shut down xlogreader */
    if (readFile >= 0)
    {
        close(readFile);
        readFile = -1;
    }
    XLogReaderFree(xlogreader);

    /*
     * If any of the critical GUCs have changed, log them before we allow
     * backends to write WAL.
     */
    LocalSetXLogInsertAllowed();
    XLogReportParameters();

    /*
     * All done.  Allow backends to write WAL.  (Although the bool flag is
     * probably atomic in itself, we use the info_lck here to ensure that
     * there are no race conditions concerning visibility of other recent
     * updates to shared memory.)
     */
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        xlogctl->SharedRecoveryInProgress = false;
        SpinLockRelease(&xlogctl->info_lck);
    }

    /*
     * If there were cascading standby servers connected to us, nudge any
     * wal sender processes to notice that we've been promoted.
     */
    WalSndWakeup();

    /*
     * If this was a fast promotion, request an (online) checkpoint now. This
     * isn't required for consistency, but the last restartpoint might be far
     * back, and in case of a crash, recovering from it might take a longer
     * than is appropriate now that we're not in standby mode anymore.
     */
    if (fast_promoted)
        RequestCheckpoint(0);
}

static char * str_time ( pg_time_t  tnow  )  [static]

Definition at line 4144 of file xlog.c.

References buf, log_timezone, pg_localtime(), and pg_strftime().

Referenced by StartupXLOG().

{
    static char buf[128];

    pg_strftime(buf, sizeof(buf),
                "%Y-%m-%d %H:%M:%S %Z",
                pg_localtime(&tnow, log_timezone));

    return buf;
}

void UpdateControlFile ( void   ) 

Definition at line 3742 of file xlog.c.

References BasicOpenFile(), close, COMP_CRC32, ControlFileData::crc, ereport, errcode_for_file_access(), errmsg(), FIN_CRC32, INIT_CRC32, offsetof, PANIC, PG_BINARY, pg_fsync(), write, and XLOG_CONTROL_FILE.

Referenced by CheckRecoveryConsistency(), CreateCheckPoint(), CreateEndOfRecoveryRecord(), CreateRestartPoint(), ReadRecord(), StartupXLOG(), UpdateMinRecoveryPoint(), xlog_redo(), and XLogReportParameters().

{
    int         fd;

    INIT_CRC32(ControlFile->crc);
    COMP_CRC32(ControlFile->crc,
               (char *) ControlFile,
               offsetof(ControlFileData, crc));
    FIN_CRC32(ControlFile->crc);

    fd = BasicOpenFile(XLOG_CONTROL_FILE,
                       O_RDWR | PG_BINARY,
                       S_IRUSR | S_IWUSR);
    if (fd < 0)
        ereport(PANIC,
                (errcode_for_file_access(),
                 errmsg("could not open control file \"%s\": %m",
                        XLOG_CONTROL_FILE)));

    errno = 0;
    if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
    {
        /* if write didn't set errno, assume problem is no disk space */
        if (errno == 0)
            errno = ENOSPC;
        ereport(PANIC,
                (errcode_for_file_access(),
                 errmsg("could not write to control file: %m")));
    }

    if (pg_fsync(fd) != 0)
        ereport(PANIC,
                (errcode_for_file_access(),
                 errmsg("could not fsync control file: %m")));

    if (close(fd))
        ereport(PANIC,
                (errcode_for_file_access(),
                 errmsg("could not close control file: %m")));
}

void UpdateFullPageWrites ( void   ) 

Definition at line 7781 of file xlog.c.

References XLogRecData::buffer, XLogRecData::data, END_CRIT_SECTION, XLogCtlInsert::fullPageWrites, fullPageWrites, XLogCtlData::Insert, Insert(), XLogRecData::len, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), XLogRecData::next, RecoveryInProgress(), START_CRIT_SECTION, WALInsertLock, XLOG_FPW_CHANGE, XLogInsert(), and XLogStandbyInfoActive.

Referenced by StartupXLOG(), and UpdateSharedMemoryConfig().

{
    XLogCtlInsert *Insert = &XLogCtl->Insert;

    /*
     * Do nothing if full_page_writes has not been changed.
     *
     * It's safe to check the shared full_page_writes without the lock,
     * because we assume that there is no concurrently running process which
     * can update it.
     */
    if (fullPageWrites == Insert->fullPageWrites)
        return;

    START_CRIT_SECTION();

    /*
     * It's always safe to take full page images, even when not strictly
     * required, but not the other round. So if we're setting full_page_writes
     * to true, first set it true and then write the WAL record. If we're
     * setting it to false, first write the WAL record and then set the global
     * flag.
     */
    if (fullPageWrites)
    {
        LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
        Insert->fullPageWrites = true;
        LWLockRelease(WALInsertLock);
    }

    /*
     * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
     * full_page_writes during archive recovery, if required.
     */
    if (XLogStandbyInfoActive() && !RecoveryInProgress())
    {
        XLogRecData rdata;

        rdata.data = (char *) (&fullPageWrites);
        rdata.len = sizeof(bool);
        rdata.buffer = InvalidBuffer;
        rdata.next = NULL;

        XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE, &rdata);
    }

    if (!fullPageWrites)
    {
        LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
        Insert->fullPageWrites = false;
        LWLockRelease(WALInsertLock);
    }
    END_CRIT_SECTION();
}

static void UpdateLastRemovedPtr ( char *  filename  )  [static]

Definition at line 2867 of file xlog.c.

References XLogCtlData::info_lck, XLogCtlData::lastRemovedSegNo, SpinLockAcquire, SpinLockRelease, and XLogFromFileName.

Referenced by RemoveOldXlogFiles().

{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;
    uint32      tli;
    XLogSegNo   segno;

    XLogFromFileName(filename, &tli, &segno);

    SpinLockAcquire(&xlogctl->info_lck);
    if (segno > xlogctl->lastRemovedSegNo)
        xlogctl->lastRemovedSegNo = segno;
    SpinLockRelease(&xlogctl->info_lck);
}

static void UpdateMinRecoveryPoint ( XLogRecPtr  lsn,
bool  force 
) [static]

Definition at line 1814 of file xlog.c.

References ControlFileLock, DEBUG2, elog, ereport, errmsg(), XLogCtlData::info_lck, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), ControlFileData::minRecoveryPoint, minRecoveryPoint, ControlFileData::minRecoveryPointTLI, minRecoveryPointTLI, XLogCtlData::replayEndRecPtr, XLogCtlData::replayEndTLI, SpinLockAcquire, SpinLockRelease, UpdateControlFile(), updateMinRecoveryPoint, and WARNING.

Referenced by CreateRestartPoint(), exitArchiveRecovery(), and XLogFlush().

{
    /* Quick check using our local copy of the variable */
    if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
        return;

    LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);

    /* update local copy */
    minRecoveryPoint = ControlFile->minRecoveryPoint;
    minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;

    /*
     * An invalid minRecoveryPoint means that we need to recover all the WAL,
     * i.e., we're doing crash recovery.  We never modify the control file's
     * value in that case, so we can short-circuit future checks here too.
     */
    if (minRecoveryPoint == 0)
        updateMinRecoveryPoint = false;
    else if (force || minRecoveryPoint < lsn)
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;
        XLogRecPtr  newMinRecoveryPoint;
        TimeLineID  newMinRecoveryPointTLI;

        /*
         * To avoid having to update the control file too often, we update it
         * all the way to the last record being replayed, even though 'lsn'
         * would suffice for correctness.  This also allows the 'force' case
         * to not need a valid 'lsn' value.
         *
         * Another important reason for doing it this way is that the passed
         * 'lsn' value could be bogus, i.e., past the end of available WAL, if
         * the caller got it from a corrupted heap page.  Accepting such a
         * value as the min recovery point would prevent us from coming up at
         * all.  Instead, we just log a warning and continue with recovery.
         * (See also the comments about corrupt LSNs in XLogFlush.)
         */
        SpinLockAcquire(&xlogctl->info_lck);
        newMinRecoveryPoint = xlogctl->replayEndRecPtr;
        newMinRecoveryPointTLI = xlogctl->replayEndTLI;
        SpinLockRelease(&xlogctl->info_lck);

        if (!force && newMinRecoveryPoint < lsn)
            elog(WARNING,
               "xlog min recovery request %X/%X is past current point %X/%X",
                 (uint32) (lsn >> 32) , (uint32) lsn,
                 (uint32) (newMinRecoveryPoint >> 32),
                 (uint32) newMinRecoveryPoint);

        /* update control file */
        if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
        {
            ControlFile->minRecoveryPoint = newMinRecoveryPoint;
            ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
            UpdateControlFile();
            minRecoveryPoint = newMinRecoveryPoint;
            minRecoveryPointTLI = newMinRecoveryPointTLI;

            ereport(DEBUG2,
                    (errmsg("updated min recovery point to %X/%X on timeline %u",
                            (uint32) (minRecoveryPoint >> 32),
                            (uint32) minRecoveryPoint,
                            newMinRecoveryPointTLI)));
        }
    }
    LWLockRelease(ControlFileLock);
}

static void ValidateXLOGDirectoryStructure ( void   )  [static]

Definition at line 3042 of file xlog.c.

References ereport, errmsg(), FATAL, LOG, MAXPGPATH, mkdir, snprintf(), and XLOGDIR.

Referenced by StartupXLOG().

{
    char        path[MAXPGPATH];
    struct stat stat_buf;

    /* Check for pg_xlog; if it doesn't exist, error out */
    if (stat(XLOGDIR, &stat_buf) != 0 ||
        !S_ISDIR(stat_buf.st_mode))
        ereport(FATAL,
                (errmsg("required WAL directory \"%s\" does not exist",
                        XLOGDIR)));

    /* Check for archive_status */
    snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
    if (stat(path, &stat_buf) == 0)
    {
        /* Check for weird cases where it exists but isn't a directory */
        if (!S_ISDIR(stat_buf.st_mode))
            ereport(FATAL,
                    (errmsg("required WAL directory \"%s\" does not exist",
                            path)));
    }
    else
    {
        ereport(LOG,
                (errmsg("creating missing WAL directory \"%s\"", path)));
        if (mkdir(path, S_IRWXU) < 0)
            ereport(FATAL,
                    (errmsg("could not create missing directory \"%s\": %m",
                            path)));
    }
}

static bool WaitForWALToBecomeAvailable ( XLogRecPtr  RecPtr,
bool  randAccess,
bool  fetching_ckpt,
XLogRecPtr  tliRecPtr 
) [static]

Definition at line 9509 of file xlog.c.

References Assert, CheckForStandbyTrigger(), ControlFileData::checkPointCopy, close, curFileTLI, currentSource, DEBUG2, elog, ERROR, GetCurrentTimestamp(), GetWalRcvWriteRecPtr(), HandleStartupProcInterrupts(), InArchiveRecovery, lastSourceFailed, NULL, PANIC, pg_usleep(), PrimaryConnInfo, readFile, readSegNo, readSource, readTimeLineHistory(), receivedUpto, receiveTLI, recoveryTargetIsLatest, XLogCtlData::recoveryWakeupLatch, RedoStartLSN, RequestXLogStreaming(), rescanLatestTimeLine(), ResetLatch(), SetCurrentChunkStartTime(), ShutdownWalRcv(), StandbyMode, CheckPoint::ThisTimeLineID, tliOfPointInHistory(), WaitLatch(), WalRcvStreaming(), WL_LATCH_SET, WL_TIMEOUT, XLOG_FROM_ARCHIVE, XLOG_FROM_PG_XLOG, XLOG_FROM_STREAM, XLogFileRead(), XLogFileReadAnyTLI(), XLogReceiptSource, XLogReceiptTime, and xlogSourceNames.

Referenced by XLogPageRead().

{
    static pg_time_t last_fail_time = 0;
    pg_time_t now;

    /*-------
     * Standby mode is implemented by a state machine:
     *
     * 1. Read from archive (XLOG_FROM_ARCHIVE)
     * 2. Read from pg_xlog (XLOG_FROM_PG_XLOG)
     * 3. Check trigger file
     * 4. Read from primary server via walreceiver (XLOG_FROM_STREAM)
     * 5. Rescan timelines
     * 6. Sleep 5 seconds, and loop back to 1.
     *
     * Failure to read from the current source advances the state machine to
     * the next state. In addition, successfully reading a file from pg_xlog
     * moves the state machine from state 2 back to state 1 (we always prefer
     * files in the archive over files in pg_xlog).
     *
     * 'currentSource' indicates the current state. There are no currentSource
     * values for "check trigger", "rescan timelines", and "sleep" states,
     * those actions are taken when reading from the previous source fails, as
     * part of advancing to the next state.
     *-------
     */
    if (!InArchiveRecovery)
        currentSource = XLOG_FROM_PG_XLOG;
    else if (currentSource == 0)
        currentSource = XLOG_FROM_ARCHIVE;

    for (;;)
    {
        int     oldSource = currentSource;

        /*
         * First check if we failed to read from the current source, and
         * advance the state machine if so. The failure to read might've
         * happened outside this function, e.g when a CRC check fails on a
         * record, or within this loop.
         */
        if (lastSourceFailed)
        {
            switch (currentSource)
            {
                case XLOG_FROM_ARCHIVE:
                    currentSource = XLOG_FROM_PG_XLOG;
                    break;

                case XLOG_FROM_PG_XLOG:
                    /*
                     * Check to see if the trigger file exists. Note that we do
                     * this only after failure, so when you create the trigger
                     * file, we still finish replaying as much as we can from
                     * archive and pg_xlog before failover.
                     */
                    if (StandbyMode && CheckForStandbyTrigger())
                    {
                        ShutdownWalRcv();
                        return false;
                    }

                    /*
                     * Not in standby mode, and we've now tried the archive and
                     * pg_xlog.
                     */
                    if (!StandbyMode)
                        return false;

                    /*
                     * If primary_conninfo is set, launch walreceiver to try to
                     * stream the missing WAL.
                     *
                     * If fetching_ckpt is TRUE, RecPtr points to the initial
                     * checkpoint location. In that case, we use RedoStartLSN
                     * as the streaming start position instead of RecPtr, so
                     * that when we later jump backwards to start redo at
                     * RedoStartLSN, we will have the logs streamed already.
                     */
                    if (PrimaryConnInfo)
                    {
                        XLogRecPtr ptr;
                        TimeLineID tli;

                        if (fetching_ckpt)
                        {
                            ptr = RedoStartLSN;
                            tli = ControlFile->checkPointCopy.ThisTimeLineID;
                        }
                        else
                        {
                            ptr = RecPtr;
                            tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);

                            if (curFileTLI > 0 && tli < curFileTLI)
                                elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
                                     (uint32) (ptr >> 32), (uint32) ptr,
                                     tli, curFileTLI);
                        }
                        curFileTLI = tli;
                        RequestXLogStreaming(curFileTLI, ptr, PrimaryConnInfo);
                    }
                    /*
                     * Move to XLOG_FROM_STREAM state in either case. We'll get
                     * immediate failure if we didn't launch walreceiver, and
                     * move on to the next state.
                     */
                    currentSource = XLOG_FROM_STREAM;
                    break;

                case XLOG_FROM_STREAM:
                    /*
                     * Failure while streaming. Most likely, we got here because
                     * streaming replication was terminated, or promotion was
                     * triggered. But we also get here if we find an invalid
                     * record in the WAL streamed from master, in which case
                     * something is seriously wrong. There's little chance that
                     * the problem will just go away, but PANIC is not good for
                     * availability either, especially in hot standby mode. So,
                     * we treat that the same as disconnection, and retry from
                     * archive/pg_xlog again. The WAL in the archive should be
                     * identical to what was streamed, so it's unlikely that it
                     * helps, but one can hope...
                     */
                    /*
                     * Before we leave XLOG_FROM_STREAM state, make sure that
                     * walreceiver is not active, so that it won't overwrite
                     * WAL that we restore from archive.
                     */
                    if (WalRcvStreaming())
                        ShutdownWalRcv();

                    /*
                     * Before we sleep, re-scan for possible new timelines if
                     * we were requested to recover to the latest timeline.
                     */
                    if (recoveryTargetIsLatest)
                    {
                        if (rescanLatestTimeLine())
                        {
                            currentSource = XLOG_FROM_ARCHIVE;
                            break;
                        }
                    }

                    /*
                     * XLOG_FROM_STREAM is the last state in our state machine,
                     * so we've exhausted all the options for obtaining the
                     * requested WAL. We're going to loop back and retry from
                     * the archive, but if it hasn't been long since last
                     * attempt, sleep 5 seconds to avoid busy-waiting.
                     */
                    now = (pg_time_t) time(NULL);
                    if ((now - last_fail_time) < 5)
                    {
                        pg_usleep(1000000L * (5 - (now - last_fail_time)));
                        now = (pg_time_t) time(NULL);
                    }
                    last_fail_time = now;
                    currentSource = XLOG_FROM_ARCHIVE;
                    break;

                default:
                    elog(ERROR, "unexpected WAL source %d", currentSource);
            }
        }
        else if (currentSource == XLOG_FROM_PG_XLOG)
        {
            /*
             * We just successfully read a file in pg_xlog. We prefer files
             * in the archive over ones in pg_xlog, so try the next file
             * again from the archive first.
             */
            if (InArchiveRecovery)
                currentSource = XLOG_FROM_ARCHIVE;
        }

        if (currentSource != oldSource)
            elog(DEBUG2, "switched WAL source from %s to %s after %s",
                 xlogSourceNames[oldSource], xlogSourceNames[currentSource],
                 lastSourceFailed ? "failure" : "success");

        /*
         * We've now handled possible failure. Try to read from the chosen
         * source.
         */
        lastSourceFailed = false;

        switch (currentSource)
        {
            case XLOG_FROM_ARCHIVE:
            case XLOG_FROM_PG_XLOG:
                /* Close any old file we might have open. */
                if (readFile >= 0)
                {
                    close(readFile);
                    readFile = -1;
                }
                /* Reset curFileTLI if random fetch. */
                if (randAccess)
                    curFileTLI = 0;

                /*
                 * Try to restore the file from archive, or read an existing
                 * file from pg_xlog.
                 */
                readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2, currentSource);
                if (readFile >= 0)
                    return true;    /* success! */

                /*
                 * Nope, not found in archive or pg_xlog.
                 */
                lastSourceFailed = true;
                break;

            case XLOG_FROM_STREAM:
            {
                bool        havedata;

                /*
                 * Check if WAL receiver is still active.
                 */
                if (!WalRcvStreaming())
                {
                    lastSourceFailed = true;
                    break;
                }

                /*
                 * Walreceiver is active, so see if new data has arrived.
                 *
                 * We only advance XLogReceiptTime when we obtain fresh WAL
                 * from walreceiver and observe that we had already processed
                 * everything before the most recent "chunk" that it flushed to
                 * disk.  In steady state where we are keeping up with the
                 * incoming data, XLogReceiptTime will be updated on each cycle.
                 * When we are behind, XLogReceiptTime will not advance, so the
                 * grace time allotted to conflicting queries will decrease.
                 */
                if (RecPtr < receivedUpto)
                    havedata = true;
                else
                {
                    XLogRecPtr  latestChunkStart;

                    receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
                    if (RecPtr < receivedUpto && receiveTLI == curFileTLI)
                    {
                        havedata = true;
                        if (latestChunkStart <= RecPtr)
                        {
                            XLogReceiptTime = GetCurrentTimestamp();
                            SetCurrentChunkStartTime(XLogReceiptTime);
                        }
                    }
                    else
                        havedata = false;
                }
                if (havedata)
                {
                    /*
                     * Great, streamed far enough.  Open the file if it's not
                     * open already.  Also read the timeline history file if
                     * we haven't initialized timeline history yet; it should
                     * be streamed over and present in pg_xlog by now.  Use
                     * XLOG_FROM_STREAM so that source info is set correctly
                     * and XLogReceiptTime isn't changed.
                     */
                    if (readFile < 0)
                    {
                        if (!expectedTLEs)
                            expectedTLEs = readTimeLineHistory(receiveTLI);
                        readFile = XLogFileRead(readSegNo, PANIC,
                                                receiveTLI,
                                                XLOG_FROM_STREAM, false);
                        Assert(readFile >= 0);
                    }
                    else
                    {
                        /* just make sure source info is correct... */
                        readSource = XLOG_FROM_STREAM;
                        XLogReceiptSource = XLOG_FROM_STREAM;
                        return true;
                    }
                    break;
                }

                /*
                 * Data not here yet. Check for trigger, then wait for
                 * walreceiver to wake us up when new WAL arrives.
                 */
                if (CheckForStandbyTrigger())
                {
                    /*
                     * Note that we don't "return false" immediately here.
                     * After being triggered, we still want to replay all the
                     * WAL that was already streamed. It's in pg_xlog now, so
                     * we just treat this as a failure, and the state machine
                     * will move on to replay the streamed WAL from pg_xlog,
                     * and then recheck the trigger and exit replay.
                     */
                    lastSourceFailed = true;
                    break;
                }

                /*
                 * Wait for more WAL to arrive. Time out after 5 seconds, like
                 * when polling the archive, to react to a trigger file
                 * promptly.
                 */
                WaitLatch(&XLogCtl->recoveryWakeupLatch,
                          WL_LATCH_SET | WL_TIMEOUT,
                          5000L);
                ResetLatch(&XLogCtl->recoveryWakeupLatch);
                break;
            }

            default:
                elog(ERROR, "unexpected WAL source %d", currentSource);
        }

        /*
         * This possibly-long loop needs to handle interrupts of startup
         * process.
         */
        HandleStartupProcInterrupts();
    } while (StandbyMode);

    return false;
}

void WakeupRecovery ( void   ) 
static void WriteControlFile ( void   )  [static]

Definition at line 3475 of file xlog.c.

References BasicOpenFile(), ControlFileData::blcksz, ControlFileData::catalog_version_no, close, COMP_CRC32, ControlFileData::crc, elog, ControlFileData::enableIntTimes, ereport, errcode_for_file_access(), errmsg(), FIN_CRC32, ControlFileData::float4ByVal, ControlFileData::float8ByVal, ControlFileData::floatFormat, ControlFileData::indexMaxKeys, INIT_CRC32, ControlFileData::maxAlign, ControlFileData::nameDataLen, offsetof, PANIC, PG_BINARY, PG_CONTROL_SIZE, ControlFileData::pg_control_version, pg_fsync(), ControlFileData::relseg_size, ControlFileData::toast_max_chunk_size, write, ControlFileData::xlog_blcksz, XLOG_CONTROL_FILE, and ControlFileData::xlog_seg_size.

Referenced by BootStrapXLOG().

{
    int         fd;
    char        buffer[PG_CONTROL_SIZE];        /* need not be aligned */

    /*
     * Initialize version and compatibility-check fields
     */
    ControlFile->pg_control_version = PG_CONTROL_VERSION;
    ControlFile->catalog_version_no = CATALOG_VERSION_NO;

    ControlFile->maxAlign = MAXIMUM_ALIGNOF;
    ControlFile->floatFormat = FLOATFORMAT_VALUE;

    ControlFile->blcksz = BLCKSZ;
    ControlFile->relseg_size = RELSEG_SIZE;
    ControlFile->xlog_blcksz = XLOG_BLCKSZ;
    ControlFile->xlog_seg_size = XLOG_SEG_SIZE;

    ControlFile->nameDataLen = NAMEDATALEN;
    ControlFile->indexMaxKeys = INDEX_MAX_KEYS;

    ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;

#ifdef HAVE_INT64_TIMESTAMP
    ControlFile->enableIntTimes = true;
#else
    ControlFile->enableIntTimes = false;
#endif
    ControlFile->float4ByVal = FLOAT4PASSBYVAL;
    ControlFile->float8ByVal = FLOAT8PASSBYVAL;

    /* Contents are protected with a CRC */
    INIT_CRC32(ControlFile->crc);
    COMP_CRC32(ControlFile->crc,
               (char *) ControlFile,
               offsetof(ControlFileData, crc));
    FIN_CRC32(ControlFile->crc);

    /*
     * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
     * excess over sizeof(ControlFileData).  This reduces the odds of
     * premature-EOF errors when reading pg_control.  We'll still fail when we
     * check the contents of the file, but hopefully with a more specific
     * error than "couldn't read pg_control".
     */
    if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
        elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");

    memset(buffer, 0, PG_CONTROL_SIZE);
    memcpy(buffer, ControlFile, sizeof(ControlFileData));

    fd = BasicOpenFile(XLOG_CONTROL_FILE,
                       O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
                       S_IRUSR | S_IWUSR);
    if (fd < 0)
        ereport(PANIC,
                (errcode_for_file_access(),
                 errmsg("could not create control file \"%s\": %m",
                        XLOG_CONTROL_FILE)));

    errno = 0;
    if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
    {
        /* if write didn't set errno, assume problem is no disk space */
        if (errno == 0)
            errno = ENOSPC;
        ereport(PANIC,
                (errcode_for_file_access(),
                 errmsg("could not write to control file: %m")));
    }

    if (pg_fsync(fd) != 0)
        ereport(PANIC,
                (errcode_for_file_access(),
                 errmsg("could not fsync control file: %m")));

    if (close(fd))
        ereport(PANIC,
                (errcode_for_file_access(),
                 errmsg("could not close control file: %m")));
}

void xlog_redo ( XLogRecPtr  lsn,
XLogRecord record 
)

Definition at line 7890 of file xlog.c.

References ArchiveRecoveryRequested, Assert, ControlFileData::backupEndPoint, ControlFileData::backupEndRequired, ControlFileData::backupStartPoint, ControlFileData::checkPointCopy, CheckRequiredParameterValues(), XLogCtlData::ckptXid, XLogCtlData::ckptXidEpoch, ControlFileLock, DEBUG1, elog, ereport, errmsg(), XLogCtlData::info_lck, XLogCtlData::lastFpwDisableRecPtr, lastFullPageWrites, RunningTransactionsData::latestCompletedXid, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), xl_parameter_change::max_locks_per_xact, ControlFileData::max_locks_per_xact, xl_parameter_change::max_prepared_xacts, ControlFileData::max_prepared_xacts, xl_parameter_change::MaxConnections, ControlFileData::MaxConnections, minRecoveryPoint, ControlFileData::minRecoveryPoint, minRecoveryPointTLI, ControlFileData::minRecoveryPointTLI, MultiXactAdvanceNextMXact(), MultiXactAdvanceOldest(), MultiXactSetNextMXact(), CheckPoint::nextMulti, CheckPoint::nextMultiOffset, CheckPoint::nextOid, VariableCacheData::nextOid, RunningTransactionsData::nextXid, CheckPoint::nextXid, VariableCacheData::nextXid, CheckPoint::nextXidEpoch, VariableCacheData::oidCount, OidGenLock, CheckPoint::oldestMulti, CheckPoint::oldestMultiDB, RunningTransactionsData::oldestRunningXid, VariableCacheData::oldestXid, CheckPoint::oldestXid, CheckPoint::oldestXidDB, PANIC, PrescanPreparedTransactions(), ProcArrayApplyRecoveryInfo(), ReadRecPtr, RecoveryRestartPoint(), RestoreBackupBlockContents(), SetMultiXactIdLimit(), SetTransactionIdLimit(), ShmemVariableCache, SpinLockAcquire, SpinLockRelease, STANDBY_INITIALIZED, StandbyRecoverPreparedTransactions(), standbyState, RunningTransactionsData::subxcnt, RunningTransactionsData::subxid_overflow, xl_end_of_recovery::ThisTimeLineID, ThisTimeLineID, CheckPoint::ThisTimeLineID, TransactionIdIsNormal, TransactionIdPrecedes(), TransactionIdRetreat, UpdateControlFile(), xl_parameter_change::wal_level, ControlFileData::wal_level, RunningTransactionsData::xcnt, XidGenLock, RunningTransactionsData::xids, XLogRecord::xl_info, XLOG_BACKUP_END, XLOG_CHECKPOINT_ONLINE, XLOG_CHECKPOINT_SHUTDOWN, XLOG_END_OF_RECOVERY, XLOG_FPW_CHANGE, XLOG_HINT, XLOG_NEXTOID, XLOG_NOOP, XLOG_PARAMETER_CHANGE, XLOG_RESTORE_POINT, XLOG_SWITCH, XLogRecGetData, XLogRecPtrIsInvalid, and XLR_BKP_BLOCK_MASK.

{
    uint8       info = record->xl_info & ~XLR_INFO_MASK;

    /* Backup blocks are not used by XLOG rmgr */
    Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));

    if (info == XLOG_NEXTOID)
    {
        Oid         nextOid;

        /*
         * We used to try to take the maximum of ShmemVariableCache->nextOid
         * and the recorded nextOid, but that fails if the OID counter wraps
         * around.  Since no OID allocation should be happening during replay
         * anyway, better to just believe the record exactly.  We still take
         * OidGenLock while setting the variable, just in case.
         */
        memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
        LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
        ShmemVariableCache->nextOid = nextOid;
        ShmemVariableCache->oidCount = 0;
        LWLockRelease(OidGenLock);
    }
    else if (info == XLOG_CHECKPOINT_SHUTDOWN)
    {
        CheckPoint  checkPoint;

        memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
        /* In a SHUTDOWN checkpoint, believe the counters exactly */
        LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
        ShmemVariableCache->nextXid = checkPoint.nextXid;
        LWLockRelease(XidGenLock);
        LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
        ShmemVariableCache->nextOid = checkPoint.nextOid;
        ShmemVariableCache->oidCount = 0;
        LWLockRelease(OidGenLock);
        MultiXactSetNextMXact(checkPoint.nextMulti,
                              checkPoint.nextMultiOffset);
        SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
        SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);

        /*
         * If we see a shutdown checkpoint while waiting for an end-of-backup
         * record, the backup was canceled and the end-of-backup record will
         * never arrive.
         */
        if (ArchiveRecoveryRequested &&
            !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
            XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
            ereport(PANIC,
            (errmsg("online backup was canceled, recovery cannot continue")));

        /*
         * If we see a shutdown checkpoint, we know that nothing was running
         * on the master at this point. So fake-up an empty running-xacts
         * record and use that here and now. Recover additional standby state
         * for prepared transactions.
         */
        if (standbyState >= STANDBY_INITIALIZED)
        {
            TransactionId *xids;
            int         nxids;
            TransactionId oldestActiveXID;
            TransactionId latestCompletedXid;
            RunningTransactionsData running;

            oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);

            /*
             * Construct a RunningTransactions snapshot representing a shut
             * down server, with only prepared transactions still alive. We're
             * never overflowed at this point because all subxids are listed
             * with their parent prepared transactions.
             */
            running.xcnt = nxids;
            running.subxcnt = 0;
            running.subxid_overflow = false;
            running.nextXid = checkPoint.nextXid;
            running.oldestRunningXid = oldestActiveXID;
            latestCompletedXid = checkPoint.nextXid;
            TransactionIdRetreat(latestCompletedXid);
            Assert(TransactionIdIsNormal(latestCompletedXid));
            running.latestCompletedXid = latestCompletedXid;
            running.xids = xids;

            ProcArrayApplyRecoveryInfo(&running);

            StandbyRecoverPreparedTransactions(true);
        }

        /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
        ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
        ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;

        /* Update shared-memory copy of checkpoint XID/epoch */
        {
            /* use volatile pointer to prevent code rearrangement */
            volatile XLogCtlData *xlogctl = XLogCtl;

            SpinLockAcquire(&xlogctl->info_lck);
            xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
            xlogctl->ckptXid = checkPoint.nextXid;
            SpinLockRelease(&xlogctl->info_lck);
        }

        /*
         * We should've already switched to the new TLI before replaying this
         * record.
         */
        if (checkPoint.ThisTimeLineID != ThisTimeLineID)
            ereport(PANIC,
                    (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
                            checkPoint.ThisTimeLineID, ThisTimeLineID)));

        RecoveryRestartPoint(&checkPoint);
    }
    else if (info == XLOG_CHECKPOINT_ONLINE)
    {
        CheckPoint  checkPoint;

        memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
        /* In an ONLINE checkpoint, treat the XID counter as a minimum */
        LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
        if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
                                  checkPoint.nextXid))
            ShmemVariableCache->nextXid = checkPoint.nextXid;
        LWLockRelease(XidGenLock);
        /* ... but still treat OID counter as exact */
        LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
        ShmemVariableCache->nextOid = checkPoint.nextOid;
        ShmemVariableCache->oidCount = 0;
        LWLockRelease(OidGenLock);
        MultiXactAdvanceNextMXact(checkPoint.nextMulti,
                                  checkPoint.nextMultiOffset);
        if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
                                  checkPoint.oldestXid))
            SetTransactionIdLimit(checkPoint.oldestXid,
                                  checkPoint.oldestXidDB);
        MultiXactAdvanceOldest(checkPoint.oldestMulti,
                               checkPoint.oldestMultiDB);

        /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
        ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
        ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;

        /* Update shared-memory copy of checkpoint XID/epoch */
        {
            /* use volatile pointer to prevent code rearrangement */
            volatile XLogCtlData *xlogctl = XLogCtl;

            SpinLockAcquire(&xlogctl->info_lck);
            xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
            xlogctl->ckptXid = checkPoint.nextXid;
            SpinLockRelease(&xlogctl->info_lck);
        }

        /* TLI should not change in an on-line checkpoint */
        if (checkPoint.ThisTimeLineID != ThisTimeLineID)
            ereport(PANIC,
                    (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
                            checkPoint.ThisTimeLineID, ThisTimeLineID)));

        RecoveryRestartPoint(&checkPoint);
    }
    else if (info == XLOG_END_OF_RECOVERY)
    {
        xl_end_of_recovery xlrec;

        memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));

        /*
         * For Hot Standby, we could treat this like a Shutdown Checkpoint,
         * but this case is rarer and harder to test, so the benefit doesn't
         * outweigh the potential extra cost of maintenance.
         */

        /*
         * We should've already switched to the new TLI before replaying this
         * record.
         */
        if (xlrec.ThisTimeLineID != ThisTimeLineID)
            ereport(PANIC,
                    (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
                            xlrec.ThisTimeLineID, ThisTimeLineID)));
    }
    else if (info == XLOG_NOOP)
    {
        /* nothing to do here */
    }
    else if (info == XLOG_SWITCH)
    {
        /* nothing to do here */
    }
    else if (info == XLOG_RESTORE_POINT)
    {
        /* nothing to do here */
    }
    else if (info == XLOG_HINT)
    {
        char *data;
        BkpBlock bkpb;

        /*
         * Hint bit records contain a backup block stored "inline" in the normal
         * data since the locking when writing hint records isn't sufficient to
         * use the normal backup block mechanism, which assumes exclusive lock
         * on the buffer supplied.
         *
         * Since the only change in these backup block are hint bits, there are
         * no recovery conflicts generated.
         *
         * This also means there is no corresponding API call for this,
         * so an smgr implementation has no need to implement anything.
         * Which means nothing is needed in md.c etc
         */
        data = XLogRecGetData(record);
        memcpy(&bkpb, data, sizeof(BkpBlock));
        data += sizeof(BkpBlock);

        RestoreBackupBlockContents(lsn, bkpb, data, false, false);
    }
    else if (info == XLOG_BACKUP_END)
    {
        XLogRecPtr  startpoint;

        memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));

        if (ControlFile->backupStartPoint == startpoint)
        {
            /*
             * We have reached the end of base backup, the point where
             * pg_stop_backup() was done. The data on disk is now consistent.
             * Reset backupStartPoint, and update minRecoveryPoint to make
             * sure we don't allow starting up at an earlier point even if
             * recovery is stopped and restarted soon after this.
             */
            elog(DEBUG1, "end of backup reached");

            LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);

            if (ControlFile->minRecoveryPoint < lsn)
            {
                ControlFile->minRecoveryPoint = lsn;
                ControlFile->minRecoveryPointTLI = ThisTimeLineID;
            }
            ControlFile->backupStartPoint = InvalidXLogRecPtr;
            ControlFile->backupEndRequired = false;
            UpdateControlFile();

            LWLockRelease(ControlFileLock);
        }
    }
    else if (info == XLOG_PARAMETER_CHANGE)
    {
        xl_parameter_change xlrec;

        /* Update our copy of the parameters in pg_control */
        memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));

        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
        ControlFile->MaxConnections = xlrec.MaxConnections;
        ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
        ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
        ControlFile->wal_level = xlrec.wal_level;

        /*
         * Update minRecoveryPoint to ensure that if recovery is aborted, we
         * recover back up to this point before allowing hot standby again.
         * This is particularly important if wal_level was set to 'archive'
         * before, and is now 'hot_standby', to ensure you don't run queries
         * against the WAL preceding the wal_level change. Same applies to
         * decreasing max_* settings.
         */
        minRecoveryPoint = ControlFile->minRecoveryPoint;
        minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
        if (minRecoveryPoint != 0 && minRecoveryPoint < lsn)
        {
            ControlFile->minRecoveryPoint = lsn;
            ControlFile->minRecoveryPointTLI = ThisTimeLineID;
        }

        UpdateControlFile();
        LWLockRelease(ControlFileLock);

        /* Check to see if any changes to max_connections give problems */
        CheckRequiredParameterValues();
    }
    else if (info == XLOG_FPW_CHANGE)
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;
        bool        fpw;

        memcpy(&fpw, XLogRecGetData(record), sizeof(bool));

        /*
         * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
         * do_pg_start_backup() and do_pg_stop_backup() can check whether
         * full_page_writes has been disabled during online backup.
         */
        if (!fpw)
        {
            SpinLockAcquire(&xlogctl->info_lck);
            if (xlogctl->lastFpwDisableRecPtr < ReadRecPtr)
                xlogctl->lastFpwDisableRecPtr = ReadRecPtr;
            SpinLockRelease(&xlogctl->info_lck);
        }

        /* Keep track of full_page_writes */
        lastFullPageWrites = fpw;
    }
}

bool XLogBackgroundFlush ( void   ) 

Definition at line 2072 of file xlog.c.

References XLogCtlData::asyncXactLSN, elog, END_CRIT_SECTION, XLogwrtRqst::Flush, XLogwrtResult::Flush, XLogCtlData::info_lck, LOG, XLogCtlData::LogwrtResult, XLogCtlData::LogwrtRqst, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), openLogFile, openLogSegNo, RecoveryInProgress(), SpinLockAcquire, SpinLockRelease, START_CRIT_SECTION, WalSndWakeupProcessRequests, WALWriteLock, XLogwrtResult::Write, XLogwrtRqst::Write, XLByteInPrevSeg, XLogFileClose(), and XLogWrite().

Referenced by WalWriterMain().

{
    XLogRecPtr  WriteRqstPtr;
    bool        flexible = true;
    bool        wrote_something = false;

    /* XLOG doesn't need flushing during recovery */
    if (RecoveryInProgress())
        return false;

    /* read LogwrtResult and update local state */
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        LogwrtResult = xlogctl->LogwrtResult;
        WriteRqstPtr = xlogctl->LogwrtRqst.Write;
        SpinLockRelease(&xlogctl->info_lck);
    }

    /* back off to last completed page boundary */
    WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;

    /* if we have already flushed that far, consider async commit records */
    if (WriteRqstPtr <= LogwrtResult.Flush)
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        WriteRqstPtr = xlogctl->asyncXactLSN;
        SpinLockRelease(&xlogctl->info_lck);
        flexible = false;       /* ensure it all gets written */
    }

    /*
     * If already known flushed, we're done. Just need to check if we are
     * holding an open file handle to a logfile that's no longer in use,
     * preventing the file from being deleted.
     */
    if (WriteRqstPtr <= LogwrtResult.Flush)
    {
        if (openLogFile >= 0)
        {
            if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
            {
                XLogFileClose();
            }
        }
        return false;
    }

#ifdef WAL_DEBUG
    if (XLOG_DEBUG)
        elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
             (uint32) (WriteRqstPtr >> 32), (uint32) WriteRqstPtr,
             (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
             (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
#endif

    START_CRIT_SECTION();

    /* now wait for the write lock */
    LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
    LogwrtResult = XLogCtl->LogwrtResult;
    if (WriteRqstPtr > LogwrtResult.Flush)
    {
        XLogwrtRqst WriteRqst;

        WriteRqst.Write = WriteRqstPtr;
        WriteRqst.Flush = WriteRqstPtr;
        XLogWrite(WriteRqst, flexible, false);
        wrote_something = true;
    }
    LWLockRelease(WALWriteLock);

    END_CRIT_SECTION();

    /* wake up walsenders now that we've released heavily contended locks */
    WalSndWakeupProcessRequests();

    return wrote_something;
}

static bool XLogCheckBuffer ( XLogRecData rdata,
bool  holdsExclusiveLock,
XLogRecPtr lsn,
BkpBlock bkpb 
) [static]

Definition at line 1246 of file xlog.c.

References BkpBlock::block, XLogRecData::buffer, XLogRecData::buffer_std, BufferGetLSNAtomic(), BufferGetPage, BufferGetTag(), BkpBlock::fork, BkpBlock::hole_length, BkpBlock::hole_offset, lower(), BkpBlock::node, PageGetLSN, RedoRecPtr, SizeOfPageHeaderData, and upper().

Referenced by XLogInsert(), and XLogSaveBufferForHint().

{
    Page        page;

    page = BufferGetPage(rdata->buffer);

    /*
     * We assume page LSN is first data on *every* page that can be passed
     * to XLogInsert, whether it has the standard page layout or not. We
     * don't need to take the buffer header lock for PageGetLSN if we hold
     * an exclusive lock on the page and/or the relation.
     */
    if (holdsExclusiveLock)
        *lsn = PageGetLSN(page);
    else
        *lsn = BufferGetLSNAtomic(rdata->buffer);

    if (*lsn <= RedoRecPtr)
    {
        /*
         * The page needs to be backed up, so set up *bkpb
         */
        BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);

        if (rdata->buffer_std)
        {
            /* Assume we can omit data between pd_lower and pd_upper */
            uint16      lower = ((PageHeader) page)->pd_lower;
            uint16      upper = ((PageHeader) page)->pd_upper;

            if (lower >= SizeOfPageHeaderData &&
                upper > lower &&
                upper <= BLCKSZ)
            {
                bkpb->hole_offset = lower;
                bkpb->hole_length = upper - lower;
            }
            else
            {
                /* No "hole" to compress out */
                bkpb->hole_offset = 0;
                bkpb->hole_length = 0;
            }
        }
        else
        {
            /* Not a standard page header, don't try to eliminate "hole" */
            bkpb->hole_offset = 0;
            bkpb->hole_length = 0;
        }

        return true;            /* buffer requires backup */
    }

    return false;               /* buffer does not need to be backed up */
}

static bool XLogCheckpointNeeded ( XLogSegNo  new_segno  )  [static]

Definition at line 1469 of file xlog.c.

References CheckPointSegments, RedoRecPtr, and XLByteToSeg.

Referenced by XLogPageRead(), and XLogWrite().

{
    XLogSegNo   old_segno;

    XLByteToSeg(RedoRecPtr, old_segno);

    if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
        return true;
    return false;
}

static int XLOGChooseNumBuffers ( void   )  [static]

Definition at line 3840 of file xlog.c.

References NBuffers.

Referenced by check_wal_buffers(), and XLOGShmemSize().

{
    int         xbuffers;

    xbuffers = NBuffers / 32;
    if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ)
        xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ;
    if (xbuffers < 8)
        xbuffers = 8;
    return xbuffers;
}

static void XLogFileClose ( void   )  [static]

Definition at line 2780 of file xlog.c.

References Assert, close, ereport, errcode_for_file_access(), errmsg(), openLogFile, openLogSegNo, PANIC, ThisTimeLineID, XLogFileNameP(), and XLogIsNeeded.

Referenced by assign_xlog_sync_method(), XLogBackgroundFlush(), and XLogWrite().

{
    Assert(openLogFile >= 0);

    /*
     * WAL segment files will not be re-read in normal operation, so we advise
     * the OS to release any cached pages.  But do not do so if WAL archiving
     * or streaming is active, because archiver and walsender process could
     * use the cache to read the WAL segment.
     */
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
    if (!XLogIsNeeded())
        (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
#endif

    if (close(openLogFile))
        ereport(PANIC,
                (errcode_for_file_access(),
                 errmsg("could not close log file %s: %m",
                        XLogFileNameP(ThisTimeLineID, openLogSegNo))));
    openLogFile = -1;
}

static void XLogFileCopy ( XLogSegNo  destsegno,
TimeLineID  srcTLI,
XLogSegNo  srcsegno 
) [static]

Definition at line 2402 of file xlog.c.

References CloseTransientFile(), elog, ereport, errcode_for_file_access(), errmsg(), ERROR, InstallXLogFileSegment(), MAXPGPATH, NULL, OpenTransientFile(), PG_BINARY, pg_fsync(), read, snprintf(), unlink(), write, XLOGDIR, and XLogFilePath.

Referenced by exitArchiveRecovery().

{
    char        path[MAXPGPATH];
    char        tmppath[MAXPGPATH];
    char        buffer[XLOG_BLCKSZ];
    int         srcfd;
    int         fd;
    int         nbytes;

    /*
     * Open the source file
     */
    XLogFilePath(path, srcTLI, srcsegno);
    srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
    if (srcfd < 0)
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not open file \"%s\": %m", path)));

    /*
     * Copy into a temp file name.
     */
    snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());

    unlink(tmppath);

    /* do not use get_sync_bit() here --- want to fsync only at end of fill */
    fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
                           S_IRUSR | S_IWUSR);
    if (fd < 0)
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not create file \"%s\": %m", tmppath)));

    /*
     * Do the data copying.
     */
    for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
    {
        errno = 0;
        if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
        {
            if (errno != 0)
                ereport(ERROR,
                        (errcode_for_file_access(),
                         errmsg("could not read file \"%s\": %m", path)));
            else
                ereport(ERROR,
                        (errmsg("not enough data in file \"%s\"", path)));
        }
        errno = 0;
        if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
        {
            int         save_errno = errno;

            /*
             * If we fail to make the file, delete it to release disk space
             */
            unlink(tmppath);
            /* if write didn't set errno, assume problem is no disk space */
            errno = save_errno ? save_errno : ENOSPC;

            ereport(ERROR,
                    (errcode_for_file_access(),
                     errmsg("could not write to file \"%s\": %m", tmppath)));
        }
    }

    if (pg_fsync(fd) != 0)
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not fsync file \"%s\": %m", tmppath)));

    if (CloseTransientFile(fd))
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not close file \"%s\": %m", tmppath)));

    CloseTransientFile(srcfd);

    /*
     * Now move the segment into place with its final name.
     */
    if (!InstallXLogFileSegment(&destsegno, tmppath, false, NULL, false))
        elog(ERROR, "InstallXLogFileSegment should not have failed");
}

int XLogFileInit ( XLogSegNo  logsegno,
bool use_existent,
bool  use_lock 
)

Definition at line 2245 of file xlog.c.

References BasicOpenFile(), close, DEBUG2, elog, ereport, errcode_for_file_access(), errmsg(), ERROR, get_sync_bit(), InstallXLogFileSegment(), MAXPGPATH, palloc0(), pfree(), PG_BINARY, pg_fsync(), snprintf(), sync_method, ThisTimeLineID, unlink(), write, XLOGDIR, and XLogFilePath.

Referenced by BootStrapXLOG(), PreallocXlogFiles(), XLogWalRcvWrite(), and XLogWrite().

{
    char        path[MAXPGPATH];
    char        tmppath[MAXPGPATH];
    char       *zbuffer;
    XLogSegNo   installed_segno;
    int         max_advance;
    int         fd;
    int         nbytes;

    XLogFilePath(path, ThisTimeLineID, logsegno);

    /*
     * Try to use existent file (checkpoint maker may have created it already)
     */
    if (*use_existent)
    {
        fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
                           S_IRUSR | S_IWUSR);
        if (fd < 0)
        {
            if (errno != ENOENT)
                ereport(ERROR,
                        (errcode_for_file_access(),
                         errmsg("could not open file \"%s\": %m", path)));
        }
        else
            return fd;
    }

    /*
     * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
     * another process is doing the same thing.  If so, we will end up
     * pre-creating an extra log segment.  That seems OK, and better than
     * holding the lock throughout this lengthy process.
     */
    elog(DEBUG2, "creating and filling new WAL file");

    snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());

    unlink(tmppath);

    /*
     * Allocate a buffer full of zeros. This is done before opening the file
     * so that we don't leak the file descriptor if palloc fails.
     *
     * Note: palloc zbuffer, instead of just using a local char array, to
     * ensure it is reasonably well-aligned; this may save a few cycles
     * transferring data to the kernel.
     */
    zbuffer = (char *) palloc0(XLOG_BLCKSZ);

    /* do not use get_sync_bit() here --- want to fsync only at end of fill */
    fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
                       S_IRUSR | S_IWUSR);
    if (fd < 0)
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not create file \"%s\": %m", tmppath)));

    /*
     * Zero-fill the file.  We have to do this the hard way to ensure that all
     * the file space has really been allocated --- on platforms that allow
     * "holes" in files, just seeking to the end doesn't allocate intermediate
     * space.  This way, we know that we have all the space and (after the
     * fsync below) that all the indirect blocks are down on disk.  Therefore,
     * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
     * log file.
     */
    for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
    {
        errno = 0;
        if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
        {
            int         save_errno = errno;

            /*
             * If we fail to make the file, delete it to release disk space
             */
            unlink(tmppath);

            close(fd);

            /* if write didn't set errno, assume problem is no disk space */
            errno = save_errno ? save_errno : ENOSPC;

            ereport(ERROR,
                    (errcode_for_file_access(),
                     errmsg("could not write to file \"%s\": %m", tmppath)));
        }
    }
    pfree(zbuffer);

    if (pg_fsync(fd) != 0)
    {
        close(fd);
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not fsync file \"%s\": %m", tmppath)));
    }

    if (close(fd))
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not close file \"%s\": %m", tmppath)));

    /*
     * Now move the segment into place with its final name.
     *
     * If caller didn't want to use a pre-existing file, get rid of any
     * pre-existing file.  Otherwise, cope with possibility that someone else
     * has created the file while we were filling ours: if so, use ours to
     * pre-create a future log segment.
     */
    installed_segno = logsegno;
    max_advance = XLOGfileslop;
    if (!InstallXLogFileSegment(&installed_segno, tmppath,
                                *use_existent, &max_advance,
                                use_lock))
    {
        /*
         * No need for any more future segments, or InstallXLogFileSegment()
         * failed to rename the file into place. If the rename failed, opening
         * the file below will fail.
         */
        unlink(tmppath);
    }

    /* Set flag to tell caller there was no existent file */
    *use_existent = false;

    /* Now open original target segment (might not be file I just made) */
    fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
                       S_IRUSR | S_IWUSR);
    if (fd < 0)
        ereport(ERROR,
                (errcode_for_file_access(),
           errmsg("could not open file \"%s\": %m", path)));

    elog(DEBUG2, "done creating and filling new WAL file");

    return fd;
}

char* XLogFileNameP ( TimeLineID  tli,
XLogSegNo  segno 
)

Definition at line 8365 of file xlog.c.

References MAXFNAMELEN, palloc(), and XLogFileName.

Referenced by assign_xlog_sync_method(), issue_xlog_fsync(), WalReceiverMain(), XLogFileClose(), XLogRead(), XLogWalRcvWrite(), and XLogWrite().

{
    char       *result = palloc(MAXFNAMELEN);
    XLogFileName(result, tli, segno);
    return result;
}

int XLogFileOpen ( XLogSegNo  segno  ) 

Definition at line 2597 of file xlog.c.

References BasicOpenFile(), ereport, errcode_for_file_access(), errmsg(), get_sync_bit(), PANIC, PG_BINARY, sync_method, ThisTimeLineID, and XLogFilePath.

Referenced by StartupXLOG(), and XLogWrite().

{
    char        path[MAXPGPATH];
    int         fd;

    XLogFilePath(path, ThisTimeLineID, segno);

    fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
                       S_IRUSR | S_IWUSR);
    if (fd < 0)
        ereport(PANIC,
                (errcode_for_file_access(),
                 errmsg("could not open xlog file \"%s\": %m", path)));

    return fd;
}

static int XLogFileRead ( XLogSegNo  segno,
int  emode,
TimeLineID  tli,
int  source,
bool  notexistOk 
) [static]

Definition at line 2621 of file xlog.c.

References BasicOpenFile(), curFileTLI, elog, ereport, errcode_for_file_access(), errmsg(), ERROR, GetCurrentTimestamp(), InRedo, KeepFileRestoredFromArchive(), MAXFNAMELEN, MAXPGPATH, PANIC, PG_BINARY, readSource, RestoreArchivedFile(), restoredFromArchive, set_ps_display(), snprintf(), XLOG_FROM_ARCHIVE, XLOG_FROM_PG_XLOG, XLOG_FROM_STREAM, XLOGDIR, XLogFileName, XLogFilePath, XLogReceiptSource, XLogReceiptTime, and XLogSegSize.

Referenced by WaitForWALToBecomeAvailable(), and XLogFileReadAnyTLI().

{
    char        xlogfname[MAXFNAMELEN];
    char        activitymsg[MAXFNAMELEN + 16];
    char        path[MAXPGPATH];
    int         fd;

    XLogFileName(xlogfname, tli, segno);

    switch (source)
    {
        case XLOG_FROM_ARCHIVE:
            /* Report recovery progress in PS display */
            snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
                     xlogfname);
            set_ps_display(activitymsg, false);

            restoredFromArchive = RestoreArchivedFile(path, xlogfname,
                                                      "RECOVERYXLOG",
                                                      XLogSegSize,
                                                      InRedo);
            if (!restoredFromArchive)
                return -1;
            break;

        case XLOG_FROM_PG_XLOG:
        case XLOG_FROM_STREAM:
            XLogFilePath(path, tli, segno);
            restoredFromArchive = false;
            break;

        default:
            elog(ERROR, "invalid XLogFileRead source %d", source);
    }

    /*
     * If the segment was fetched from archival storage, replace the existing
     * xlog segment (if any) with the archival version.
     */
    if (source == XLOG_FROM_ARCHIVE)
    {
        KeepFileRestoredFromArchive(path, xlogfname);

        /*
         * Set path to point at the new file in pg_xlog.
         */
        snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
    }

    fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
    if (fd >= 0)
    {
        /* Success! */
        curFileTLI = tli;

        /* Report recovery progress in PS display */
        snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
                 xlogfname);
        set_ps_display(activitymsg, false);

        /* Track source of data in assorted state variables */
        readSource = source;
        XLogReceiptSource = source;
        /* In FROM_STREAM case, caller tracks receipt time, not me */
        if (source != XLOG_FROM_STREAM)
            XLogReceiptTime = GetCurrentTimestamp();

        return fd;
    }
    if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
        ereport(PANIC,
                (errcode_for_file_access(),
                 errmsg("could not open file \"%s\": %m", path)));
    return -1;
}

static int XLogFileReadAnyTLI ( XLogSegNo  segno,
int  emode,
int  source 
) [static]

Definition at line 2704 of file xlog.c.

References curFileTLI, DEBUG1, elog, ereport, errcode_for_file_access(), errmsg(), lfirst, readTimeLineHistory(), recoveryTargetTLI, XLOG_FROM_ANY, XLOG_FROM_ARCHIVE, XLOG_FROM_PG_XLOG, XLogFilePath, and XLogFileRead().

Referenced by WaitForWALToBecomeAvailable().

{
    char        path[MAXPGPATH];
    ListCell   *cell;
    int         fd;
    List       *tles;

    /*
     * Loop looking for a suitable timeline ID: we might need to read any of
     * the timelines listed in expectedTLEs.
     *
     * We expect curFileTLI on entry to be the TLI of the preceding file in
     * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
     * to go backwards; this prevents us from picking up the wrong file when a
     * parent timeline extends to higher segment numbers than the child we
     * want to read.
     *
     * If we haven't read the timeline history file yet, read it now, so that
     * we know which TLIs to scan.  We don't save the list in expectedTLEs,
     * however, unless we actually find a valid segment.  That way if there is
     * neither a timeline history file nor a WAL segment in the archive, and
     * streaming replication is set up, we'll read the timeline history file
     * streamed from the master when we start streaming, instead of recovering
     * with a dummy history generated here.
     */
    if (expectedTLEs)
        tles = expectedTLEs;
    else
        tles = readTimeLineHistory(recoveryTargetTLI);

    foreach(cell, tles)
    {
        TimeLineID  tli = ((TimeLineHistoryEntry *) lfirst(cell))->tli;

        if (tli < curFileTLI)
            break;              /* don't bother looking at too-old TLIs */

        if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
        {
            fd = XLogFileRead(segno, emode, tli,
                              XLOG_FROM_ARCHIVE, true);
            if (fd != -1)
            {
                elog(DEBUG1, "got WAL segment from archive");
                if (!expectedTLEs)
                    expectedTLEs = tles;
                return fd;
            }
        }

        if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_XLOG)
        {
            fd = XLogFileRead(segno, emode, tli,
                              XLOG_FROM_PG_XLOG, true);
            if (fd != -1)
            {
                if (!expectedTLEs)
                    expectedTLEs = tles;
                return fd;
            }
        }
    }

    /* Couldn't find it.  For simplicity, complain about front timeline */
    XLogFilePath(path, recoveryTargetTLI, segno);
    errno = ENOENT;
    ereport(emode,
            (errcode_for_file_access(),
             errmsg("could not open file \"%s\": %m", path)));
    return -1;
}

void XLogFlush ( XLogRecPtr  record  ) 

Definition at line 1891 of file xlog.c.

References CommitDelay, CommitSiblings, XLogCtlInsert::curridx, elog, enableFsync, END_CRIT_SECTION, ERROR, XLogwrtRqst::Flush, XLogwrtResult::Flush, XLogCtlData::info_lck, XLogCtlData::Insert, Insert(), INSERT_FREESPACE, LOG, XLogCtlData::LogwrtResult, XLogCtlData::LogwrtRqst, LW_EXCLUSIVE, LWLockAcquireOrWait(), LWLockConditionalAcquire(), LWLockRelease(), MinimumActiveBackends(), pg_usleep(), SpinLockAcquire, SpinLockRelease, START_CRIT_SECTION, UpdateMinRecoveryPoint(), WALInsertLock, WalSndWakeupProcessRequests, WALWriteLock, XLogwrtRqst::Write, XLogwrtResult::Write, XLogCtlData::xlblocks, XLogInsertAllowed(), and XLogWrite().

Referenced by CreateCheckPoint(), CreateEndOfRecoveryRecord(), EndPrepare(), FlushBuffer(), RecordTransactionAbortPrepared(), RecordTransactionCommit(), RecordTransactionCommitPrepared(), RelationTruncate(), SlruPhysicalWritePage(), smgr_redo(), write_relmap_file(), WriteTruncateXlogRec(), and xact_redo_commit_internal().

{
    XLogRecPtr  WriteRqstPtr;
    XLogwrtRqst WriteRqst;

    /*
     * During REDO, we are reading not writing WAL.  Therefore, instead of
     * trying to flush the WAL, we should update minRecoveryPoint instead. We
     * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
     * to act this way too, and because when it tries to write the
     * end-of-recovery checkpoint, it should indeed flush.
     */
    if (!XLogInsertAllowed())
    {
        UpdateMinRecoveryPoint(record, false);
        return;
    }

    /* Quick exit if already known flushed */
    if (record <= LogwrtResult.Flush)
        return;

#ifdef WAL_DEBUG
    if (XLOG_DEBUG)
        elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
             (uint32) (record >> 32), (uint32) record,
             (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
             (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
#endif

    START_CRIT_SECTION();

    /*
     * Since fsync is usually a horribly expensive operation, we try to
     * piggyback as much data as we can on each fsync: if we see any more data
     * entered into the xlog buffer, we'll write and fsync that too, so that
     * the final value of LogwrtResult.Flush is as large as possible. This
     * gives us some chance of avoiding another fsync immediately after.
     */

    /* initialize to given target; may increase below */
    WriteRqstPtr = record;

    /*
     * Now wait until we get the write lock, or someone else does the flush
     * for us.
     */
    for (;;)
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        /* read LogwrtResult and update local state */
        SpinLockAcquire(&xlogctl->info_lck);
        if (WriteRqstPtr < xlogctl->LogwrtRqst.Write)
            WriteRqstPtr = xlogctl->LogwrtRqst.Write;
        LogwrtResult = xlogctl->LogwrtResult;
        SpinLockRelease(&xlogctl->info_lck);

        /* done already? */
        if (record <= LogwrtResult.Flush)
            break;

        /*
         * Try to get the write lock. If we can't get it immediately, wait
         * until it's released, and recheck if we still need to do the flush
         * or if the backend that held the lock did it for us already. This
         * helps to maintain a good rate of group committing when the system
         * is bottlenecked by the speed of fsyncing.
         */
        if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
        {
            /*
             * The lock is now free, but we didn't acquire it yet. Before we
             * do, loop back to check if someone else flushed the record for
             * us already.
             */
            continue;
        }

        /* Got the lock; recheck whether request is satisfied */
        LogwrtResult = XLogCtl->LogwrtResult;
        if (record <= LogwrtResult.Flush)
        {
            LWLockRelease(WALWriteLock);
            break;
        }

        /*
         * Sleep before flush! By adding a delay here, we may give further
         * backends the opportunity to join the backlog of group commit
         * followers; this can significantly improve transaction throughput, at
         * the risk of increasing transaction latency.
         *
         * We do not sleep if enableFsync is not turned on, nor if there are
         * fewer than CommitSiblings other backends with active transactions.
         */
        if (CommitDelay > 0 && enableFsync &&
            MinimumActiveBackends(CommitSiblings))
            pg_usleep(CommitDelay);

        /* try to write/flush later additions to XLOG as well */
        if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
        {
            XLogCtlInsert *Insert = &XLogCtl->Insert;
            uint32      freespace = INSERT_FREESPACE(Insert);

            if (freespace == 0)     /* buffer is full */
                WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
            else
            {
                WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
                WriteRqstPtr -= freespace;
            }
            LWLockRelease(WALInsertLock);
            WriteRqst.Write = WriteRqstPtr;
            WriteRqst.Flush = WriteRqstPtr;
        }
        else
        {
            WriteRqst.Write = WriteRqstPtr;
            WriteRqst.Flush = record;
        }
        XLogWrite(WriteRqst, false, false);

        LWLockRelease(WALWriteLock);
        /* done */
        break;
    }

    END_CRIT_SECTION();

    /* wake up walsenders now that we've released heavily contended locks */
    WalSndWakeupProcessRequests();

    /*
     * If we still haven't flushed to the request point then we have a
     * problem; most likely, the requested flush point is past end of XLOG.
     * This has been seen to occur when a disk page has a corrupted LSN.
     *
     * Formerly we treated this as a PANIC condition, but that hurts the
     * system's robustness rather than helping it: we do not want to take down
     * the whole system due to corruption on one data page.  In particular, if
     * the bad page is encountered again during recovery then we would be
     * unable to restart the database at all!  (This scenario actually
     * happened in the field several times with 7.1 releases.)  As of 8.4, bad
     * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
     * the only time we can reach here during recovery is while flushing the
     * end-of-recovery checkpoint record, and we don't expect that to have a
     * bad LSN.
     *
     * Note that for calls from xact.c, the ERROR will be promoted to PANIC
     * since xact.c calls this routine inside a critical section.  However,
     * calls from bufmgr.c are not within critical sections and so we will not
     * force a restart for a bad LSN on a data page.
     */
    if (LogwrtResult.Flush < record)
        elog(ERROR,
        "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
             (uint32) (record >> 32), (uint32) record,
             (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
}

XLogRecPtr XLogInsert ( RmgrId  rmid,
uint8  info,
XLogRecData rdata 
)

Definition at line 712 of file xlog.c.

References AdvanceXLInsertBuffer(), appendStringInfo(), Assert, buf, XLogRecData::buffer, BufferGetBlock, COMP_CRC32, XLogCtlWrite::curridx, XLogCtlInsert::curridx, XLogCtlInsert::currpage, XLogCtlInsert::currpos, StringInfoData::data, XLogRecData::data, elog, END_CRIT_SECTION, ERROR, FIN_CRC32, XLogwrtRqst::Flush, XLogwrtResult::Flush, XLogCtlInsert::forcePageWrites, XLogCtlInsert::fullPageWrites, GetCurrentTransactionIdIfAny(), BkpBlock::hole_length, BkpBlock::hole_offset, i, XLogCtlData::info_lck, INIT_CRC32, initStringInfo(), XLogCtlData::Insert, Insert(), INSERT_FREESPACE, INSERT_RECPTR, InvalidBuffer, IsBootstrapProcessingMode, XLogRecData::len, LOG, XLogCtlData::LogwrtResult, XLogCtlData::LogwrtRqst, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), malloc, MAXALIGN, MemSet, XLogRecData::next, NULL, offsetof, PANIC, pfree(), PrevBufIdx, XLogCtlInsert::PrevRecord, ProcLastRecPtr, XLogCtlInsert::RedoRecPtr, RedoRecPtr, RmgrData::rm_desc, RmgrTable, SizeOfXLogLongPHD, SizeOfXLogRecord, SpinLockAcquire, SpinLockRelease, START_CRIT_SECTION, WALInsertLock, WalSndWakeupProcessRequests, WALWriteLock, XLogwrtResult::Write, XLogCtlData::Write, XLogwrtRqst::Write, XactLastRecEnd, XLogRecord::xl_info, XLogRecord::xl_len, XLogRecord::xl_prev, XLogRecord::xl_rmid, XLogRecord::xl_tot_len, XLogRecord::xl_xid, XLogCtlData::xlblocks, XLogCheckBuffer(), XLogInsertAllowed(), XLogSegSize, XLogWrite(), XLogPageHeaderData::xlp_info, XLogPageHeaderData::xlp_rem_len, XLR_BKP_BLOCK, and XLR_INFO_MASK.

Referenced by _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_getroot(), _bt_insertonpg(), _bt_log_reuse_page(), _bt_newroot(), _bt_pagedel(), _bt_split(), addLeafTuple(), AlterSequence(), AssignTransactionId(), CreateCheckPoint(), createdb(), CreateEndOfRecoveryRecord(), CreateMultiXactId(), createPostingTree(), CreateTableSpace(), do_pg_stop_backup(), do_setval(), doPickSplit(), DropTableSpace(), EndPrepare(), fill_seq_with_data(), ginbuild(), ginDeletePage(), ginHeapTupleFastInsert(), ginInsertValue(), ginUpdateStats(), gistbuild(), gistXLogSplit(), gistXLogUpdate(), heap_delete(), heap_inplace_update(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), log_heap_clean(), log_heap_cleanup_info(), log_heap_freeze(), log_heap_update(), log_heap_visible(), log_newpage(), log_newpage_buffer(), log_smgrcreate(), LogAccessExclusiveLocks(), LogCurrentRunningXacts(), movedb(), moveLeafs(), nextval_internal(), RecordTransactionAbort(), RecordTransactionAbortPrepared(), RecordTransactionCommit(), RecordTransactionCommitPrepared(), RelationTruncate(), remove_dbtablespaces(), RequestXLogSwitch(), shiftList(), spgAddNodeAction(), spgbuild(), spgSplitNodeAction(), UpdateFullPageWrites(), vacuumLeafPage(), vacuumLeafRoot(), vacuumRedirectAndPlaceholder(), write_relmap_file(), writeListPage(), WriteMZeroPageXlogRec(), WriteTruncateXlogRec(), WriteZeroPageXlogRec(), XLogPutNextOid(), XLogReportParameters(), XLogRestorePoint(), XLogSaveBufferForHint(), and xlogVacuumPage().

{
    XLogCtlInsert *Insert = &XLogCtl->Insert;
    XLogRecPtr  RecPtr;
    XLogRecPtr  WriteRqst;
    uint32      freespace;
    int         curridx;
    XLogRecData *rdt;
    XLogRecData *rdt_lastnormal;
    Buffer      dtbuf[XLR_MAX_BKP_BLOCKS];
    bool        dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
    BkpBlock    dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
    XLogRecPtr  dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
    XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
    XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
    XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
    XLogRecData hdr_rdt;
    pg_crc32    rdata_crc;
    uint32      len,
                write_len;
    unsigned    i;
    bool        updrqst;
    bool        doPageWrites;
    bool        isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
    uint8       info_orig = info;
    static XLogRecord *rechdr;

    if (rechdr == NULL)
    {
        rechdr = malloc(SizeOfXLogRecord);
        if (rechdr == NULL)
            elog(ERROR, "out of memory");
        MemSet(rechdr, 0, SizeOfXLogRecord);
    }

    /* cross-check on whether we should be here or not */
    if (!XLogInsertAllowed())
        elog(ERROR, "cannot make new WAL entries during recovery");

    /* info's high bits are reserved for use by me */
    if (info & XLR_INFO_MASK)
        elog(PANIC, "invalid xlog info mask %02X", info);

    TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);

    /*
     * In bootstrap mode, we don't actually log anything but XLOG resources;
     * return a phony record pointer.
     */
    if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
    {
        RecPtr = SizeOfXLogLongPHD;     /* start of 1st chkpt record */
        return RecPtr;
    }

    /*
     * Here we scan the rdata chain, to determine which buffers must be backed
     * up.
     *
     * We may have to loop back to here if a race condition is detected below.
     * We could prevent the race by doing all this work while holding the
     * insert lock, but it seems better to avoid doing CRC calculations while
     * holding the lock.
     *
     * We add entries for backup blocks to the chain, so that they don't need
     * any special treatment in the critical section where the chunks are
     * copied into the WAL buffers. Those entries have to be unlinked from the
     * chain if we have to loop back here.
     */
begin:;
    for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
    {
        dtbuf[i] = InvalidBuffer;
        dtbuf_bkp[i] = false;
    }

    /*
     * Decide if we need to do full-page writes in this XLOG record: true if
     * full_page_writes is on or we have a PITR request for it.  Since we
     * don't yet have the insert lock, fullPageWrites and forcePageWrites
     * could change under us, but we'll recheck them once we have the lock.
     */
    doPageWrites = Insert->fullPageWrites || Insert->forcePageWrites;

    len = 0;
    for (rdt = rdata;;)
    {
        if (rdt->buffer == InvalidBuffer)
        {
            /* Simple data, just include it */
            len += rdt->len;
        }
        else
        {
            /* Find info for buffer */
            for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
            {
                if (rdt->buffer == dtbuf[i])
                {
                    /* Buffer already referenced by earlier chain item */
                    if (dtbuf_bkp[i])
                    {
                        rdt->data = NULL;
                        rdt->len = 0;
                    }
                    else if (rdt->data)
                        len += rdt->len;
                    break;
                }
                if (dtbuf[i] == InvalidBuffer)
                {
                    /* OK, put it in this slot */
                    dtbuf[i] = rdt->buffer;
                    if (doPageWrites && XLogCheckBuffer(rdt, true,
                                        &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
                    {
                        dtbuf_bkp[i] = true;
                        rdt->data = NULL;
                        rdt->len = 0;
                    }
                    else if (rdt->data)
                        len += rdt->len;
                    break;
                }
            }
            if (i >= XLR_MAX_BKP_BLOCKS)
                elog(PANIC, "can backup at most %d blocks per xlog record",
                     XLR_MAX_BKP_BLOCKS);
        }
        /* Break out of loop when rdt points to last chain item */
        if (rdt->next == NULL)
            break;
        rdt = rdt->next;
    }

    /*
     * NOTE: We disallow len == 0 because it provides a useful bit of extra
     * error checking in ReadRecord.  This means that all callers of
     * XLogInsert must supply at least some not-in-a-buffer data.  However, we
     * make an exception for XLOG SWITCH records because we don't want them to
     * ever cross a segment boundary.
     */
    if (len == 0 && !isLogSwitch)
        elog(PANIC, "invalid xlog record length %u", len);

    /*
     * Make additional rdata chain entries for the backup blocks, so that we
     * don't need to special-case them in the write loop.  This modifies the
     * original rdata chain, but we keep a pointer to the last regular entry,
     * rdt_lastnormal, so that we can undo this if we have to loop back to the
     * beginning.
     *
     * At the exit of this loop, write_len includes the backup block data.
     *
     * Also set the appropriate info bits to show which buffers were backed
     * up. The XLR_BKP_BLOCK(N) bit corresponds to the N'th distinct buffer
     * value (ignoring InvalidBuffer) appearing in the rdata chain.
     */
    rdt_lastnormal = rdt;
    write_len = len;
    for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
    {
        BkpBlock   *bkpb;
        char       *page;

        if (!dtbuf_bkp[i])
            continue;

        info |= XLR_BKP_BLOCK(i);

        bkpb = &(dtbuf_xlg[i]);
        page = (char *) BufferGetBlock(dtbuf[i]);

        rdt->next = &(dtbuf_rdt1[i]);
        rdt = rdt->next;

        rdt->data = (char *) bkpb;
        rdt->len = sizeof(BkpBlock);
        write_len += sizeof(BkpBlock);

        rdt->next = &(dtbuf_rdt2[i]);
        rdt = rdt->next;

        if (bkpb->hole_length == 0)
        {
            rdt->data = page;
            rdt->len = BLCKSZ;
            write_len += BLCKSZ;
            rdt->next = NULL;
        }
        else
        {
            /* must skip the hole */
            rdt->data = page;
            rdt->len = bkpb->hole_offset;
            write_len += bkpb->hole_offset;

            rdt->next = &(dtbuf_rdt3[i]);
            rdt = rdt->next;

            rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
            rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
            write_len += rdt->len;
            rdt->next = NULL;
        }
    }

    /*
     * Calculate CRC of the data, including all the backup blocks
     *
     * Note that the record header isn't added into the CRC initially since we
     * don't know the prev-link yet.  Thus, the CRC will represent the CRC of
     * the whole record in the order: rdata, then backup blocks, then record
     * header.
     */
    INIT_CRC32(rdata_crc);
    for (rdt = rdata; rdt != NULL; rdt = rdt->next)
        COMP_CRC32(rdata_crc, rdt->data, rdt->len);

    /*
     * Construct record header (prev-link and CRC are filled in later), and
     * make that the first chunk in the chain.
     */
    rechdr->xl_xid = GetCurrentTransactionIdIfAny();
    rechdr->xl_tot_len = SizeOfXLogRecord + write_len;
    rechdr->xl_len = len;       /* doesn't include backup blocks */
    rechdr->xl_info = info;
    rechdr->xl_rmid = rmid;

    hdr_rdt.next = rdata;
    hdr_rdt.data = (char *) rechdr;
    hdr_rdt.len = SizeOfXLogRecord;

    write_len += SizeOfXLogRecord;

    START_CRIT_SECTION();

    /* Now wait to get insert lock */
    LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);

    /*
     * Check to see if my RedoRecPtr is out of date.  If so, may have to go
     * back and recompute everything.  This can only happen just after a
     * checkpoint, so it's better to be slow in this case and fast otherwise.
     *
     * If we aren't doing full-page writes then RedoRecPtr doesn't actually
     * affect the contents of the XLOG record, so we'll update our local copy
     * but not force a recomputation.
     */
    if (RedoRecPtr != Insert->RedoRecPtr)
    {
        Assert(RedoRecPtr < Insert->RedoRecPtr);
        RedoRecPtr = Insert->RedoRecPtr;

        if (doPageWrites)
        {
            for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
            {
                if (dtbuf[i] == InvalidBuffer)
                    continue;
                if (dtbuf_bkp[i] == false &&
                    dtbuf_lsn[i] <= RedoRecPtr)
                {
                    /*
                     * Oops, this buffer now needs to be backed up, but we
                     * didn't think so above.  Start over.
                     */
                    LWLockRelease(WALInsertLock);
                    END_CRIT_SECTION();
                    rdt_lastnormal->next = NULL;
                    info = info_orig;
                    goto begin;
                }
            }
        }
    }

    /*
     * Also check to see if fullPageWrites or forcePageWrites was just turned
     * on; if we weren't already doing full-page writes then go back and
     * recompute. (If it was just turned off, we could recompute the record
     * without full pages, but we choose not to bother.)
     */
    if ((Insert->fullPageWrites || Insert->forcePageWrites) && !doPageWrites)
    {
        /* Oops, must redo it with full-page data. */
        LWLockRelease(WALInsertLock);
        END_CRIT_SECTION();
        rdt_lastnormal->next = NULL;
        info = info_orig;
        goto begin;
    }

    /*
     * If the current page is completely full, the record goes to the next
     * page, right after the page header.
     */
    updrqst = false;
    freespace = INSERT_FREESPACE(Insert);
    if (freespace == 0)
    {
        updrqst = AdvanceXLInsertBuffer(false);
        freespace = INSERT_FREESPACE(Insert);
    }

    /* Compute record's XLOG location */
    curridx = Insert->curridx;
    INSERT_RECPTR(RecPtr, Insert, curridx);

    /*
     * If the record is an XLOG_SWITCH, and we are exactly at the start of a
     * segment, we need not insert it (and don't want to because we'd like
     * consecutive switch requests to be no-ops).  Instead, make sure
     * everything is written and flushed through the end of the prior segment,
     * and return the prior segment's end address.
     */
    if (isLogSwitch && (RecPtr % XLogSegSize) == SizeOfXLogLongPHD)
    {
        /* We can release insert lock immediately */
        LWLockRelease(WALInsertLock);

        RecPtr -= SizeOfXLogLongPHD;

        LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
        LogwrtResult = XLogCtl->LogwrtResult;
        if (LogwrtResult.Flush < RecPtr)
        {
            XLogwrtRqst FlushRqst;

            FlushRqst.Write = RecPtr;
            FlushRqst.Flush = RecPtr;
            XLogWrite(FlushRqst, false, false);
        }
        LWLockRelease(WALWriteLock);

        END_CRIT_SECTION();

        /* wake up walsenders now that we've released heavily contended locks */
        WalSndWakeupProcessRequests();
        return RecPtr;
    }

    /* Finish the record header */
    rechdr->xl_prev = Insert->PrevRecord;

    /* Now we can finish computing the record's CRC */
    COMP_CRC32(rdata_crc, (char *) rechdr, offsetof(XLogRecord, xl_crc));
    FIN_CRC32(rdata_crc);
    rechdr->xl_crc = rdata_crc;

#ifdef WAL_DEBUG
    if (XLOG_DEBUG)
    {
        StringInfoData buf;

        initStringInfo(&buf);
        appendStringInfo(&buf, "INSERT @ %X/%X: ",
                         (uint32) (RecPtr >> 32), (uint32) RecPtr);
        xlog_outrec(&buf, rechdr);
        if (rdata->data != NULL)
        {
            appendStringInfo(&buf, " - ");
            RmgrTable[rechdr->xl_rmid].rm_desc(&buf, rechdr->xl_info, rdata->data);
        }
        elog(LOG, "%s", buf.data);
        pfree(buf.data);
    }
#endif

    /* Record begin of record in appropriate places */
    ProcLastRecPtr = RecPtr;
    Insert->PrevRecord = RecPtr;

    /*
     * Append the data, including backup blocks if any
     */
    rdata = &hdr_rdt;
    while (write_len)
    {
        while (rdata->data == NULL)
            rdata = rdata->next;

        if (freespace > 0)
        {
            if (rdata->len > freespace)
            {
                memcpy(Insert->currpos, rdata->data, freespace);
                rdata->data += freespace;
                rdata->len -= freespace;
                write_len -= freespace;
            }
            else
            {
                memcpy(Insert->currpos, rdata->data, rdata->len);
                freespace -= rdata->len;
                write_len -= rdata->len;
                Insert->currpos += rdata->len;
                rdata = rdata->next;
                continue;
            }
        }

        /* Use next buffer */
        updrqst = AdvanceXLInsertBuffer(false);
        curridx = Insert->curridx;
        /* Mark page header to indicate this record continues on the page */
        Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
        Insert->currpage->xlp_rem_len = write_len;
        freespace = INSERT_FREESPACE(Insert);
    }

    /* Ensure next record will be properly aligned */
    Insert->currpos = (char *) Insert->currpage +
        MAXALIGN(Insert->currpos - (char *) Insert->currpage);
    freespace = INSERT_FREESPACE(Insert);

    /*
     * The recptr I return is the beginning of the *next* record. This will be
     * stored as LSN for changed data pages...
     */
    INSERT_RECPTR(RecPtr, Insert, curridx);

    /*
     * If the record is an XLOG_SWITCH, we must now write and flush all the
     * existing data, and then forcibly advance to the start of the next
     * segment.  It's not good to do this I/O while holding the insert lock,
     * but there seems too much risk of confusion if we try to release the
     * lock sooner.  Fortunately xlog switch needn't be a high-performance
     * operation anyway...
     */
    if (isLogSwitch)
    {
        XLogwrtRqst FlushRqst;
        XLogRecPtr  OldSegEnd;

        TRACE_POSTGRESQL_XLOG_SWITCH();

        LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);

        /*
         * Flush through the end of the page containing XLOG_SWITCH, and
         * perform end-of-segment actions (eg, notifying archiver).
         */
        WriteRqst = XLogCtl->xlblocks[curridx];
        FlushRqst.Write = WriteRqst;
        FlushRqst.Flush = WriteRqst;
        XLogWrite(FlushRqst, false, true);

        /* Set up the next buffer as first page of next segment */
        /* Note: AdvanceXLInsertBuffer cannot need to do I/O here */
        (void) AdvanceXLInsertBuffer(true);

        /* There should be no unwritten data */
        curridx = Insert->curridx;
        Assert(curridx == XLogCtl->Write.curridx);

        /* Compute end address of old segment */
        OldSegEnd = XLogCtl->xlblocks[curridx];
        OldSegEnd -= XLOG_BLCKSZ;

        /* Make it look like we've written and synced all of old segment */
        LogwrtResult.Write = OldSegEnd;
        LogwrtResult.Flush = OldSegEnd;

        /*
         * Update shared-memory status --- this code should match XLogWrite
         */
        {
            /* use volatile pointer to prevent code rearrangement */
            volatile XLogCtlData *xlogctl = XLogCtl;

            SpinLockAcquire(&xlogctl->info_lck);
            xlogctl->LogwrtResult = LogwrtResult;
            if (xlogctl->LogwrtRqst.Write < LogwrtResult.Write)
                xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
            if (xlogctl->LogwrtRqst.Flush < LogwrtResult.Flush)
                xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
            SpinLockRelease(&xlogctl->info_lck);
        }

        LWLockRelease(WALWriteLock);

        updrqst = false;        /* done already */
    }
    else
    {
        /* normal case, ie not xlog switch */

        /* Need to update shared LogwrtRqst if some block was filled up */
        if (freespace == 0)
        {
            /* curridx is filled and available for writing out */
            updrqst = true;
        }
        else
        {
            /* if updrqst already set, write through end of previous buf */
            curridx = PrevBufIdx(curridx);
        }
        WriteRqst = XLogCtl->xlblocks[curridx];
    }

    LWLockRelease(WALInsertLock);

    if (updrqst)
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        /* advance global request to include new block(s) */
        if (xlogctl->LogwrtRqst.Write < WriteRqst)
            xlogctl->LogwrtRqst.Write = WriteRqst;
        /* update local result copy while I have the chance */
        LogwrtResult = xlogctl->LogwrtResult;
        SpinLockRelease(&xlogctl->info_lck);
    }

    XactLastRecEnd = RecPtr;

    END_CRIT_SECTION();

    /* wake up walsenders now that we've released heavily contended locks */
    WalSndWakeupProcessRequests();

    return RecPtr;
}

bool XLogInsertAllowed ( void   ) 

Definition at line 6283 of file xlog.c.

References LocalXLogInsertAllowed, and RecoveryInProgress().

Referenced by XLogFlush(), and XLogInsert().

{
    /*
     * If value is "unconditionally true" or "unconditionally false", just
     * return it.  This provides the normal fast path once recovery is known
     * done.
     */
    if (LocalXLogInsertAllowed >= 0)
        return (bool) LocalXLogInsertAllowed;

    /*
     * Else, must check to see if we're still in recovery.
     */
    if (RecoveryInProgress())
        return false;

    /*
     * On exit from recovery, reset to "unconditionally true", since there is
     * no need to keep checking.
     */
    LocalXLogInsertAllowed = 1;
    return true;
}

bool XLogNeedsFlush ( XLogRecPtr  record  ) 

Definition at line 2164 of file xlog.c.

References ControlFileLock, XLogwrtResult::Flush, XLogCtlData::info_lck, XLogCtlData::LogwrtResult, LW_SHARED, LWLockConditionalAcquire(), LWLockRelease(), ControlFileData::minRecoveryPoint, minRecoveryPoint, ControlFileData::minRecoveryPointTLI, minRecoveryPointTLI, RecoveryInProgress(), SpinLockAcquire, SpinLockRelease, and updateMinRecoveryPoint.

Referenced by BufferAlloc(), and SetHintBits().

{
    /*
     * During recovery, we don't flush WAL but update minRecoveryPoint
     * instead. So "needs flush" is taken to mean whether minRecoveryPoint
     * would need to be updated.
     */
    if (RecoveryInProgress())
    {
        /* Quick exit if already known updated */
        if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
            return false;

        /*
         * Update local copy of minRecoveryPoint. But if the lock is busy,
         * just return a conservative guess.
         */
        if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
            return true;
        minRecoveryPoint = ControlFile->minRecoveryPoint;
        minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
        LWLockRelease(ControlFileLock);

        /*
         * An invalid minRecoveryPoint means that we need to recover all the
         * WAL, i.e., we're doing crash recovery.  We never modify the control
         * file's value in that case, so we can short-circuit future checks
         * here too.
         */
        if (minRecoveryPoint == 0)
            updateMinRecoveryPoint = false;

        /* check again */
        if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
            return false;
        else
            return true;
    }

    /* Quick exit if already known flushed */
    if (record <= LogwrtResult.Flush)
        return false;

    /* read LogwrtResult and update local state */
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        LogwrtResult = xlogctl->LogwrtResult;
        SpinLockRelease(&xlogctl->info_lck);
    }

    /* check again */
    if (record <= LogwrtResult.Flush)
        return false;

    return true;
}

static int XLogPageRead ( XLogReaderState xlogreader,
XLogRecPtr  targetPagePtr,
int  reqLen,
XLogRecPtr  targetRecPtr,
char *  readBuf,
TimeLineID readTLI 
) [static]

Definition at line 9351 of file xlog.c.

References Assert, bgwriterLaunched, CHECKPOINT_CAUSE_XLOG, close, curFileTLI, emode_for_corrupt_record(), ereport, errcode_for_file_access(), errmsg(), GetRedoRecPtr(), lastSourceFailed, PG_USED_FOR_ASSERTS_ONLY, XLogReaderState::private_data, read, readFile, readLen, readOff, readSegNo, readSource, receivedUpto, RequestCheckpoint(), StandbyMode, StandbyModeRequested, WaitForWALToBecomeAvailable(), XLByteInSeg, XLByteToSeg, XLOG_FROM_STREAM, XLogCheckpointNeeded(), XLogFileName, and XLogSegSize.

Referenced by StartupXLOG().

{
    XLogPageReadPrivate *private =
        (XLogPageReadPrivate *) xlogreader->private_data;
    int         emode = private->emode;
    uint32      targetPageOff;
    XLogSegNo   targetSegNo PG_USED_FOR_ASSERTS_ONLY;

    XLByteToSeg(targetPagePtr, targetSegNo);
    targetPageOff = targetPagePtr % XLogSegSize;

    /*
     * See if we need to switch to a new segment because the requested record
     * is not in the currently open one.
     */
    if (readFile >= 0 && !XLByteInSeg(targetPagePtr, readSegNo))
    {
        /*
         * Request a restartpoint if we've replayed too much xlog since the
         * last one.
         */
        if (StandbyModeRequested && bgwriterLaunched)
        {
            if (XLogCheckpointNeeded(readSegNo))
            {
                (void) GetRedoRecPtr();
                if (XLogCheckpointNeeded(readSegNo))
                    RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
            }
        }

        close(readFile);
        readFile = -1;
        readSource = 0;
    }

    XLByteToSeg(targetPagePtr, readSegNo);

retry:
    /* See if we need to retrieve more data */
    if (readFile < 0 ||
        (readSource == XLOG_FROM_STREAM &&
         receivedUpto < targetPagePtr + reqLen))
    {
        if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
                                         private->randAccess,
                                         private->fetching_ckpt,
                                         targetRecPtr))
        {
            if (readFile >= 0)
                close(readFile);
            readFile = -1;
            readLen = 0;
            readSource = 0;

            return -1;
        }
    }

    /*
     * At this point, we have the right segment open and if we're streaming we
     * know the requested record is in it.
     */
    Assert(readFile != -1);

    /*
     * If the current segment is being streamed from master, calculate how
     * much of the current page we have received already. We know the
     * requested record has been received, but this is for the benefit of
     * future calls, to allow quick exit at the top of this function.
     */
    if (readSource == XLOG_FROM_STREAM)
    {
        if (((targetPagePtr) / XLOG_BLCKSZ) != (receivedUpto / XLOG_BLCKSZ))
            readLen = XLOG_BLCKSZ;
        else
            readLen = receivedUpto % XLogSegSize - targetPageOff;
    }
    else
        readLen = XLOG_BLCKSZ;

    /* Read the requested page */
    readOff = targetPageOff;
    if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
    {
        char fname[MAXFNAMELEN];

        XLogFileName(fname, curFileTLI, readSegNo);
        ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
                (errcode_for_file_access(),
         errmsg("could not seek in log segment %s to offset %u: %m",
                        fname, readOff)));
        goto next_record_is_invalid;
    }

    if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
    {
        char fname[MAXFNAMELEN];

        XLogFileName(fname, curFileTLI, readSegNo);
        ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
                (errcode_for_file_access(),
         errmsg("could not read from log segment %s, offset %u: %m",
                        fname, readOff)));
        goto next_record_is_invalid;
    }

    Assert(targetSegNo == readSegNo);
    Assert(targetPageOff == readOff);
    Assert(reqLen <= readLen);

    *readTLI = curFileTLI;
    return readLen;

next_record_is_invalid:
    lastSourceFailed = true;

    if (readFile >= 0)
        close(readFile);
    readFile = -1;
    readLen = 0;
    readSource = 0;

    /* In standby-mode, keep trying */
    if (StandbyMode)
        goto retry;
    else
        return -1;
}

void XLogPutNextOid ( Oid  nextOid  ) 

Definition at line 7555 of file xlog.c.

References XLogRecData::buffer, XLogRecData::data, XLogRecData::len, XLogRecData::next, XLOG_NEXTOID, and XLogInsert().

Referenced by GetNewObjectId().

{
    XLogRecData rdata;

    rdata.data = (char *) (&nextOid);
    rdata.len = sizeof(Oid);
    rdata.buffer = InvalidBuffer;
    rdata.next = NULL;
    (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);

    /*
     * We need not flush the NEXTOID record immediately, because any of the
     * just-allocated OIDs could only reach disk as part of a tuple insert or
     * update that would have its own XLOG record that must follow the NEXTOID
     * record.  Therefore, the standard buffer LSN interlock applied to those
     * records will ensure no such OID reaches disk before the NEXTOID record
     * does.
     *
     * Note, however, that the above statement only covers state "within" the
     * database.  When we use a generated OID as a file or directory name, we
     * are in a sense violating the basic WAL rule, because that filesystem
     * change may reach disk before the NEXTOID WAL record does.  The impact
     * of this is that if a database crash occurs immediately afterward, we
     * might after restart re-generate the same OID and find that it conflicts
     * with the leftover file or directory.  But since for safety's sake we
     * always loop until finding a nonconflicting filename, this poses no real
     * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
     */
}

static void XLogReportParameters ( void   )  [static]

Definition at line 7733 of file xlog.c.

References XLogRecData::buffer, XLogRecData::data, XLogRecData::len, xl_parameter_change::max_locks_per_xact, ControlFileData::max_locks_per_xact, max_locks_per_xact, xl_parameter_change::max_prepared_xacts, ControlFileData::max_prepared_xacts, max_prepared_xacts, xl_parameter_change::MaxConnections, ControlFileData::MaxConnections, MaxConnections, XLogRecData::next, UpdateControlFile(), xl_parameter_change::wal_level, ControlFileData::wal_level, wal_level, XLOG_PARAMETER_CHANGE, XLogInsert(), and XLogIsNeeded.

Referenced by StartupXLOG().

{
    if (wal_level != ControlFile->wal_level ||
        MaxConnections != ControlFile->MaxConnections ||
        max_prepared_xacts != ControlFile->max_prepared_xacts ||
        max_locks_per_xact != ControlFile->max_locks_per_xact)
    {
        /*
         * The change in number of backend slots doesn't need to be WAL-logged
         * if archiving is not enabled, as you can't start archive recovery
         * with wal_level=minimal anyway. We don't really care about the
         * values in pg_control either if wal_level=minimal, but seems better
         * to keep them up-to-date to avoid confusion.
         */
        if (wal_level != ControlFile->wal_level || XLogIsNeeded())
        {
            XLogRecData rdata;
            xl_parameter_change xlrec;

            xlrec.MaxConnections = MaxConnections;
            xlrec.max_prepared_xacts = max_prepared_xacts;
            xlrec.max_locks_per_xact = max_locks_per_xact;
            xlrec.wal_level = wal_level;

            rdata.buffer = InvalidBuffer;
            rdata.data = (char *) &xlrec;
            rdata.len = sizeof(xlrec);
            rdata.next = NULL;

            XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE, &rdata);
        }

        ControlFile->MaxConnections = MaxConnections;
        ControlFile->max_prepared_xacts = max_prepared_xacts;
        ControlFile->max_locks_per_xact = max_locks_per_xact;
        ControlFile->wal_level = wal_level;
        UpdateControlFile();
    }
}

XLogRecPtr XLogRestorePoint ( const char *  rpName  ) 

Definition at line 7616 of file xlog.c.

References XLogRecData::buffer, XLogRecData::data, ereport, errmsg(), GetCurrentTimestamp(), XLogRecData::len, LOG, MAXFNAMELEN, XLogRecData::next, xl_restore_point::rp_name, xl_restore_point::rp_time, XLOG_RESTORE_POINT, and XLogInsert().

Referenced by pg_create_restore_point().

{
    XLogRecPtr  RecPtr;
    XLogRecData rdata;
    xl_restore_point xlrec;

    xlrec.rp_time = GetCurrentTimestamp();
    strncpy(xlrec.rp_name, rpName, MAXFNAMELEN);

    rdata.buffer = InvalidBuffer;
    rdata.data = (char *) &xlrec;
    rdata.len = sizeof(xl_restore_point);
    rdata.next = NULL;

    RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT, &rdata);

    ereport(LOG,
            (errmsg("restore point \"%s\" created at %X/%X",
                    rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));

    return RecPtr;
}

XLogRecPtr XLogSaveBufferForHint ( Buffer  buffer  ) 

Definition at line 7664 of file xlog.c.

References Assert, XLogRecData::buffer, XLogRecData::buffer_std, BufferGetBlock, XLogRecData::data, PGXACT::delayChkpt, GetRedoRecPtr(), BkpBlock::hole_length, BkpBlock::hole_offset, XLogRecData::len, MyPgXact, XLogRecData::next, XLOG_HINT, XLogCheckBuffer(), and XLogInsert().

Referenced by MarkBufferDirtyHint().

{
    XLogRecPtr recptr = InvalidXLogRecPtr;
    XLogRecPtr lsn;
    XLogRecData rdata[2];
    BkpBlock bkpb;

    /*
     * Ensure no checkpoint can change our view of RedoRecPtr.
     */
    Assert(MyPgXact->delayChkpt);

    /*
     * Update RedoRecPtr so XLogCheckBuffer can make the right decision
     */
    GetRedoRecPtr();

    /*
     * Setup phony rdata element for use within XLogCheckBuffer only.
     * We reuse and reset rdata for any actual WAL record insert.
     */
    rdata[0].buffer = buffer;
    rdata[0].buffer_std = true;

    /*
     * Check buffer while not holding an exclusive lock.
     */
    if (XLogCheckBuffer(rdata, false, &lsn, &bkpb))
    {
        char copied_buffer[BLCKSZ];
        char *origdata = (char *) BufferGetBlock(buffer);

        /*
         * Copy buffer so we don't have to worry about concurrent hint bit or
         * lsn updates. We assume pd_lower/upper cannot be changed without an
         * exclusive lock, so the contents bkp are not racy.
         */
        memcpy(copied_buffer, origdata, bkpb.hole_offset);
        memcpy(copied_buffer + bkpb.hole_offset,
                origdata + bkpb.hole_offset + bkpb.hole_length,
                BLCKSZ - bkpb.hole_offset - bkpb.hole_length);

        /*
         * Header for backup block.
         */
        rdata[0].data = (char *) &bkpb;
        rdata[0].len = sizeof(BkpBlock);
        rdata[0].buffer = InvalidBuffer;
        rdata[0].next = &(rdata[1]);

        /*
         * Save copy of the buffer.
         */
        rdata[1].data = copied_buffer;
        rdata[1].len = BLCKSZ - bkpb.hole_length;
        rdata[1].buffer = InvalidBuffer;
        rdata[1].next = NULL;

        recptr = XLogInsert(RM_XLOG_ID, XLOG_HINT, rdata);
    }

    return recptr;
}

void XLogSetAsyncXactLSN ( XLogRecPtr  asyncXactLSN  ) 

Definition at line 1765 of file xlog.c.

References XLogCtlData::asyncXactLSN, XLogwrtResult::Flush, XLogCtlData::info_lck, XLogCtlData::LogwrtResult, ProcGlobal, SetLatch(), SpinLockAcquire, SpinLockRelease, PROC_HDR::walwriterLatch, and XLogCtlData::WalWriterSleeping.

Referenced by RecordTransactionAbort(), and RecordTransactionCommit().

{
    XLogRecPtr  WriteRqstPtr = asyncXactLSN;
    bool        sleeping;

    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    LogwrtResult = xlogctl->LogwrtResult;
    sleeping = xlogctl->WalWriterSleeping;
    if (xlogctl->asyncXactLSN < asyncXactLSN)
        xlogctl->asyncXactLSN = asyncXactLSN;
    SpinLockRelease(&xlogctl->info_lck);

    /*
     * If the WALWriter is sleeping, we should kick it to make it come out of
     * low-power mode.  Otherwise, determine whether there's a full page of
     * WAL available to write.
     */
    if (!sleeping)
    {
        /* back off to last completed page boundary */
        WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;

        /* if we have already flushed that far, we're done */
        if (WriteRqstPtr <= LogwrtResult.Flush)
            return;
    }

    /*
     * Nudge the WALWriter: it has a full page of WAL to write, or we want it
     * to come out of low-power mode so that this async commit will reach disk
     * within the expected amount of time.
     */
    if (ProcGlobal->walwriterLatch)
        SetLatch(ProcGlobal->walwriterLatch);
}

void XLOGShmemInit ( void   ) 

Definition at line 3929 of file xlog.c.

References ALIGNOF_XLOG_BUFFER, Assert, XLogCtlInsert::currpage, XLogCtlData::info_lck, InitSharedLatch(), XLogCtlData::Insert, IsBootstrapProcessingMode, XLogCtlData::pages, ReadControlFile(), XLogCtlData::recoveryWakeupLatch, XLogCtlData::SharedHotStandbyActive, XLogCtlData::SharedRecoveryInProgress, ShmemInitStruct(), SpinLockInit, TYPEALIGN, XLogCtlData::ulsn_lck, XLogCtlData::WalWriterSleeping, XLogCtlData::xlblocks, XLOGbuffers, XLogCtlData::XLogCacheBlck, and XLOGShmemSize().

Referenced by CreateSharedMemoryAndSemaphores().

{
    bool        foundCFile,
                foundXLog;
    char       *allocptr;

    ControlFile = (ControlFileData *)
        ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
    XLogCtl = (XLogCtlData *)
        ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);

    if (foundCFile || foundXLog)
    {
        /* both should be present or neither */
        Assert(foundCFile && foundXLog);
        return;
    }

    memset(XLogCtl, 0, sizeof(XLogCtlData));

    /*
     * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
     * multiple of the alignment for same, so no extra alignment padding is
     * needed here.
     */
    allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
    XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
    memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
    allocptr += sizeof(XLogRecPtr) * XLOGbuffers;

    /*
     * Align the start of the page buffers to an ALIGNOF_XLOG_BUFFER boundary.
     */
    allocptr = (char *) TYPEALIGN(ALIGNOF_XLOG_BUFFER, allocptr);
    XLogCtl->pages = allocptr;
    memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);

    /*
     * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
     * in additional info.)
     */
    XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
    XLogCtl->SharedRecoveryInProgress = true;
    XLogCtl->SharedHotStandbyActive = false;
    XLogCtl->WalWriterSleeping = false;
    XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
    SpinLockInit(&XLogCtl->info_lck);
    SpinLockInit(&XLogCtl->ulsn_lck);
    InitSharedLatch(&XLogCtl->recoveryWakeupLatch);

    /*
     * If we are not in bootstrap mode, pg_control should already exist. Read
     * and validate it immediately (see comments in ReadControlFile() for the
     * reasons why).
     */
    if (!IsBootstrapProcessingMode())
        ReadControlFile();
}

Size XLOGShmemSize ( void   ) 

Definition at line 3891 of file xlog.c.

References add_size(), ALIGNOF_XLOG_BUFFER, Assert, buf, mul_size(), PGC_POSTMASTER, PGC_S_OVERRIDE, SetConfigOption(), snprintf(), XLOGbuffers, and XLOGChooseNumBuffers().

Referenced by CreateSharedMemoryAndSemaphores(), and XLOGShmemInit().

{
    Size        size;

    /*
     * If the value of wal_buffers is -1, use the preferred auto-tune value.
     * This isn't an amazingly clean place to do this, but we must wait till
     * NBuffers has received its final value, and must do it before using the
     * value of XLOGbuffers to do anything important.
     */
    if (XLOGbuffers == -1)
    {
        char        buf[32];

        snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
        SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
    }
    Assert(XLOGbuffers > 0);

    /* XLogCtl */
    size = sizeof(XLogCtlData);
    /* xlblocks array */
    size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
    /* extra alignment padding for XLOG I/O buffers */
    size = add_size(size, ALIGNOF_XLOG_BUFFER);
    /* and the buffers themselves */
    size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));

    /*
     * Note: we don't count ControlFileData, it comes out of the "slop factor"
     * added by CreateSharedMemoryAndSemaphores.  This lets us use this
     * routine again below to compute the actual allocation size.
     */

    return size;
}

static void XLogWrite ( XLogwrtRqst  WriteRqst,
bool  flexible,
bool  xlog_switch 
) [static]

Definition at line 1497 of file xlog.c.

References Assert, CHECKPOINT_CAUSE_XLOG, CritSectionCount, XLogCtlWrite::curridx, elog, ereport, errcode_for_file_access(), errmsg(), XLogwrtRqst::Flush, XLogwrtResult::Flush, GetRedoRecPtr(), XLogCtlData::info_lck, issue_xlog_fsync(), IsUnderPostmaster, XLogCtlWrite::lastSegSwitchTime, XLogCtlData::LogwrtResult, XLogCtlData::LogwrtRqst, NextBufIdx, NULL, openLogFile, openLogOff, openLogSegNo, XLogCtlData::pages, PANIC, RequestCheckpoint(), SpinLockAcquire, SpinLockRelease, sync_method, SYNC_METHOD_OPEN, SYNC_METHOD_OPEN_DSYNC, ThisTimeLineID, WalSndWakeupRequest, write, XLogwrtRqst::Write, XLogwrtResult::Write, XLogCtlData::Write, Write, XLogCtlData::xlblocks, XLByteInPrevSeg, XLByteToPrevSeg, XLogArchiveNotifySeg(), XLogArchivingActive, XLogCtlData::XLogCacheBlck, XLogCheckpointNeeded(), XLogFileClose(), XLogFileInit(), XLogFileNameP(), XLogFileOpen(), and XLogSegSize.

Referenced by AdvanceXLInsertBuffer(), XLogBackgroundFlush(), XLogFlush(), and XLogInsert().

{
    XLogCtlWrite *Write = &XLogCtl->Write;
    bool        ispartialpage;
    bool        last_iteration;
    bool        finishing_seg;
    bool        use_existent;
    int         curridx;
    int         npages;
    int         startidx;
    uint32      startoffset;

    /* We should always be inside a critical section here */
    Assert(CritSectionCount > 0);

    /*
     * Update local LogwrtResult (caller probably did this already, but...)
     */
    LogwrtResult = XLogCtl->LogwrtResult;

    /*
     * Since successive pages in the xlog cache are consecutively allocated,
     * we can usually gather multiple pages together and issue just one
     * write() call.  npages is the number of pages we have determined can be
     * written together; startidx is the cache block index of the first one,
     * and startoffset is the file offset at which it should go. The latter
     * two variables are only valid when npages > 0, but we must initialize
     * all of them to keep the compiler quiet.
     */
    npages = 0;
    startidx = 0;
    startoffset = 0;

    /*
     * Within the loop, curridx is the cache block index of the page to
     * consider writing.  We advance Write->curridx only after successfully
     * writing pages.  (Right now, this refinement is useless since we are
     * going to PANIC if any error occurs anyway; but someday it may come in
     * useful.)
     */
    curridx = Write->curridx;

    while (LogwrtResult.Write < WriteRqst.Write)
    {
        /*
         * Make sure we're not ahead of the insert process.  This could happen
         * if we're passed a bogus WriteRqst.Write that is past the end of the
         * last page that's been initialized by AdvanceXLInsertBuffer.
         */
        if (LogwrtResult.Write >= XLogCtl->xlblocks[curridx])
            elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
                 (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
                 (uint32) (XLogCtl->xlblocks[curridx] >> 32),
                 (uint32) XLogCtl->xlblocks[curridx]);

        /* Advance LogwrtResult.Write to end of current buffer page */
        LogwrtResult.Write = XLogCtl->xlblocks[curridx];
        ispartialpage = WriteRqst.Write < LogwrtResult.Write;

        if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
        {
            /*
             * Switch to new logfile segment.  We cannot have any pending
             * pages here (since we dump what we have at segment end).
             */
            Assert(npages == 0);
            if (openLogFile >= 0)
                XLogFileClose();
            XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);

            /* create/use new log file */
            use_existent = true;
            openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
            openLogOff = 0;
        }

        /* Make sure we have the current logfile open */
        if (openLogFile < 0)
        {
            XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
            openLogFile = XLogFileOpen(openLogSegNo);
            openLogOff = 0;
        }

        /* Add current page to the set of pending pages-to-dump */
        if (npages == 0)
        {
            /* first of group */
            startidx = curridx;
            startoffset = (LogwrtResult.Write - XLOG_BLCKSZ) % XLogSegSize;
        }
        npages++;

        /*
         * Dump the set if this will be the last loop iteration, or if we are
         * at the last page of the cache area (since the next page won't be
         * contiguous in memory), or if we are at the end of the logfile
         * segment.
         */
        last_iteration = WriteRqst.Write <= LogwrtResult.Write;

        finishing_seg = !ispartialpage &&
            (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;

        if (last_iteration ||
            curridx == XLogCtl->XLogCacheBlck ||
            finishing_seg)
        {
            char       *from;
            Size        nbytes;

            /* Need to seek in the file? */
            if (openLogOff != startoffset)
            {
                if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
                    ereport(PANIC,
                            (errcode_for_file_access(),
                             errmsg("could not seek in log file %s to offset %u: %m",
                                    XLogFileNameP(ThisTimeLineID, openLogSegNo),
                                    startoffset)));
                openLogOff = startoffset;
            }

            /* OK to write the page(s) */
            from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
            nbytes = npages * (Size) XLOG_BLCKSZ;
            errno = 0;
            if (write(openLogFile, from, nbytes) != nbytes)
            {
                /* if write didn't set errno, assume no disk space */
                if (errno == 0)
                    errno = ENOSPC;
                ereport(PANIC,
                        (errcode_for_file_access(),
                         errmsg("could not write to log file %s "
                                "at offset %u, length %lu: %m",
                                XLogFileNameP(ThisTimeLineID, openLogSegNo),
                                openLogOff, (unsigned long) nbytes)));
            }

            /* Update state for write */
            openLogOff += nbytes;
            Write->curridx = ispartialpage ? curridx : NextBufIdx(curridx);
            npages = 0;

            /*
             * If we just wrote the whole last page of a logfile segment,
             * fsync the segment immediately.  This avoids having to go back
             * and re-open prior segments when an fsync request comes along
             * later. Doing it here ensures that one and only one backend will
             * perform this fsync.
             *
             * We also do this if this is the last page written for an xlog
             * switch.
             *
             * This is also the right place to notify the Archiver that the
             * segment is ready to copy to archival storage, and to update the
             * timer for archive_timeout, and to signal for a checkpoint if
             * too many logfile segments have been used since the last
             * checkpoint.
             */
            if (finishing_seg || (xlog_switch && last_iteration))
            {
                issue_xlog_fsync(openLogFile, openLogSegNo);

                /* signal that we need to wakeup walsenders later */
                WalSndWakeupRequest();

                LogwrtResult.Flush = LogwrtResult.Write;        /* end of page */

                if (XLogArchivingActive())
                    XLogArchiveNotifySeg(openLogSegNo);

                Write->lastSegSwitchTime = (pg_time_t) time(NULL);

                /*
                 * Request a checkpoint if we've consumed too much xlog since
                 * the last one.  For speed, we first check using the local
                 * copy of RedoRecPtr, which might be out of date; if it looks
                 * like a checkpoint is needed, forcibly update RedoRecPtr and
                 * recheck.
                 */
                if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
                {
                    (void) GetRedoRecPtr();
                    if (XLogCheckpointNeeded(openLogSegNo))
                        RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
                }
            }
        }

        if (ispartialpage)
        {
            /* Only asked to write a partial page */
            LogwrtResult.Write = WriteRqst.Write;
            break;
        }
        curridx = NextBufIdx(curridx);

        /* If flexible, break out of loop as soon as we wrote something */
        if (flexible && npages == 0)
            break;
    }

    Assert(npages == 0);
    Assert(curridx == Write->curridx);

    /*
     * If asked to flush, do so
     */
    if (LogwrtResult.Flush < WriteRqst.Flush &&
        LogwrtResult.Flush < LogwrtResult.Write)

    {
        /*
         * Could get here without iterating above loop, in which case we might
         * have no open file or the wrong one.  However, we do not need to
         * fsync more than one file.
         */
        if (sync_method != SYNC_METHOD_OPEN &&
            sync_method != SYNC_METHOD_OPEN_DSYNC)
        {
            if (openLogFile >= 0 &&
                !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
                XLogFileClose();
            if (openLogFile < 0)
            {
                XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
                openLogFile = XLogFileOpen(openLogSegNo);
                openLogOff = 0;
            }

            issue_xlog_fsync(openLogFile, openLogSegNo);
        }

        /* signal that we need to wakeup walsenders later */
        WalSndWakeupRequest();

        LogwrtResult.Flush = LogwrtResult.Write;
    }

    /*
     * Update shared-memory status
     *
     * We make sure that the shared 'request' values do not fall behind the
     * 'result' values.  This is not absolutely essential, but it saves some
     * code in a couple of places.
     */
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        xlogctl->LogwrtResult = LogwrtResult;
        if (xlogctl->LogwrtRqst.Write < LogwrtResult.Write)
            xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
        if (xlogctl->LogwrtRqst.Flush < LogwrtResult.Flush)
            xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
        SpinLockRelease(&xlogctl->info_lck);
    }
}


Variable Documentation

char* archiveCleanupCommand = NULL [static]

Definition at line 212 of file xlog.c.

Referenced by readRecoveryCommandFile(), and StartupXLOG().

bool bgwriterLaunched = false [static]

Definition at line 631 of file xlog.c.

Referenced by StartupXLOG(), and XLogPageRead().

Definition at line 52 of file bootstrap.c.

Referenced by AuxiliaryProcessMain(), and BootStrapXLOG().

Definition at line 73 of file xlog.c.

Referenced by IsCheckpointOnSchedule(), and XLogCheckpointNeeded().

Definition at line 130 of file xlog.c.

Referenced by BufferSync(), CheckPointBuffers(), and mdsync().

int CommitDelay = 0

Definition at line 84 of file xlog.c.

Referenced by XLogFlush().

int CommitSiblings = 5

Definition at line 85 of file xlog.c.

Referenced by XLogFlush().

ControlFileData* ControlFile = NULL [static]

Definition at line 517 of file xlog.c.

Referenced by main().

XLogSource currentSource = 0 [static]

Definition at line 591 of file xlog.c.

Referenced by ReadRecord(), and WaitForWALToBecomeAvailable().

Definition at line 614 of file xlog.c.

Referenced by CheckRecoveryConsistency(), ReadRecord(), rescanLatestTimeLine(), and StartupXLOG().

List* expectedTLEs [static]

Definition at line 263 of file xlog.c.

bool fast_promote = false [static]

Definition at line 229 of file xlog.c.

Referenced by CheckForStandbyTrigger(), and StartupXLOG().

Definition at line 80 of file xlog.c.

Referenced by BootStrapXLOG(), and UpdateFullPageWrites().

bool InRecovery = false
bool InRedo = false [static]

Definition at line 628 of file xlog.c.

Referenced by StartupXLOG(), and XLogFileRead().

Definition at line 166 of file xlog.c.

Referenced by StartupXLOG(), and xlog_redo().

XLogRecPtr LastRec [static]

Definition at line 154 of file xlog.c.

Referenced by StartupXLOG().

bool lastSourceFailed = false [static]

Definition at line 592 of file xlog.c.

Referenced by ReadRecord(), WaitForWALToBecomeAvailable(), and XLogPageRead().

bool LocalHotStandbyActive = false [static]

Definition at line 178 of file xlog.c.

Referenced by CheckRecoveryConsistency(), HotStandbyActive(), and recoveryPausesHere().

bool LocalRecoveryInProgress = true [static]

Definition at line 172 of file xlog.c.

Referenced by RecoveryInProgress().

int LocalXLogInsertAllowed = -1 [static]

Definition at line 81 of file xlog.c.

Referenced by CreateCheckPoint(), CreateRestartPoint(), LogCheckpointEnd(), and mdsync().

XLogwrtResult LogwrtResult = {0, 0} [static]

Definition at line 543 of file xlog.c.

int openLogFile = -1 [static]
uint32 openLogOff = 0 [static]

Definition at line 568 of file xlog.c.

Referenced by StartupXLOG(), and XLogWrite().

XLogSegNo openLogSegNo = 0 [static]
char* PrimaryConnInfo = NULL [static]

Definition at line 222 of file xlog.c.

Referenced by readRecoveryCommandFile(), and WaitForWALToBecomeAvailable().

XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr [static]

Definition at line 273 of file xlog.c.

Referenced by CreateCheckPoint(), and XLogInsert().

int readFile = -1 [static]
uint32 readLen = 0 [static]

Definition at line 581 of file xlog.c.

Referenced by ReadPageInternal(), and XLogPageRead().

uint32 readOff = 0 [static]

Definition at line 580 of file xlog.c.

Referenced by StartupXLOG(), XLogPageRead(), and XLogReadRecord().

Definition at line 613 of file xlog.c.

Referenced by ReadRecord(), RecoveryRestartPoint(), StartupXLOG(), and xlog_redo().

XLogSegNo readSegNo = 0 [static]

Definition at line 579 of file xlog.c.

Referenced by WaitForWALToBecomeAvailable(), and XLogPageRead().

XLogSource readSource = 0 [static]
XLogRecPtr receivedUpto = 0 [static]

Definition at line 157 of file xlog.c.

Referenced by WaitForWALToBecomeAvailable(), and XLogPageRead().

TimeLineID receiveTLI = 0 [static]

Definition at line 158 of file xlog.c.

Referenced by GetStandbyFlushRecPtr(), and WaitForWALToBecomeAvailable().

char* recoveryEndCommand = NULL [static]

Definition at line 211 of file xlog.c.

Referenced by readRecoveryCommandFile(), and StartupXLOG().

bool recoveryPauseAtTarget = true [static]

Definition at line 215 of file xlog.c.

Referenced by readRecoveryCommandFile(), and StartupXLOG().

char* recoveryRestoreCommand = NULL

Definition at line 210 of file xlog.c.

Referenced by readRecoveryCommandFile(), and RestoreArchivedFile().

Definition at line 235 of file xlog.c.

Referenced by recoveryStopsHere(), and StartupXLOG().

char recoveryStopName[MAXFNAMELEN] [static]

Definition at line 234 of file xlog.c.

Referenced by recoveryStopsHere(), and StartupXLOG().

Definition at line 233 of file xlog.c.

Referenced by recoveryStopsHere(), and StartupXLOG().

Definition at line 232 of file xlog.c.

Referenced by recoveryStopsHere(), and StartupXLOG().

RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET [static]

Definition at line 213 of file xlog.c.

Referenced by readRecoveryCommandFile(), recoveryStopsHere(), and StartupXLOG().

bool recoveryTargetInclusive = true [static]

Definition at line 214 of file xlog.c.

Referenced by readRecoveryCommandFile(), and recoveryStopsHere().

bool recoveryTargetIsLatest = false [static]

Definition at line 262 of file xlog.c.

Referenced by readRecoveryCommandFile(), StartupXLOG(), and WaitForWALToBecomeAvailable().

char* recoveryTargetName [static]

Definition at line 218 of file xlog.c.

Referenced by readRecoveryCommandFile(), recoveryStopsHere(), and StartupXLOG().

Definition at line 217 of file xlog.c.

Referenced by readRecoveryCommandFile(), recoveryStopsHere(), and StartupXLOG().

Definition at line 216 of file xlog.c.

Referenced by readRecoveryCommandFile(), recoveryStopsHere(), and StartupXLOG().

XLogRecPtr RedoStartLSN = InvalidXLogRecPtr [static]

Definition at line 299 of file xlog.c.

Referenced by read_backup_label(), StartupXLOG(), and WaitForWALToBecomeAvailable().

bool restoredFromArchive = false [static]

Definition at line 207 of file xlog.c.

Referenced by XLogFileRead().

bool StandbyMode = false
bool StandbyModeRequested = false [static]

Definition at line 221 of file xlog.c.

Referenced by ReadRecord(), readRecoveryCommandFile(), StartupXLOG(), and XLogPageRead().

HotStandbyState standbyState = STANDBY_DISABLED
int sync_method = DEFAULT_SYNC_METHOD
Initial value:
 {
    {"fsync", SYNC_METHOD_FSYNC, false},












    {NULL, 0, false}
}

Definition at line 108 of file xlog.c.

char* TriggerFile = NULL [static]

Definition at line 223 of file xlog.c.

Referenced by CheckForStandbyTrigger(), and readRecoveryCommandFile().

bool updateMinRecoveryPoint = true [static]

Definition at line 619 of file xlog.c.

Referenced by UpdateMinRecoveryPoint(), and XLogNeedsFlush().

Definition at line 74 of file xlog.c.

Referenced by KeepLogSeg().

int wal_level = WAL_LEVEL_MINIMAL

Definition at line 83 of file xlog.c.

Referenced by BootStrapXLOG(), PostmasterMain(), and XLogReportParameters().

XLogRecPtr XactLastRecEnd = InvalidXLogRecPtr
char* XLogArchiveCommand = NULL

Definition at line 78 of file xlog.c.

Referenced by pgarch_archiveXlog(), and show_archive_command().

Definition at line 77 of file xlog.c.

Referenced by PostmasterMain().

Definition at line 76 of file xlog.c.

Referenced by CheckArchiveTimeout(), and CheckpointerMain().

int XLOGbuffers = -1

Definition at line 75 of file xlog.c.

Referenced by check_wal_buffers(), XLOGShmemInit(), and XLOGShmemSize().

XLogCtlData* XLogCtl = NULL [static]

Definition at line 512 of file xlog.c.

Definition at line 610 of file xlog.c.

Referenced by GetXLogReceiptTime(), WaitForWALToBecomeAvailable(), and XLogFileRead().

Definition at line 609 of file xlog.c.

Referenced by GetXLogReceiptTime(), StartupXLOG(), WaitForWALToBecomeAvailable(), and XLogFileRead().

const char* xlogSourceNames[] = { "any", "archive", "pg_xlog", "stream" } [static]

Definition at line 558 of file xlog.c.

Referenced by WaitForWALToBecomeAvailable().