Header And Logo

PostgreSQL
| The world's most advanced open source database.

xlog.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * xlog.c
00004  *      PostgreSQL transaction log manager
00005  *
00006  *
00007  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00008  * Portions Copyright (c) 1994, Regents of the University of California
00009  *
00010  * src/backend/access/transam/xlog.c
00011  *
00012  *-------------------------------------------------------------------------
00013  */
00014 
00015 #include "postgres.h"
00016 
00017 #include <ctype.h>
00018 #include <time.h>
00019 #include <fcntl.h>
00020 #include <sys/stat.h>
00021 #include <sys/time.h>
00022 #include <unistd.h>
00023 
00024 #include "access/clog.h"
00025 #include "access/multixact.h"
00026 #include "access/subtrans.h"
00027 #include "access/timeline.h"
00028 #include "access/transam.h"
00029 #include "access/tuptoaster.h"
00030 #include "access/twophase.h"
00031 #include "access/xact.h"
00032 #include "access/xlog_internal.h"
00033 #include "access/xlogreader.h"
00034 #include "access/xlogutils.h"
00035 #include "catalog/catversion.h"
00036 #include "catalog/pg_control.h"
00037 #include "catalog/pg_database.h"
00038 #include "miscadmin.h"
00039 #include "pgstat.h"
00040 #include "postmaster/bgwriter.h"
00041 #include "postmaster/startup.h"
00042 #include "replication/walreceiver.h"
00043 #include "replication/walsender.h"
00044 #include "storage/bufmgr.h"
00045 #include "storage/fd.h"
00046 #include "storage/ipc.h"
00047 #include "storage/latch.h"
00048 #include "storage/pmsignal.h"
00049 #include "storage/predicate.h"
00050 #include "storage/proc.h"
00051 #include "storage/procarray.h"
00052 #include "storage/reinit.h"
00053 #include "storage/smgr.h"
00054 #include "storage/spin.h"
00055 #include "utils/builtins.h"
00056 #include "utils/guc.h"
00057 #include "utils/ps_status.h"
00058 #include "utils/relmapper.h"
00059 #include "utils/snapmgr.h"
00060 #include "utils/timestamp.h"
00061 #include "pg_trace.h"
00062 
00063 extern uint32 bootstrap_data_checksum_version;
00064 
00065 /* File path names (all relative to $PGDATA) */
00066 #define RECOVERY_COMMAND_FILE   "recovery.conf"
00067 #define RECOVERY_COMMAND_DONE   "recovery.done"
00068 #define PROMOTE_SIGNAL_FILE "promote"
00069 #define FAST_PROMOTE_SIGNAL_FILE "fast_promote"
00070 
00071 
00072 /* User-settable parameters */
00073 int         CheckPointSegments = 3;
00074 int         wal_keep_segments = 0;
00075 int         XLOGbuffers = -1;
00076 int         XLogArchiveTimeout = 0;
00077 bool        XLogArchiveMode = false;
00078 char       *XLogArchiveCommand = NULL;
00079 bool        EnableHotStandby = false;
00080 bool        fullPageWrites = true;
00081 bool        log_checkpoints = false;
00082 int         sync_method = DEFAULT_SYNC_METHOD;
00083 int         wal_level = WAL_LEVEL_MINIMAL;
00084 int         CommitDelay = 0;    /* precommit delay in microseconds */
00085 int         CommitSiblings = 5; /* # concurrent xacts needed to sleep */
00086 
00087 #ifdef WAL_DEBUG
00088 bool        XLOG_DEBUG = false;
00089 #endif
00090 
00091 /*
00092  * XLOGfileslop is the maximum number of preallocated future XLOG segments.
00093  * When we are done with an old XLOG segment file, we will recycle it as a
00094  * future XLOG segment as long as there aren't already XLOGfileslop future
00095  * segments; else we'll delete it.  This could be made a separate GUC
00096  * variable, but at present I think it's sufficient to hardwire it as
00097  * 2*CheckPointSegments+1.  Under normal conditions, a checkpoint will free
00098  * no more than 2*CheckPointSegments log segments, and we want to recycle all
00099  * of them; the +1 allows boundary cases to happen without wasting a
00100  * delete/create-segment cycle.
00101  */
00102 #define XLOGfileslop    (2*CheckPointSegments + 1)
00103 
00104 
00105 /*
00106  * GUC support
00107  */
00108 const struct config_enum_entry sync_method_options[] = {
00109     {"fsync", SYNC_METHOD_FSYNC, false},
00110 #ifdef HAVE_FSYNC_WRITETHROUGH
00111     {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
00112 #endif
00113 #ifdef HAVE_FDATASYNC
00114     {"fdatasync", SYNC_METHOD_FDATASYNC, false},
00115 #endif
00116 #ifdef OPEN_SYNC_FLAG
00117     {"open_sync", SYNC_METHOD_OPEN, false},
00118 #endif
00119 #ifdef OPEN_DATASYNC_FLAG
00120     {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
00121 #endif
00122     {NULL, 0, false}
00123 };
00124 
00125 /*
00126  * Statistics for current checkpoint are collected in this global struct.
00127  * Because only the background writer or a stand-alone backend can perform
00128  * checkpoints, this will be unused in normal backends.
00129  */
00130 CheckpointStatsData CheckpointStats;
00131 
00132 /*
00133  * ThisTimeLineID will be same in all backends --- it identifies current
00134  * WAL timeline for the database system.
00135  */
00136 TimeLineID  ThisTimeLineID = 0;
00137 
00138 /*
00139  * Are we doing recovery from XLOG?
00140  *
00141  * This is only ever true in the startup process; it should be read as meaning
00142  * "this process is replaying WAL records", rather than "the system is in
00143  * recovery mode".  It should be examined primarily by functions that need
00144  * to act differently when called from a WAL redo function (e.g., to skip WAL
00145  * logging).  To check whether the system is in recovery regardless of which
00146  * process you're running in, use RecoveryInProgress() but only after shared
00147  * memory startup and lock initialization.
00148  */
00149 bool        InRecovery = false;
00150 
00151 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
00152 HotStandbyState standbyState = STANDBY_DISABLED;
00153 
00154 static XLogRecPtr LastRec;
00155 
00156 /* Local copy of WalRcv->receivedUpto */
00157 static XLogRecPtr receivedUpto = 0;
00158 static TimeLineID receiveTLI = 0;
00159 
00160 /*
00161  * During recovery, lastFullPageWrites keeps track of full_page_writes that
00162  * the replayed WAL records indicate. It's initialized with full_page_writes
00163  * that the recovery starting checkpoint record indicates, and then updated
00164  * each time XLOG_FPW_CHANGE record is replayed.
00165  */
00166 static bool lastFullPageWrites;
00167 
00168 /*
00169  * Local copy of SharedRecoveryInProgress variable. True actually means "not
00170  * known, need to check the shared state".
00171  */
00172 static bool LocalRecoveryInProgress = true;
00173 
00174 /*
00175  * Local copy of SharedHotStandbyActive variable. False actually means "not
00176  * known, need to check the shared state".
00177  */
00178 static bool LocalHotStandbyActive = false;
00179 
00180 /*
00181  * Local state for XLogInsertAllowed():
00182  *      1: unconditionally allowed to insert XLOG
00183  *      0: unconditionally not allowed to insert XLOG
00184  *      -1: must check RecoveryInProgress(); disallow until it is false
00185  * Most processes start with -1 and transition to 1 after seeing that recovery
00186  * is not in progress.  But we can also force the value for special cases.
00187  * The coding in XLogInsertAllowed() depends on the first two of these states
00188  * being numerically the same as bool true and false.
00189  */
00190 static int  LocalXLogInsertAllowed = -1;
00191 
00192 /*
00193  * When ArchiveRecoveryRequested is set, archive recovery was requested,
00194  * ie. recovery.conf file was present. When InArchiveRecovery is set, we are
00195  * currently recovering using offline XLOG archives. These variables are only
00196  * valid in the startup process.
00197  *
00198  * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
00199  * currently performing crash recovery using only XLOG files in pg_xlog, but
00200  * will switch to using offline XLOG archives as soon as we reach the end of
00201  * WAL in pg_xlog.
00202 */
00203 bool ArchiveRecoveryRequested = false;
00204 bool InArchiveRecovery = false;
00205 
00206 /* Was the last xlog file restored from archive, or local? */
00207 static bool restoredFromArchive = false;
00208 
00209 /* options taken from recovery.conf for archive recovery */
00210 char *recoveryRestoreCommand = NULL;
00211 static char *recoveryEndCommand = NULL;
00212 static char *archiveCleanupCommand = NULL;
00213 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
00214 static bool recoveryTargetInclusive = true;
00215 static bool recoveryPauseAtTarget = true;
00216 static TransactionId recoveryTargetXid;
00217 static TimestampTz recoveryTargetTime;
00218 static char *recoveryTargetName;
00219 
00220 /* options taken from recovery.conf for XLOG streaming */
00221 static bool StandbyModeRequested = false;
00222 static char *PrimaryConnInfo = NULL;
00223 static char *TriggerFile = NULL;
00224 
00225 /* are we currently in standby mode? */
00226 bool StandbyMode = false;
00227 
00228 /* whether request for fast promotion has been made yet */
00229 static bool fast_promote = false;
00230 
00231 /* if recoveryStopsHere returns true, it saves actual stop xid/time/name here */
00232 static TransactionId recoveryStopXid;
00233 static TimestampTz recoveryStopTime;
00234 static char recoveryStopName[MAXFNAMELEN];
00235 static bool recoveryStopAfter;
00236 
00237 /*
00238  * During normal operation, the only timeline we care about is ThisTimeLineID.
00239  * During recovery, however, things are more complicated.  To simplify life
00240  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
00241  * scan through the WAL history (that is, it is the line that was active when
00242  * the currently-scanned WAL record was generated).  We also need these
00243  * timeline values:
00244  *
00245  * recoveryTargetTLI: the desired timeline that we want to end in.
00246  *
00247  * recoveryTargetIsLatest: was the requested target timeline 'latest'?
00248  *
00249  * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
00250  * its known parents, newest first (so recoveryTargetTLI is always the
00251  * first list member).  Only these TLIs are expected to be seen in the WAL
00252  * segments we read, and indeed only these TLIs will be considered as
00253  * candidate WAL files to open at all.
00254  *
00255  * curFileTLI: the TLI appearing in the name of the current input WAL file.
00256  * (This is not necessarily the same as ThisTimeLineID, because we could
00257  * be scanning data that was copied from an ancestor timeline when the current
00258  * file was created.)  During a sequential scan we do not allow this value
00259  * to decrease.
00260  */
00261 static TimeLineID recoveryTargetTLI;
00262 static bool recoveryTargetIsLatest = false;
00263 static List *expectedTLEs;
00264 static TimeLineID curFileTLI;
00265 
00266 /*
00267  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
00268  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
00269  * end+1 of the last record, and is reset when we end a top-level transaction,
00270  * or start a new one; so it can be used to tell if the current transaction has
00271  * created any XLOG records.
00272  */
00273 static XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr;
00274 
00275 XLogRecPtr  XactLastRecEnd = InvalidXLogRecPtr;
00276 
00277 /*
00278  * RedoRecPtr is this backend's local copy of the REDO record pointer
00279  * (which is almost but not quite the same as a pointer to the most recent
00280  * CHECKPOINT record).  We update this from the shared-memory copy,
00281  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
00282  * hold the Insert lock).  See XLogInsert for details.  We are also allowed
00283  * to update from XLogCtl->Insert.RedoRecPtr if we hold the info_lck;
00284  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
00285  * InitXLOGAccess.
00286  */
00287 static XLogRecPtr RedoRecPtr;
00288 
00289 /*
00290  * RedoStartLSN points to the checkpoint's REDO location which is specified
00291  * in a backup label file, backup history file or control file. In standby
00292  * mode, XLOG streaming usually starts from the position where an invalid
00293  * record was found. But if we fail to read even the initial checkpoint
00294  * record, we use the REDO location instead of the checkpoint location as
00295  * the start position of XLOG streaming. Otherwise we would have to jump
00296  * backwards to the REDO location after reading the checkpoint record,
00297  * because the REDO record can precede the checkpoint record.
00298  */
00299 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
00300 
00301 /*----------
00302  * Shared-memory data structures for XLOG control
00303  *
00304  * LogwrtRqst indicates a byte position that we need to write and/or fsync
00305  * the log up to (all records before that point must be written or fsynced).
00306  * LogwrtResult indicates the byte positions we have already written/fsynced.
00307  * These structs are identical but are declared separately to indicate their
00308  * slightly different functions.
00309  *
00310  * To read XLogCtl->LogwrtResult, you must hold either info_lck or
00311  * WALWriteLock.  To update it, you need to hold both locks.  The point of
00312  * this arrangement is that the value can be examined by code that already
00313  * holds WALWriteLock without needing to grab info_lck as well.  In addition
00314  * to the shared variable, each backend has a private copy of LogwrtResult,
00315  * which is updated when convenient.
00316  *
00317  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
00318  * (protected by info_lck), but we don't need to cache any copies of it.
00319  *
00320  * info_lck is only held long enough to read/update the protected variables,
00321  * so it's a plain spinlock.  The other locks are held longer (potentially
00322  * over I/O operations), so we use LWLocks for them.  These locks are:
00323  *
00324  * WALInsertLock: must be held to insert a record into the WAL buffers.
00325  *
00326  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
00327  * XLogFlush).
00328  *
00329  * ControlFileLock: must be held to read/update control file or create
00330  * new log file.
00331  *
00332  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
00333  * only one checkpointer at a time; currently, with all checkpoints done by
00334  * the checkpointer, this is just pro forma).
00335  *
00336  *----------
00337  */
00338 
00339 typedef struct XLogwrtRqst
00340 {
00341     XLogRecPtr  Write;          /* last byte + 1 to write out */
00342     XLogRecPtr  Flush;          /* last byte + 1 to flush */
00343 } XLogwrtRqst;
00344 
00345 typedef struct XLogwrtResult
00346 {
00347     XLogRecPtr  Write;          /* last byte + 1 written out */
00348     XLogRecPtr  Flush;          /* last byte + 1 flushed */
00349 } XLogwrtResult;
00350 
00351 /*
00352  * Shared state data for XLogInsert.
00353  */
00354 typedef struct XLogCtlInsert
00355 {
00356     XLogRecPtr  PrevRecord;     /* start of previously-inserted record */
00357     int         curridx;        /* current block index in cache */
00358     XLogPageHeader currpage;    /* points to header of block in cache */
00359     char       *currpos;        /* current insertion point in cache */
00360     XLogRecPtr  RedoRecPtr;     /* current redo point for insertions */
00361     bool        forcePageWrites;    /* forcing full-page writes for PITR? */
00362 
00363     /*
00364      * fullPageWrites is the master copy used by all backends to determine
00365      * whether to write full-page to WAL, instead of using process-local one.
00366      * This is required because, when full_page_writes is changed by SIGHUP,
00367      * we must WAL-log it before it actually affects WAL-logging by backends.
00368      * Checkpointer sets at startup or after SIGHUP.
00369      */
00370     bool        fullPageWrites;
00371 
00372     /*
00373      * exclusiveBackup is true if a backup started with pg_start_backup() is
00374      * in progress, and nonExclusiveBackups is a counter indicating the number
00375      * of streaming base backups currently in progress. forcePageWrites is set
00376      * to true when either of these is non-zero. lastBackupStart is the latest
00377      * checkpoint redo location used as a starting point for an online backup.
00378      */
00379     bool        exclusiveBackup;
00380     int         nonExclusiveBackups;
00381     XLogRecPtr  lastBackupStart;
00382 } XLogCtlInsert;
00383 
00384 /*
00385  * Shared state data for XLogWrite/XLogFlush.
00386  */
00387 typedef struct XLogCtlWrite
00388 {
00389     int         curridx;        /* cache index of next block to write */
00390     pg_time_t   lastSegSwitchTime;      /* time of last xlog segment switch */
00391 } XLogCtlWrite;
00392 
00393 /*
00394  * Total shared-memory state for XLOG.
00395  */
00396 typedef struct XLogCtlData
00397 {
00398     /* Protected by WALInsertLock: */
00399     XLogCtlInsert Insert;
00400 
00401     /* Protected by info_lck: */
00402     XLogwrtRqst LogwrtRqst;
00403     uint32      ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
00404     TransactionId ckptXid;
00405     XLogRecPtr  asyncXactLSN;   /* LSN of newest async commit/abort */
00406     XLogSegNo   lastRemovedSegNo; /* latest removed/recycled XLOG segment */
00407 
00408     /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck */
00409     XLogRecPtr  unloggedLSN;
00410     slock_t     ulsn_lck;
00411 
00412     /* Protected by WALWriteLock: */
00413     XLogCtlWrite Write;
00414 
00415     /*
00416      * Protected by info_lck and WALWriteLock (you must hold either lock to
00417      * read it, but both to update)
00418      */
00419     XLogwrtResult LogwrtResult;
00420 
00421     /*
00422      * These values do not change after startup, although the pointed-to pages
00423      * and xlblocks values certainly do.  Permission to read/write the pages
00424      * and xlblocks values depends on WALInsertLock and WALWriteLock.
00425      */
00426     char       *pages;          /* buffers for unwritten XLOG pages */
00427     XLogRecPtr *xlblocks;       /* 1st byte ptr-s + XLOG_BLCKSZ */
00428     int         XLogCacheBlck;  /* highest allocated xlog buffer index */
00429 
00430     /*
00431      * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
00432      * If we created a new timeline when the system was started up,
00433      * PrevTimeLineID is the old timeline's ID that we forked off from.
00434      * Otherwise it's equal to ThisTimeLineID.
00435      */
00436     TimeLineID  ThisTimeLineID;
00437     TimeLineID  PrevTimeLineID;
00438 
00439     /*
00440      * archiveCleanupCommand is read from recovery.conf but needs to be in
00441      * shared memory so that the checkpointer process can access it.
00442      */
00443     char        archiveCleanupCommand[MAXPGPATH];
00444 
00445     /*
00446      * SharedRecoveryInProgress indicates if we're still in crash or archive
00447      * recovery.  Protected by info_lck.
00448      */
00449     bool        SharedRecoveryInProgress;
00450 
00451     /*
00452      * SharedHotStandbyActive indicates if we're still in crash or archive
00453      * recovery.  Protected by info_lck.
00454      */
00455     bool        SharedHotStandbyActive;
00456 
00457     /*
00458      * WalWriterSleeping indicates whether the WAL writer is currently in
00459      * low-power mode (and hence should be nudged if an async commit occurs).
00460      * Protected by info_lck.
00461      */
00462     bool        WalWriterSleeping;
00463 
00464     /*
00465      * recoveryWakeupLatch is used to wake up the startup process to continue
00466      * WAL replay, if it is waiting for WAL to arrive or failover trigger file
00467      * to appear.
00468      */
00469     Latch       recoveryWakeupLatch;
00470 
00471     /*
00472      * During recovery, we keep a copy of the latest checkpoint record here.
00473      * Used by the background writer when it wants to create a restartpoint.
00474      *
00475      * Protected by info_lck.
00476      */
00477     XLogRecPtr  lastCheckPointRecPtr;
00478     CheckPoint  lastCheckPoint;
00479 
00480     /*
00481      * lastReplayedEndRecPtr points to end+1 of the last record successfully
00482      * replayed. When we're currently replaying a record, ie. in a redo
00483      * function, replayEndRecPtr points to the end+1 of the record being
00484      * replayed, otherwise it's equal to lastReplayedEndRecPtr.
00485      */
00486     XLogRecPtr  lastReplayedEndRecPtr;
00487     TimeLineID  lastReplayedTLI;
00488     XLogRecPtr  replayEndRecPtr;
00489     TimeLineID  replayEndTLI;
00490     /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
00491     TimestampTz recoveryLastXTime;
00492     /* current effective recovery target timeline */
00493     TimeLineID  RecoveryTargetTLI;
00494 
00495     /*
00496      * timestamp of when we started replaying the current chunk of WAL data,
00497      * only relevant for replication or archive recovery
00498      */
00499     TimestampTz currentChunkStartTime;
00500     /* Are we requested to pause recovery? */
00501     bool        recoveryPause;
00502 
00503     /*
00504      * lastFpwDisableRecPtr points to the start of the last replayed
00505      * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
00506      */
00507     XLogRecPtr  lastFpwDisableRecPtr;
00508 
00509     slock_t     info_lck;       /* locks shared variables shown above */
00510 } XLogCtlData;
00511 
00512 static XLogCtlData *XLogCtl = NULL;
00513 
00514 /*
00515  * We maintain an image of pg_control in shared memory.
00516  */
00517 static ControlFileData *ControlFile = NULL;
00518 
00519 /*
00520  * Macros for managing XLogInsert state.  In most cases, the calling routine
00521  * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
00522  * so these are passed as parameters instead of being fetched via XLogCtl.
00523  */
00524 
00525 /* Free space remaining in the current xlog page buffer */
00526 #define INSERT_FREESPACE(Insert)  \
00527     (XLOG_BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))
00528 
00529 /* Construct XLogRecPtr value for current insertion point */
00530 #define INSERT_RECPTR(recptr,Insert,curridx)  \
00531         (recptr) = XLogCtl->xlblocks[curridx] - INSERT_FREESPACE(Insert)
00532 
00533 #define PrevBufIdx(idx)     \
00534         (((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))
00535 
00536 #define NextBufIdx(idx)     \
00537         (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
00538 
00539 /*
00540  * Private, possibly out-of-date copy of shared LogwrtResult.
00541  * See discussion above.
00542  */
00543 static XLogwrtResult LogwrtResult = {0, 0};
00544 
00545 /*
00546  * Codes indicating where we got a WAL file from during recovery, or where
00547  * to attempt to get one.
00548  */
00549 typedef enum
00550 {
00551     XLOG_FROM_ANY = 0,      /* request to read WAL from any source */
00552     XLOG_FROM_ARCHIVE,      /* restored using restore_command */
00553     XLOG_FROM_PG_XLOG,      /* existing file in pg_xlog */
00554     XLOG_FROM_STREAM,       /* streamed from master */
00555 } XLogSource;
00556 
00557 /* human-readable names for XLogSources, for debugging output */
00558 static const char *xlogSourceNames[] = { "any", "archive", "pg_xlog", "stream" };
00559 
00560 /*
00561  * openLogFile is -1 or a kernel FD for an open log file segment.
00562  * When it's open, openLogOff is the current seek offset in the file.
00563  * openLogSegNo identifies the segment.  These variables are only
00564  * used to write the XLOG, and so will normally refer to the active segment.
00565  */
00566 static int  openLogFile = -1;
00567 static XLogSegNo openLogSegNo = 0;
00568 static uint32 openLogOff = 0;
00569 
00570 /*
00571  * These variables are used similarly to the ones above, but for reading
00572  * the XLOG.  Note, however, that readOff generally represents the offset
00573  * of the page just read, not the seek position of the FD itself, which
00574  * will be just past that page. readLen indicates how much of the current
00575  * page has been read into readBuf, and readSource indicates where we got
00576  * the currently open file from.
00577  */
00578 static int  readFile = -1;
00579 static XLogSegNo readSegNo = 0;
00580 static uint32 readOff = 0;
00581 static uint32 readLen = 0;
00582 static XLogSource readSource = 0;       /* XLOG_FROM_* code */
00583 
00584 /*
00585  * Keeps track of which source we're currently reading from. This is
00586  * different from readSource in that this is always set, even when we don't
00587  * currently have a WAL file open. If lastSourceFailed is set, our last
00588  * attempt to read from currentSource failed, and we should try another source
00589  * next.
00590  */
00591 static XLogSource currentSource = 0;    /* XLOG_FROM_* code */
00592 static bool lastSourceFailed = false;
00593 
00594 typedef struct XLogPageReadPrivate
00595 {
00596     int         emode;
00597     bool        fetching_ckpt;  /* are we fetching a checkpoint record? */
00598     bool        randAccess;
00599 } XLogPageReadPrivate;
00600 
00601 /*
00602  * These variables track when we last obtained some WAL data to process,
00603  * and where we got it from.  (XLogReceiptSource is initially the same as
00604  * readSource, but readSource gets reset to zero when we don't have data
00605  * to process right now.  It is also different from currentSource, which
00606  * also changes when we try to read from a source and fail, while
00607  * XLogReceiptSource tracks where we last successfully read some WAL.)
00608  */
00609 static TimestampTz XLogReceiptTime = 0;
00610 static XLogSource XLogReceiptSource = 0;    /* XLOG_FROM_* code */
00611 
00612 /* State information for XLOG reading */
00613 static XLogRecPtr ReadRecPtr;   /* start of last record read */
00614 static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
00615 
00616 static XLogRecPtr minRecoveryPoint;     /* local copy of
00617                                          * ControlFile->minRecoveryPoint */
00618 static TimeLineID minRecoveryPointTLI;
00619 static bool updateMinRecoveryPoint = true;
00620 
00621 /*
00622  * Have we reached a consistent database state? In crash recovery, we have
00623  * to replay all the WAL, so reachedConsistency is never set. During archive
00624  * recovery, the database is consistent once minRecoveryPoint is reached.
00625  */
00626 bool        reachedConsistency = false;
00627 
00628 static bool InRedo = false;
00629 
00630 /* Have we launched bgwriter during recovery? */
00631 static bool bgwriterLaunched = false;
00632 
00633 
00634 static void readRecoveryCommandFile(void);
00635 static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo);
00636 static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
00637 static void recoveryPausesHere(void);
00638 static void SetLatestXTime(TimestampTz xtime);
00639 static void SetCurrentChunkStartTime(TimestampTz xtime);
00640 static void CheckRequiredParameterValues(void);
00641 static void XLogReportParameters(void);
00642 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
00643                     TimeLineID prevTLI);
00644 static void LocalSetXLogInsertAllowed(void);
00645 static void CreateEndOfRecoveryRecord(void);
00646 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
00647 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
00648 
00649 static bool XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
00650                 XLogRecPtr *lsn, BkpBlock *bkpb);
00651 static Buffer RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb,
00652                 char *blk, bool get_cleanup_lock, bool keep_buffer);
00653 static bool AdvanceXLInsertBuffer(bool new_segment);
00654 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
00655 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch);
00656 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
00657                        bool find_free, int *max_advance,
00658                        bool use_lock);
00659 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
00660              int source, bool notexistOk);
00661 static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
00662 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
00663              int reqLen, XLogRecPtr targetRecPtr, char *readBuf,
00664              TimeLineID *readTLI);
00665 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
00666                             bool fetching_ckpt, XLogRecPtr tliRecPtr);
00667 static int  emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
00668 static void XLogFileClose(void);
00669 static void PreallocXlogFiles(XLogRecPtr endptr);
00670 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr);
00671 static void UpdateLastRemovedPtr(char *filename);
00672 static void ValidateXLOGDirectoryStructure(void);
00673 static void CleanupBackupHistory(void);
00674 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
00675 static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
00676            int emode, bool fetching_ckpt);
00677 static void CheckRecoveryConsistency(void);
00678 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
00679                      XLogRecPtr RecPtr, int whichChkpti, bool report);
00680 static bool rescanLatestTimeLine(void);
00681 static void WriteControlFile(void);
00682 static void ReadControlFile(void);
00683 static char *str_time(pg_time_t tnow);
00684 static bool CheckForStandbyTrigger(void);
00685 
00686 #ifdef WAL_DEBUG
00687 static void xlog_outrec(StringInfo buf, XLogRecord *record);
00688 #endif
00689 static void pg_start_backup_callback(int code, Datum arg);
00690 static bool read_backup_label(XLogRecPtr *checkPointLoc,
00691                   bool *backupEndRequired, bool *backupFromStandby);
00692 static void rm_redo_error_callback(void *arg);
00693 static int  get_sync_bit(int method);
00694 
00695 
00696 /*
00697  * Insert an XLOG record having the specified RMID and info bytes,
00698  * with the body of the record being the data chunk(s) described by
00699  * the rdata chain (see xlog.h for notes about rdata).
00700  *
00701  * Returns XLOG pointer to end of record (beginning of next record).
00702  * This can be used as LSN for data pages affected by the logged action.
00703  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
00704  * before the data page can be written out.  This implements the basic
00705  * WAL rule "write the log before the data".)
00706  *
00707  * NB: this routine feels free to scribble on the XLogRecData structs,
00708  * though not on the data they reference.  This is OK since the XLogRecData
00709  * structs are always just temporaries in the calling code.
00710  */
00711 XLogRecPtr
00712 XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
00713 {
00714     XLogCtlInsert *Insert = &XLogCtl->Insert;
00715     XLogRecPtr  RecPtr;
00716     XLogRecPtr  WriteRqst;
00717     uint32      freespace;
00718     int         curridx;
00719     XLogRecData *rdt;
00720     XLogRecData *rdt_lastnormal;
00721     Buffer      dtbuf[XLR_MAX_BKP_BLOCKS];
00722     bool        dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
00723     BkpBlock    dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
00724     XLogRecPtr  dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
00725     XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
00726     XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
00727     XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
00728     XLogRecData hdr_rdt;
00729     pg_crc32    rdata_crc;
00730     uint32      len,
00731                 write_len;
00732     unsigned    i;
00733     bool        updrqst;
00734     bool        doPageWrites;
00735     bool        isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
00736     uint8       info_orig = info;
00737     static XLogRecord *rechdr;
00738 
00739     if (rechdr == NULL)
00740     {
00741         rechdr = malloc(SizeOfXLogRecord);
00742         if (rechdr == NULL)
00743             elog(ERROR, "out of memory");
00744         MemSet(rechdr, 0, SizeOfXLogRecord);
00745     }
00746 
00747     /* cross-check on whether we should be here or not */
00748     if (!XLogInsertAllowed())
00749         elog(ERROR, "cannot make new WAL entries during recovery");
00750 
00751     /* info's high bits are reserved for use by me */
00752     if (info & XLR_INFO_MASK)
00753         elog(PANIC, "invalid xlog info mask %02X", info);
00754 
00755     TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);
00756 
00757     /*
00758      * In bootstrap mode, we don't actually log anything but XLOG resources;
00759      * return a phony record pointer.
00760      */
00761     if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
00762     {
00763         RecPtr = SizeOfXLogLongPHD;     /* start of 1st chkpt record */
00764         return RecPtr;
00765     }
00766 
00767     /*
00768      * Here we scan the rdata chain, to determine which buffers must be backed
00769      * up.
00770      *
00771      * We may have to loop back to here if a race condition is detected below.
00772      * We could prevent the race by doing all this work while holding the
00773      * insert lock, but it seems better to avoid doing CRC calculations while
00774      * holding the lock.
00775      *
00776      * We add entries for backup blocks to the chain, so that they don't need
00777      * any special treatment in the critical section where the chunks are
00778      * copied into the WAL buffers. Those entries have to be unlinked from the
00779      * chain if we have to loop back here.
00780      */
00781 begin:;
00782     for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
00783     {
00784         dtbuf[i] = InvalidBuffer;
00785         dtbuf_bkp[i] = false;
00786     }
00787 
00788     /*
00789      * Decide if we need to do full-page writes in this XLOG record: true if
00790      * full_page_writes is on or we have a PITR request for it.  Since we
00791      * don't yet have the insert lock, fullPageWrites and forcePageWrites
00792      * could change under us, but we'll recheck them once we have the lock.
00793      */
00794     doPageWrites = Insert->fullPageWrites || Insert->forcePageWrites;
00795 
00796     len = 0;
00797     for (rdt = rdata;;)
00798     {
00799         if (rdt->buffer == InvalidBuffer)
00800         {
00801             /* Simple data, just include it */
00802             len += rdt->len;
00803         }
00804         else
00805         {
00806             /* Find info for buffer */
00807             for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
00808             {
00809                 if (rdt->buffer == dtbuf[i])
00810                 {
00811                     /* Buffer already referenced by earlier chain item */
00812                     if (dtbuf_bkp[i])
00813                     {
00814                         rdt->data = NULL;
00815                         rdt->len = 0;
00816                     }
00817                     else if (rdt->data)
00818                         len += rdt->len;
00819                     break;
00820                 }
00821                 if (dtbuf[i] == InvalidBuffer)
00822                 {
00823                     /* OK, put it in this slot */
00824                     dtbuf[i] = rdt->buffer;
00825                     if (doPageWrites && XLogCheckBuffer(rdt, true,
00826                                         &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
00827                     {
00828                         dtbuf_bkp[i] = true;
00829                         rdt->data = NULL;
00830                         rdt->len = 0;
00831                     }
00832                     else if (rdt->data)
00833                         len += rdt->len;
00834                     break;
00835                 }
00836             }
00837             if (i >= XLR_MAX_BKP_BLOCKS)
00838                 elog(PANIC, "can backup at most %d blocks per xlog record",
00839                      XLR_MAX_BKP_BLOCKS);
00840         }
00841         /* Break out of loop when rdt points to last chain item */
00842         if (rdt->next == NULL)
00843             break;
00844         rdt = rdt->next;
00845     }
00846 
00847     /*
00848      * NOTE: We disallow len == 0 because it provides a useful bit of extra
00849      * error checking in ReadRecord.  This means that all callers of
00850      * XLogInsert must supply at least some not-in-a-buffer data.  However, we
00851      * make an exception for XLOG SWITCH records because we don't want them to
00852      * ever cross a segment boundary.
00853      */
00854     if (len == 0 && !isLogSwitch)
00855         elog(PANIC, "invalid xlog record length %u", len);
00856 
00857     /*
00858      * Make additional rdata chain entries for the backup blocks, so that we
00859      * don't need to special-case them in the write loop.  This modifies the
00860      * original rdata chain, but we keep a pointer to the last regular entry,
00861      * rdt_lastnormal, so that we can undo this if we have to loop back to the
00862      * beginning.
00863      *
00864      * At the exit of this loop, write_len includes the backup block data.
00865      *
00866      * Also set the appropriate info bits to show which buffers were backed
00867      * up. The XLR_BKP_BLOCK(N) bit corresponds to the N'th distinct buffer
00868      * value (ignoring InvalidBuffer) appearing in the rdata chain.
00869      */
00870     rdt_lastnormal = rdt;
00871     write_len = len;
00872     for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
00873     {
00874         BkpBlock   *bkpb;
00875         char       *page;
00876 
00877         if (!dtbuf_bkp[i])
00878             continue;
00879 
00880         info |= XLR_BKP_BLOCK(i);
00881 
00882         bkpb = &(dtbuf_xlg[i]);
00883         page = (char *) BufferGetBlock(dtbuf[i]);
00884 
00885         rdt->next = &(dtbuf_rdt1[i]);
00886         rdt = rdt->next;
00887 
00888         rdt->data = (char *) bkpb;
00889         rdt->len = sizeof(BkpBlock);
00890         write_len += sizeof(BkpBlock);
00891 
00892         rdt->next = &(dtbuf_rdt2[i]);
00893         rdt = rdt->next;
00894 
00895         if (bkpb->hole_length == 0)
00896         {
00897             rdt->data = page;
00898             rdt->len = BLCKSZ;
00899             write_len += BLCKSZ;
00900             rdt->next = NULL;
00901         }
00902         else
00903         {
00904             /* must skip the hole */
00905             rdt->data = page;
00906             rdt->len = bkpb->hole_offset;
00907             write_len += bkpb->hole_offset;
00908 
00909             rdt->next = &(dtbuf_rdt3[i]);
00910             rdt = rdt->next;
00911 
00912             rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
00913             rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
00914             write_len += rdt->len;
00915             rdt->next = NULL;
00916         }
00917     }
00918 
00919     /*
00920      * Calculate CRC of the data, including all the backup blocks
00921      *
00922      * Note that the record header isn't added into the CRC initially since we
00923      * don't know the prev-link yet.  Thus, the CRC will represent the CRC of
00924      * the whole record in the order: rdata, then backup blocks, then record
00925      * header.
00926      */
00927     INIT_CRC32(rdata_crc);
00928     for (rdt = rdata; rdt != NULL; rdt = rdt->next)
00929         COMP_CRC32(rdata_crc, rdt->data, rdt->len);
00930 
00931     /*
00932      * Construct record header (prev-link and CRC are filled in later), and
00933      * make that the first chunk in the chain.
00934      */
00935     rechdr->xl_xid = GetCurrentTransactionIdIfAny();
00936     rechdr->xl_tot_len = SizeOfXLogRecord + write_len;
00937     rechdr->xl_len = len;       /* doesn't include backup blocks */
00938     rechdr->xl_info = info;
00939     rechdr->xl_rmid = rmid;
00940 
00941     hdr_rdt.next = rdata;
00942     hdr_rdt.data = (char *) rechdr;
00943     hdr_rdt.len = SizeOfXLogRecord;
00944 
00945     write_len += SizeOfXLogRecord;
00946 
00947     START_CRIT_SECTION();
00948 
00949     /* Now wait to get insert lock */
00950     LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
00951 
00952     /*
00953      * Check to see if my RedoRecPtr is out of date.  If so, may have to go
00954      * back and recompute everything.  This can only happen just after a
00955      * checkpoint, so it's better to be slow in this case and fast otherwise.
00956      *
00957      * If we aren't doing full-page writes then RedoRecPtr doesn't actually
00958      * affect the contents of the XLOG record, so we'll update our local copy
00959      * but not force a recomputation.
00960      */
00961     if (RedoRecPtr != Insert->RedoRecPtr)
00962     {
00963         Assert(RedoRecPtr < Insert->RedoRecPtr);
00964         RedoRecPtr = Insert->RedoRecPtr;
00965 
00966         if (doPageWrites)
00967         {
00968             for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
00969             {
00970                 if (dtbuf[i] == InvalidBuffer)
00971                     continue;
00972                 if (dtbuf_bkp[i] == false &&
00973                     dtbuf_lsn[i] <= RedoRecPtr)
00974                 {
00975                     /*
00976                      * Oops, this buffer now needs to be backed up, but we
00977                      * didn't think so above.  Start over.
00978                      */
00979                     LWLockRelease(WALInsertLock);
00980                     END_CRIT_SECTION();
00981                     rdt_lastnormal->next = NULL;
00982                     info = info_orig;
00983                     goto begin;
00984                 }
00985             }
00986         }
00987     }
00988 
00989     /*
00990      * Also check to see if fullPageWrites or forcePageWrites was just turned
00991      * on; if we weren't already doing full-page writes then go back and
00992      * recompute. (If it was just turned off, we could recompute the record
00993      * without full pages, but we choose not to bother.)
00994      */
00995     if ((Insert->fullPageWrites || Insert->forcePageWrites) && !doPageWrites)
00996     {
00997         /* Oops, must redo it with full-page data. */
00998         LWLockRelease(WALInsertLock);
00999         END_CRIT_SECTION();
01000         rdt_lastnormal->next = NULL;
01001         info = info_orig;
01002         goto begin;
01003     }
01004 
01005     /*
01006      * If the current page is completely full, the record goes to the next
01007      * page, right after the page header.
01008      */
01009     updrqst = false;
01010     freespace = INSERT_FREESPACE(Insert);
01011     if (freespace == 0)
01012     {
01013         updrqst = AdvanceXLInsertBuffer(false);
01014         freespace = INSERT_FREESPACE(Insert);
01015     }
01016 
01017     /* Compute record's XLOG location */
01018     curridx = Insert->curridx;
01019     INSERT_RECPTR(RecPtr, Insert, curridx);
01020 
01021     /*
01022      * If the record is an XLOG_SWITCH, and we are exactly at the start of a
01023      * segment, we need not insert it (and don't want to because we'd like
01024      * consecutive switch requests to be no-ops).  Instead, make sure
01025      * everything is written and flushed through the end of the prior segment,
01026      * and return the prior segment's end address.
01027      */
01028     if (isLogSwitch && (RecPtr % XLogSegSize) == SizeOfXLogLongPHD)
01029     {
01030         /* We can release insert lock immediately */
01031         LWLockRelease(WALInsertLock);
01032 
01033         RecPtr -= SizeOfXLogLongPHD;
01034 
01035         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
01036         LogwrtResult = XLogCtl->LogwrtResult;
01037         if (LogwrtResult.Flush < RecPtr)
01038         {
01039             XLogwrtRqst FlushRqst;
01040 
01041             FlushRqst.Write = RecPtr;
01042             FlushRqst.Flush = RecPtr;
01043             XLogWrite(FlushRqst, false, false);
01044         }
01045         LWLockRelease(WALWriteLock);
01046 
01047         END_CRIT_SECTION();
01048 
01049         /* wake up walsenders now that we've released heavily contended locks */
01050         WalSndWakeupProcessRequests();
01051         return RecPtr;
01052     }
01053 
01054     /* Finish the record header */
01055     rechdr->xl_prev = Insert->PrevRecord;
01056 
01057     /* Now we can finish computing the record's CRC */
01058     COMP_CRC32(rdata_crc, (char *) rechdr, offsetof(XLogRecord, xl_crc));
01059     FIN_CRC32(rdata_crc);
01060     rechdr->xl_crc = rdata_crc;
01061 
01062 #ifdef WAL_DEBUG
01063     if (XLOG_DEBUG)
01064     {
01065         StringInfoData buf;
01066 
01067         initStringInfo(&buf);
01068         appendStringInfo(&buf, "INSERT @ %X/%X: ",
01069                          (uint32) (RecPtr >> 32), (uint32) RecPtr);
01070         xlog_outrec(&buf, rechdr);
01071         if (rdata->data != NULL)
01072         {
01073             appendStringInfo(&buf, " - ");
01074             RmgrTable[rechdr->xl_rmid].rm_desc(&buf, rechdr->xl_info, rdata->data);
01075         }
01076         elog(LOG, "%s", buf.data);
01077         pfree(buf.data);
01078     }
01079 #endif
01080 
01081     /* Record begin of record in appropriate places */
01082     ProcLastRecPtr = RecPtr;
01083     Insert->PrevRecord = RecPtr;
01084 
01085     /*
01086      * Append the data, including backup blocks if any
01087      */
01088     rdata = &hdr_rdt;
01089     while (write_len)
01090     {
01091         while (rdata->data == NULL)
01092             rdata = rdata->next;
01093 
01094         if (freespace > 0)
01095         {
01096             if (rdata->len > freespace)
01097             {
01098                 memcpy(Insert->currpos, rdata->data, freespace);
01099                 rdata->data += freespace;
01100                 rdata->len -= freespace;
01101                 write_len -= freespace;
01102             }
01103             else
01104             {
01105                 memcpy(Insert->currpos, rdata->data, rdata->len);
01106                 freespace -= rdata->len;
01107                 write_len -= rdata->len;
01108                 Insert->currpos += rdata->len;
01109                 rdata = rdata->next;
01110                 continue;
01111             }
01112         }
01113 
01114         /* Use next buffer */
01115         updrqst = AdvanceXLInsertBuffer(false);
01116         curridx = Insert->curridx;
01117         /* Mark page header to indicate this record continues on the page */
01118         Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
01119         Insert->currpage->xlp_rem_len = write_len;
01120         freespace = INSERT_FREESPACE(Insert);
01121     }
01122 
01123     /* Ensure next record will be properly aligned */
01124     Insert->currpos = (char *) Insert->currpage +
01125         MAXALIGN(Insert->currpos - (char *) Insert->currpage);
01126     freespace = INSERT_FREESPACE(Insert);
01127 
01128     /*
01129      * The recptr I return is the beginning of the *next* record. This will be
01130      * stored as LSN for changed data pages...
01131      */
01132     INSERT_RECPTR(RecPtr, Insert, curridx);
01133 
01134     /*
01135      * If the record is an XLOG_SWITCH, we must now write and flush all the
01136      * existing data, and then forcibly advance to the start of the next
01137      * segment.  It's not good to do this I/O while holding the insert lock,
01138      * but there seems too much risk of confusion if we try to release the
01139      * lock sooner.  Fortunately xlog switch needn't be a high-performance
01140      * operation anyway...
01141      */
01142     if (isLogSwitch)
01143     {
01144         XLogwrtRqst FlushRqst;
01145         XLogRecPtr  OldSegEnd;
01146 
01147         TRACE_POSTGRESQL_XLOG_SWITCH();
01148 
01149         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
01150 
01151         /*
01152          * Flush through the end of the page containing XLOG_SWITCH, and
01153          * perform end-of-segment actions (eg, notifying archiver).
01154          */
01155         WriteRqst = XLogCtl->xlblocks[curridx];
01156         FlushRqst.Write = WriteRqst;
01157         FlushRqst.Flush = WriteRqst;
01158         XLogWrite(FlushRqst, false, true);
01159 
01160         /* Set up the next buffer as first page of next segment */
01161         /* Note: AdvanceXLInsertBuffer cannot need to do I/O here */
01162         (void) AdvanceXLInsertBuffer(true);
01163 
01164         /* There should be no unwritten data */
01165         curridx = Insert->curridx;
01166         Assert(curridx == XLogCtl->Write.curridx);
01167 
01168         /* Compute end address of old segment */
01169         OldSegEnd = XLogCtl->xlblocks[curridx];
01170         OldSegEnd -= XLOG_BLCKSZ;
01171 
01172         /* Make it look like we've written and synced all of old segment */
01173         LogwrtResult.Write = OldSegEnd;
01174         LogwrtResult.Flush = OldSegEnd;
01175 
01176         /*
01177          * Update shared-memory status --- this code should match XLogWrite
01178          */
01179         {
01180             /* use volatile pointer to prevent code rearrangement */
01181             volatile XLogCtlData *xlogctl = XLogCtl;
01182 
01183             SpinLockAcquire(&xlogctl->info_lck);
01184             xlogctl->LogwrtResult = LogwrtResult;
01185             if (xlogctl->LogwrtRqst.Write < LogwrtResult.Write)
01186                 xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
01187             if (xlogctl->LogwrtRqst.Flush < LogwrtResult.Flush)
01188                 xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
01189             SpinLockRelease(&xlogctl->info_lck);
01190         }
01191 
01192         LWLockRelease(WALWriteLock);
01193 
01194         updrqst = false;        /* done already */
01195     }
01196     else
01197     {
01198         /* normal case, ie not xlog switch */
01199 
01200         /* Need to update shared LogwrtRqst if some block was filled up */
01201         if (freespace == 0)
01202         {
01203             /* curridx is filled and available for writing out */
01204             updrqst = true;
01205         }
01206         else
01207         {
01208             /* if updrqst already set, write through end of previous buf */
01209             curridx = PrevBufIdx(curridx);
01210         }
01211         WriteRqst = XLogCtl->xlblocks[curridx];
01212     }
01213 
01214     LWLockRelease(WALInsertLock);
01215 
01216     if (updrqst)
01217     {
01218         /* use volatile pointer to prevent code rearrangement */
01219         volatile XLogCtlData *xlogctl = XLogCtl;
01220 
01221         SpinLockAcquire(&xlogctl->info_lck);
01222         /* advance global request to include new block(s) */
01223         if (xlogctl->LogwrtRqst.Write < WriteRqst)
01224             xlogctl->LogwrtRqst.Write = WriteRqst;
01225         /* update local result copy while I have the chance */
01226         LogwrtResult = xlogctl->LogwrtResult;
01227         SpinLockRelease(&xlogctl->info_lck);
01228     }
01229 
01230     XactLastRecEnd = RecPtr;
01231 
01232     END_CRIT_SECTION();
01233 
01234     /* wake up walsenders now that we've released heavily contended locks */
01235     WalSndWakeupProcessRequests();
01236 
01237     return RecPtr;
01238 }
01239 
01240 /*
01241  * Determine whether the buffer referenced by an XLogRecData item has to
01242  * be backed up, and if so fill a BkpBlock struct for it.  In any case
01243  * save the buffer's LSN at *lsn.
01244  */
01245 static bool
01246 XLogCheckBuffer(XLogRecData *rdata, bool holdsExclusiveLock,
01247                 XLogRecPtr *lsn, BkpBlock *bkpb)
01248 {
01249     Page        page;
01250 
01251     page = BufferGetPage(rdata->buffer);
01252 
01253     /*
01254      * We assume page LSN is first data on *every* page that can be passed
01255      * to XLogInsert, whether it has the standard page layout or not. We
01256      * don't need to take the buffer header lock for PageGetLSN if we hold
01257      * an exclusive lock on the page and/or the relation.
01258      */
01259     if (holdsExclusiveLock)
01260         *lsn = PageGetLSN(page);
01261     else
01262         *lsn = BufferGetLSNAtomic(rdata->buffer);
01263 
01264     if (*lsn <= RedoRecPtr)
01265     {
01266         /*
01267          * The page needs to be backed up, so set up *bkpb
01268          */
01269         BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
01270 
01271         if (rdata->buffer_std)
01272         {
01273             /* Assume we can omit data between pd_lower and pd_upper */
01274             uint16      lower = ((PageHeader) page)->pd_lower;
01275             uint16      upper = ((PageHeader) page)->pd_upper;
01276 
01277             if (lower >= SizeOfPageHeaderData &&
01278                 upper > lower &&
01279                 upper <= BLCKSZ)
01280             {
01281                 bkpb->hole_offset = lower;
01282                 bkpb->hole_length = upper - lower;
01283             }
01284             else
01285             {
01286                 /* No "hole" to compress out */
01287                 bkpb->hole_offset = 0;
01288                 bkpb->hole_length = 0;
01289             }
01290         }
01291         else
01292         {
01293             /* Not a standard page header, don't try to eliminate "hole" */
01294             bkpb->hole_offset = 0;
01295             bkpb->hole_length = 0;
01296         }
01297 
01298         return true;            /* buffer requires backup */
01299     }
01300 
01301     return false;               /* buffer does not need to be backed up */
01302 }
01303 
01304 /*
01305  * Advance the Insert state to the next buffer page, writing out the next
01306  * buffer if it still contains unwritten data.
01307  *
01308  * If new_segment is TRUE then we set up the next buffer page as the first
01309  * page of the next xlog segment file, possibly but not usually the next
01310  * consecutive file page.
01311  *
01312  * The global LogwrtRqst.Write pointer needs to be advanced to include the
01313  * just-filled page.  If we can do this for free (without an extra lock),
01314  * we do so here.  Otherwise the caller must do it.  We return TRUE if the
01315  * request update still needs to be done, FALSE if we did it internally.
01316  *
01317  * Must be called with WALInsertLock held.
01318  */
01319 static bool
01320 AdvanceXLInsertBuffer(bool new_segment)
01321 {
01322     XLogCtlInsert *Insert = &XLogCtl->Insert;
01323     int         nextidx = NextBufIdx(Insert->curridx);
01324     bool        update_needed = true;
01325     XLogRecPtr  OldPageRqstPtr;
01326     XLogwrtRqst WriteRqst;
01327     XLogRecPtr  NewPageEndPtr;
01328     XLogRecPtr  NewPageBeginPtr;
01329     XLogPageHeader NewPage;
01330 
01331     /*
01332      * Get ending-offset of the buffer page we need to replace (this may be
01333      * zero if the buffer hasn't been used yet).  Fall through if it's already
01334      * written out.
01335      */
01336     OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
01337     if (LogwrtResult.Write < OldPageRqstPtr)
01338     {
01339         /* nope, got work to do... */
01340         XLogRecPtr  FinishedPageRqstPtr;
01341 
01342         FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
01343 
01344         /* Before waiting, get info_lck and update LogwrtResult */
01345         {
01346             /* use volatile pointer to prevent code rearrangement */
01347             volatile XLogCtlData *xlogctl = XLogCtl;
01348 
01349             SpinLockAcquire(&xlogctl->info_lck);
01350             if (xlogctl->LogwrtRqst.Write < FinishedPageRqstPtr)
01351                 xlogctl->LogwrtRqst.Write = FinishedPageRqstPtr;
01352             LogwrtResult = xlogctl->LogwrtResult;
01353             SpinLockRelease(&xlogctl->info_lck);
01354         }
01355 
01356         update_needed = false;  /* Did the shared-request update */
01357 
01358         /*
01359          * Now that we have an up-to-date LogwrtResult value, see if we still
01360          * need to write it or if someone else already did.
01361          */
01362         if (LogwrtResult.Write < OldPageRqstPtr)
01363         {
01364             /* Must acquire write lock */
01365             LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
01366             LogwrtResult = XLogCtl->LogwrtResult;
01367             if (LogwrtResult.Write >= OldPageRqstPtr)
01368             {
01369                 /* OK, someone wrote it already */
01370                 LWLockRelease(WALWriteLock);
01371             }
01372             else
01373             {
01374                 /*
01375                  * Have to write buffers while holding insert lock. This is
01376                  * not good, so only write as much as we absolutely must.
01377                  */
01378                 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
01379                 WriteRqst.Write = OldPageRqstPtr;
01380                 WriteRqst.Flush = 0;
01381                 XLogWrite(WriteRqst, false, false);
01382                 LWLockRelease(WALWriteLock);
01383                 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
01384             }
01385         }
01386     }
01387 
01388     /*
01389      * Now the next buffer slot is free and we can set it up to be the next
01390      * output page.
01391      */
01392     NewPageBeginPtr = XLogCtl->xlblocks[Insert->curridx];
01393 
01394     if (new_segment)
01395     {
01396         /* force it to a segment start point */
01397         if (NewPageBeginPtr % XLogSegSize != 0)
01398             NewPageBeginPtr += XLogSegSize - NewPageBeginPtr % XLogSegSize;
01399     }
01400 
01401     NewPageEndPtr = NewPageBeginPtr;
01402     NewPageEndPtr += XLOG_BLCKSZ;
01403     XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
01404     NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
01405 
01406     Insert->curridx = nextidx;
01407     Insert->currpage = NewPage;
01408 
01409     Insert->currpos = ((char *) NewPage) +SizeOfXLogShortPHD;
01410 
01411     /*
01412      * Be sure to re-zero the buffer so that bytes beyond what we've written
01413      * will look like zeroes and not valid XLOG records...
01414      */
01415     MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
01416 
01417     /*
01418      * Fill the new page's header
01419      */
01420     NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;
01421 
01422     /* NewPage->xlp_info = 0; */    /* done by memset */
01423     NewPage   ->xlp_tli = ThisTimeLineID;
01424     NewPage   ->xlp_pageaddr = NewPageBeginPtr;
01425 
01426     /*
01427      * If online backup is not in progress, mark the header to indicate that
01428      * WAL records beginning in this page have removable backup blocks.  This
01429      * allows the WAL archiver to know whether it is safe to compress archived
01430      * WAL data by transforming full-block records into the non-full-block
01431      * format.  It is sufficient to record this at the page level because we
01432      * force a page switch (in fact a segment switch) when starting a backup,
01433      * so the flag will be off before any records can be written during the
01434      * backup.  At the end of a backup, the last page will be marked as all
01435      * unsafe when perhaps only part is unsafe, but at worst the archiver
01436      * would miss the opportunity to compress a few records.
01437      */
01438     if (!Insert->forcePageWrites)
01439         NewPage   ->xlp_info |= XLP_BKP_REMOVABLE;
01440 
01441     /*
01442      * If first page of an XLOG segment file, make it a long header.
01443      */
01444     if ((NewPage->xlp_pageaddr % XLogSegSize) == 0)
01445     {
01446         XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
01447 
01448         NewLongPage->xlp_sysid = ControlFile->system_identifier;
01449         NewLongPage->xlp_seg_size = XLogSegSize;
01450         NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
01451         NewPage   ->xlp_info |= XLP_LONG_HEADER;
01452 
01453         Insert->currpos = ((char *) NewPage) +SizeOfXLogLongPHD;
01454     }
01455 
01456     return update_needed;
01457 }
01458 
01459 /*
01460  * Check whether we've consumed enough xlog space that a checkpoint is needed.
01461  *
01462  * new_segno indicates a log file that has just been filled up (or read
01463  * during recovery). We measure the distance from RedoRecPtr to new_segno
01464  * and see if that exceeds CheckPointSegments.
01465  *
01466  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
01467  */
01468 static bool
01469 XLogCheckpointNeeded(XLogSegNo new_segno)
01470 {
01471     XLogSegNo   old_segno;
01472 
01473     XLByteToSeg(RedoRecPtr, old_segno);
01474 
01475     if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
01476         return true;
01477     return false;
01478 }
01479 
01480 /*
01481  * Write and/or fsync the log at least as far as WriteRqst indicates.
01482  *
01483  * If flexible == TRUE, we don't have to write as far as WriteRqst, but
01484  * may stop at any convenient boundary (such as a cache or logfile boundary).
01485  * This option allows us to avoid uselessly issuing multiple writes when a
01486  * single one would do.
01487  *
01488  * If xlog_switch == TRUE, we are intending an xlog segment switch, so
01489  * perform end-of-segment actions after writing the last page, even if
01490  * it's not physically the end of its segment.  (NB: this will work properly
01491  * only if caller specifies WriteRqst == page-end and flexible == false,
01492  * and there is some data to write.)
01493  *
01494  * Must be called with WALWriteLock held.
01495  */
01496 static void
01497 XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
01498 {
01499     XLogCtlWrite *Write = &XLogCtl->Write;
01500     bool        ispartialpage;
01501     bool        last_iteration;
01502     bool        finishing_seg;
01503     bool        use_existent;
01504     int         curridx;
01505     int         npages;
01506     int         startidx;
01507     uint32      startoffset;
01508 
01509     /* We should always be inside a critical section here */
01510     Assert(CritSectionCount > 0);
01511 
01512     /*
01513      * Update local LogwrtResult (caller probably did this already, but...)
01514      */
01515     LogwrtResult = XLogCtl->LogwrtResult;
01516 
01517     /*
01518      * Since successive pages in the xlog cache are consecutively allocated,
01519      * we can usually gather multiple pages together and issue just one
01520      * write() call.  npages is the number of pages we have determined can be
01521      * written together; startidx is the cache block index of the first one,
01522      * and startoffset is the file offset at which it should go. The latter
01523      * two variables are only valid when npages > 0, but we must initialize
01524      * all of them to keep the compiler quiet.
01525      */
01526     npages = 0;
01527     startidx = 0;
01528     startoffset = 0;
01529 
01530     /*
01531      * Within the loop, curridx is the cache block index of the page to
01532      * consider writing.  We advance Write->curridx only after successfully
01533      * writing pages.  (Right now, this refinement is useless since we are
01534      * going to PANIC if any error occurs anyway; but someday it may come in
01535      * useful.)
01536      */
01537     curridx = Write->curridx;
01538 
01539     while (LogwrtResult.Write < WriteRqst.Write)
01540     {
01541         /*
01542          * Make sure we're not ahead of the insert process.  This could happen
01543          * if we're passed a bogus WriteRqst.Write that is past the end of the
01544          * last page that's been initialized by AdvanceXLInsertBuffer.
01545          */
01546         if (LogwrtResult.Write >= XLogCtl->xlblocks[curridx])
01547             elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
01548                  (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
01549                  (uint32) (XLogCtl->xlblocks[curridx] >> 32),
01550                  (uint32) XLogCtl->xlblocks[curridx]);
01551 
01552         /* Advance LogwrtResult.Write to end of current buffer page */
01553         LogwrtResult.Write = XLogCtl->xlblocks[curridx];
01554         ispartialpage = WriteRqst.Write < LogwrtResult.Write;
01555 
01556         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
01557         {
01558             /*
01559              * Switch to new logfile segment.  We cannot have any pending
01560              * pages here (since we dump what we have at segment end).
01561              */
01562             Assert(npages == 0);
01563             if (openLogFile >= 0)
01564                 XLogFileClose();
01565             XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
01566 
01567             /* create/use new log file */
01568             use_existent = true;
01569             openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
01570             openLogOff = 0;
01571         }
01572 
01573         /* Make sure we have the current logfile open */
01574         if (openLogFile < 0)
01575         {
01576             XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
01577             openLogFile = XLogFileOpen(openLogSegNo);
01578             openLogOff = 0;
01579         }
01580 
01581         /* Add current page to the set of pending pages-to-dump */
01582         if (npages == 0)
01583         {
01584             /* first of group */
01585             startidx = curridx;
01586             startoffset = (LogwrtResult.Write - XLOG_BLCKSZ) % XLogSegSize;
01587         }
01588         npages++;
01589 
01590         /*
01591          * Dump the set if this will be the last loop iteration, or if we are
01592          * at the last page of the cache area (since the next page won't be
01593          * contiguous in memory), or if we are at the end of the logfile
01594          * segment.
01595          */
01596         last_iteration = WriteRqst.Write <= LogwrtResult.Write;
01597 
01598         finishing_seg = !ispartialpage &&
01599             (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
01600 
01601         if (last_iteration ||
01602             curridx == XLogCtl->XLogCacheBlck ||
01603             finishing_seg)
01604         {
01605             char       *from;
01606             Size        nbytes;
01607 
01608             /* Need to seek in the file? */
01609             if (openLogOff != startoffset)
01610             {
01611                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
01612                     ereport(PANIC,
01613                             (errcode_for_file_access(),
01614                              errmsg("could not seek in log file %s to offset %u: %m",
01615                                     XLogFileNameP(ThisTimeLineID, openLogSegNo),
01616                                     startoffset)));
01617                 openLogOff = startoffset;
01618             }
01619 
01620             /* OK to write the page(s) */
01621             from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
01622             nbytes = npages * (Size) XLOG_BLCKSZ;
01623             errno = 0;
01624             if (write(openLogFile, from, nbytes) != nbytes)
01625             {
01626                 /* if write didn't set errno, assume no disk space */
01627                 if (errno == 0)
01628                     errno = ENOSPC;
01629                 ereport(PANIC,
01630                         (errcode_for_file_access(),
01631                          errmsg("could not write to log file %s "
01632                                 "at offset %u, length %lu: %m",
01633                                 XLogFileNameP(ThisTimeLineID, openLogSegNo),
01634                                 openLogOff, (unsigned long) nbytes)));
01635             }
01636 
01637             /* Update state for write */
01638             openLogOff += nbytes;
01639             Write->curridx = ispartialpage ? curridx : NextBufIdx(curridx);
01640             npages = 0;
01641 
01642             /*
01643              * If we just wrote the whole last page of a logfile segment,
01644              * fsync the segment immediately.  This avoids having to go back
01645              * and re-open prior segments when an fsync request comes along
01646              * later. Doing it here ensures that one and only one backend will
01647              * perform this fsync.
01648              *
01649              * We also do this if this is the last page written for an xlog
01650              * switch.
01651              *
01652              * This is also the right place to notify the Archiver that the
01653              * segment is ready to copy to archival storage, and to update the
01654              * timer for archive_timeout, and to signal for a checkpoint if
01655              * too many logfile segments have been used since the last
01656              * checkpoint.
01657              */
01658             if (finishing_seg || (xlog_switch && last_iteration))
01659             {
01660                 issue_xlog_fsync(openLogFile, openLogSegNo);
01661 
01662                 /* signal that we need to wakeup walsenders later */
01663                 WalSndWakeupRequest();
01664 
01665                 LogwrtResult.Flush = LogwrtResult.Write;        /* end of page */
01666 
01667                 if (XLogArchivingActive())
01668                     XLogArchiveNotifySeg(openLogSegNo);
01669 
01670                 Write->lastSegSwitchTime = (pg_time_t) time(NULL);
01671 
01672                 /*
01673                  * Request a checkpoint if we've consumed too much xlog since
01674                  * the last one.  For speed, we first check using the local
01675                  * copy of RedoRecPtr, which might be out of date; if it looks
01676                  * like a checkpoint is needed, forcibly update RedoRecPtr and
01677                  * recheck.
01678                  */
01679                 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
01680                 {
01681                     (void) GetRedoRecPtr();
01682                     if (XLogCheckpointNeeded(openLogSegNo))
01683                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
01684                 }
01685             }
01686         }
01687 
01688         if (ispartialpage)
01689         {
01690             /* Only asked to write a partial page */
01691             LogwrtResult.Write = WriteRqst.Write;
01692             break;
01693         }
01694         curridx = NextBufIdx(curridx);
01695 
01696         /* If flexible, break out of loop as soon as we wrote something */
01697         if (flexible && npages == 0)
01698             break;
01699     }
01700 
01701     Assert(npages == 0);
01702     Assert(curridx == Write->curridx);
01703 
01704     /*
01705      * If asked to flush, do so
01706      */
01707     if (LogwrtResult.Flush < WriteRqst.Flush &&
01708         LogwrtResult.Flush < LogwrtResult.Write)
01709 
01710     {
01711         /*
01712          * Could get here without iterating above loop, in which case we might
01713          * have no open file or the wrong one.  However, we do not need to
01714          * fsync more than one file.
01715          */
01716         if (sync_method != SYNC_METHOD_OPEN &&
01717             sync_method != SYNC_METHOD_OPEN_DSYNC)
01718         {
01719             if (openLogFile >= 0 &&
01720                 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
01721                 XLogFileClose();
01722             if (openLogFile < 0)
01723             {
01724                 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
01725                 openLogFile = XLogFileOpen(openLogSegNo);
01726                 openLogOff = 0;
01727             }
01728 
01729             issue_xlog_fsync(openLogFile, openLogSegNo);
01730         }
01731 
01732         /* signal that we need to wakeup walsenders later */
01733         WalSndWakeupRequest();
01734 
01735         LogwrtResult.Flush = LogwrtResult.Write;
01736     }
01737 
01738     /*
01739      * Update shared-memory status
01740      *
01741      * We make sure that the shared 'request' values do not fall behind the
01742      * 'result' values.  This is not absolutely essential, but it saves some
01743      * code in a couple of places.
01744      */
01745     {
01746         /* use volatile pointer to prevent code rearrangement */
01747         volatile XLogCtlData *xlogctl = XLogCtl;
01748 
01749         SpinLockAcquire(&xlogctl->info_lck);
01750         xlogctl->LogwrtResult = LogwrtResult;
01751         if (xlogctl->LogwrtRqst.Write < LogwrtResult.Write)
01752             xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
01753         if (xlogctl->LogwrtRqst.Flush < LogwrtResult.Flush)
01754             xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
01755         SpinLockRelease(&xlogctl->info_lck);
01756     }
01757 }
01758 
01759 /*
01760  * Record the LSN for an asynchronous transaction commit/abort
01761  * and nudge the WALWriter if there is work for it to do.
01762  * (This should not be called for synchronous commits.)
01763  */
01764 void
01765 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
01766 {
01767     XLogRecPtr  WriteRqstPtr = asyncXactLSN;
01768     bool        sleeping;
01769 
01770     /* use volatile pointer to prevent code rearrangement */
01771     volatile XLogCtlData *xlogctl = XLogCtl;
01772 
01773     SpinLockAcquire(&xlogctl->info_lck);
01774     LogwrtResult = xlogctl->LogwrtResult;
01775     sleeping = xlogctl->WalWriterSleeping;
01776     if (xlogctl->asyncXactLSN < asyncXactLSN)
01777         xlogctl->asyncXactLSN = asyncXactLSN;
01778     SpinLockRelease(&xlogctl->info_lck);
01779 
01780     /*
01781      * If the WALWriter is sleeping, we should kick it to make it come out of
01782      * low-power mode.  Otherwise, determine whether there's a full page of
01783      * WAL available to write.
01784      */
01785     if (!sleeping)
01786     {
01787         /* back off to last completed page boundary */
01788         WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
01789 
01790         /* if we have already flushed that far, we're done */
01791         if (WriteRqstPtr <= LogwrtResult.Flush)
01792             return;
01793     }
01794 
01795     /*
01796      * Nudge the WALWriter: it has a full page of WAL to write, or we want it
01797      * to come out of low-power mode so that this async commit will reach disk
01798      * within the expected amount of time.
01799      */
01800     if (ProcGlobal->walwriterLatch)
01801         SetLatch(ProcGlobal->walwriterLatch);
01802 }
01803 
01804 /*
01805  * Advance minRecoveryPoint in control file.
01806  *
01807  * If we crash during recovery, we must reach this point again before the
01808  * database is consistent.
01809  *
01810  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
01811  * is only updated if it's not already greater than or equal to 'lsn'.
01812  */
01813 static void
01814 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
01815 {
01816     /* Quick check using our local copy of the variable */
01817     if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
01818         return;
01819 
01820     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
01821 
01822     /* update local copy */
01823     minRecoveryPoint = ControlFile->minRecoveryPoint;
01824     minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
01825 
01826     /*
01827      * An invalid minRecoveryPoint means that we need to recover all the WAL,
01828      * i.e., we're doing crash recovery.  We never modify the control file's
01829      * value in that case, so we can short-circuit future checks here too.
01830      */
01831     if (minRecoveryPoint == 0)
01832         updateMinRecoveryPoint = false;
01833     else if (force || minRecoveryPoint < lsn)
01834     {
01835         /* use volatile pointer to prevent code rearrangement */
01836         volatile XLogCtlData *xlogctl = XLogCtl;
01837         XLogRecPtr  newMinRecoveryPoint;
01838         TimeLineID  newMinRecoveryPointTLI;
01839 
01840         /*
01841          * To avoid having to update the control file too often, we update it
01842          * all the way to the last record being replayed, even though 'lsn'
01843          * would suffice for correctness.  This also allows the 'force' case
01844          * to not need a valid 'lsn' value.
01845          *
01846          * Another important reason for doing it this way is that the passed
01847          * 'lsn' value could be bogus, i.e., past the end of available WAL, if
01848          * the caller got it from a corrupted heap page.  Accepting such a
01849          * value as the min recovery point would prevent us from coming up at
01850          * all.  Instead, we just log a warning and continue with recovery.
01851          * (See also the comments about corrupt LSNs in XLogFlush.)
01852          */
01853         SpinLockAcquire(&xlogctl->info_lck);
01854         newMinRecoveryPoint = xlogctl->replayEndRecPtr;
01855         newMinRecoveryPointTLI = xlogctl->replayEndTLI;
01856         SpinLockRelease(&xlogctl->info_lck);
01857 
01858         if (!force && newMinRecoveryPoint < lsn)
01859             elog(WARNING,
01860                "xlog min recovery request %X/%X is past current point %X/%X",
01861                  (uint32) (lsn >> 32) , (uint32) lsn,
01862                  (uint32) (newMinRecoveryPoint >> 32),
01863                  (uint32) newMinRecoveryPoint);
01864 
01865         /* update control file */
01866         if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
01867         {
01868             ControlFile->minRecoveryPoint = newMinRecoveryPoint;
01869             ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
01870             UpdateControlFile();
01871             minRecoveryPoint = newMinRecoveryPoint;
01872             minRecoveryPointTLI = newMinRecoveryPointTLI;
01873 
01874             ereport(DEBUG2,
01875                     (errmsg("updated min recovery point to %X/%X on timeline %u",
01876                             (uint32) (minRecoveryPoint >> 32),
01877                             (uint32) minRecoveryPoint,
01878                             newMinRecoveryPointTLI)));
01879         }
01880     }
01881     LWLockRelease(ControlFileLock);
01882 }
01883 
01884 /*
01885  * Ensure that all XLOG data through the given position is flushed to disk.
01886  *
01887  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
01888  * already held, and we try to avoid acquiring it if possible.
01889  */
01890 void
01891 XLogFlush(XLogRecPtr record)
01892 {
01893     XLogRecPtr  WriteRqstPtr;
01894     XLogwrtRqst WriteRqst;
01895 
01896     /*
01897      * During REDO, we are reading not writing WAL.  Therefore, instead of
01898      * trying to flush the WAL, we should update minRecoveryPoint instead. We
01899      * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
01900      * to act this way too, and because when it tries to write the
01901      * end-of-recovery checkpoint, it should indeed flush.
01902      */
01903     if (!XLogInsertAllowed())
01904     {
01905         UpdateMinRecoveryPoint(record, false);
01906         return;
01907     }
01908 
01909     /* Quick exit if already known flushed */
01910     if (record <= LogwrtResult.Flush)
01911         return;
01912 
01913 #ifdef WAL_DEBUG
01914     if (XLOG_DEBUG)
01915         elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
01916              (uint32) (record >> 32), (uint32) record,
01917              (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
01918              (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
01919 #endif
01920 
01921     START_CRIT_SECTION();
01922 
01923     /*
01924      * Since fsync is usually a horribly expensive operation, we try to
01925      * piggyback as much data as we can on each fsync: if we see any more data
01926      * entered into the xlog buffer, we'll write and fsync that too, so that
01927      * the final value of LogwrtResult.Flush is as large as possible. This
01928      * gives us some chance of avoiding another fsync immediately after.
01929      */
01930 
01931     /* initialize to given target; may increase below */
01932     WriteRqstPtr = record;
01933 
01934     /*
01935      * Now wait until we get the write lock, or someone else does the flush
01936      * for us.
01937      */
01938     for (;;)
01939     {
01940         /* use volatile pointer to prevent code rearrangement */
01941         volatile XLogCtlData *xlogctl = XLogCtl;
01942 
01943         /* read LogwrtResult and update local state */
01944         SpinLockAcquire(&xlogctl->info_lck);
01945         if (WriteRqstPtr < xlogctl->LogwrtRqst.Write)
01946             WriteRqstPtr = xlogctl->LogwrtRqst.Write;
01947         LogwrtResult = xlogctl->LogwrtResult;
01948         SpinLockRelease(&xlogctl->info_lck);
01949 
01950         /* done already? */
01951         if (record <= LogwrtResult.Flush)
01952             break;
01953 
01954         /*
01955          * Try to get the write lock. If we can't get it immediately, wait
01956          * until it's released, and recheck if we still need to do the flush
01957          * or if the backend that held the lock did it for us already. This
01958          * helps to maintain a good rate of group committing when the system
01959          * is bottlenecked by the speed of fsyncing.
01960          */
01961         if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
01962         {
01963             /*
01964              * The lock is now free, but we didn't acquire it yet. Before we
01965              * do, loop back to check if someone else flushed the record for
01966              * us already.
01967              */
01968             continue;
01969         }
01970 
01971         /* Got the lock; recheck whether request is satisfied */
01972         LogwrtResult = XLogCtl->LogwrtResult;
01973         if (record <= LogwrtResult.Flush)
01974         {
01975             LWLockRelease(WALWriteLock);
01976             break;
01977         }
01978 
01979         /*
01980          * Sleep before flush! By adding a delay here, we may give further
01981          * backends the opportunity to join the backlog of group commit
01982          * followers; this can significantly improve transaction throughput, at
01983          * the risk of increasing transaction latency.
01984          *
01985          * We do not sleep if enableFsync is not turned on, nor if there are
01986          * fewer than CommitSiblings other backends with active transactions.
01987          */
01988         if (CommitDelay > 0 && enableFsync &&
01989             MinimumActiveBackends(CommitSiblings))
01990             pg_usleep(CommitDelay);
01991 
01992         /* try to write/flush later additions to XLOG as well */
01993         if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
01994         {
01995             XLogCtlInsert *Insert = &XLogCtl->Insert;
01996             uint32      freespace = INSERT_FREESPACE(Insert);
01997 
01998             if (freespace == 0)     /* buffer is full */
01999                 WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
02000             else
02001             {
02002                 WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
02003                 WriteRqstPtr -= freespace;
02004             }
02005             LWLockRelease(WALInsertLock);
02006             WriteRqst.Write = WriteRqstPtr;
02007             WriteRqst.Flush = WriteRqstPtr;
02008         }
02009         else
02010         {
02011             WriteRqst.Write = WriteRqstPtr;
02012             WriteRqst.Flush = record;
02013         }
02014         XLogWrite(WriteRqst, false, false);
02015 
02016         LWLockRelease(WALWriteLock);
02017         /* done */
02018         break;
02019     }
02020 
02021     END_CRIT_SECTION();
02022 
02023     /* wake up walsenders now that we've released heavily contended locks */
02024     WalSndWakeupProcessRequests();
02025 
02026     /*
02027      * If we still haven't flushed to the request point then we have a
02028      * problem; most likely, the requested flush point is past end of XLOG.
02029      * This has been seen to occur when a disk page has a corrupted LSN.
02030      *
02031      * Formerly we treated this as a PANIC condition, but that hurts the
02032      * system's robustness rather than helping it: we do not want to take down
02033      * the whole system due to corruption on one data page.  In particular, if
02034      * the bad page is encountered again during recovery then we would be
02035      * unable to restart the database at all!  (This scenario actually
02036      * happened in the field several times with 7.1 releases.)  As of 8.4, bad
02037      * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
02038      * the only time we can reach here during recovery is while flushing the
02039      * end-of-recovery checkpoint record, and we don't expect that to have a
02040      * bad LSN.
02041      *
02042      * Note that for calls from xact.c, the ERROR will be promoted to PANIC
02043      * since xact.c calls this routine inside a critical section.  However,
02044      * calls from bufmgr.c are not within critical sections and so we will not
02045      * force a restart for a bad LSN on a data page.
02046      */
02047     if (LogwrtResult.Flush < record)
02048         elog(ERROR,
02049         "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
02050              (uint32) (record >> 32), (uint32) record,
02051              (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
02052 }
02053 
02054 /*
02055  * Flush xlog, but without specifying exactly where to flush to.
02056  *
02057  * We normally flush only completed blocks; but if there is nothing to do on
02058  * that basis, we check for unflushed async commits in the current incomplete
02059  * block, and flush through the latest one of those.  Thus, if async commits
02060  * are not being used, we will flush complete blocks only.  We can guarantee
02061  * that async commits reach disk after at most three cycles; normally only
02062  * one or two.  (When flushing complete blocks, we allow XLogWrite to write
02063  * "flexibly", meaning it can stop at the end of the buffer ring; this makes a
02064  * difference only with very high load or long wal_writer_delay, but imposes
02065  * one extra cycle for the worst case for async commits.)
02066  *
02067  * This routine is invoked periodically by the background walwriter process.
02068  *
02069  * Returns TRUE if we flushed anything.
02070  */
02071 bool
02072 XLogBackgroundFlush(void)
02073 {
02074     XLogRecPtr  WriteRqstPtr;
02075     bool        flexible = true;
02076     bool        wrote_something = false;
02077 
02078     /* XLOG doesn't need flushing during recovery */
02079     if (RecoveryInProgress())
02080         return false;
02081 
02082     /* read LogwrtResult and update local state */
02083     {
02084         /* use volatile pointer to prevent code rearrangement */
02085         volatile XLogCtlData *xlogctl = XLogCtl;
02086 
02087         SpinLockAcquire(&xlogctl->info_lck);
02088         LogwrtResult = xlogctl->LogwrtResult;
02089         WriteRqstPtr = xlogctl->LogwrtRqst.Write;
02090         SpinLockRelease(&xlogctl->info_lck);
02091     }
02092 
02093     /* back off to last completed page boundary */
02094     WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
02095 
02096     /* if we have already flushed that far, consider async commit records */
02097     if (WriteRqstPtr <= LogwrtResult.Flush)
02098     {
02099         /* use volatile pointer to prevent code rearrangement */
02100         volatile XLogCtlData *xlogctl = XLogCtl;
02101 
02102         SpinLockAcquire(&xlogctl->info_lck);
02103         WriteRqstPtr = xlogctl->asyncXactLSN;
02104         SpinLockRelease(&xlogctl->info_lck);
02105         flexible = false;       /* ensure it all gets written */
02106     }
02107 
02108     /*
02109      * If already known flushed, we're done. Just need to check if we are
02110      * holding an open file handle to a logfile that's no longer in use,
02111      * preventing the file from being deleted.
02112      */
02113     if (WriteRqstPtr <= LogwrtResult.Flush)
02114     {
02115         if (openLogFile >= 0)
02116         {
02117             if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
02118             {
02119                 XLogFileClose();
02120             }
02121         }
02122         return false;
02123     }
02124 
02125 #ifdef WAL_DEBUG
02126     if (XLOG_DEBUG)
02127         elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
02128              (uint32) (WriteRqstPtr >> 32), (uint32) WriteRqstPtr,
02129              (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
02130              (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
02131 #endif
02132 
02133     START_CRIT_SECTION();
02134 
02135     /* now wait for the write lock */
02136     LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
02137     LogwrtResult = XLogCtl->LogwrtResult;
02138     if (WriteRqstPtr > LogwrtResult.Flush)
02139     {
02140         XLogwrtRqst WriteRqst;
02141 
02142         WriteRqst.Write = WriteRqstPtr;
02143         WriteRqst.Flush = WriteRqstPtr;
02144         XLogWrite(WriteRqst, flexible, false);
02145         wrote_something = true;
02146     }
02147     LWLockRelease(WALWriteLock);
02148 
02149     END_CRIT_SECTION();
02150 
02151     /* wake up walsenders now that we've released heavily contended locks */
02152     WalSndWakeupProcessRequests();
02153 
02154     return wrote_something;
02155 }
02156 
02157 /*
02158  * Test whether XLOG data has been flushed up to (at least) the given position.
02159  *
02160  * Returns true if a flush is still needed.  (It may be that someone else
02161  * is already in process of flushing that far, however.)
02162  */
02163 bool
02164 XLogNeedsFlush(XLogRecPtr record)
02165 {
02166     /*
02167      * During recovery, we don't flush WAL but update minRecoveryPoint
02168      * instead. So "needs flush" is taken to mean whether minRecoveryPoint
02169      * would need to be updated.
02170      */
02171     if (RecoveryInProgress())
02172     {
02173         /* Quick exit if already known updated */
02174         if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
02175             return false;
02176 
02177         /*
02178          * Update local copy of minRecoveryPoint. But if the lock is busy,
02179          * just return a conservative guess.
02180          */
02181         if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
02182             return true;
02183         minRecoveryPoint = ControlFile->minRecoveryPoint;
02184         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
02185         LWLockRelease(ControlFileLock);
02186 
02187         /*
02188          * An invalid minRecoveryPoint means that we need to recover all the
02189          * WAL, i.e., we're doing crash recovery.  We never modify the control
02190          * file's value in that case, so we can short-circuit future checks
02191          * here too.
02192          */
02193         if (minRecoveryPoint == 0)
02194             updateMinRecoveryPoint = false;
02195 
02196         /* check again */
02197         if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
02198             return false;
02199         else
02200             return true;
02201     }
02202 
02203     /* Quick exit if already known flushed */
02204     if (record <= LogwrtResult.Flush)
02205         return false;
02206 
02207     /* read LogwrtResult and update local state */
02208     {
02209         /* use volatile pointer to prevent code rearrangement */
02210         volatile XLogCtlData *xlogctl = XLogCtl;
02211 
02212         SpinLockAcquire(&xlogctl->info_lck);
02213         LogwrtResult = xlogctl->LogwrtResult;
02214         SpinLockRelease(&xlogctl->info_lck);
02215     }
02216 
02217     /* check again */
02218     if (record <= LogwrtResult.Flush)
02219         return false;
02220 
02221     return true;
02222 }
02223 
02224 /*
02225  * Create a new XLOG file segment, or open a pre-existing one.
02226  *
02227  * log, seg: identify segment to be created/opened.
02228  *
02229  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
02230  * pre-existing file will be deleted).  On return, TRUE if a pre-existing
02231  * file was used.
02232  *
02233  * use_lock: if TRUE, acquire ControlFileLock while moving file into
02234  * place.  This should be TRUE except during bootstrap log creation.  The
02235  * caller must *not* hold the lock at call.
02236  *
02237  * Returns FD of opened file.
02238  *
02239  * Note: errors here are ERROR not PANIC because we might or might not be
02240  * inside a critical section (eg, during checkpoint there is no reason to
02241  * take down the system on failure).  They will promote to PANIC if we are
02242  * in a critical section.
02243  */
02244 int
02245 XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
02246 {
02247     char        path[MAXPGPATH];
02248     char        tmppath[MAXPGPATH];
02249     char       *zbuffer;
02250     XLogSegNo   installed_segno;
02251     int         max_advance;
02252     int         fd;
02253     int         nbytes;
02254 
02255     XLogFilePath(path, ThisTimeLineID, logsegno);
02256 
02257     /*
02258      * Try to use existent file (checkpoint maker may have created it already)
02259      */
02260     if (*use_existent)
02261     {
02262         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
02263                            S_IRUSR | S_IWUSR);
02264         if (fd < 0)
02265         {
02266             if (errno != ENOENT)
02267                 ereport(ERROR,
02268                         (errcode_for_file_access(),
02269                          errmsg("could not open file \"%s\": %m", path)));
02270         }
02271         else
02272             return fd;
02273     }
02274 
02275     /*
02276      * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
02277      * another process is doing the same thing.  If so, we will end up
02278      * pre-creating an extra log segment.  That seems OK, and better than
02279      * holding the lock throughout this lengthy process.
02280      */
02281     elog(DEBUG2, "creating and filling new WAL file");
02282 
02283     snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
02284 
02285     unlink(tmppath);
02286 
02287     /*
02288      * Allocate a buffer full of zeros. This is done before opening the file
02289      * so that we don't leak the file descriptor if palloc fails.
02290      *
02291      * Note: palloc zbuffer, instead of just using a local char array, to
02292      * ensure it is reasonably well-aligned; this may save a few cycles
02293      * transferring data to the kernel.
02294      */
02295     zbuffer = (char *) palloc0(XLOG_BLCKSZ);
02296 
02297     /* do not use get_sync_bit() here --- want to fsync only at end of fill */
02298     fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
02299                        S_IRUSR | S_IWUSR);
02300     if (fd < 0)
02301         ereport(ERROR,
02302                 (errcode_for_file_access(),
02303                  errmsg("could not create file \"%s\": %m", tmppath)));
02304 
02305     /*
02306      * Zero-fill the file.  We have to do this the hard way to ensure that all
02307      * the file space has really been allocated --- on platforms that allow
02308      * "holes" in files, just seeking to the end doesn't allocate intermediate
02309      * space.  This way, we know that we have all the space and (after the
02310      * fsync below) that all the indirect blocks are down on disk.  Therefore,
02311      * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
02312      * log file.
02313      */
02314     for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
02315     {
02316         errno = 0;
02317         if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
02318         {
02319             int         save_errno = errno;
02320 
02321             /*
02322              * If we fail to make the file, delete it to release disk space
02323              */
02324             unlink(tmppath);
02325 
02326             close(fd);
02327 
02328             /* if write didn't set errno, assume problem is no disk space */
02329             errno = save_errno ? save_errno : ENOSPC;
02330 
02331             ereport(ERROR,
02332                     (errcode_for_file_access(),
02333                      errmsg("could not write to file \"%s\": %m", tmppath)));
02334         }
02335     }
02336     pfree(zbuffer);
02337 
02338     if (pg_fsync(fd) != 0)
02339     {
02340         close(fd);
02341         ereport(ERROR,
02342                 (errcode_for_file_access(),
02343                  errmsg("could not fsync file \"%s\": %m", tmppath)));
02344     }
02345 
02346     if (close(fd))
02347         ereport(ERROR,
02348                 (errcode_for_file_access(),
02349                  errmsg("could not close file \"%s\": %m", tmppath)));
02350 
02351     /*
02352      * Now move the segment into place with its final name.
02353      *
02354      * If caller didn't want to use a pre-existing file, get rid of any
02355      * pre-existing file.  Otherwise, cope with possibility that someone else
02356      * has created the file while we were filling ours: if so, use ours to
02357      * pre-create a future log segment.
02358      */
02359     installed_segno = logsegno;
02360     max_advance = XLOGfileslop;
02361     if (!InstallXLogFileSegment(&installed_segno, tmppath,
02362                                 *use_existent, &max_advance,
02363                                 use_lock))
02364     {
02365         /*
02366          * No need for any more future segments, or InstallXLogFileSegment()
02367          * failed to rename the file into place. If the rename failed, opening
02368          * the file below will fail.
02369          */
02370         unlink(tmppath);
02371     }
02372 
02373     /* Set flag to tell caller there was no existent file */
02374     *use_existent = false;
02375 
02376     /* Now open original target segment (might not be file I just made) */
02377     fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
02378                        S_IRUSR | S_IWUSR);
02379     if (fd < 0)
02380         ereport(ERROR,
02381                 (errcode_for_file_access(),
02382            errmsg("could not open file \"%s\": %m", path)));
02383 
02384     elog(DEBUG2, "done creating and filling new WAL file");
02385 
02386     return fd;
02387 }
02388 
02389 /*
02390  * Create a new XLOG file segment by copying a pre-existing one.
02391  *
02392  * destsegno: identify segment to be created.
02393  *
02394  * srcTLI, srclog, srcseg: identify segment to be copied (could be from
02395  *      a different timeline)
02396  *
02397  * Currently this is only used during recovery, and so there are no locking
02398  * considerations.  But we should be just as tense as XLogFileInit to avoid
02399  * emplacing a bogus file.
02400  */
02401 static void
02402 XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno)
02403 {
02404     char        path[MAXPGPATH];
02405     char        tmppath[MAXPGPATH];
02406     char        buffer[XLOG_BLCKSZ];
02407     int         srcfd;
02408     int         fd;
02409     int         nbytes;
02410 
02411     /*
02412      * Open the source file
02413      */
02414     XLogFilePath(path, srcTLI, srcsegno);
02415     srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
02416     if (srcfd < 0)
02417         ereport(ERROR,
02418                 (errcode_for_file_access(),
02419                  errmsg("could not open file \"%s\": %m", path)));
02420 
02421     /*
02422      * Copy into a temp file name.
02423      */
02424     snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
02425 
02426     unlink(tmppath);
02427 
02428     /* do not use get_sync_bit() here --- want to fsync only at end of fill */
02429     fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
02430                            S_IRUSR | S_IWUSR);
02431     if (fd < 0)
02432         ereport(ERROR,
02433                 (errcode_for_file_access(),
02434                  errmsg("could not create file \"%s\": %m", tmppath)));
02435 
02436     /*
02437      * Do the data copying.
02438      */
02439     for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
02440     {
02441         errno = 0;
02442         if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
02443         {
02444             if (errno != 0)
02445                 ereport(ERROR,
02446                         (errcode_for_file_access(),
02447                          errmsg("could not read file \"%s\": %m", path)));
02448             else
02449                 ereport(ERROR,
02450                         (errmsg("not enough data in file \"%s\"", path)));
02451         }
02452         errno = 0;
02453         if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
02454         {
02455             int         save_errno = errno;
02456 
02457             /*
02458              * If we fail to make the file, delete it to release disk space
02459              */
02460             unlink(tmppath);
02461             /* if write didn't set errno, assume problem is no disk space */
02462             errno = save_errno ? save_errno : ENOSPC;
02463 
02464             ereport(ERROR,
02465                     (errcode_for_file_access(),
02466                      errmsg("could not write to file \"%s\": %m", tmppath)));
02467         }
02468     }
02469 
02470     if (pg_fsync(fd) != 0)
02471         ereport(ERROR,
02472                 (errcode_for_file_access(),
02473                  errmsg("could not fsync file \"%s\": %m", tmppath)));
02474 
02475     if (CloseTransientFile(fd))
02476         ereport(ERROR,
02477                 (errcode_for_file_access(),
02478                  errmsg("could not close file \"%s\": %m", tmppath)));
02479 
02480     CloseTransientFile(srcfd);
02481 
02482     /*
02483      * Now move the segment into place with its final name.
02484      */
02485     if (!InstallXLogFileSegment(&destsegno, tmppath, false, NULL, false))
02486         elog(ERROR, "InstallXLogFileSegment should not have failed");
02487 }
02488 
02489 /*
02490  * Install a new XLOG segment file as a current or future log segment.
02491  *
02492  * This is used both to install a newly-created segment (which has a temp
02493  * filename while it's being created) and to recycle an old segment.
02494  *
02495  * *segno: identify segment to install as (or first possible target).
02496  * When find_free is TRUE, this is modified on return to indicate the
02497  * actual installation location or last segment searched.
02498  *
02499  * tmppath: initial name of file to install.  It will be renamed into place.
02500  *
02501  * find_free: if TRUE, install the new segment at the first empty segno
02502  * number at or after the passed numbers.  If FALSE, install the new segment
02503  * exactly where specified, deleting any existing segment file there.
02504  *
02505  * *max_advance: maximum number of segno slots to advance past the starting
02506  * point.  Fail if no free slot is found in this range.  On return, reduced
02507  * by the number of slots skipped over.  (Irrelevant, and may be NULL,
02508  * when find_free is FALSE.)
02509  *
02510  * use_lock: if TRUE, acquire ControlFileLock while moving file into
02511  * place.  This should be TRUE except during bootstrap log creation.  The
02512  * caller must *not* hold the lock at call.
02513  *
02514  * Returns TRUE if the file was installed successfully.  FALSE indicates that
02515  * max_advance limit was exceeded, or an error occurred while renaming the
02516  * file into place.
02517  */
02518 static bool
02519 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
02520                        bool find_free, int *max_advance,
02521                        bool use_lock)
02522 {
02523     char        path[MAXPGPATH];
02524     struct stat stat_buf;
02525 
02526     XLogFilePath(path, ThisTimeLineID, *segno);
02527 
02528     /*
02529      * We want to be sure that only one process does this at a time.
02530      */
02531     if (use_lock)
02532         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
02533 
02534     if (!find_free)
02535     {
02536         /* Force installation: get rid of any pre-existing segment file */
02537         unlink(path);
02538     }
02539     else
02540     {
02541         /* Find a free slot to put it in */
02542         while (stat(path, &stat_buf) == 0)
02543         {
02544             if (*max_advance <= 0)
02545             {
02546                 /* Failed to find a free slot within specified range */
02547                 if (use_lock)
02548                     LWLockRelease(ControlFileLock);
02549                 return false;
02550             }
02551             (*segno)++;
02552             (*max_advance)--;
02553             XLogFilePath(path, ThisTimeLineID, *segno);
02554         }
02555     }
02556 
02557     /*
02558      * Prefer link() to rename() here just to be really sure that we don't
02559      * overwrite an existing logfile.  However, there shouldn't be one, so
02560      * rename() is an acceptable substitute except for the truly paranoid.
02561      */
02562 #if HAVE_WORKING_LINK
02563     if (link(tmppath, path) < 0)
02564     {
02565         if (use_lock)
02566             LWLockRelease(ControlFileLock);
02567         ereport(LOG,
02568                 (errcode_for_file_access(),
02569                  errmsg("could not link file \"%s\" to \"%s\" (initialization of log file): %m",
02570                         tmppath, path)));
02571         return false;
02572     }
02573     unlink(tmppath);
02574 #else
02575     if (rename(tmppath, path) < 0)
02576     {
02577         if (use_lock)
02578             LWLockRelease(ControlFileLock);
02579         ereport(LOG,
02580                 (errcode_for_file_access(),
02581                  errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file): %m",
02582                         tmppath, path)));
02583         return false;
02584     }
02585 #endif
02586 
02587     if (use_lock)
02588         LWLockRelease(ControlFileLock);
02589 
02590     return true;
02591 }
02592 
02593 /*
02594  * Open a pre-existing logfile segment for writing.
02595  */
02596 int
02597 XLogFileOpen(XLogSegNo segno)
02598 {
02599     char        path[MAXPGPATH];
02600     int         fd;
02601 
02602     XLogFilePath(path, ThisTimeLineID, segno);
02603 
02604     fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
02605                        S_IRUSR | S_IWUSR);
02606     if (fd < 0)
02607         ereport(PANIC,
02608                 (errcode_for_file_access(),
02609                  errmsg("could not open xlog file \"%s\": %m", path)));
02610 
02611     return fd;
02612 }
02613 
02614 /*
02615  * Open a logfile segment for reading (during recovery).
02616  *
02617  * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
02618  * Otherwise, it's assumed to be already available in pg_xlog.
02619  */
02620 static int
02621 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
02622              int source, bool notfoundOk)
02623 {
02624     char        xlogfname[MAXFNAMELEN];
02625     char        activitymsg[MAXFNAMELEN + 16];
02626     char        path[MAXPGPATH];
02627     int         fd;
02628 
02629     XLogFileName(xlogfname, tli, segno);
02630 
02631     switch (source)
02632     {
02633         case XLOG_FROM_ARCHIVE:
02634             /* Report recovery progress in PS display */
02635             snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
02636                      xlogfname);
02637             set_ps_display(activitymsg, false);
02638 
02639             restoredFromArchive = RestoreArchivedFile(path, xlogfname,
02640                                                       "RECOVERYXLOG",
02641                                                       XLogSegSize,
02642                                                       InRedo);
02643             if (!restoredFromArchive)
02644                 return -1;
02645             break;
02646 
02647         case XLOG_FROM_PG_XLOG:
02648         case XLOG_FROM_STREAM:
02649             XLogFilePath(path, tli, segno);
02650             restoredFromArchive = false;
02651             break;
02652 
02653         default:
02654             elog(ERROR, "invalid XLogFileRead source %d", source);
02655     }
02656 
02657     /*
02658      * If the segment was fetched from archival storage, replace the existing
02659      * xlog segment (if any) with the archival version.
02660      */
02661     if (source == XLOG_FROM_ARCHIVE)
02662     {
02663         KeepFileRestoredFromArchive(path, xlogfname);
02664 
02665         /*
02666          * Set path to point at the new file in pg_xlog.
02667          */
02668         snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
02669     }
02670 
02671     fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
02672     if (fd >= 0)
02673     {
02674         /* Success! */
02675         curFileTLI = tli;
02676 
02677         /* Report recovery progress in PS display */
02678         snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
02679                  xlogfname);
02680         set_ps_display(activitymsg, false);
02681 
02682         /* Track source of data in assorted state variables */
02683         readSource = source;
02684         XLogReceiptSource = source;
02685         /* In FROM_STREAM case, caller tracks receipt time, not me */
02686         if (source != XLOG_FROM_STREAM)
02687             XLogReceiptTime = GetCurrentTimestamp();
02688 
02689         return fd;
02690     }
02691     if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
02692         ereport(PANIC,
02693                 (errcode_for_file_access(),
02694                  errmsg("could not open file \"%s\": %m", path)));
02695     return -1;
02696 }
02697 
02698 /*
02699  * Open a logfile segment for reading (during recovery).
02700  *
02701  * This version searches for the segment with any TLI listed in expectedTLEs.
02702  */
02703 static int
02704 XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
02705 {
02706     char        path[MAXPGPATH];
02707     ListCell   *cell;
02708     int         fd;
02709     List       *tles;
02710 
02711     /*
02712      * Loop looking for a suitable timeline ID: we might need to read any of
02713      * the timelines listed in expectedTLEs.
02714      *
02715      * We expect curFileTLI on entry to be the TLI of the preceding file in
02716      * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
02717      * to go backwards; this prevents us from picking up the wrong file when a
02718      * parent timeline extends to higher segment numbers than the child we
02719      * want to read.
02720      *
02721      * If we haven't read the timeline history file yet, read it now, so that
02722      * we know which TLIs to scan.  We don't save the list in expectedTLEs,
02723      * however, unless we actually find a valid segment.  That way if there is
02724      * neither a timeline history file nor a WAL segment in the archive, and
02725      * streaming replication is set up, we'll read the timeline history file
02726      * streamed from the master when we start streaming, instead of recovering
02727      * with a dummy history generated here.
02728      */
02729     if (expectedTLEs)
02730         tles = expectedTLEs;
02731     else
02732         tles = readTimeLineHistory(recoveryTargetTLI);
02733 
02734     foreach(cell, tles)
02735     {
02736         TimeLineID  tli = ((TimeLineHistoryEntry *) lfirst(cell))->tli;
02737 
02738         if (tli < curFileTLI)
02739             break;              /* don't bother looking at too-old TLIs */
02740 
02741         if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
02742         {
02743             fd = XLogFileRead(segno, emode, tli,
02744                               XLOG_FROM_ARCHIVE, true);
02745             if (fd != -1)
02746             {
02747                 elog(DEBUG1, "got WAL segment from archive");
02748                 if (!expectedTLEs)
02749                     expectedTLEs = tles;
02750                 return fd;
02751             }
02752         }
02753 
02754         if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_XLOG)
02755         {
02756             fd = XLogFileRead(segno, emode, tli,
02757                               XLOG_FROM_PG_XLOG, true);
02758             if (fd != -1)
02759             {
02760                 if (!expectedTLEs)
02761                     expectedTLEs = tles;
02762                 return fd;
02763             }
02764         }
02765     }
02766 
02767     /* Couldn't find it.  For simplicity, complain about front timeline */
02768     XLogFilePath(path, recoveryTargetTLI, segno);
02769     errno = ENOENT;
02770     ereport(emode,
02771             (errcode_for_file_access(),
02772              errmsg("could not open file \"%s\": %m", path)));
02773     return -1;
02774 }
02775 
02776 /*
02777  * Close the current logfile segment for writing.
02778  */
02779 static void
02780 XLogFileClose(void)
02781 {
02782     Assert(openLogFile >= 0);
02783 
02784     /*
02785      * WAL segment files will not be re-read in normal operation, so we advise
02786      * the OS to release any cached pages.  But do not do so if WAL archiving
02787      * or streaming is active, because archiver and walsender process could
02788      * use the cache to read the WAL segment.
02789      */
02790 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
02791     if (!XLogIsNeeded())
02792         (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
02793 #endif
02794 
02795     if (close(openLogFile))
02796         ereport(PANIC,
02797                 (errcode_for_file_access(),
02798                  errmsg("could not close log file %s: %m",
02799                         XLogFileNameP(ThisTimeLineID, openLogSegNo))));
02800     openLogFile = -1;
02801 }
02802 
02803 /*
02804  * Preallocate log files beyond the specified log endpoint.
02805  *
02806  * XXX this is currently extremely conservative, since it forces only one
02807  * future log segment to exist, and even that only if we are 75% done with
02808  * the current one.  This is only appropriate for very low-WAL-volume systems.
02809  * High-volume systems will be OK once they've built up a sufficient set of
02810  * recycled log segments, but the startup transient is likely to include
02811  * a lot of segment creations by foreground processes, which is not so good.
02812  */
02813 static void
02814 PreallocXlogFiles(XLogRecPtr endptr)
02815 {
02816     XLogSegNo   _logSegNo;
02817     int         lf;
02818     bool        use_existent;
02819 
02820     XLByteToPrevSeg(endptr, _logSegNo);
02821     if ((endptr - 1) % XLogSegSize >= (uint32) (0.75 * XLogSegSize))
02822     {
02823         _logSegNo++;
02824         use_existent = true;
02825         lf = XLogFileInit(_logSegNo, &use_existent, true);
02826         close(lf);
02827         if (!use_existent)
02828             CheckpointStats.ckpt_segs_added++;
02829     }
02830 }
02831 
02832 /*
02833  * Throws an error if the given log segment has already been removed or
02834  * recycled. The caller should only pass a segment that it knows to have
02835  * existed while the server has been running, as this function always
02836  * succeeds if no WAL segments have been removed since startup.
02837  * 'tli' is only used in the error message.
02838  */
02839 void
02840 CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
02841 {
02842     /* use volatile pointer to prevent code rearrangement */
02843     volatile XLogCtlData *xlogctl = XLogCtl;
02844     XLogSegNo   lastRemovedSegNo;
02845 
02846     SpinLockAcquire(&xlogctl->info_lck);
02847     lastRemovedSegNo = xlogctl->lastRemovedSegNo;
02848     SpinLockRelease(&xlogctl->info_lck);
02849 
02850     if (segno <= lastRemovedSegNo)
02851     {
02852         char        filename[MAXFNAMELEN];
02853 
02854         XLogFileName(filename, tli, segno);
02855         ereport(ERROR,
02856                 (errcode_for_file_access(),
02857                  errmsg("requested WAL segment %s has already been removed",
02858                         filename)));
02859     }
02860 }
02861 
02862 /*
02863  * Update the last removed segno pointer in shared memory, to reflect
02864  * that the given XLOG file has been removed.
02865  */
02866 static void
02867 UpdateLastRemovedPtr(char *filename)
02868 {
02869     /* use volatile pointer to prevent code rearrangement */
02870     volatile XLogCtlData *xlogctl = XLogCtl;
02871     uint32      tli;
02872     XLogSegNo   segno;
02873 
02874     XLogFromFileName(filename, &tli, &segno);
02875 
02876     SpinLockAcquire(&xlogctl->info_lck);
02877     if (segno > xlogctl->lastRemovedSegNo)
02878         xlogctl->lastRemovedSegNo = segno;
02879     SpinLockRelease(&xlogctl->info_lck);
02880 }
02881 
02882 /*
02883  * Recycle or remove all log files older or equal to passed segno
02884  *
02885  * endptr is current (or recent) end of xlog; this is used to determine
02886  * whether we want to recycle rather than delete no-longer-wanted log files.
02887  */
02888 static void
02889 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr)
02890 {
02891     XLogSegNo   endlogSegNo;
02892     int         max_advance;
02893     DIR        *xldir;
02894     struct dirent *xlde;
02895     char        lastoff[MAXFNAMELEN];
02896     char        path[MAXPGPATH];
02897 
02898 #ifdef WIN32
02899     char        newpath[MAXPGPATH];
02900 #endif
02901     struct stat statbuf;
02902 
02903     /*
02904      * Initialize info about where to try to recycle to.  We allow recycling
02905      * segments up to XLOGfileslop segments beyond the current XLOG location.
02906      */
02907     XLByteToPrevSeg(endptr, endlogSegNo);
02908     max_advance = XLOGfileslop;
02909 
02910     xldir = AllocateDir(XLOGDIR);
02911     if (xldir == NULL)
02912         ereport(ERROR,
02913                 (errcode_for_file_access(),
02914                  errmsg("could not open transaction log directory \"%s\": %m",
02915                         XLOGDIR)));
02916 
02917     /*
02918      * Construct a filename of the last segment to be kept. The timeline ID
02919      * doesn't matter, we ignore that in the comparison. (During recovery,
02920      * ThisTimeLineID isn't set, so we can't use that.)
02921      */
02922     XLogFileName(lastoff, 0, segno);
02923 
02924     elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
02925          lastoff);
02926 
02927     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
02928     {
02929         /*
02930          * We ignore the timeline part of the XLOG segment identifiers in
02931          * deciding whether a segment is still needed.  This ensures that we
02932          * won't prematurely remove a segment from a parent timeline. We could
02933          * probably be a little more proactive about removing segments of
02934          * non-parent timelines, but that would be a whole lot more
02935          * complicated.
02936          *
02937          * We use the alphanumeric sorting property of the filenames to decide
02938          * which ones are earlier than the lastoff segment.
02939          */
02940         if (strlen(xlde->d_name) == 24 &&
02941             strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
02942             strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
02943         {
02944             if (XLogArchiveCheckDone(xlde->d_name))
02945             {
02946                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
02947 
02948                 /* Update the last removed location in shared memory first */
02949                 UpdateLastRemovedPtr(xlde->d_name);
02950 
02951                 /*
02952                  * Before deleting the file, see if it can be recycled as a
02953                  * future log segment. Only recycle normal files, pg_standby
02954                  * for example can create symbolic links pointing to a
02955                  * separate archive directory.
02956                  */
02957                 if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
02958                     InstallXLogFileSegment(&endlogSegNo, path,
02959                                            true, &max_advance, true))
02960                 {
02961                     ereport(DEBUG2,
02962                             (errmsg("recycled transaction log file \"%s\"",
02963                                     xlde->d_name)));
02964                     CheckpointStats.ckpt_segs_recycled++;
02965                     /* Needn't recheck that slot on future iterations */
02966                     if (max_advance > 0)
02967                     {
02968                         endlogSegNo++;
02969                         max_advance--;
02970                     }
02971                 }
02972                 else
02973                 {
02974                     /* No need for any more future segments... */
02975                     int         rc;
02976 
02977                     ereport(DEBUG2,
02978                             (errmsg("removing transaction log file \"%s\"",
02979                                     xlde->d_name)));
02980 
02981 #ifdef WIN32
02982 
02983                     /*
02984                      * On Windows, if another process (e.g another backend)
02985                      * holds the file open in FILE_SHARE_DELETE mode, unlink
02986                      * will succeed, but the file will still show up in
02987                      * directory listing until the last handle is closed. To
02988                      * avoid confusing the lingering deleted file for a live
02989                      * WAL file that needs to be archived, rename it before
02990                      * deleting it.
02991                      *
02992                      * If another process holds the file open without
02993                      * FILE_SHARE_DELETE flag, rename will fail. We'll try
02994                      * again at the next checkpoint.
02995                      */
02996                     snprintf(newpath, MAXPGPATH, "%s.deleted", path);
02997                     if (rename(path, newpath) != 0)
02998                     {
02999                         ereport(LOG,
03000                                 (errcode_for_file_access(),
03001                                  errmsg("could not rename old transaction log file \"%s\": %m",
03002                                         path)));
03003                         continue;
03004                     }
03005                     rc = unlink(newpath);
03006 #else
03007                     rc = unlink(path);
03008 #endif
03009                     if (rc != 0)
03010                     {
03011                         ereport(LOG,
03012                                 (errcode_for_file_access(),
03013                                  errmsg("could not remove old transaction log file \"%s\": %m",
03014                                         path)));
03015                         continue;
03016                     }
03017                     CheckpointStats.ckpt_segs_removed++;
03018                 }
03019 
03020                 XLogArchiveCleanup(xlde->d_name);
03021             }
03022         }
03023     }
03024 
03025     FreeDir(xldir);
03026 }
03027 
03028 /*
03029  * Verify whether pg_xlog and pg_xlog/archive_status exist.
03030  * If the latter does not exist, recreate it.
03031  *
03032  * It is not the goal of this function to verify the contents of these
03033  * directories, but to help in cases where someone has performed a cluster
03034  * copy for PITR purposes but omitted pg_xlog from the copy.
03035  *
03036  * We could also recreate pg_xlog if it doesn't exist, but a deliberate
03037  * policy decision was made not to.  It is fairly common for pg_xlog to be
03038  * a symlink, and if that was the DBA's intent then automatically making a
03039  * plain directory would result in degraded performance with no notice.
03040  */
03041 static void
03042 ValidateXLOGDirectoryStructure(void)
03043 {
03044     char        path[MAXPGPATH];
03045     struct stat stat_buf;
03046 
03047     /* Check for pg_xlog; if it doesn't exist, error out */
03048     if (stat(XLOGDIR, &stat_buf) != 0 ||
03049         !S_ISDIR(stat_buf.st_mode))
03050         ereport(FATAL,
03051                 (errmsg("required WAL directory \"%s\" does not exist",
03052                         XLOGDIR)));
03053 
03054     /* Check for archive_status */
03055     snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
03056     if (stat(path, &stat_buf) == 0)
03057     {
03058         /* Check for weird cases where it exists but isn't a directory */
03059         if (!S_ISDIR(stat_buf.st_mode))
03060             ereport(FATAL,
03061                     (errmsg("required WAL directory \"%s\" does not exist",
03062                             path)));
03063     }
03064     else
03065     {
03066         ereport(LOG,
03067                 (errmsg("creating missing WAL directory \"%s\"", path)));
03068         if (mkdir(path, S_IRWXU) < 0)
03069             ereport(FATAL,
03070                     (errmsg("could not create missing directory \"%s\": %m",
03071                             path)));
03072     }
03073 }
03074 
03075 /*
03076  * Remove previous backup history files.  This also retries creation of
03077  * .ready files for any backup history files for which XLogArchiveNotify
03078  * failed earlier.
03079  */
03080 static void
03081 CleanupBackupHistory(void)
03082 {
03083     DIR        *xldir;
03084     struct dirent *xlde;
03085     char        path[MAXPGPATH];
03086 
03087     xldir = AllocateDir(XLOGDIR);
03088     if (xldir == NULL)
03089         ereport(ERROR,
03090                 (errcode_for_file_access(),
03091                  errmsg("could not open transaction log directory \"%s\": %m",
03092                         XLOGDIR)));
03093 
03094     while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
03095     {
03096         if (strlen(xlde->d_name) > 24 &&
03097             strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
03098             strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
03099                    ".backup") == 0)
03100         {
03101             if (XLogArchiveCheckDone(xlde->d_name))
03102             {
03103                 ereport(DEBUG2,
03104                 (errmsg("removing transaction log backup history file \"%s\"",
03105                         xlde->d_name)));
03106                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
03107                 unlink(path);
03108                 XLogArchiveCleanup(xlde->d_name);
03109             }
03110         }
03111     }
03112 
03113     FreeDir(xldir);
03114 }
03115 
03116 /*
03117  * Restore a full-page image from a backup block attached to an XLOG record.
03118  *
03119  * lsn: LSN of the XLOG record being replayed
03120  * record: the complete XLOG record
03121  * block_index: which backup block to restore (0 .. XLR_MAX_BKP_BLOCKS - 1)
03122  * get_cleanup_lock: TRUE to get a cleanup rather than plain exclusive lock
03123  * keep_buffer: TRUE to return the buffer still locked and pinned
03124  *
03125  * Returns the buffer number containing the page.  Note this is not terribly
03126  * useful unless keep_buffer is specified as TRUE.
03127  *
03128  * Note: when a backup block is available in XLOG, we restore it
03129  * unconditionally, even if the page in the database appears newer.
03130  * This is to protect ourselves against database pages that were partially
03131  * or incorrectly written during a crash.  We assume that the XLOG data
03132  * must be good because it has passed a CRC check, while the database
03133  * page might not be.  This will force us to replay all subsequent
03134  * modifications of the page that appear in XLOG, rather than possibly
03135  * ignoring them as already applied, but that's not a huge drawback.
03136  *
03137  * If 'get_cleanup_lock' is true, a cleanup lock is obtained on the buffer,
03138  * else a normal exclusive lock is used.  During crash recovery, that's just
03139  * pro forma because there can't be any regular backends in the system, but
03140  * in hot standby mode the distinction is important.
03141  *
03142  * If 'keep_buffer' is true, return without releasing the buffer lock and pin;
03143  * then caller is responsible for doing UnlockReleaseBuffer() later.  This
03144  * is needed in some cases when replaying XLOG records that touch multiple
03145  * pages, to prevent inconsistent states from being visible to other backends.
03146  * (Again, that's only important in hot standby mode.)
03147  */
03148 Buffer
03149 RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index,
03150                    bool get_cleanup_lock, bool keep_buffer)
03151 {
03152     BkpBlock    bkpb;
03153     char       *blk;
03154     int         i;
03155 
03156     /* Locate requested BkpBlock in the record */
03157     blk = (char *) XLogRecGetData(record) + record->xl_len;
03158     for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
03159     {
03160         if (!(record->xl_info & XLR_BKP_BLOCK(i)))
03161             continue;
03162 
03163         memcpy(&bkpb, blk, sizeof(BkpBlock));
03164         blk += sizeof(BkpBlock);
03165 
03166         if (i == block_index)
03167         {
03168             /* Found it, apply the update */
03169             return RestoreBackupBlockContents(lsn, bkpb, blk, get_cleanup_lock,
03170                                               keep_buffer);
03171         }
03172 
03173         blk += BLCKSZ - bkpb.hole_length;
03174     }
03175 
03176     /* Caller specified a bogus block_index */
03177     elog(ERROR, "failed to restore block_index %d", block_index);
03178     return InvalidBuffer;       /* keep compiler quiet */
03179 }
03180 
03181 /*
03182  * Workhorse for RestoreBackupBlock usable without an xlog record
03183  *
03184  * Restores a full-page image from BkpBlock and a data pointer.
03185  */
03186 static Buffer
03187 RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, char *blk,
03188                            bool get_cleanup_lock, bool keep_buffer)
03189 {
03190     Buffer      buffer;
03191     Page        page;
03192 
03193     buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
03194                                     RBM_ZERO);
03195     Assert(BufferIsValid(buffer));
03196     if (get_cleanup_lock)
03197         LockBufferForCleanup(buffer);
03198     else
03199         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
03200 
03201     page = (Page) BufferGetPage(buffer);
03202 
03203     if (bkpb.hole_length == 0)
03204     {
03205         memcpy((char *) page, blk, BLCKSZ);
03206     }
03207     else
03208     {
03209         memcpy((char *) page, blk, bkpb.hole_offset);
03210         /* must zero-fill the hole */
03211         MemSet((char *) page + bkpb.hole_offset, 0, bkpb.hole_length);
03212         memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
03213                blk + bkpb.hole_offset,
03214                BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
03215     }
03216 
03217     /*
03218      * The checksum value on this page is currently invalid. We don't
03219      * need to reset it here since it will be set before being written.
03220      */
03221 
03222     PageSetLSN(page, lsn);
03223     MarkBufferDirty(buffer);
03224 
03225     if (!keep_buffer)
03226         UnlockReleaseBuffer(buffer);
03227 
03228     return buffer;
03229 }
03230 
03231 /*
03232  * Attempt to read an XLOG record.
03233  *
03234  * If RecPtr is not NULL, try to read a record at that position.  Otherwise
03235  * try to read a record just after the last one previously read.
03236  *
03237  * If no valid record is available, returns NULL, or fails if emode is PANIC.
03238  * (emode must be either PANIC, LOG). In standby mode, retries until a valid
03239  * record is available.
03240  *
03241  * The record is copied into readRecordBuf, so that on successful return,
03242  * the returned record pointer always points there.
03243  */
03244 static XLogRecord *
03245 ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
03246            bool fetching_ckpt)
03247 {
03248     XLogRecord *record;
03249     XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
03250 
03251     /* Pass through parameters to XLogPageRead */
03252     private->fetching_ckpt = fetching_ckpt;
03253     private->emode = emode;
03254     private->randAccess = (RecPtr != InvalidXLogRecPtr);
03255 
03256     /* This is the first attempt to read this page. */
03257     lastSourceFailed = false;
03258 
03259     for (;;)
03260     {
03261         char   *errormsg;
03262 
03263         record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
03264         ReadRecPtr = xlogreader->ReadRecPtr;
03265         EndRecPtr = xlogreader->EndRecPtr;
03266         if (record == NULL)
03267         {
03268             if (readFile >= 0)
03269             {
03270                 close(readFile);
03271                 readFile = -1;
03272             }
03273 
03274             /*
03275              * We only end up here without a message when XLogPageRead() failed
03276              * - in that case we already logged something.
03277              * In StandbyMode that only happens if we have been triggered, so
03278              * we shouldn't loop anymore in that case.
03279              */
03280             if (errormsg)
03281                 ereport(emode_for_corrupt_record(emode,
03282                                                  RecPtr ? RecPtr : EndRecPtr),
03283                         (errmsg_internal("%s", errormsg) /* already translated */));
03284         }
03285         /*
03286          * Check page TLI is one of the expected values.
03287          */
03288         else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
03289         {
03290             char        fname[MAXFNAMELEN];
03291             XLogSegNo segno;
03292             int32 offset;
03293 
03294             XLByteToSeg(xlogreader->latestPagePtr, segno);
03295             offset = xlogreader->latestPagePtr % XLogSegSize;
03296             XLogFileName(fname, xlogreader->readPageTLI, segno);
03297             ereport(emode_for_corrupt_record(emode,
03298                                              RecPtr ? RecPtr : EndRecPtr),
03299                     (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
03300                             xlogreader->latestPageTLI,
03301                             fname,
03302                             offset)));
03303             record = NULL;
03304         }
03305 
03306         if (record)
03307         {
03308             /* Great, got a record */
03309             return record;
03310         }
03311         else
03312         {
03313             /* No valid record available from this source */
03314             lastSourceFailed = true;
03315 
03316             /*
03317              * If archive recovery was requested, but we were still doing crash
03318              * recovery, switch to archive recovery and retry using the offline
03319              * archive. We have now replayed all the valid WAL in pg_xlog, so
03320              * we are presumably now consistent.
03321              *
03322              * We require that there's at least some valid WAL present in
03323              * pg_xlog, however (!fetch_ckpt). We could recover using the WAL
03324              * from the archive, even if pg_xlog is completely empty, but we'd
03325              * have no idea how far we'd have to replay to reach consistency.
03326              * So err on the safe side and give up.
03327              */
03328             if (!InArchiveRecovery && ArchiveRecoveryRequested &&
03329                 !fetching_ckpt)
03330             {
03331                 ereport(DEBUG1,
03332                         (errmsg_internal("reached end of WAL in pg_xlog, entering archive recovery")));
03333                 InArchiveRecovery = true;
03334                 if (StandbyModeRequested)
03335                     StandbyMode = true;
03336 
03337                 /* initialize minRecoveryPoint to this record */
03338                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
03339                 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
03340                 if (ControlFile->minRecoveryPoint < EndRecPtr)
03341                 {
03342                     ControlFile->minRecoveryPoint = EndRecPtr;
03343                     ControlFile->minRecoveryPointTLI = ThisTimeLineID;
03344                 }
03345                 /* update local copy */
03346                 minRecoveryPoint = ControlFile->minRecoveryPoint;
03347                 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
03348 
03349                 UpdateControlFile();
03350                 LWLockRelease(ControlFileLock);
03351 
03352                 CheckRecoveryConsistency();
03353 
03354                 /*
03355                  * Before we retry, reset lastSourceFailed and currentSource
03356                  * so that we will check the archive next.
03357                  */
03358                 lastSourceFailed = false;
03359                 currentSource = 0;
03360 
03361                 continue;
03362             }
03363 
03364             /* In standby mode, loop back to retry. Otherwise, give up. */
03365             if (StandbyMode && !CheckForStandbyTrigger())
03366                 continue;
03367             else
03368                 return NULL;
03369         }
03370     }
03371 }
03372 
03373 /*
03374  * Scan for new timelines that might have appeared in the archive since we
03375  * started recovery.
03376  *
03377  * If there are any, the function changes recovery target TLI to the latest
03378  * one and returns 'true'.
03379  */
03380 static bool
03381 rescanLatestTimeLine(void)
03382 {
03383     List       *newExpectedTLEs;
03384     bool        found;
03385     ListCell   *cell;
03386     TimeLineID  newtarget;
03387     TimeLineID  oldtarget = recoveryTargetTLI;
03388     TimeLineHistoryEntry *currentTle = NULL;
03389 
03390     newtarget = findNewestTimeLine(recoveryTargetTLI);
03391     if (newtarget == recoveryTargetTLI)
03392     {
03393         /* No new timelines found */
03394         return false;
03395     }
03396 
03397     /*
03398      * Determine the list of expected TLIs for the new TLI
03399      */
03400 
03401     newExpectedTLEs = readTimeLineHistory(newtarget);
03402 
03403     /*
03404      * If the current timeline is not part of the history of the new
03405      * timeline, we cannot proceed to it.
03406      */
03407     found = false;
03408     foreach (cell, newExpectedTLEs)
03409     {
03410         currentTle = (TimeLineHistoryEntry *) lfirst(cell);
03411 
03412         if (currentTle->tli == recoveryTargetTLI)
03413         {
03414             found = true;
03415             break;
03416         }
03417     }
03418     if (!found)
03419     {
03420         ereport(LOG,
03421                 (errmsg("new timeline %u is not a child of database system timeline %u",
03422                         newtarget,
03423                         ThisTimeLineID)));
03424         return false;
03425     }
03426 
03427     /*
03428      * The current timeline was found in the history file, but check that the
03429      * next timeline was forked off from it *after* the current recovery
03430      * location.
03431      */
03432     if (currentTle->end < EndRecPtr)
03433     {
03434         ereport(LOG,
03435                 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
03436                         newtarget,
03437                         ThisTimeLineID,
03438                         (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
03439         return false;
03440     }
03441 
03442     /* The new timeline history seems valid. Switch target */
03443     recoveryTargetTLI = newtarget;
03444     list_free_deep(expectedTLEs);
03445     expectedTLEs = newExpectedTLEs;
03446 
03447     /*
03448      * As in StartupXLOG(), try to ensure we have all the history files
03449      * between the old target and new target in pg_xlog.
03450      */
03451     restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
03452 
03453     ereport(LOG,
03454             (errmsg("new target timeline is %u",
03455                     recoveryTargetTLI)));
03456 
03457     return true;
03458 }
03459 
03460 /*
03461  * I/O routines for pg_control
03462  *
03463  * *ControlFile is a buffer in shared memory that holds an image of the
03464  * contents of pg_control.  WriteControlFile() initializes pg_control
03465  * given a preloaded buffer, ReadControlFile() loads the buffer from
03466  * the pg_control file (during postmaster or standalone-backend startup),
03467  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
03468  *
03469  * For simplicity, WriteControlFile() initializes the fields of pg_control
03470  * that are related to checking backend/database compatibility, and
03471  * ReadControlFile() verifies they are correct.  We could split out the
03472  * I/O and compatibility-check functions, but there seems no need currently.
03473  */
03474 static void
03475 WriteControlFile(void)
03476 {
03477     int         fd;
03478     char        buffer[PG_CONTROL_SIZE];        /* need not be aligned */
03479 
03480     /*
03481      * Initialize version and compatibility-check fields
03482      */
03483     ControlFile->pg_control_version = PG_CONTROL_VERSION;
03484     ControlFile->catalog_version_no = CATALOG_VERSION_NO;
03485 
03486     ControlFile->maxAlign = MAXIMUM_ALIGNOF;
03487     ControlFile->floatFormat = FLOATFORMAT_VALUE;
03488 
03489     ControlFile->blcksz = BLCKSZ;
03490     ControlFile->relseg_size = RELSEG_SIZE;
03491     ControlFile->xlog_blcksz = XLOG_BLCKSZ;
03492     ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
03493 
03494     ControlFile->nameDataLen = NAMEDATALEN;
03495     ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
03496 
03497     ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
03498 
03499 #ifdef HAVE_INT64_TIMESTAMP
03500     ControlFile->enableIntTimes = true;
03501 #else
03502     ControlFile->enableIntTimes = false;
03503 #endif
03504     ControlFile->float4ByVal = FLOAT4PASSBYVAL;
03505     ControlFile->float8ByVal = FLOAT8PASSBYVAL;
03506 
03507     /* Contents are protected with a CRC */
03508     INIT_CRC32(ControlFile->crc);
03509     COMP_CRC32(ControlFile->crc,
03510                (char *) ControlFile,
03511                offsetof(ControlFileData, crc));
03512     FIN_CRC32(ControlFile->crc);
03513 
03514     /*
03515      * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
03516      * excess over sizeof(ControlFileData).  This reduces the odds of
03517      * premature-EOF errors when reading pg_control.  We'll still fail when we
03518      * check the contents of the file, but hopefully with a more specific
03519      * error than "couldn't read pg_control".
03520      */
03521     if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
03522         elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
03523 
03524     memset(buffer, 0, PG_CONTROL_SIZE);
03525     memcpy(buffer, ControlFile, sizeof(ControlFileData));
03526 
03527     fd = BasicOpenFile(XLOG_CONTROL_FILE,
03528                        O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
03529                        S_IRUSR | S_IWUSR);
03530     if (fd < 0)
03531         ereport(PANIC,
03532                 (errcode_for_file_access(),
03533                  errmsg("could not create control file \"%s\": %m",
03534                         XLOG_CONTROL_FILE)));
03535 
03536     errno = 0;
03537     if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
03538     {
03539         /* if write didn't set errno, assume problem is no disk space */
03540         if (errno == 0)
03541             errno = ENOSPC;
03542         ereport(PANIC,
03543                 (errcode_for_file_access(),
03544                  errmsg("could not write to control file: %m")));
03545     }
03546 
03547     if (pg_fsync(fd) != 0)
03548         ereport(PANIC,
03549                 (errcode_for_file_access(),
03550                  errmsg("could not fsync control file: %m")));
03551 
03552     if (close(fd))
03553         ereport(PANIC,
03554                 (errcode_for_file_access(),
03555                  errmsg("could not close control file: %m")));
03556 }
03557 
03558 static void
03559 ReadControlFile(void)
03560 {
03561     pg_crc32    crc;
03562     int         fd;
03563 
03564     /*
03565      * Read data...
03566      */
03567     fd = BasicOpenFile(XLOG_CONTROL_FILE,
03568                        O_RDWR | PG_BINARY,
03569                        S_IRUSR | S_IWUSR);
03570     if (fd < 0)
03571         ereport(PANIC,
03572                 (errcode_for_file_access(),
03573                  errmsg("could not open control file \"%s\": %m",
03574                         XLOG_CONTROL_FILE)));
03575 
03576     if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
03577         ereport(PANIC,
03578                 (errcode_for_file_access(),
03579                  errmsg("could not read from control file: %m")));
03580 
03581     close(fd);
03582 
03583     /*
03584      * Check for expected pg_control format version.  If this is wrong, the
03585      * CRC check will likely fail because we'll be checking the wrong number
03586      * of bytes.  Complaining about wrong version will probably be more
03587      * enlightening than complaining about wrong CRC.
03588      */
03589 
03590     if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
03591         ereport(FATAL,
03592                 (errmsg("database files are incompatible with server"),
03593                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
03594          " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
03595             ControlFile->pg_control_version, ControlFile->pg_control_version,
03596                            PG_CONTROL_VERSION, PG_CONTROL_VERSION),
03597                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
03598 
03599     if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
03600         ereport(FATAL,
03601                 (errmsg("database files are incompatible with server"),
03602                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
03603                   " but the server was compiled with PG_CONTROL_VERSION %d.",
03604                         ControlFile->pg_control_version, PG_CONTROL_VERSION),
03605                  errhint("It looks like you need to initdb.")));
03606 
03607     /* Now check the CRC. */
03608     INIT_CRC32(crc);
03609     COMP_CRC32(crc,
03610                (char *) ControlFile,
03611                offsetof(ControlFileData, crc));
03612     FIN_CRC32(crc);
03613 
03614     if (!EQ_CRC32(crc, ControlFile->crc))
03615         ereport(FATAL,
03616                 (errmsg("incorrect checksum in control file")));
03617 
03618     /*
03619      * Do compatibility checking immediately.  If the database isn't
03620      * compatible with the backend executable, we want to abort before we can
03621      * possibly do any damage.
03622      */
03623     if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
03624         ereport(FATAL,
03625                 (errmsg("database files are incompatible with server"),
03626                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
03627                   " but the server was compiled with CATALOG_VERSION_NO %d.",
03628                         ControlFile->catalog_version_no, CATALOG_VERSION_NO),
03629                  errhint("It looks like you need to initdb.")));
03630     if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
03631         ereport(FATAL,
03632                 (errmsg("database files are incompatible with server"),
03633            errdetail("The database cluster was initialized with MAXALIGN %d,"
03634                      " but the server was compiled with MAXALIGN %d.",
03635                      ControlFile->maxAlign, MAXIMUM_ALIGNOF),
03636                  errhint("It looks like you need to initdb.")));
03637     if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
03638         ereport(FATAL,
03639                 (errmsg("database files are incompatible with server"),
03640                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
03641                  errhint("It looks like you need to initdb.")));
03642     if (ControlFile->blcksz != BLCKSZ)
03643         ereport(FATAL,
03644                 (errmsg("database files are incompatible with server"),
03645              errdetail("The database cluster was initialized with BLCKSZ %d,"
03646                        " but the server was compiled with BLCKSZ %d.",
03647                        ControlFile->blcksz, BLCKSZ),
03648                  errhint("It looks like you need to recompile or initdb.")));
03649     if (ControlFile->relseg_size != RELSEG_SIZE)
03650         ereport(FATAL,
03651                 (errmsg("database files are incompatible with server"),
03652         errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
03653                   " but the server was compiled with RELSEG_SIZE %d.",
03654                   ControlFile->relseg_size, RELSEG_SIZE),
03655                  errhint("It looks like you need to recompile or initdb.")));
03656     if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
03657         ereport(FATAL,
03658                 (errmsg("database files are incompatible with server"),
03659         errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
03660                   " but the server was compiled with XLOG_BLCKSZ %d.",
03661                   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
03662                  errhint("It looks like you need to recompile or initdb.")));
03663     if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
03664         ereport(FATAL,
03665                 (errmsg("database files are incompatible with server"),
03666                  errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
03667                        " but the server was compiled with XLOG_SEG_SIZE %d.",
03668                            ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
03669                  errhint("It looks like you need to recompile or initdb.")));
03670     if (ControlFile->nameDataLen != NAMEDATALEN)
03671         ereport(FATAL,
03672                 (errmsg("database files are incompatible with server"),
03673         errdetail("The database cluster was initialized with NAMEDATALEN %d,"
03674                   " but the server was compiled with NAMEDATALEN %d.",
03675                   ControlFile->nameDataLen, NAMEDATALEN),
03676                  errhint("It looks like you need to recompile or initdb.")));
03677     if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
03678         ereport(FATAL,
03679                 (errmsg("database files are incompatible with server"),
03680                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
03681                       " but the server was compiled with INDEX_MAX_KEYS %d.",
03682                            ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
03683                  errhint("It looks like you need to recompile or initdb.")));
03684     if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
03685         ereport(FATAL,
03686                 (errmsg("database files are incompatible with server"),
03687                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
03688                 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
03689               ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
03690                  errhint("It looks like you need to recompile or initdb.")));
03691 
03692 #ifdef HAVE_INT64_TIMESTAMP
03693     if (ControlFile->enableIntTimes != true)
03694         ereport(FATAL,
03695                 (errmsg("database files are incompatible with server"),
03696                  errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
03697                   " but the server was compiled with HAVE_INT64_TIMESTAMP."),
03698                  errhint("It looks like you need to recompile or initdb.")));
03699 #else
03700     if (ControlFile->enableIntTimes != false)
03701         ereport(FATAL,
03702                 (errmsg("database files are incompatible with server"),
03703                  errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
03704                " but the server was compiled without HAVE_INT64_TIMESTAMP."),
03705                  errhint("It looks like you need to recompile or initdb.")));
03706 #endif
03707 
03708 #ifdef USE_FLOAT4_BYVAL
03709     if (ControlFile->float4ByVal != true)
03710         ereport(FATAL,
03711                 (errmsg("database files are incompatible with server"),
03712                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
03713                       " but the server was compiled with USE_FLOAT4_BYVAL."),
03714                  errhint("It looks like you need to recompile or initdb.")));
03715 #else
03716     if (ControlFile->float4ByVal != false)
03717         ereport(FATAL,
03718                 (errmsg("database files are incompatible with server"),
03719         errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
03720                   " but the server was compiled without USE_FLOAT4_BYVAL."),
03721                  errhint("It looks like you need to recompile or initdb.")));
03722 #endif
03723 
03724 #ifdef USE_FLOAT8_BYVAL
03725     if (ControlFile->float8ByVal != true)
03726         ereport(FATAL,
03727                 (errmsg("database files are incompatible with server"),
03728                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
03729                       " but the server was compiled with USE_FLOAT8_BYVAL."),
03730                  errhint("It looks like you need to recompile or initdb.")));
03731 #else
03732     if (ControlFile->float8ByVal != false)
03733         ereport(FATAL,
03734                 (errmsg("database files are incompatible with server"),
03735         errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
03736                   " but the server was compiled without USE_FLOAT8_BYVAL."),
03737                  errhint("It looks like you need to recompile or initdb.")));
03738 #endif
03739 }
03740 
03741 void
03742 UpdateControlFile(void)
03743 {
03744     int         fd;
03745 
03746     INIT_CRC32(ControlFile->crc);
03747     COMP_CRC32(ControlFile->crc,
03748                (char *) ControlFile,
03749                offsetof(ControlFileData, crc));
03750     FIN_CRC32(ControlFile->crc);
03751 
03752     fd = BasicOpenFile(XLOG_CONTROL_FILE,
03753                        O_RDWR | PG_BINARY,
03754                        S_IRUSR | S_IWUSR);
03755     if (fd < 0)
03756         ereport(PANIC,
03757                 (errcode_for_file_access(),
03758                  errmsg("could not open control file \"%s\": %m",
03759                         XLOG_CONTROL_FILE)));
03760 
03761     errno = 0;
03762     if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
03763     {
03764         /* if write didn't set errno, assume problem is no disk space */
03765         if (errno == 0)
03766             errno = ENOSPC;
03767         ereport(PANIC,
03768                 (errcode_for_file_access(),
03769                  errmsg("could not write to control file: %m")));
03770     }
03771 
03772     if (pg_fsync(fd) != 0)
03773         ereport(PANIC,
03774                 (errcode_for_file_access(),
03775                  errmsg("could not fsync control file: %m")));
03776 
03777     if (close(fd))
03778         ereport(PANIC,
03779                 (errcode_for_file_access(),
03780                  errmsg("could not close control file: %m")));
03781 }
03782 
03783 /*
03784  * Returns the unique system identifier from control file.
03785  */
03786 uint64
03787 GetSystemIdentifier(void)
03788 {
03789     Assert(ControlFile != NULL);
03790     return ControlFile->system_identifier;
03791 }
03792 
03793 /*
03794  * Are checksums enabled for data pages?
03795  */
03796 bool
03797 DataChecksumsEnabled(void)
03798 {
03799     Assert(ControlFile != NULL);
03800     return (ControlFile->data_checksum_version > 0);
03801 }
03802 
03803 /*
03804  * Returns a fake LSN for unlogged relations.
03805  *
03806  * Each call generates an LSN that is greater than any previous value
03807  * returned. The current counter value is saved and restored across clean
03808  * shutdowns, but like unlogged relations, does not survive a crash. This can
03809  * be used in lieu of real LSN values returned by XLogInsert, if you need an
03810  * LSN-like increasing sequence of numbers without writing any WAL.
03811  */
03812 XLogRecPtr
03813 GetFakeLSNForUnloggedRel(void)
03814 {
03815     XLogRecPtr nextUnloggedLSN;
03816 
03817     /* use volatile pointer to prevent code rearrangement */
03818     volatile XLogCtlData *xlogctl = XLogCtl;
03819 
03820     /* increment the unloggedLSN counter, need SpinLock */
03821     SpinLockAcquire(&xlogctl->ulsn_lck);
03822     nextUnloggedLSN = xlogctl->unloggedLSN++;
03823     SpinLockRelease(&xlogctl->ulsn_lck);
03824 
03825     return nextUnloggedLSN;
03826 }
03827 
03828 /*
03829  * Auto-tune the number of XLOG buffers.
03830  *
03831  * The preferred setting for wal_buffers is about 3% of shared_buffers, with
03832  * a maximum of one XLOG segment (there is little reason to think that more
03833  * is helpful, at least so long as we force an fsync when switching log files)
03834  * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
03835  * 9.1, when auto-tuning was added).
03836  *
03837  * This should not be called until NBuffers has received its final value.
03838  */
03839 static int
03840 XLOGChooseNumBuffers(void)
03841 {
03842     int         xbuffers;
03843 
03844     xbuffers = NBuffers / 32;
03845     if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ)
03846         xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ;
03847     if (xbuffers < 8)
03848         xbuffers = 8;
03849     return xbuffers;
03850 }
03851 
03852 /*
03853  * GUC check_hook for wal_buffers
03854  */
03855 bool
03856 check_wal_buffers(int *newval, void **extra, GucSource source)
03857 {
03858     /*
03859      * -1 indicates a request for auto-tune.
03860      */
03861     if (*newval == -1)
03862     {
03863         /*
03864          * If we haven't yet changed the boot_val default of -1, just let it
03865          * be.  We'll fix it when XLOGShmemSize is called.
03866          */
03867         if (XLOGbuffers == -1)
03868             return true;
03869 
03870         /* Otherwise, substitute the auto-tune value */
03871         *newval = XLOGChooseNumBuffers();
03872     }
03873 
03874     /*
03875      * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
03876      * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
03877      * the case, we just silently treat such values as a request for the
03878      * minimum.  (We could throw an error instead, but that doesn't seem very
03879      * helpful.)
03880      */
03881     if (*newval < 4)
03882         *newval = 4;
03883 
03884     return true;
03885 }
03886 
03887 /*
03888  * Initialization of shared memory for XLOG
03889  */
03890 Size
03891 XLOGShmemSize(void)
03892 {
03893     Size        size;
03894 
03895     /*
03896      * If the value of wal_buffers is -1, use the preferred auto-tune value.
03897      * This isn't an amazingly clean place to do this, but we must wait till
03898      * NBuffers has received its final value, and must do it before using the
03899      * value of XLOGbuffers to do anything important.
03900      */
03901     if (XLOGbuffers == -1)
03902     {
03903         char        buf[32];
03904 
03905         snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
03906         SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
03907     }
03908     Assert(XLOGbuffers > 0);
03909 
03910     /* XLogCtl */
03911     size = sizeof(XLogCtlData);
03912     /* xlblocks array */
03913     size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
03914     /* extra alignment padding for XLOG I/O buffers */
03915     size = add_size(size, ALIGNOF_XLOG_BUFFER);
03916     /* and the buffers themselves */
03917     size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
03918 
03919     /*
03920      * Note: we don't count ControlFileData, it comes out of the "slop factor"
03921      * added by CreateSharedMemoryAndSemaphores.  This lets us use this
03922      * routine again below to compute the actual allocation size.
03923      */
03924 
03925     return size;
03926 }
03927 
03928 void
03929 XLOGShmemInit(void)
03930 {
03931     bool        foundCFile,
03932                 foundXLog;
03933     char       *allocptr;
03934 
03935     ControlFile = (ControlFileData *)
03936         ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
03937     XLogCtl = (XLogCtlData *)
03938         ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
03939 
03940     if (foundCFile || foundXLog)
03941     {
03942         /* both should be present or neither */
03943         Assert(foundCFile && foundXLog);
03944         return;
03945     }
03946 
03947     memset(XLogCtl, 0, sizeof(XLogCtlData));
03948 
03949     /*
03950      * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
03951      * multiple of the alignment for same, so no extra alignment padding is
03952      * needed here.
03953      */
03954     allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
03955     XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
03956     memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
03957     allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
03958 
03959     /*
03960      * Align the start of the page buffers to an ALIGNOF_XLOG_BUFFER boundary.
03961      */
03962     allocptr = (char *) TYPEALIGN(ALIGNOF_XLOG_BUFFER, allocptr);
03963     XLogCtl->pages = allocptr;
03964     memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
03965 
03966     /*
03967      * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
03968      * in additional info.)
03969      */
03970     XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
03971     XLogCtl->SharedRecoveryInProgress = true;
03972     XLogCtl->SharedHotStandbyActive = false;
03973     XLogCtl->WalWriterSleeping = false;
03974     XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
03975     SpinLockInit(&XLogCtl->info_lck);
03976     SpinLockInit(&XLogCtl->ulsn_lck);
03977     InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
03978 
03979     /*
03980      * If we are not in bootstrap mode, pg_control should already exist. Read
03981      * and validate it immediately (see comments in ReadControlFile() for the
03982      * reasons why).
03983      */
03984     if (!IsBootstrapProcessingMode())
03985         ReadControlFile();
03986 }
03987 
03988 /*
03989  * This func must be called ONCE on system install.  It creates pg_control
03990  * and the initial XLOG segment.
03991  */
03992 void
03993 BootStrapXLOG(void)
03994 {
03995     CheckPoint  checkPoint;
03996     char       *buffer;
03997     XLogPageHeader page;
03998     XLogLongPageHeader longpage;
03999     XLogRecord *record;
04000     bool        use_existent;
04001     uint64      sysidentifier;
04002     struct timeval tv;
04003     pg_crc32    crc;
04004 
04005     /*
04006      * Select a hopefully-unique system identifier code for this installation.
04007      * We use the result of gettimeofday(), including the fractional seconds
04008      * field, as being about as unique as we can easily get.  (Think not to
04009      * use random(), since it hasn't been seeded and there's no portable way
04010      * to seed it other than the system clock value...)  The upper half of the
04011      * uint64 value is just the tv_sec part, while the lower half is the XOR
04012      * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
04013      * unnecessarily if "uint64" is really only 32 bits wide.  A person
04014      * knowing this encoding can determine the initialization time of the
04015      * installation, which could perhaps be useful sometimes.
04016      */
04017     gettimeofday(&tv, NULL);
04018     sysidentifier = ((uint64) tv.tv_sec) << 32;
04019     sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);
04020 
04021     /* First timeline ID is always 1 */
04022     ThisTimeLineID = 1;
04023 
04024     /* page buffer must be aligned suitably for O_DIRECT */
04025     buffer = (char *) palloc(XLOG_BLCKSZ + ALIGNOF_XLOG_BUFFER);
04026     page = (XLogPageHeader) TYPEALIGN(ALIGNOF_XLOG_BUFFER, buffer);
04027     memset(page, 0, XLOG_BLCKSZ);
04028 
04029     /*
04030      * Set up information for the initial checkpoint record
04031      *
04032      * The initial checkpoint record is written to the beginning of the WAL
04033      * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
04034      * used, so that we can use 0/0 to mean "before any valid WAL segment".
04035      */
04036     checkPoint.redo = XLogSegSize + SizeOfXLogLongPHD;
04037     checkPoint.ThisTimeLineID = ThisTimeLineID;
04038     checkPoint.PrevTimeLineID = ThisTimeLineID;
04039     checkPoint.fullPageWrites = fullPageWrites;
04040     checkPoint.nextXidEpoch = 0;
04041     checkPoint.nextXid = FirstNormalTransactionId;
04042     checkPoint.nextOid = FirstBootstrapObjectId;
04043     checkPoint.nextMulti = FirstMultiXactId;
04044     checkPoint.nextMultiOffset = 0;
04045     checkPoint.oldestXid = FirstNormalTransactionId;
04046     checkPoint.oldestXidDB = TemplateDbOid;
04047     checkPoint.oldestMulti = FirstMultiXactId;
04048     checkPoint.oldestMultiDB = TemplateDbOid;
04049     checkPoint.time = (pg_time_t) time(NULL);
04050     checkPoint.oldestActiveXid = InvalidTransactionId;
04051 
04052     ShmemVariableCache->nextXid = checkPoint.nextXid;
04053     ShmemVariableCache->nextOid = checkPoint.nextOid;
04054     ShmemVariableCache->oidCount = 0;
04055     MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
04056     SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
04057     SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
04058 
04059     /* Set up the XLOG page header */
04060     page->xlp_magic = XLOG_PAGE_MAGIC;
04061     page->xlp_info = XLP_LONG_HEADER;
04062     page->xlp_tli = ThisTimeLineID;
04063     page->xlp_pageaddr = XLogSegSize;
04064     longpage = (XLogLongPageHeader) page;
04065     longpage->xlp_sysid = sysidentifier;
04066     longpage->xlp_seg_size = XLogSegSize;
04067     longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
04068 
04069     /* Insert the initial checkpoint record */
04070     record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
04071     record->xl_prev = 0;
04072     record->xl_xid = InvalidTransactionId;
04073     record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
04074     record->xl_len = sizeof(checkPoint);
04075     record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
04076     record->xl_rmid = RM_XLOG_ID;
04077     memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
04078 
04079     INIT_CRC32(crc);
04080     COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
04081     COMP_CRC32(crc, (char *) record, offsetof(XLogRecord, xl_crc));
04082     FIN_CRC32(crc);
04083     record->xl_crc = crc;
04084 
04085     /* Create first XLOG segment file */
04086     use_existent = false;
04087     openLogFile = XLogFileInit(1, &use_existent, false);
04088 
04089     /* Write the first page with the initial record */
04090     errno = 0;
04091     if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
04092     {
04093         /* if write didn't set errno, assume problem is no disk space */
04094         if (errno == 0)
04095             errno = ENOSPC;
04096         ereport(PANIC,
04097                 (errcode_for_file_access(),
04098               errmsg("could not write bootstrap transaction log file: %m")));
04099     }
04100 
04101     if (pg_fsync(openLogFile) != 0)
04102         ereport(PANIC,
04103                 (errcode_for_file_access(),
04104               errmsg("could not fsync bootstrap transaction log file: %m")));
04105 
04106     if (close(openLogFile))
04107         ereport(PANIC,
04108                 (errcode_for_file_access(),
04109               errmsg("could not close bootstrap transaction log file: %m")));
04110 
04111     openLogFile = -1;
04112 
04113     /* Now create pg_control */
04114 
04115     memset(ControlFile, 0, sizeof(ControlFileData));
04116     /* Initialize pg_control status fields */
04117     ControlFile->system_identifier = sysidentifier;
04118     ControlFile->state = DB_SHUTDOWNED;
04119     ControlFile->time = checkPoint.time;
04120     ControlFile->checkPoint = checkPoint.redo;
04121     ControlFile->checkPointCopy = checkPoint;
04122     ControlFile->unloggedLSN = 1;
04123 
04124     /* Set important parameter values for use when replaying WAL */
04125     ControlFile->MaxConnections = MaxConnections;
04126     ControlFile->max_prepared_xacts = max_prepared_xacts;
04127     ControlFile->max_locks_per_xact = max_locks_per_xact;
04128     ControlFile->wal_level = wal_level;
04129     ControlFile->data_checksum_version = bootstrap_data_checksum_version;
04130 
04131     /* some additional ControlFile fields are set in WriteControlFile() */
04132 
04133     WriteControlFile();
04134 
04135     /* Bootstrap the commit log, too */
04136     BootStrapCLOG();
04137     BootStrapSUBTRANS();
04138     BootStrapMultiXact();
04139 
04140     pfree(buffer);
04141 }
04142 
04143 static char *
04144 str_time(pg_time_t tnow)
04145 {
04146     static char buf[128];
04147 
04148     pg_strftime(buf, sizeof(buf),
04149                 "%Y-%m-%d %H:%M:%S %Z",
04150                 pg_localtime(&tnow, log_timezone));
04151 
04152     return buf;
04153 }
04154 
04155 /*
04156  * See if there is a recovery command file (recovery.conf), and if so
04157  * read in parameters for archive recovery and XLOG streaming.
04158  *
04159  * The file is parsed using the main configuration parser.
04160  */
04161 static void
04162 readRecoveryCommandFile(void)
04163 {
04164     FILE       *fd;
04165     TimeLineID  rtli = 0;
04166     bool        rtliGiven = false;
04167     ConfigVariable *item,
04168                *head = NULL,
04169                *tail = NULL;
04170 
04171     fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
04172     if (fd == NULL)
04173     {
04174         if (errno == ENOENT)
04175             return;             /* not there, so no archive recovery */
04176         ereport(FATAL,
04177                 (errcode_for_file_access(),
04178                  errmsg("could not open recovery command file \"%s\": %m",
04179                         RECOVERY_COMMAND_FILE)));
04180     }
04181 
04182     /*
04183      * Since we're asking ParseConfigFp() to report errors as FATAL, there's
04184      * no need to check the return value.
04185      */
04186     (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
04187 
04188     FreeFile(fd);
04189 
04190     for (item = head; item; item = item->next)
04191     {
04192         if (strcmp(item->name, "restore_command") == 0)
04193         {
04194             recoveryRestoreCommand = pstrdup(item->value);
04195             ereport(DEBUG2,
04196                     (errmsg_internal("restore_command = '%s'",
04197                                      recoveryRestoreCommand)));
04198         }
04199         else if (strcmp(item->name, "recovery_end_command") == 0)
04200         {
04201             recoveryEndCommand = pstrdup(item->value);
04202             ereport(DEBUG2,
04203                     (errmsg_internal("recovery_end_command = '%s'",
04204                                      recoveryEndCommand)));
04205         }
04206         else if (strcmp(item->name, "archive_cleanup_command") == 0)
04207         {
04208             archiveCleanupCommand = pstrdup(item->value);
04209             ereport(DEBUG2,
04210                     (errmsg_internal("archive_cleanup_command = '%s'",
04211                                      archiveCleanupCommand)));
04212         }
04213         else if (strcmp(item->name, "pause_at_recovery_target") == 0)
04214         {
04215             if (!parse_bool(item->value, &recoveryPauseAtTarget))
04216                 ereport(ERROR,
04217                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
04218                          errmsg("parameter \"%s\" requires a Boolean value", "pause_at_recovery_target")));
04219             ereport(DEBUG2,
04220                     (errmsg_internal("pause_at_recovery_target = '%s'",
04221                                      item->value)));
04222         }
04223         else if (strcmp(item->name, "recovery_target_timeline") == 0)
04224         {
04225             rtliGiven = true;
04226             if (strcmp(item->value, "latest") == 0)
04227                 rtli = 0;
04228             else
04229             {
04230                 errno = 0;
04231                 rtli = (TimeLineID) strtoul(item->value, NULL, 0);
04232                 if (errno == EINVAL || errno == ERANGE)
04233                     ereport(FATAL,
04234                             (errmsg("recovery_target_timeline is not a valid number: \"%s\"",
04235                                     item->value)));
04236             }
04237             if (rtli)
04238                 ereport(DEBUG2,
04239                    (errmsg_internal("recovery_target_timeline = %u", rtli)));
04240             else
04241                 ereport(DEBUG2,
04242                      (errmsg_internal("recovery_target_timeline = latest")));
04243         }
04244         else if (strcmp(item->name, "recovery_target_xid") == 0)
04245         {
04246             errno = 0;
04247             recoveryTargetXid = (TransactionId) strtoul(item->value, NULL, 0);
04248             if (errno == EINVAL || errno == ERANGE)
04249                 ereport(FATAL,
04250                  (errmsg("recovery_target_xid is not a valid number: \"%s\"",
04251                          item->value)));
04252             ereport(DEBUG2,
04253                     (errmsg_internal("recovery_target_xid = %u",
04254                                      recoveryTargetXid)));
04255             recoveryTarget = RECOVERY_TARGET_XID;
04256         }
04257         else if (strcmp(item->name, "recovery_target_time") == 0)
04258         {
04259             /*
04260              * if recovery_target_xid or recovery_target_name specified, then
04261              * this overrides recovery_target_time
04262              */
04263             if (recoveryTarget == RECOVERY_TARGET_XID ||
04264                 recoveryTarget == RECOVERY_TARGET_NAME)
04265                 continue;
04266             recoveryTarget = RECOVERY_TARGET_TIME;
04267 
04268             /*
04269              * Convert the time string given by the user to TimestampTz form.
04270              */
04271             recoveryTargetTime =
04272                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
04273                                                 CStringGetDatum(item->value),
04274                                                 ObjectIdGetDatum(InvalidOid),
04275                                                         Int32GetDatum(-1)));
04276             ereport(DEBUG2,
04277                     (errmsg_internal("recovery_target_time = '%s'",
04278                                    timestamptz_to_str(recoveryTargetTime))));
04279         }
04280         else if (strcmp(item->name, "recovery_target_name") == 0)
04281         {
04282             /*
04283              * if recovery_target_xid specified, then this overrides
04284              * recovery_target_name
04285              */
04286             if (recoveryTarget == RECOVERY_TARGET_XID)
04287                 continue;
04288             recoveryTarget = RECOVERY_TARGET_NAME;
04289 
04290             recoveryTargetName = pstrdup(item->value);
04291             if (strlen(recoveryTargetName) >= MAXFNAMELEN)
04292                 ereport(FATAL,
04293                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
04294                          errmsg("recovery_target_name is too long (maximum %d characters)",
04295                                 MAXFNAMELEN - 1)));
04296 
04297             ereport(DEBUG2,
04298                     (errmsg_internal("recovery_target_name = '%s'",
04299                                      recoveryTargetName)));
04300         }
04301         else if (strcmp(item->name, "recovery_target_inclusive") == 0)
04302         {
04303             /*
04304              * does nothing if a recovery_target is not also set
04305              */
04306             if (!parse_bool(item->value, &recoveryTargetInclusive))
04307                 ereport(ERROR,
04308                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
04309                          errmsg("parameter \"%s\" requires a Boolean value",
04310                                 "recovery_target_inclusive")));
04311             ereport(DEBUG2,
04312                     (errmsg_internal("recovery_target_inclusive = %s",
04313                                      item->value)));
04314         }
04315         else if (strcmp(item->name, "standby_mode") == 0)
04316         {
04317             if (!parse_bool(item->value, &StandbyModeRequested))
04318                 ereport(ERROR,
04319                         (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
04320                          errmsg("parameter \"%s\" requires a Boolean value",
04321                                 "standby_mode")));
04322             ereport(DEBUG2,
04323                     (errmsg_internal("standby_mode = '%s'", item->value)));
04324         }
04325         else if (strcmp(item->name, "primary_conninfo") == 0)
04326         {
04327             PrimaryConnInfo = pstrdup(item->value);
04328             ereport(DEBUG2,
04329                     (errmsg_internal("primary_conninfo = '%s'",
04330                                      PrimaryConnInfo)));
04331         }
04332         else if (strcmp(item->name, "trigger_file") == 0)
04333         {
04334             TriggerFile = pstrdup(item->value);
04335             ereport(DEBUG2,
04336                     (errmsg_internal("trigger_file = '%s'",
04337                                      TriggerFile)));
04338         }
04339         else
04340             ereport(FATAL,
04341                     (errmsg("unrecognized recovery parameter \"%s\"",
04342                             item->name)));
04343     }
04344 
04345     /*
04346      * Check for compulsory parameters
04347      */
04348     if (StandbyModeRequested)
04349     {
04350         if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
04351             ereport(WARNING,
04352                     (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
04353                             RECOVERY_COMMAND_FILE),
04354                      errhint("The database server will regularly poll the pg_xlog subdirectory to check for files placed there.")));
04355     }
04356     else
04357     {
04358         if (recoveryRestoreCommand == NULL)
04359             ereport(FATAL,
04360                     (errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
04361                             RECOVERY_COMMAND_FILE)));
04362     }
04363 
04364     /* Enable fetching from archive recovery area */
04365     ArchiveRecoveryRequested = true;
04366 
04367     /*
04368      * If user specified recovery_target_timeline, validate it or compute the
04369      * "latest" value.  We can't do this until after we've gotten the restore
04370      * command and set InArchiveRecovery, because we need to fetch timeline
04371      * history files from the archive.
04372      */
04373     if (rtliGiven)
04374     {
04375         if (rtli)
04376         {
04377             /* Timeline 1 does not have a history file, all else should */
04378             if (rtli != 1 && !existsTimeLineHistory(rtli))
04379                 ereport(FATAL,
04380                         (errmsg("recovery target timeline %u does not exist",
04381                                 rtli)));
04382             recoveryTargetTLI = rtli;
04383             recoveryTargetIsLatest = false;
04384         }
04385         else
04386         {
04387             /* We start the "latest" search from pg_control's timeline */
04388             recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
04389             recoveryTargetIsLatest = true;
04390         }
04391     }
04392 
04393     FreeConfigVariables(head);
04394 }
04395 
04396 /*
04397  * Exit archive-recovery state
04398  */
04399 static void
04400 exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo)
04401 {
04402     char        recoveryPath[MAXPGPATH];
04403     char        xlogpath[MAXPGPATH];
04404 
04405     /*
04406      * We are no longer in archive recovery state.
04407      */
04408     InArchiveRecovery = false;
04409 
04410     /*
04411      * Update min recovery point one last time.
04412      */
04413     UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
04414 
04415     /*
04416      * If the ending log segment is still open, close it (to avoid problems on
04417      * Windows with trying to rename or delete an open file).
04418      */
04419     if (readFile >= 0)
04420     {
04421         close(readFile);
04422         readFile = -1;
04423     }
04424 
04425     /*
04426      * If we are establishing a new timeline, we have to copy data from the
04427      * last WAL segment of the old timeline to create a starting WAL segment
04428      * for the new timeline.
04429      *
04430      * Notify the archiver that the last WAL segment of the old timeline is
04431      * ready to copy to archival storage. Otherwise, it is not archived for a
04432      * while.
04433      */
04434     if (endTLI != ThisTimeLineID)
04435     {
04436         XLogFileCopy(endLogSegNo, endTLI, endLogSegNo);
04437 
04438         if (XLogArchivingActive())
04439         {
04440             XLogFileName(xlogpath, endTLI, endLogSegNo);
04441             XLogArchiveNotify(xlogpath);
04442         }
04443     }
04444 
04445     /*
04446      * Let's just make real sure there are not .ready or .done flags posted
04447      * for the new segment.
04448      */
04449     XLogFileName(xlogpath, ThisTimeLineID, endLogSegNo);
04450     XLogArchiveCleanup(xlogpath);
04451 
04452     /*
04453      * Since there might be a partial WAL segment named RECOVERYXLOG, get rid
04454      * of it.
04455      */
04456     snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
04457     unlink(recoveryPath);       /* ignore any error */
04458 
04459     /* Get rid of any remaining recovered timeline-history file, too */
04460     snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
04461     unlink(recoveryPath);       /* ignore any error */
04462 
04463     /*
04464      * Rename the config file out of the way, so that we don't accidentally
04465      * re-enter archive recovery mode in a subsequent crash.
04466      */
04467     unlink(RECOVERY_COMMAND_DONE);
04468     if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
04469         ereport(FATAL,
04470                 (errcode_for_file_access(),
04471                  errmsg("could not rename file \"%s\" to \"%s\": %m",
04472                         RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
04473 
04474     ereport(LOG,
04475             (errmsg("archive recovery complete")));
04476 }
04477 
04478 /*
04479  * For point-in-time recovery, this function decides whether we want to
04480  * stop applying the XLOG at or after the current record.
04481  *
04482  * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
04483  * *includeThis is set TRUE if we should apply this record before stopping.
04484  *
04485  * We also track the timestamp of the latest applied COMMIT/ABORT
04486  * record in XLogCtl->recoveryLastXTime, for logging purposes.
04487  * Also, some information is saved in recoveryStopXid et al for use in
04488  * annotating the new timeline's history file.
04489  */
04490 static bool
04491 recoveryStopsHere(XLogRecord *record, bool *includeThis)
04492 {
04493     bool        stopsHere;
04494     uint8       record_info;
04495     TimestampTz recordXtime;
04496     char        recordRPName[MAXFNAMELEN];
04497 
04498     /* We only consider stopping at COMMIT, ABORT or RESTORE POINT records */
04499     if (record->xl_rmid != RM_XACT_ID && record->xl_rmid != RM_XLOG_ID)
04500         return false;
04501     record_info = record->xl_info & ~XLR_INFO_MASK;
04502     if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_COMPACT)
04503     {
04504         xl_xact_commit_compact *recordXactCommitData;
04505 
04506         recordXactCommitData = (xl_xact_commit_compact *) XLogRecGetData(record);
04507         recordXtime = recordXactCommitData->xact_time;
04508     }
04509     else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_COMMIT)
04510     {
04511         xl_xact_commit *recordXactCommitData;
04512 
04513         recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
04514         recordXtime = recordXactCommitData->xact_time;
04515     }
04516     else if (record->xl_rmid == RM_XACT_ID && record_info == XLOG_XACT_ABORT)
04517     {
04518         xl_xact_abort *recordXactAbortData;
04519 
04520         recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
04521         recordXtime = recordXactAbortData->xact_time;
04522     }
04523     else if (record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
04524     {
04525         xl_restore_point *recordRestorePointData;
04526 
04527         recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
04528         recordXtime = recordRestorePointData->rp_time;
04529         strncpy(recordRPName, recordRestorePointData->rp_name, MAXFNAMELEN);
04530     }
04531     else
04532         return false;
04533 
04534     /* Do we have a PITR target at all? */
04535     if (recoveryTarget == RECOVERY_TARGET_UNSET)
04536     {
04537         /*
04538          * Save timestamp of latest transaction commit/abort if this is a
04539          * transaction record
04540          */
04541         if (record->xl_rmid == RM_XACT_ID)
04542             SetLatestXTime(recordXtime);
04543         return false;
04544     }
04545 
04546     if (recoveryTarget == RECOVERY_TARGET_XID)
04547     {
04548         /*
04549          * There can be only one transaction end record with this exact
04550          * transactionid
04551          *
04552          * when testing for an xid, we MUST test for equality only, since
04553          * transactions are numbered in the order they start, not the order
04554          * they complete. A higher numbered xid will complete before you about
04555          * 50% of the time...
04556          */
04557         stopsHere = (record->xl_xid == recoveryTargetXid);
04558         if (stopsHere)
04559             *includeThis = recoveryTargetInclusive;
04560     }
04561     else if (recoveryTarget == RECOVERY_TARGET_NAME)
04562     {
04563         /*
04564          * There can be many restore points that share the same name, so we
04565          * stop at the first one
04566          */
04567         stopsHere = (strcmp(recordRPName, recoveryTargetName) == 0);
04568 
04569         /*
04570          * Ignore recoveryTargetInclusive because this is not a transaction
04571          * record
04572          */
04573         *includeThis = false;
04574     }
04575     else
04576     {
04577         /*
04578          * There can be many transactions that share the same commit time, so
04579          * we stop after the last one, if we are inclusive, or stop at the
04580          * first one if we are exclusive
04581          */
04582         if (recoveryTargetInclusive)
04583             stopsHere = (recordXtime > recoveryTargetTime);
04584         else
04585             stopsHere = (recordXtime >= recoveryTargetTime);
04586         if (stopsHere)
04587             *includeThis = false;
04588     }
04589 
04590     if (stopsHere)
04591     {
04592         recoveryStopXid = record->xl_xid;
04593         recoveryStopTime = recordXtime;
04594         recoveryStopAfter = *includeThis;
04595 
04596         if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT)
04597         {
04598             if (recoveryStopAfter)
04599                 ereport(LOG,
04600                         (errmsg("recovery stopping after commit of transaction %u, time %s",
04601                                 recoveryStopXid,
04602                                 timestamptz_to_str(recoveryStopTime))));
04603             else
04604                 ereport(LOG,
04605                         (errmsg("recovery stopping before commit of transaction %u, time %s",
04606                                 recoveryStopXid,
04607                                 timestamptz_to_str(recoveryStopTime))));
04608         }
04609         else if (record_info == XLOG_XACT_ABORT)
04610         {
04611             if (recoveryStopAfter)
04612                 ereport(LOG,
04613                         (errmsg("recovery stopping after abort of transaction %u, time %s",
04614                                 recoveryStopXid,
04615                                 timestamptz_to_str(recoveryStopTime))));
04616             else
04617                 ereport(LOG,
04618                         (errmsg("recovery stopping before abort of transaction %u, time %s",
04619                                 recoveryStopXid,
04620                                 timestamptz_to_str(recoveryStopTime))));
04621         }
04622         else
04623         {
04624             strncpy(recoveryStopName, recordRPName, MAXFNAMELEN);
04625 
04626             ereport(LOG,
04627                 (errmsg("recovery stopping at restore point \"%s\", time %s",
04628                         recoveryStopName,
04629                         timestamptz_to_str(recoveryStopTime))));
04630         }
04631 
04632         /*
04633          * Note that if we use a RECOVERY_TARGET_TIME then we can stop at a
04634          * restore point since they are timestamped, though the latest
04635          * transaction time is not updated.
04636          */
04637         if (record->xl_rmid == RM_XACT_ID && recoveryStopAfter)
04638             SetLatestXTime(recordXtime);
04639     }
04640     else if (record->xl_rmid == RM_XACT_ID)
04641         SetLatestXTime(recordXtime);
04642 
04643     return stopsHere;
04644 }
04645 
04646 /*
04647  * Wait until shared recoveryPause flag is cleared.
04648  *
04649  * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
04650  * Probably not worth the trouble though.  This state shouldn't be one that
04651  * anyone cares about server power consumption in.
04652  */
04653 static void
04654 recoveryPausesHere(void)
04655 {
04656     /* Don't pause unless users can connect! */
04657     if (!LocalHotStandbyActive)
04658         return;
04659 
04660     ereport(LOG,
04661             (errmsg("recovery has paused"),
04662              errhint("Execute pg_xlog_replay_resume() to continue.")));
04663 
04664     while (RecoveryIsPaused())
04665     {
04666         pg_usleep(1000000L);    /* 1000 ms */
04667         HandleStartupProcInterrupts();
04668     }
04669 }
04670 
04671 bool
04672 RecoveryIsPaused(void)
04673 {
04674     /* use volatile pointer to prevent code rearrangement */
04675     volatile XLogCtlData *xlogctl = XLogCtl;
04676     bool        recoveryPause;
04677 
04678     SpinLockAcquire(&xlogctl->info_lck);
04679     recoveryPause = xlogctl->recoveryPause;
04680     SpinLockRelease(&xlogctl->info_lck);
04681 
04682     return recoveryPause;
04683 }
04684 
04685 void
04686 SetRecoveryPause(bool recoveryPause)
04687 {
04688     /* use volatile pointer to prevent code rearrangement */
04689     volatile XLogCtlData *xlogctl = XLogCtl;
04690 
04691     SpinLockAcquire(&xlogctl->info_lck);
04692     xlogctl->recoveryPause = recoveryPause;
04693     SpinLockRelease(&xlogctl->info_lck);
04694 }
04695 
04696 /*
04697  * Save timestamp of latest processed commit/abort record.
04698  *
04699  * We keep this in XLogCtl, not a simple static variable, so that it can be
04700  * seen by processes other than the startup process.  Note in particular
04701  * that CreateRestartPoint is executed in the checkpointer.
04702  */
04703 static void
04704 SetLatestXTime(TimestampTz xtime)
04705 {
04706     /* use volatile pointer to prevent code rearrangement */
04707     volatile XLogCtlData *xlogctl = XLogCtl;
04708 
04709     SpinLockAcquire(&xlogctl->info_lck);
04710     xlogctl->recoveryLastXTime = xtime;
04711     SpinLockRelease(&xlogctl->info_lck);
04712 }
04713 
04714 /*
04715  * Fetch timestamp of latest processed commit/abort record.
04716  */
04717 TimestampTz
04718 GetLatestXTime(void)
04719 {
04720     /* use volatile pointer to prevent code rearrangement */
04721     volatile XLogCtlData *xlogctl = XLogCtl;
04722     TimestampTz xtime;
04723 
04724     SpinLockAcquire(&xlogctl->info_lck);
04725     xtime = xlogctl->recoveryLastXTime;
04726     SpinLockRelease(&xlogctl->info_lck);
04727 
04728     return xtime;
04729 }
04730 
04731 /*
04732  * Save timestamp of the next chunk of WAL records to apply.
04733  *
04734  * We keep this in XLogCtl, not a simple static variable, so that it can be
04735  * seen by all backends.
04736  */
04737 static void
04738 SetCurrentChunkStartTime(TimestampTz xtime)
04739 {
04740     /* use volatile pointer to prevent code rearrangement */
04741     volatile XLogCtlData *xlogctl = XLogCtl;
04742 
04743     SpinLockAcquire(&xlogctl->info_lck);
04744     xlogctl->currentChunkStartTime = xtime;
04745     SpinLockRelease(&xlogctl->info_lck);
04746 }
04747 
04748 /*
04749  * Fetch timestamp of latest processed commit/abort record.
04750  * Startup process maintains an accurate local copy in XLogReceiptTime
04751  */
04752 TimestampTz
04753 GetCurrentChunkReplayStartTime(void)
04754 {
04755     /* use volatile pointer to prevent code rearrangement */
04756     volatile XLogCtlData *xlogctl = XLogCtl;
04757     TimestampTz xtime;
04758 
04759     SpinLockAcquire(&xlogctl->info_lck);
04760     xtime = xlogctl->currentChunkStartTime;
04761     SpinLockRelease(&xlogctl->info_lck);
04762 
04763     return xtime;
04764 }
04765 
04766 /*
04767  * Returns time of receipt of current chunk of XLOG data, as well as
04768  * whether it was received from streaming replication or from archives.
04769  */
04770 void
04771 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
04772 {
04773     /*
04774      * This must be executed in the startup process, since we don't export the
04775      * relevant state to shared memory.
04776      */
04777     Assert(InRecovery);
04778 
04779     *rtime = XLogReceiptTime;
04780     *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
04781 }
04782 
04783 /*
04784  * Note that text field supplied is a parameter name and does not require
04785  * translation
04786  */
04787 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
04788 do { \
04789     if ((currValue) < (minValue)) \
04790         ereport(ERROR, \
04791                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
04792                  errmsg("hot standby is not possible because " \
04793                         "%s = %d is a lower setting than on the master server " \
04794                         "(its value was %d)", \
04795                         param_name, \
04796                         currValue, \
04797                         minValue))); \
04798 } while(0)
04799 
04800 /*
04801  * Check to see if required parameters are set high enough on this server
04802  * for various aspects of recovery operation.
04803  */
04804 static void
04805 CheckRequiredParameterValues(void)
04806 {
04807     /*
04808      * For archive recovery, the WAL must be generated with at least 'archive'
04809      * wal_level.
04810      */
04811     if (InArchiveRecovery && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
04812     {
04813         ereport(WARNING,
04814                 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
04815                  errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
04816     }
04817 
04818     /*
04819      * For Hot Standby, the WAL must be generated with 'hot_standby' mode, and
04820      * we must have at least as many backend slots as the primary.
04821      */
04822     if (InArchiveRecovery && EnableHotStandby)
04823     {
04824         if (ControlFile->wal_level < WAL_LEVEL_HOT_STANDBY)
04825             ereport(ERROR,
04826                     (errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" on the master server"),
04827                      errhint("Either set wal_level to \"hot_standby\" on the master, or turn off hot_standby here.")));
04828 
04829         /* We ignore autovacuum_max_workers when we make this test. */
04830         RecoveryRequiresIntParameter("max_connections",
04831                                      MaxConnections,
04832                                      ControlFile->MaxConnections);
04833         RecoveryRequiresIntParameter("max_prepared_transactions",
04834                                      max_prepared_xacts,
04835                                      ControlFile->max_prepared_xacts);
04836         RecoveryRequiresIntParameter("max_locks_per_transaction",
04837                                      max_locks_per_xact,
04838                                      ControlFile->max_locks_per_xact);
04839     }
04840 }
04841 
04842 /*
04843  * This must be called ONCE during postmaster or standalone-backend startup
04844  */
04845 void
04846 StartupXLOG(void)
04847 {
04848     XLogCtlInsert *Insert;
04849     CheckPoint  checkPoint;
04850     bool        wasShutdown;
04851     bool        reachedStopPoint = false;
04852     bool        haveBackupLabel = false;
04853     XLogRecPtr  RecPtr,
04854                 checkPointLoc,
04855                 EndOfLog;
04856     XLogSegNo   endLogSegNo;
04857     TimeLineID  PrevTimeLineID;
04858     XLogRecord *record;
04859     uint32      freespace;
04860     TransactionId oldestActiveXID;
04861     bool        backupEndRequired = false;
04862     bool        backupFromStandby = false;
04863     DBState     dbstate_at_startup;
04864     XLogReaderState *xlogreader;
04865     XLogPageReadPrivate private;
04866     bool        fast_promoted = false;
04867 
04868     /*
04869      * Read control file and check XLOG status looks valid.
04870      *
04871      * Note: in most control paths, *ControlFile is already valid and we need
04872      * not do ReadControlFile() here, but might as well do it to be sure.
04873      */
04874     ReadControlFile();
04875 
04876     if (ControlFile->state < DB_SHUTDOWNED ||
04877         ControlFile->state > DB_IN_PRODUCTION ||
04878         !XRecOffIsValid(ControlFile->checkPoint))
04879         ereport(FATAL,
04880                 (errmsg("control file contains invalid data")));
04881 
04882     if (ControlFile->state == DB_SHUTDOWNED)
04883         ereport(LOG,
04884                 (errmsg("database system was shut down at %s",
04885                         str_time(ControlFile->time))));
04886     else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
04887         ereport(LOG,
04888                 (errmsg("database system was shut down in recovery at %s",
04889                         str_time(ControlFile->time))));
04890     else if (ControlFile->state == DB_SHUTDOWNING)
04891         ereport(LOG,
04892                 (errmsg("database system shutdown was interrupted; last known up at %s",
04893                         str_time(ControlFile->time))));
04894     else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
04895         ereport(LOG,
04896            (errmsg("database system was interrupted while in recovery at %s",
04897                    str_time(ControlFile->time)),
04898             errhint("This probably means that some data is corrupted and"
04899                     " you will have to use the last backup for recovery.")));
04900     else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
04901         ereport(LOG,
04902                 (errmsg("database system was interrupted while in recovery at log time %s",
04903                         str_time(ControlFile->checkPointCopy.time)),
04904                  errhint("If this has occurred more than once some data might be corrupted"
04905               " and you might need to choose an earlier recovery target.")));
04906     else if (ControlFile->state == DB_IN_PRODUCTION)
04907         ereport(LOG,
04908               (errmsg("database system was interrupted; last known up at %s",
04909                       str_time(ControlFile->time))));
04910 
04911     /* This is just to allow attaching to startup process with a debugger */
04912 #ifdef XLOG_REPLAY_DELAY
04913     if (ControlFile->state != DB_SHUTDOWNED)
04914         pg_usleep(60000000L);
04915 #endif
04916 
04917     /*
04918      * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
04919      * someone has performed a copy for PITR, these directories may have been
04920      * excluded and need to be re-created.
04921      */
04922     ValidateXLOGDirectoryStructure();
04923 
04924     /*
04925      * Clear out any old relcache cache files.  This is *necessary* if we do
04926      * any WAL replay, since that would probably result in the cache files
04927      * being out of sync with database reality.  In theory we could leave them
04928      * in place if the database had been cleanly shut down, but it seems
04929      * safest to just remove them always and let them be rebuilt during the
04930      * first backend startup.
04931      */
04932     RelationCacheInitFileRemove();
04933 
04934     /*
04935      * Initialize on the assumption we want to recover to the same timeline
04936      * that's active according to pg_control.
04937      */
04938     recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
04939 
04940     /*
04941      * Check for recovery control file, and if so set up state for offline
04942      * recovery
04943      */
04944     readRecoveryCommandFile();
04945 
04946     /*
04947      * Save archive_cleanup_command in shared memory so that other processes
04948      * can see it.
04949      */
04950     strncpy(XLogCtl->archiveCleanupCommand,
04951             archiveCleanupCommand ? archiveCleanupCommand : "",
04952             sizeof(XLogCtl->archiveCleanupCommand));
04953 
04954     if (ArchiveRecoveryRequested)
04955     {
04956         if (StandbyModeRequested)
04957             ereport(LOG,
04958                     (errmsg("entering standby mode")));
04959         else if (recoveryTarget == RECOVERY_TARGET_XID)
04960             ereport(LOG,
04961                     (errmsg("starting point-in-time recovery to XID %u",
04962                             recoveryTargetXid)));
04963         else if (recoveryTarget == RECOVERY_TARGET_TIME)
04964             ereport(LOG,
04965                     (errmsg("starting point-in-time recovery to %s",
04966                             timestamptz_to_str(recoveryTargetTime))));
04967         else if (recoveryTarget == RECOVERY_TARGET_NAME)
04968             ereport(LOG,
04969                     (errmsg("starting point-in-time recovery to \"%s\"",
04970                             recoveryTargetName)));
04971         else
04972             ereport(LOG,
04973                     (errmsg("starting archive recovery")));
04974     }
04975     else if (ControlFile->minRecoveryPointTLI > 0)
04976     {
04977         /*
04978          * If the minRecoveryPointTLI is set when not in Archive Recovery
04979          * it means that we have crashed after ending recovery and
04980          * yet before we wrote a new checkpoint on the new timeline.
04981          * That means we are doing a crash recovery that needs to cross
04982          * timelines to get to our newly assigned timeline again.
04983          * The timeline we are headed for is exact and not 'latest'.
04984          * As soon as we hit a checkpoint, the minRecoveryPointTLI is
04985          * reset, so we will not enter crash recovery again.
04986          */
04987         Assert(ControlFile->minRecoveryPointTLI != 1);
04988         recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
04989         recoveryTargetIsLatest = false;
04990     }
04991 
04992     /*
04993      * Take ownership of the wakeup latch if we're going to sleep during
04994      * recovery.
04995      */
04996     if (StandbyModeRequested)
04997         OwnLatch(&XLogCtl->recoveryWakeupLatch);
04998 
04999     /* Set up XLOG reader facility */
05000     MemSet(&private, 0, sizeof(XLogPageReadPrivate));
05001     xlogreader = XLogReaderAllocate(&XLogPageRead, &private);
05002     if (!xlogreader)
05003         ereport(ERROR,
05004                 (errcode(ERRCODE_OUT_OF_MEMORY),
05005                  errmsg("out of memory"),
05006                  errdetail("Failed while allocating an XLog reading processor")));
05007     xlogreader->system_identifier = ControlFile->system_identifier;
05008 
05009     if (read_backup_label(&checkPointLoc, &backupEndRequired,
05010                           &backupFromStandby))
05011     {
05012         /*
05013          * Archive recovery was requested, and thanks to the backup label file,
05014          * we know how far we need to replay to reach consistency. Enter
05015          * archive recovery directly.
05016          */
05017         InArchiveRecovery = true;
05018         if (StandbyModeRequested)
05019             StandbyMode = true;
05020 
05021         /*
05022          * When a backup_label file is present, we want to roll forward from
05023          * the checkpoint it identifies, rather than using pg_control.
05024          */
05025         record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
05026         if (record != NULL)
05027         {
05028             memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
05029             wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
05030             ereport(DEBUG1,
05031                     (errmsg("checkpoint record is at %X/%X",
05032                             (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
05033             InRecovery = true;  /* force recovery even if SHUTDOWNED */
05034 
05035             /*
05036              * Make sure that REDO location exists. This may not be the case
05037              * if there was a crash during an online backup, which left a
05038              * backup_label around that references a WAL segment that's
05039              * already been archived.
05040              */
05041             if (checkPoint.redo < checkPointLoc)
05042             {
05043                 if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false))
05044                     ereport(FATAL,
05045                             (errmsg("could not find redo location referenced by checkpoint record"),
05046                              errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
05047             }
05048         }
05049         else
05050         {
05051             ereport(FATAL,
05052                     (errmsg("could not locate required checkpoint record"),
05053                      errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
05054             wasShutdown = false;    /* keep compiler quiet */
05055         }
05056         /* set flag to delete it later */
05057         haveBackupLabel = true;
05058     }
05059     else
05060     {
05061         /*
05062          * It's possible that archive recovery was requested, but we don't
05063          * know how far we need to replay the WAL before we reach consistency.
05064          * This can happen for example if a base backup is taken from a running
05065          * server using an atomic filesystem snapshot, without calling
05066          * pg_start/stop_backup. Or if you just kill a running master server
05067          * and put it into archive recovery by creating a recovery.conf file.
05068          *
05069          * Our strategy in that case is to perform crash recovery first,
05070          * replaying all the WAL present in pg_xlog, and only enter archive
05071          * recovery after that.
05072          *
05073          * But usually we already know how far we need to replay the WAL (up to
05074          * minRecoveryPoint, up to backupEndPoint, or until we see an
05075          * end-of-backup record), and we can enter archive recovery directly.
05076          */
05077         if (ArchiveRecoveryRequested &&
05078             (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
05079              ControlFile->backupEndRequired ||
05080              ControlFile->backupEndPoint != InvalidXLogRecPtr ||
05081              ControlFile->state == DB_SHUTDOWNED))
05082         {
05083             InArchiveRecovery = true;
05084             if (StandbyModeRequested)
05085                 StandbyMode = true;
05086         }
05087 
05088         /*
05089          * Get the last valid checkpoint record.  If the latest one according
05090          * to pg_control is broken, try the next-to-last one.
05091          */
05092         checkPointLoc = ControlFile->checkPoint;
05093         RedoStartLSN = ControlFile->checkPointCopy.redo;
05094         record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
05095         if (record != NULL)
05096         {
05097             ereport(DEBUG1,
05098                     (errmsg("checkpoint record is at %X/%X",
05099                             (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
05100         }
05101         else if (StandbyMode)
05102         {
05103             /*
05104              * The last valid checkpoint record required for a streaming
05105              * recovery exists in neither standby nor the primary.
05106              */
05107             ereport(PANIC,
05108                     (errmsg("could not locate a valid checkpoint record")));
05109         }
05110         else
05111         {
05112             checkPointLoc = ControlFile->prevCheckPoint;
05113             record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
05114             if (record != NULL)
05115             {
05116                 ereport(LOG,
05117                         (errmsg("using previous checkpoint record at %X/%X",
05118                                 (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
05119                 InRecovery = true;      /* force recovery even if SHUTDOWNED */
05120             }
05121             else
05122                 ereport(PANIC,
05123                      (errmsg("could not locate a valid checkpoint record")));
05124         }
05125         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
05126         wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
05127     }
05128 
05129     /*
05130      * If the location of the checkpoint record is not on the expected
05131      * timeline in the history of the requested timeline, we cannot proceed:
05132      * the backup is not part of the history of the requested timeline.
05133      */
05134     Assert(expectedTLEs); /* was initialized by reading checkpoint record */
05135     if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
05136             checkPoint.ThisTimeLineID)
05137     {
05138         XLogRecPtr switchpoint;
05139 
05140         /*
05141          * tliSwitchPoint will throw an error if the checkpoint's timeline
05142          * is not in expectedTLEs at all.
05143          */
05144         switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
05145         ereport(FATAL,
05146                 (errmsg("requested timeline %u is not a child of this server's history",
05147                         recoveryTargetTLI),
05148                  errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X",
05149                            (uint32) (ControlFile->checkPoint >> 32),
05150                            (uint32) ControlFile->checkPoint,
05151                            ControlFile->checkPointCopy.ThisTimeLineID,
05152                            (uint32) (switchpoint >> 32),
05153                            (uint32) switchpoint)));
05154     }
05155 
05156     /*
05157      * The min recovery point should be part of the requested timeline's
05158      * history, too.
05159      */
05160     if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
05161         tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
05162             ControlFile->minRecoveryPointTLI)
05163         ereport(FATAL,
05164                 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
05165                         recoveryTargetTLI,
05166                         (uint32) (ControlFile->minRecoveryPoint >> 32),
05167                         (uint32) ControlFile->minRecoveryPoint,
05168                         ControlFile->minRecoveryPointTLI)));
05169 
05170     LastRec = RecPtr = checkPointLoc;
05171 
05172     ereport(DEBUG1,
05173             (errmsg("redo record is at %X/%X; shutdown %s",
05174                     (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
05175                     wasShutdown ? "TRUE" : "FALSE")));
05176     ereport(DEBUG1,
05177             (errmsg("next transaction ID: %u/%u; next OID: %u",
05178                     checkPoint.nextXidEpoch, checkPoint.nextXid,
05179                     checkPoint.nextOid)));
05180     ereport(DEBUG1,
05181             (errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
05182                     checkPoint.nextMulti, checkPoint.nextMultiOffset)));
05183     ereport(DEBUG1,
05184             (errmsg("oldest unfrozen transaction ID: %u, in database %u",
05185                     checkPoint.oldestXid, checkPoint.oldestXidDB)));
05186     ereport(DEBUG1,
05187             (errmsg("oldest MultiXactId: %u, in database %u",
05188                     checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
05189     if (!TransactionIdIsNormal(checkPoint.nextXid))
05190         ereport(PANIC,
05191                 (errmsg("invalid next transaction ID")));
05192 
05193     /* initialize shared memory variables from the checkpoint record */
05194     ShmemVariableCache->nextXid = checkPoint.nextXid;
05195     ShmemVariableCache->nextOid = checkPoint.nextOid;
05196     ShmemVariableCache->oidCount = 0;
05197     MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
05198     SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
05199     SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
05200     XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
05201     XLogCtl->ckptXid = checkPoint.nextXid;
05202 
05203     /*
05204      * Initialize unlogged LSN. On a clean shutdown, it's restored from the
05205      * control file. On recovery, all unlogged relations are blown away, so
05206      * the unlogged LSN counter can be reset too.
05207      */
05208     if (ControlFile->state == DB_SHUTDOWNED)
05209         XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
05210     else
05211         XLogCtl->unloggedLSN = 1;
05212 
05213     /*
05214      * We must replay WAL entries using the same TimeLineID they were created
05215      * under, so temporarily adopt the TLI indicated by the checkpoint (see
05216      * also xlog_redo()).
05217      */
05218     ThisTimeLineID = checkPoint.ThisTimeLineID;
05219 
05220     /*
05221      * Copy any missing timeline history files between 'now' and the
05222      * recovery target timeline from archive to pg_xlog. While we don't need
05223      * those files ourselves - the history file of the recovery target
05224      * timeline covers all the previous timelines in the history too - a
05225      * cascading standby server might be interested in them. Or, if you
05226      * archive the WAL from this server to a different archive than the
05227      * master, it'd be good for all the history files to get archived there
05228      * after failover, so that you can use one of the old timelines as a
05229      * PITR target. Timeline history files are small, so it's better to copy
05230      * them unnecessarily than not copy them and regret later.
05231      */
05232     restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
05233 
05234     lastFullPageWrites = checkPoint.fullPageWrites;
05235 
05236     RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
05237 
05238     if (RecPtr < checkPoint.redo)
05239         ereport(PANIC,
05240                 (errmsg("invalid redo in checkpoint record")));
05241 
05242     /*
05243      * Check whether we need to force recovery from WAL.  If it appears to
05244      * have been a clean shutdown and we did not have a recovery.conf file,
05245      * then assume no recovery needed.
05246      */
05247     if (checkPoint.redo < RecPtr)
05248     {
05249         if (wasShutdown)
05250             ereport(PANIC,
05251                     (errmsg("invalid redo record in shutdown checkpoint")));
05252         InRecovery = true;
05253     }
05254     else if (ControlFile->state != DB_SHUTDOWNED)
05255         InRecovery = true;
05256     else if (ArchiveRecoveryRequested)
05257     {
05258         /* force recovery due to presence of recovery.conf */
05259         InRecovery = true;
05260     }
05261 
05262     /* REDO */
05263     if (InRecovery)
05264     {
05265         int         rmid;
05266 
05267         /* use volatile pointer to prevent code rearrangement */
05268         volatile XLogCtlData *xlogctl = XLogCtl;
05269 
05270         /*
05271          * Update pg_control to show that we are recovering and to show the
05272          * selected checkpoint as the place we are starting from. We also mark
05273          * pg_control with any minimum recovery stop point obtained from a
05274          * backup history file.
05275          */
05276         dbstate_at_startup = ControlFile->state;
05277         if (InArchiveRecovery)
05278             ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
05279         else
05280         {
05281             ereport(LOG,
05282                     (errmsg("database system was not properly shut down; "
05283                             "automatic recovery in progress")));
05284             if (recoveryTargetTLI > 0)
05285                 ereport(LOG,
05286                     (errmsg("crash recovery starts in timeline %u "
05287                             "and has target timeline %u",
05288                             ControlFile->checkPointCopy.ThisTimeLineID,
05289                             recoveryTargetTLI)));
05290             ControlFile->state = DB_IN_CRASH_RECOVERY;
05291         }
05292         ControlFile->prevCheckPoint = ControlFile->checkPoint;
05293         ControlFile->checkPoint = checkPointLoc;
05294         ControlFile->checkPointCopy = checkPoint;
05295         if (InArchiveRecovery)
05296         {
05297             /* initialize minRecoveryPoint if not set yet */
05298             if (ControlFile->minRecoveryPoint < checkPoint.redo)
05299             {
05300                 ControlFile->minRecoveryPoint = checkPoint.redo;
05301                 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
05302             }
05303         }
05304 
05305         /*
05306          * Set backupStartPoint if we're starting recovery from a base backup.
05307          *
05308          * Set backupEndPoint and use minRecoveryPoint as the backup end
05309          * location if we're starting recovery from a base backup which was
05310          * taken from the standby. In this case, the database system status in
05311          * pg_control must indicate DB_IN_ARCHIVE_RECOVERY. If not, which
05312          * means that backup is corrupted, so we cancel recovery.
05313          */
05314         if (haveBackupLabel)
05315         {
05316             ControlFile->backupStartPoint = checkPoint.redo;
05317             ControlFile->backupEndRequired = backupEndRequired;
05318 
05319             if (backupFromStandby)
05320             {
05321                 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY)
05322                     ereport(FATAL,
05323                             (errmsg("backup_label contains data inconsistent with control file"),
05324                              errhint("This means that the backup is corrupted and you will "
05325                                "have to use another backup for recovery.")));
05326                 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
05327             }
05328         }
05329         ControlFile->time = (pg_time_t) time(NULL);
05330         /* No need to hold ControlFileLock yet, we aren't up far enough */
05331         UpdateControlFile();
05332 
05333         /* initialize our local copy of minRecoveryPoint */
05334         minRecoveryPoint = ControlFile->minRecoveryPoint;
05335         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
05336 
05337         /*
05338          * Reset pgstat data, because it may be invalid after recovery.
05339          */
05340         pgstat_reset_all();
05341 
05342         /*
05343          * If there was a backup label file, it's done its job and the info
05344          * has now been propagated into pg_control.  We must get rid of the
05345          * label file so that if we crash during recovery, we'll pick up at
05346          * the latest recovery restartpoint instead of going all the way back
05347          * to the backup start point.  It seems prudent though to just rename
05348          * the file out of the way rather than delete it completely.
05349          */
05350         if (haveBackupLabel)
05351         {
05352             unlink(BACKUP_LABEL_OLD);
05353             if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
05354                 ereport(FATAL,
05355                         (errcode_for_file_access(),
05356                          errmsg("could not rename file \"%s\" to \"%s\": %m",
05357                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
05358         }
05359 
05360         /* Check that the GUCs used to generate the WAL allow recovery */
05361         CheckRequiredParameterValues();
05362 
05363         /*
05364          * We're in recovery, so unlogged relations may be trashed and must be
05365          * reset.  This should be done BEFORE allowing Hot Standby
05366          * connections, so that read-only backends don't try to read whatever
05367          * garbage is left over from before.
05368          */
05369         ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
05370 
05371         /*
05372          * Likewise, delete any saved transaction snapshot files that got left
05373          * behind by crashed backends.
05374          */
05375         DeleteAllExportedSnapshotFiles();
05376 
05377         /*
05378          * Initialize for Hot Standby, if enabled. We won't let backends in
05379          * yet, not until we've reached the min recovery point specified in
05380          * control file and we've established a recovery snapshot from a
05381          * running-xacts WAL record.
05382          */
05383         if (ArchiveRecoveryRequested && EnableHotStandby)
05384         {
05385             TransactionId *xids;
05386             int         nxids;
05387 
05388             ereport(DEBUG1,
05389                     (errmsg("initializing for hot standby")));
05390 
05391             InitRecoveryTransactionEnvironment();
05392 
05393             if (wasShutdown)
05394                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
05395             else
05396                 oldestActiveXID = checkPoint.oldestActiveXid;
05397             Assert(TransactionIdIsValid(oldestActiveXID));
05398 
05399             /*
05400              * Startup commit log and subtrans only. Other SLRUs are not
05401              * maintained during recovery and need not be started yet.
05402              */
05403             StartupCLOG();
05404             StartupSUBTRANS(oldestActiveXID);
05405 
05406             /*
05407              * If we're beginning at a shutdown checkpoint, we know that
05408              * nothing was running on the master at this point. So fake-up an
05409              * empty running-xacts record and use that here and now. Recover
05410              * additional standby state for prepared transactions.
05411              */
05412             if (wasShutdown)
05413             {
05414                 RunningTransactionsData running;
05415                 TransactionId latestCompletedXid;
05416 
05417                 /*
05418                  * Construct a RunningTransactions snapshot representing a
05419                  * shut down server, with only prepared transactions still
05420                  * alive. We're never overflowed at this point because all
05421                  * subxids are listed with their parent prepared transactions.
05422                  */
05423                 running.xcnt = nxids;
05424                 running.subxcnt = 0;
05425                 running.subxid_overflow = false;
05426                 running.nextXid = checkPoint.nextXid;
05427                 running.oldestRunningXid = oldestActiveXID;
05428                 latestCompletedXid = checkPoint.nextXid;
05429                 TransactionIdRetreat(latestCompletedXid);
05430                 Assert(TransactionIdIsNormal(latestCompletedXid));
05431                 running.latestCompletedXid = latestCompletedXid;
05432                 running.xids = xids;
05433 
05434                 ProcArrayApplyRecoveryInfo(&running);
05435 
05436                 StandbyRecoverPreparedTransactions(false);
05437             }
05438         }
05439 
05440         /* Initialize resource managers */
05441         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
05442         {
05443             if (RmgrTable[rmid].rm_startup != NULL)
05444                 RmgrTable[rmid].rm_startup();
05445         }
05446 
05447         /*
05448          * Initialize shared replayEndRecPtr, lastReplayedEndRecPtr, and
05449          * recoveryLastXTime.
05450          *
05451          * This is slightly confusing if we're starting from an online
05452          * checkpoint; we've just read and replayed the chekpoint record, but
05453          * we're going to start replay from its redo pointer, which precedes
05454          * the location of the checkpoint record itself. So even though the
05455          * last record we've replayed is indeed ReadRecPtr, we haven't
05456          * replayed all the preceding records yet. That's OK for the current
05457          * use of these variables.
05458          */
05459         SpinLockAcquire(&xlogctl->info_lck);
05460         xlogctl->replayEndRecPtr = ReadRecPtr;
05461         xlogctl->replayEndTLI = ThisTimeLineID;
05462         xlogctl->lastReplayedEndRecPtr = EndRecPtr;
05463         xlogctl->lastReplayedTLI = ThisTimeLineID;
05464         xlogctl->recoveryLastXTime = 0;
05465         xlogctl->currentChunkStartTime = 0;
05466         xlogctl->recoveryPause = false;
05467         SpinLockRelease(&xlogctl->info_lck);
05468 
05469         /* Also ensure XLogReceiptTime has a sane value */
05470         XLogReceiptTime = GetCurrentTimestamp();
05471 
05472         /*
05473          * Let postmaster know we've started redo now, so that it can launch
05474          * checkpointer to perform restartpoints.  We don't bother during
05475          * crash recovery as restartpoints can only be performed during
05476          * archive recovery.  And we'd like to keep crash recovery simple, to
05477          * avoid introducing bugs that could affect you when recovering after
05478          * crash.
05479          *
05480          * After this point, we can no longer assume that we're the only
05481          * process in addition to postmaster!  Also, fsync requests are
05482          * subsequently to be handled by the checkpointer, not locally.
05483          */
05484         if (ArchiveRecoveryRequested && IsUnderPostmaster)
05485         {
05486             PublishStartupProcessInformation();
05487             SetForwardFsyncRequests();
05488             SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
05489             bgwriterLaunched = true;
05490         }
05491 
05492         /*
05493          * Allow read-only connections immediately if we're consistent
05494          * already.
05495          */
05496         CheckRecoveryConsistency();
05497 
05498         /*
05499          * Find the first record that logically follows the checkpoint --- it
05500          * might physically precede it, though.
05501          */
05502         if (checkPoint.redo < RecPtr)
05503         {
05504             /* back up to find the record */
05505             record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
05506         }
05507         else
05508         {
05509             /* just have to read next record after CheckPoint */
05510             record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
05511         }
05512 
05513         if (record != NULL)
05514         {
05515             bool        recoveryContinue = true;
05516             bool        recoveryApply = true;
05517             ErrorContextCallback errcallback;
05518             TimestampTz xtime;
05519 
05520             InRedo = true;
05521 
05522             ereport(LOG,
05523                     (errmsg("redo starts at %X/%X",
05524                             (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
05525 
05526             /*
05527              * main redo apply loop
05528              */
05529             do
05530             {
05531                 bool switchedTLI = false;
05532 #ifdef WAL_DEBUG
05533                 if (XLOG_DEBUG ||
05534                  (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
05535                     (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
05536                 {
05537                     StringInfoData buf;
05538 
05539                     initStringInfo(&buf);
05540                     appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
05541                                      (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
05542                                      (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
05543                     xlog_outrec(&buf, record);
05544                     appendStringInfo(&buf, " - ");
05545                     RmgrTable[record->xl_rmid].rm_desc(&buf,
05546                                                        record->xl_info,
05547                                                      XLogRecGetData(record));
05548                     elog(LOG, "%s", buf.data);
05549                     pfree(buf.data);
05550                 }
05551 #endif
05552 
05553                 /* Handle interrupt signals of startup process */
05554                 HandleStartupProcInterrupts();
05555 
05556                 /*
05557                  * Pause WAL replay, if requested by a hot-standby session via
05558                  * SetRecoveryPause().
05559                  *
05560                  * Note that we intentionally don't take the info_lck spinlock
05561                  * here.  We might therefore read a slightly stale value of
05562                  * the recoveryPause flag, but it can't be very stale (no
05563                  * worse than the last spinlock we did acquire).  Since a
05564                  * pause request is a pretty asynchronous thing anyway,
05565                  * possibly responding to it one WAL record later than we
05566                  * otherwise would is a minor issue, so it doesn't seem worth
05567                  * adding another spinlock cycle to prevent that.
05568                  */
05569                 if (xlogctl->recoveryPause)
05570                     recoveryPausesHere();
05571 
05572                 /*
05573                  * Have we reached our recovery target?
05574                  */
05575                 if (recoveryStopsHere(record, &recoveryApply))
05576                 {
05577                     if (recoveryPauseAtTarget)
05578                     {
05579                         SetRecoveryPause(true);
05580                         recoveryPausesHere();
05581                     }
05582                     reachedStopPoint = true;    /* see below */
05583                     recoveryContinue = false;
05584 
05585                     /* Exit loop if we reached non-inclusive recovery target */
05586                     if (!recoveryApply)
05587                         break;
05588                 }
05589 
05590                 /* Setup error traceback support for ereport() */
05591                 errcallback.callback = rm_redo_error_callback;
05592                 errcallback.arg = (void *) record;
05593                 errcallback.previous = error_context_stack;
05594                 error_context_stack = &errcallback;
05595 
05596                 /*
05597                  * ShmemVariableCache->nextXid must be beyond record's xid.
05598                  *
05599                  * We don't expect anyone else to modify nextXid, hence we
05600                  * don't need to hold a lock while examining it.  We still
05601                  * acquire the lock to modify it, though.
05602                  */
05603                 if (TransactionIdFollowsOrEquals(record->xl_xid,
05604                                                  ShmemVariableCache->nextXid))
05605                 {
05606                     LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
05607                     ShmemVariableCache->nextXid = record->xl_xid;
05608                     TransactionIdAdvance(ShmemVariableCache->nextXid);
05609                     LWLockRelease(XidGenLock);
05610                 }
05611 
05612                 /*
05613                  * Before replaying this record, check if this record
05614                  * causes the current timeline to change. The record is
05615                  * already considered to be part of the new timeline,
05616                  * so we update ThisTimeLineID before replaying it.
05617                  * That's important so that replayEndTLI, which is
05618                  * recorded as the minimum recovery point's TLI if
05619                  * recovery stops after this record, is set correctly.
05620                  */
05621                 if (record->xl_rmid == RM_XLOG_ID)
05622                 {
05623                     TimeLineID  newTLI = ThisTimeLineID;
05624                     TimeLineID  prevTLI = ThisTimeLineID;
05625                     uint8       info = record->xl_info & ~XLR_INFO_MASK;
05626 
05627                     if (info == XLOG_CHECKPOINT_SHUTDOWN)
05628                     {
05629                         CheckPoint  checkPoint;
05630 
05631                         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
05632                         newTLI = checkPoint.ThisTimeLineID;
05633                         prevTLI = checkPoint.PrevTimeLineID;
05634                     }
05635                     else if (info == XLOG_END_OF_RECOVERY)
05636                     {
05637                         xl_end_of_recovery  xlrec;
05638 
05639                         memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
05640                         newTLI = xlrec.ThisTimeLineID;
05641                         prevTLI = xlrec.PrevTimeLineID;
05642                     }
05643 
05644                     if (newTLI != ThisTimeLineID)
05645                     {
05646                         /* Check that it's OK to switch to this TLI */
05647                         checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
05648 
05649                         /* Following WAL records should be run with new TLI */
05650                         ThisTimeLineID = newTLI;
05651                         switchedTLI = true;
05652                     }
05653                 }
05654 
05655                 /*
05656                  * Update shared replayEndRecPtr before replaying this record,
05657                  * so that XLogFlush will update minRecoveryPoint correctly.
05658                  */
05659                 SpinLockAcquire(&xlogctl->info_lck);
05660                 xlogctl->replayEndRecPtr = EndRecPtr;
05661                 xlogctl->replayEndTLI = ThisTimeLineID;
05662                 SpinLockRelease(&xlogctl->info_lck);
05663 
05664                 /*
05665                  * If we are attempting to enter Hot Standby mode, process
05666                  * XIDs we see
05667                  */
05668                 if (standbyState >= STANDBY_INITIALIZED &&
05669                     TransactionIdIsValid(record->xl_xid))
05670                     RecordKnownAssignedTransactionIds(record->xl_xid);
05671 
05672                 /* Now apply the WAL record itself */
05673                 RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
05674 
05675                 /* Pop the error context stack */
05676                 error_context_stack = errcallback.previous;
05677 
05678                 /*
05679                  * Update lastReplayedEndRecPtr after this record has been
05680                  * successfully replayed.
05681                  */
05682                 SpinLockAcquire(&xlogctl->info_lck);
05683                 xlogctl->lastReplayedEndRecPtr = EndRecPtr;
05684                 xlogctl->lastReplayedTLI = ThisTimeLineID;
05685                 SpinLockRelease(&xlogctl->info_lck);
05686 
05687                 /* Remember this record as the last-applied one */
05688                 LastRec = ReadRecPtr;
05689 
05690                 /* Allow read-only connections if we're consistent now */
05691                 CheckRecoveryConsistency();
05692 
05693                 /*
05694                  * If this record was a timeline switch, wake up any
05695                  * walsenders to notice that we are on a new timeline.
05696                  */
05697                 if (switchedTLI && AllowCascadeReplication())
05698                     WalSndWakeup();
05699 
05700                 /* Exit loop if we reached inclusive recovery target */
05701                 if (!recoveryContinue)
05702                     break;
05703 
05704                 /* Else, try to fetch the next WAL record */
05705                 record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
05706             } while (record != NULL);
05707 
05708             /*
05709              * end of main redo apply loop
05710              */
05711 
05712             ereport(LOG,
05713                     (errmsg("redo done at %X/%X",
05714                             (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
05715             xtime = GetLatestXTime();
05716             if (xtime)
05717                 ereport(LOG,
05718                      (errmsg("last completed transaction was at log time %s",
05719                              timestamptz_to_str(xtime))));
05720             InRedo = false;
05721         }
05722         else
05723         {
05724             /* there are no WAL records following the checkpoint */
05725             ereport(LOG,
05726                     (errmsg("redo is not required")));
05727         }
05728     }
05729 
05730     /*
05731      * Kill WAL receiver, if it's still running, before we continue to write
05732      * the startup checkpoint record. It will trump over the checkpoint and
05733      * subsequent records if it's still alive when we start writing WAL.
05734      */
05735     ShutdownWalRcv();
05736 
05737     /*
05738      * We don't need the latch anymore. It's not strictly necessary to disown
05739      * it, but let's do it for the sake of tidiness.
05740      */
05741     if (StandbyModeRequested)
05742         DisownLatch(&XLogCtl->recoveryWakeupLatch);
05743 
05744     /*
05745      * We are now done reading the xlog from stream. Turn off streaming
05746      * recovery to force fetching the files (which would be required at end of
05747      * recovery, e.g., timeline history file) from archive or pg_xlog.
05748      */
05749     StandbyMode = false;
05750 
05751     /*
05752      * Re-fetch the last valid or last applied record, so we can identify the
05753      * exact endpoint of what we consider the valid portion of WAL.
05754      */
05755     record = ReadRecord(xlogreader, LastRec, PANIC, false);
05756     EndOfLog = EndRecPtr;
05757     XLByteToPrevSeg(EndOfLog, endLogSegNo);
05758 
05759     /*
05760      * Complain if we did not roll forward far enough to render the backup
05761      * dump consistent.  Note: it is indeed okay to look at the local variable
05762      * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
05763      * be further ahead --- ControlFile->minRecoveryPoint cannot have been
05764      * advanced beyond the WAL we processed.
05765      */
05766     if (InRecovery &&
05767         (EndOfLog < minRecoveryPoint ||
05768          !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
05769     {
05770         if (reachedStopPoint)
05771         {
05772             /* stopped because of stop request */
05773             ereport(FATAL,
05774                     (errmsg("requested recovery stop point is before consistent recovery point")));
05775         }
05776 
05777         /*
05778          * Ran off end of WAL before reaching end-of-backup WAL record, or
05779          * minRecoveryPoint. That's usually a bad sign, indicating that you
05780          * tried to recover from an online backup but never called
05781          * pg_stop_backup(), or you didn't archive all the WAL up to that
05782          * point. However, this also happens in crash recovery, if the system
05783          * crashes while an online backup is in progress. We must not treat
05784          * that as an error, or the database will refuse to start up.
05785          */
05786         if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
05787         {
05788             if (ControlFile->backupEndRequired)
05789                 ereport(FATAL,
05790                         (errmsg("WAL ends before end of online backup"),
05791                          errhint("All WAL generated while online backup was taken must be available at recovery.")));
05792             else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
05793                 ereport(FATAL,
05794                         (errmsg("WAL ends before end of online backup"),
05795                          errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
05796             else
05797                 ereport(FATAL,
05798                       (errmsg("WAL ends before consistent recovery point")));
05799         }
05800     }
05801 
05802     /*
05803      * Consider whether we need to assign a new timeline ID.
05804      *
05805      * If we are doing an archive recovery, we always assign a new ID.  This
05806      * handles a couple of issues.  If we stopped short of the end of WAL
05807      * during recovery, then we are clearly generating a new timeline and must
05808      * assign it a unique new ID.  Even if we ran to the end, modifying the
05809      * current last segment is problematic because it may result in trying to
05810      * overwrite an already-archived copy of that segment, and we encourage
05811      * DBAs to make their archive_commands reject that.  We can dodge the
05812      * problem by making the new active segment have a new timeline ID.
05813      *
05814      * In a normal crash recovery, we can just extend the timeline we were in.
05815      */
05816     PrevTimeLineID = ThisTimeLineID;
05817     if (ArchiveRecoveryRequested)
05818     {
05819         char    reason[200];
05820 
05821         Assert(InArchiveRecovery);
05822 
05823         ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
05824         ereport(LOG,
05825                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
05826 
05827         /*
05828          * Create a comment for the history file to explain why and where
05829          * timeline changed.
05830          */
05831         if (recoveryTarget == RECOVERY_TARGET_XID)
05832             snprintf(reason, sizeof(reason),
05833                      "%s transaction %u",
05834                      recoveryStopAfter ? "after" : "before",
05835                      recoveryStopXid);
05836         else if (recoveryTarget == RECOVERY_TARGET_TIME)
05837             snprintf(reason, sizeof(reason),
05838                      "%s %s\n",
05839                      recoveryStopAfter ? "after" : "before",
05840                      timestamptz_to_str(recoveryStopTime));
05841         else if (recoveryTarget == RECOVERY_TARGET_NAME)
05842             snprintf(reason, sizeof(reason),
05843                      "at restore point \"%s\"",
05844                      recoveryStopName);
05845         else
05846             snprintf(reason, sizeof(reason), "no recovery target specified");
05847 
05848         writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
05849                              EndRecPtr, reason);
05850     }
05851 
05852     /* Save the selected TimeLineID in shared memory, too */
05853     XLogCtl->ThisTimeLineID = ThisTimeLineID;
05854     XLogCtl->PrevTimeLineID = PrevTimeLineID;
05855 
05856     /*
05857      * We are now done reading the old WAL.  Turn off archive fetching if it
05858      * was active, and make a writable copy of the last WAL segment. (Note
05859      * that we also have a copy of the last block of the old WAL in readBuf;
05860      * we will use that below.)
05861      */
05862     if (ArchiveRecoveryRequested)
05863         exitArchiveRecovery(xlogreader->readPageTLI, endLogSegNo);
05864 
05865     /*
05866      * Prepare to write WAL starting at EndOfLog position, and init xlog
05867      * buffer cache using the block containing the last record from the
05868      * previous incarnation.
05869      */
05870     openLogSegNo = endLogSegNo;
05871     openLogFile = XLogFileOpen(openLogSegNo);
05872     openLogOff = 0;
05873     Insert = &XLogCtl->Insert;
05874     Insert->PrevRecord = LastRec;
05875     XLogCtl->xlblocks[0] = ((EndOfLog - 1) / XLOG_BLCKSZ + 1) * XLOG_BLCKSZ;
05876 
05877     /*
05878      * Tricky point here: readBuf contains the *last* block that the LastRec
05879      * record spans, not the one it starts in.  The last block is indeed the
05880      * one we want to use.
05881      */
05882     if (EndOfLog % XLOG_BLCKSZ == 0)
05883     {
05884         memset(Insert->currpage, 0, XLOG_BLCKSZ);
05885     }
05886     else
05887     {
05888         Assert(readOff == (XLogCtl->xlblocks[0] - XLOG_BLCKSZ) % XLogSegSize);
05889         memcpy((char *) Insert->currpage, xlogreader->readBuf, XLOG_BLCKSZ);
05890     }
05891     Insert->currpos = (char *) Insert->currpage +
05892         (EndOfLog + XLOG_BLCKSZ - XLogCtl->xlblocks[0]);
05893 
05894     LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
05895 
05896     XLogCtl->LogwrtResult = LogwrtResult;
05897 
05898     XLogCtl->LogwrtRqst.Write = EndOfLog;
05899     XLogCtl->LogwrtRqst.Flush = EndOfLog;
05900 
05901     freespace = INSERT_FREESPACE(Insert);
05902     if (freespace > 0)
05903     {
05904         /* Make sure rest of page is zero */
05905         MemSet(Insert->currpos, 0, freespace);
05906         XLogCtl->Write.curridx = 0;
05907     }
05908     else
05909     {
05910         /*
05911          * Whenever LogwrtResult points to exactly the end of a page,
05912          * Write.curridx must point to the *next* page (see XLogWrite()).
05913          *
05914          * Note: it might seem we should do AdvanceXLInsertBuffer() here, but
05915          * this is sufficient.  The first actual attempt to insert a log
05916          * record will advance the insert state.
05917          */
05918         XLogCtl->Write.curridx = NextBufIdx(0);
05919     }
05920 
05921     /* Pre-scan prepared transactions to find out the range of XIDs present */
05922     oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
05923 
05924     /*
05925      * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
05926      * record before resource manager writes cleanup WAL records or checkpoint
05927      * record is written.
05928      */
05929     Insert->fullPageWrites = lastFullPageWrites;
05930     LocalSetXLogInsertAllowed();
05931     UpdateFullPageWrites();
05932     LocalXLogInsertAllowed = -1;
05933 
05934     if (InRecovery)
05935     {
05936         int         rmid;
05937 
05938         /*
05939          * Resource managers might need to write WAL records, eg, to record
05940          * index cleanup actions.  So temporarily enable XLogInsertAllowed in
05941          * this process only.
05942          */
05943         LocalSetXLogInsertAllowed();
05944 
05945         /*
05946          * Allow resource managers to do any required cleanup.
05947          */
05948         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
05949         {
05950             if (RmgrTable[rmid].rm_cleanup != NULL)
05951                 RmgrTable[rmid].rm_cleanup();
05952         }
05953 
05954         /* Disallow XLogInsert again */
05955         LocalXLogInsertAllowed = -1;
05956 
05957         /*
05958          * Perform a checkpoint to update all our recovery activity to disk.
05959          *
05960          * Note that we write a shutdown checkpoint rather than an on-line
05961          * one. This is not particularly critical, but since we may be
05962          * assigning a new TLI, using a shutdown checkpoint allows us to have
05963          * the rule that TLI only changes in shutdown checkpoints, which
05964          * allows some extra error checking in xlog_redo.
05965          *
05966          * In fast promotion, only create a lightweight end-of-recovery record
05967          * instead of a full checkpoint. A checkpoint is requested later, after
05968          * we're fully out of recovery mode and already accepting queries.
05969          */
05970         if (bgwriterLaunched)
05971         {
05972             if (fast_promote)
05973             {
05974                 checkPointLoc = ControlFile->prevCheckPoint;
05975 
05976                 /*
05977                  * Confirm the last checkpoint is available for us to recover
05978                  * from if we fail. Note that we don't check for the secondary
05979                  * checkpoint since that isn't available in most base backups.
05980                  */
05981                 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
05982                 if (record != NULL)
05983                 {
05984                     fast_promoted = true;
05985                     CreateEndOfRecoveryRecord();
05986                 }
05987             }
05988 
05989             if (!fast_promoted)
05990                 RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
05991                                     CHECKPOINT_IMMEDIATE |
05992                                     CHECKPOINT_WAIT);
05993         }
05994         else
05995             CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
05996 
05997         /*
05998          * And finally, execute the recovery_end_command, if any.
05999          */
06000         if (recoveryEndCommand)
06001             ExecuteRecoveryCommand(recoveryEndCommand,
06002                                    "recovery_end_command",
06003                                    true);
06004     }
06005 
06006     /*
06007      * Preallocate additional log files, if wanted.
06008      */
06009     PreallocXlogFiles(EndOfLog);
06010 
06011     /*
06012      * Reset initial contents of unlogged relations.  This has to be done
06013      * AFTER recovery is complete so that any unlogged relations created
06014      * during recovery also get picked up.
06015      */
06016     if (InRecovery)
06017         ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
06018 
06019     /*
06020      * Okay, we're officially UP.
06021      */
06022     InRecovery = false;
06023 
06024     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
06025     ControlFile->state = DB_IN_PRODUCTION;
06026     ControlFile->time = (pg_time_t) time(NULL);
06027     UpdateControlFile();
06028     LWLockRelease(ControlFileLock);
06029 
06030     /* start the archive_timeout timer running */
06031     XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
06032 
06033     /* also initialize latestCompletedXid, to nextXid - 1 */
06034     LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
06035     ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
06036     TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
06037     LWLockRelease(ProcArrayLock);
06038 
06039     /*
06040      * Start up the commit log and subtrans, if not already done for hot
06041      * standby.
06042      */
06043     if (standbyState == STANDBY_DISABLED)
06044     {
06045         StartupCLOG();
06046         StartupSUBTRANS(oldestActiveXID);
06047     }
06048 
06049     /*
06050      * Perform end of recovery actions for any SLRUs that need it.
06051      */
06052     StartupMultiXact();
06053     TrimCLOG();
06054 
06055     /* Reload shared-memory state for prepared transactions */
06056     RecoverPreparedTransactions();
06057 
06058     /*
06059      * Shutdown the recovery environment. This must occur after
06060      * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
06061      */
06062     if (standbyState != STANDBY_DISABLED)
06063         ShutdownRecoveryTransactionEnvironment();
06064 
06065     /* Shut down xlogreader */
06066     if (readFile >= 0)
06067     {
06068         close(readFile);
06069         readFile = -1;
06070     }
06071     XLogReaderFree(xlogreader);
06072 
06073     /*
06074      * If any of the critical GUCs have changed, log them before we allow
06075      * backends to write WAL.
06076      */
06077     LocalSetXLogInsertAllowed();
06078     XLogReportParameters();
06079 
06080     /*
06081      * All done.  Allow backends to write WAL.  (Although the bool flag is
06082      * probably atomic in itself, we use the info_lck here to ensure that
06083      * there are no race conditions concerning visibility of other recent
06084      * updates to shared memory.)
06085      */
06086     {
06087         /* use volatile pointer to prevent code rearrangement */
06088         volatile XLogCtlData *xlogctl = XLogCtl;
06089 
06090         SpinLockAcquire(&xlogctl->info_lck);
06091         xlogctl->SharedRecoveryInProgress = false;
06092         SpinLockRelease(&xlogctl->info_lck);
06093     }
06094 
06095     /*
06096      * If there were cascading standby servers connected to us, nudge any
06097      * wal sender processes to notice that we've been promoted.
06098      */
06099     WalSndWakeup();
06100 
06101     /*
06102      * If this was a fast promotion, request an (online) checkpoint now. This
06103      * isn't required for consistency, but the last restartpoint might be far
06104      * back, and in case of a crash, recovering from it might take a longer
06105      * than is appropriate now that we're not in standby mode anymore.
06106      */
06107     if (fast_promoted)
06108         RequestCheckpoint(0);
06109 }
06110 
06111 /*
06112  * Checks if recovery has reached a consistent state. When consistency is
06113  * reached and we have a valid starting standby snapshot, tell postmaster
06114  * that it can start accepting read-only connections.
06115  */
06116 static void
06117 CheckRecoveryConsistency(void)
06118 {
06119     /*
06120      * During crash recovery, we don't reach a consistent state until we've
06121      * replayed all the WAL.
06122      */
06123     if (XLogRecPtrIsInvalid(minRecoveryPoint))
06124         return;
06125 
06126     /*
06127      * Have we reached the point where our base backup was completed?
06128      */
06129     if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
06130         ControlFile->backupEndPoint <= EndRecPtr)
06131     {
06132         /*
06133          * We have reached the end of base backup, as indicated by pg_control.
06134          * The data on disk is now consistent. Reset backupStartPoint and
06135          * backupEndPoint, and update minRecoveryPoint to make sure we don't
06136          * allow starting up at an earlier point even if recovery is stopped
06137          * and restarted soon after this.
06138          */
06139         elog(DEBUG1, "end of backup reached");
06140 
06141         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
06142 
06143         if (ControlFile->minRecoveryPoint < EndRecPtr)
06144             ControlFile->minRecoveryPoint = EndRecPtr;
06145 
06146         ControlFile->backupStartPoint = InvalidXLogRecPtr;
06147         ControlFile->backupEndPoint = InvalidXLogRecPtr;
06148         ControlFile->backupEndRequired = false;
06149         UpdateControlFile();
06150 
06151         LWLockRelease(ControlFileLock);
06152     }
06153 
06154     /*
06155      * Have we passed our safe starting point? Note that minRecoveryPoint
06156      * is known to be incorrectly set if ControlFile->backupEndRequired,
06157      * until the XLOG_BACKUP_RECORD arrives to advise us of the correct
06158      * minRecoveryPoint. All we know prior to that is that we're not
06159      * consistent yet.
06160      */
06161     if (!reachedConsistency && !ControlFile->backupEndRequired &&
06162         minRecoveryPoint <= XLogCtl->lastReplayedEndRecPtr &&
06163         XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
06164     {
06165         /*
06166          * Check to see if the XLOG sequence contained any unresolved
06167          * references to uninitialized pages.
06168          */
06169         XLogCheckInvalidPages();
06170 
06171         reachedConsistency = true;
06172         ereport(LOG,
06173                 (errmsg("consistent recovery state reached at %X/%X",
06174                         (uint32) (XLogCtl->lastReplayedEndRecPtr >> 32),
06175                         (uint32) XLogCtl->lastReplayedEndRecPtr)));
06176     }
06177 
06178     /*
06179      * Have we got a valid starting snapshot that will allow queries to be
06180      * run? If so, we can tell postmaster that the database is consistent now,
06181      * enabling connections.
06182      */
06183     if (standbyState == STANDBY_SNAPSHOT_READY &&
06184         !LocalHotStandbyActive &&
06185         reachedConsistency &&
06186         IsUnderPostmaster)
06187     {
06188         /* use volatile pointer to prevent code rearrangement */
06189         volatile XLogCtlData *xlogctl = XLogCtl;
06190 
06191         SpinLockAcquire(&xlogctl->info_lck);
06192         xlogctl->SharedHotStandbyActive = true;
06193         SpinLockRelease(&xlogctl->info_lck);
06194 
06195         LocalHotStandbyActive = true;
06196 
06197         SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
06198     }
06199 }
06200 
06201 /*
06202  * Is the system still in recovery?
06203  *
06204  * Unlike testing InRecovery, this works in any process that's connected to
06205  * shared memory.
06206  *
06207  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
06208  * variables the first time we see that recovery is finished.
06209  */
06210 bool
06211 RecoveryInProgress(void)
06212 {
06213     /*
06214      * We check shared state each time only until we leave recovery mode. We
06215      * can't re-enter recovery, so there's no need to keep checking after the
06216      * shared variable has once been seen false.
06217      */
06218     if (!LocalRecoveryInProgress)
06219         return false;
06220     else
06221     {
06222         /* use volatile pointer to prevent code rearrangement */
06223         volatile XLogCtlData *xlogctl = XLogCtl;
06224 
06225         /* spinlock is essential on machines with weak memory ordering! */
06226         SpinLockAcquire(&xlogctl->info_lck);
06227         LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
06228         SpinLockRelease(&xlogctl->info_lck);
06229 
06230         /*
06231          * Initialize TimeLineID and RedoRecPtr when we discover that recovery
06232          * is finished. InitPostgres() relies upon this behaviour to ensure
06233          * that InitXLOGAccess() is called at backend startup.  (If you change
06234          * this, see also LocalSetXLogInsertAllowed.)
06235          */
06236         if (!LocalRecoveryInProgress)
06237             InitXLOGAccess();
06238 
06239         return LocalRecoveryInProgress;
06240     }
06241 }
06242 
06243 /*
06244  * Is HotStandby active yet? This is only important in special backends
06245  * since normal backends won't ever be able to connect until this returns
06246  * true. Postmaster knows this by way of signal, not via shared memory.
06247  *
06248  * Unlike testing standbyState, this works in any process that's connected to
06249  * shared memory.
06250  */
06251 bool
06252 HotStandbyActive(void)
06253 {
06254     /*
06255      * We check shared state each time only until Hot Standby is active. We
06256      * can't de-activate Hot Standby, so there's no need to keep checking
06257      * after the shared variable has once been seen true.
06258      */
06259     if (LocalHotStandbyActive)
06260         return true;
06261     else
06262     {
06263         /* use volatile pointer to prevent code rearrangement */
06264         volatile XLogCtlData *xlogctl = XLogCtl;
06265 
06266         /* spinlock is essential on machines with weak memory ordering! */
06267         SpinLockAcquire(&xlogctl->info_lck);
06268         LocalHotStandbyActive = xlogctl->SharedHotStandbyActive;
06269         SpinLockRelease(&xlogctl->info_lck);
06270 
06271         return LocalHotStandbyActive;
06272     }
06273 }
06274 
06275 /*
06276  * Is this process allowed to insert new WAL records?
06277  *
06278  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
06279  * But we also have provisions for forcing the result "true" or "false"
06280  * within specific processes regardless of the global state.
06281  */
06282 bool
06283 XLogInsertAllowed(void)
06284 {
06285     /*
06286      * If value is "unconditionally true" or "unconditionally false", just
06287      * return it.  This provides the normal fast path once recovery is known
06288      * done.
06289      */
06290     if (LocalXLogInsertAllowed >= 0)
06291         return (bool) LocalXLogInsertAllowed;
06292 
06293     /*
06294      * Else, must check to see if we're still in recovery.
06295      */
06296     if (RecoveryInProgress())
06297         return false;
06298 
06299     /*
06300      * On exit from recovery, reset to "unconditionally true", since there is
06301      * no need to keep checking.
06302      */
06303     LocalXLogInsertAllowed = 1;
06304     return true;
06305 }
06306 
06307 /*
06308  * Make XLogInsertAllowed() return true in the current process only.
06309  *
06310  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
06311  * and even call LocalSetXLogInsertAllowed() again after that.
06312  */
06313 static void
06314 LocalSetXLogInsertAllowed(void)
06315 {
06316     Assert(LocalXLogInsertAllowed == -1);
06317     LocalXLogInsertAllowed = 1;
06318 
06319     /* Initialize as RecoveryInProgress() would do when switching state */
06320     InitXLOGAccess();
06321 }
06322 
06323 /*
06324  * Subroutine to try to fetch and validate a prior checkpoint record.
06325  *
06326  * whichChkpt identifies the checkpoint (merely for reporting purposes).
06327  * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
06328  */
06329 static XLogRecord *
06330 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
06331                      int whichChkpt, bool report)
06332 {
06333     XLogRecord *record;
06334 
06335     if (!XRecOffIsValid(RecPtr))
06336     {
06337         if (!report)
06338             return NULL;
06339 
06340         switch (whichChkpt)
06341         {
06342             case 1:
06343                 ereport(LOG,
06344                 (errmsg("invalid primary checkpoint link in control file")));
06345                 break;
06346             case 2:
06347                 ereport(LOG,
06348                         (errmsg("invalid secondary checkpoint link in control file")));
06349                 break;
06350             default:
06351                 ereport(LOG,
06352                    (errmsg("invalid checkpoint link in backup_label file")));
06353                 break;
06354         }
06355         return NULL;
06356     }
06357 
06358     record = ReadRecord(xlogreader, RecPtr, LOG, true);
06359 
06360     if (record == NULL)
06361     {
06362         if (!report)
06363             return NULL;
06364 
06365         switch (whichChkpt)
06366         {
06367             case 1:
06368                 ereport(LOG,
06369                         (errmsg("invalid primary checkpoint record")));
06370                 break;
06371             case 2:
06372                 ereport(LOG,
06373                         (errmsg("invalid secondary checkpoint record")));
06374                 break;
06375             default:
06376                 ereport(LOG,
06377                         (errmsg("invalid checkpoint record")));
06378                 break;
06379         }
06380         return NULL;
06381     }
06382     if (record->xl_rmid != RM_XLOG_ID)
06383     {
06384         switch (whichChkpt)
06385         {
06386             case 1:
06387                 ereport(LOG,
06388                         (errmsg("invalid resource manager ID in primary checkpoint record")));
06389                 break;
06390             case 2:
06391                 ereport(LOG,
06392                         (errmsg("invalid resource manager ID in secondary checkpoint record")));
06393                 break;
06394             default:
06395                 ereport(LOG,
06396                 (errmsg("invalid resource manager ID in checkpoint record")));
06397                 break;
06398         }
06399         return NULL;
06400     }
06401     if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
06402         record->xl_info != XLOG_CHECKPOINT_ONLINE)
06403     {
06404         switch (whichChkpt)
06405         {
06406             case 1:
06407                 ereport(LOG,
06408                    (errmsg("invalid xl_info in primary checkpoint record")));
06409                 break;
06410             case 2:
06411                 ereport(LOG,
06412                  (errmsg("invalid xl_info in secondary checkpoint record")));
06413                 break;
06414             default:
06415                 ereport(LOG,
06416                         (errmsg("invalid xl_info in checkpoint record")));
06417                 break;
06418         }
06419         return NULL;
06420     }
06421     if (record->xl_len != sizeof(CheckPoint) ||
06422         record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
06423     {
06424         switch (whichChkpt)
06425         {
06426             case 1:
06427                 ereport(LOG,
06428                     (errmsg("invalid length of primary checkpoint record")));
06429                 break;
06430             case 2:
06431                 ereport(LOG,
06432                   (errmsg("invalid length of secondary checkpoint record")));
06433                 break;
06434             default:
06435                 ereport(LOG,
06436                         (errmsg("invalid length of checkpoint record")));
06437                 break;
06438         }
06439         return NULL;
06440     }
06441     return record;
06442 }
06443 
06444 /*
06445  * This must be called during startup of a backend process, except that
06446  * it need not be called in a standalone backend (which does StartupXLOG
06447  * instead).  We need to initialize the local copies of ThisTimeLineID and
06448  * RedoRecPtr.
06449  *
06450  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
06451  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
06452  * unnecessary however, since the postmaster itself never touches XLOG anyway.
06453  */
06454 void
06455 InitXLOGAccess(void)
06456 {
06457     /* ThisTimeLineID doesn't change so we need no lock to copy it */
06458     ThisTimeLineID = XLogCtl->ThisTimeLineID;
06459     Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
06460 
06461     /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
06462     (void) GetRedoRecPtr();
06463 }
06464 
06465 /*
06466  * Once spawned, a backend may update its local RedoRecPtr from
06467  * XLogCtl->Insert.RedoRecPtr; it must hold the insert lock or info_lck
06468  * to do so.  This is done in XLogInsert() or GetRedoRecPtr().
06469  */
06470 XLogRecPtr
06471 GetRedoRecPtr(void)
06472 {
06473     /* use volatile pointer to prevent code rearrangement */
06474     volatile XLogCtlData *xlogctl = XLogCtl;
06475 
06476     SpinLockAcquire(&xlogctl->info_lck);
06477     Assert(RedoRecPtr <= xlogctl->Insert.RedoRecPtr);
06478     RedoRecPtr = xlogctl->Insert.RedoRecPtr;
06479     SpinLockRelease(&xlogctl->info_lck);
06480 
06481     return RedoRecPtr;
06482 }
06483 
06484 /*
06485  * GetInsertRecPtr -- Returns the current insert position.
06486  *
06487  * NOTE: The value *actually* returned is the position of the last full
06488  * xlog page. It lags behind the real insert position by at most 1 page.
06489  * For that, we don't need to acquire WALInsertLock which can be quite
06490  * heavily contended, and an approximation is enough for the current
06491  * usage of this function.
06492  */
06493 XLogRecPtr
06494 GetInsertRecPtr(void)
06495 {
06496     /* use volatile pointer to prevent code rearrangement */
06497     volatile XLogCtlData *xlogctl = XLogCtl;
06498     XLogRecPtr  recptr;
06499 
06500     SpinLockAcquire(&xlogctl->info_lck);
06501     recptr = xlogctl->LogwrtRqst.Write;
06502     SpinLockRelease(&xlogctl->info_lck);
06503 
06504     return recptr;
06505 }
06506 
06507 /*
06508  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
06509  * position known to be fsync'd to disk.
06510  */
06511 XLogRecPtr
06512 GetFlushRecPtr(void)
06513 {
06514     /* use volatile pointer to prevent code rearrangement */
06515     volatile XLogCtlData *xlogctl = XLogCtl;
06516     XLogRecPtr  recptr;
06517 
06518     SpinLockAcquire(&xlogctl->info_lck);
06519     recptr = xlogctl->LogwrtResult.Flush;
06520     SpinLockRelease(&xlogctl->info_lck);
06521 
06522     return recptr;
06523 }
06524 
06525 /*
06526  * Get the time of the last xlog segment switch
06527  */
06528 pg_time_t
06529 GetLastSegSwitchTime(void)
06530 {
06531     pg_time_t   result;
06532 
06533     /* Need WALWriteLock, but shared lock is sufficient */
06534     LWLockAcquire(WALWriteLock, LW_SHARED);
06535     result = XLogCtl->Write.lastSegSwitchTime;
06536     LWLockRelease(WALWriteLock);
06537 
06538     return result;
06539 }
06540 
06541 /*
06542  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
06543  *
06544  * This is exported for use by code that would like to have 64-bit XIDs.
06545  * We don't really support such things, but all XIDs within the system
06546  * can be presumed "close to" the result, and thus the epoch associated
06547  * with them can be determined.
06548  */
06549 void
06550 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
06551 {
06552     uint32      ckptXidEpoch;
06553     TransactionId ckptXid;
06554     TransactionId nextXid;
06555 
06556     /* Must read checkpoint info first, else have race condition */
06557     {
06558         /* use volatile pointer to prevent code rearrangement */
06559         volatile XLogCtlData *xlogctl = XLogCtl;
06560 
06561         SpinLockAcquire(&xlogctl->info_lck);
06562         ckptXidEpoch = xlogctl->ckptXidEpoch;
06563         ckptXid = xlogctl->ckptXid;
06564         SpinLockRelease(&xlogctl->info_lck);
06565     }
06566 
06567     /* Now fetch current nextXid */
06568     nextXid = ReadNewTransactionId();
06569 
06570     /*
06571      * nextXid is certainly logically later than ckptXid.  So if it's
06572      * numerically less, it must have wrapped into the next epoch.
06573      */
06574     if (nextXid < ckptXid)
06575         ckptXidEpoch++;
06576 
06577     *xid = nextXid;
06578     *epoch = ckptXidEpoch;
06579 }
06580 
06581 /*
06582  * This must be called ONCE during postmaster or standalone-backend shutdown
06583  */
06584 void
06585 ShutdownXLOG(int code, Datum arg)
06586 {
06587     ereport(LOG,
06588             (errmsg("shutting down")));
06589 
06590     if (RecoveryInProgress())
06591         CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
06592     else
06593     {
06594         /*
06595          * If archiving is enabled, rotate the last XLOG file so that all the
06596          * remaining records are archived (postmaster wakes up the archiver
06597          * process one more time at the end of shutdown). The checkpoint
06598          * record will go to the next XLOG file and won't be archived (yet).
06599          */
06600         if (XLogArchivingActive() && XLogArchiveCommandSet())
06601             RequestXLogSwitch();
06602 
06603         CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
06604     }
06605     ShutdownCLOG();
06606     ShutdownSUBTRANS();
06607     ShutdownMultiXact();
06608 
06609     ereport(LOG,
06610             (errmsg("database system is shut down")));
06611 }
06612 
06613 /*
06614  * Log start of a checkpoint.
06615  */
06616 static void
06617 LogCheckpointStart(int flags, bool restartpoint)
06618 {
06619     const char *msg;
06620 
06621     /*
06622      * XXX: This is hopelessly untranslatable. We could call gettext_noop for
06623      * the main message, but what about all the flags?
06624      */
06625     if (restartpoint)
06626         msg = "restartpoint starting:%s%s%s%s%s%s%s";
06627     else
06628         msg = "checkpoint starting:%s%s%s%s%s%s%s";
06629 
06630     elog(LOG, msg,
06631          (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
06632          (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
06633          (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
06634          (flags & CHECKPOINT_FORCE) ? " force" : "",
06635          (flags & CHECKPOINT_WAIT) ? " wait" : "",
06636          (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
06637          (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
06638 }
06639 
06640 /*
06641  * Log end of a checkpoint.
06642  */
06643 static void
06644 LogCheckpointEnd(bool restartpoint)
06645 {
06646     long        write_secs,
06647                 sync_secs,
06648                 total_secs,
06649                 longest_secs,
06650                 average_secs;
06651     int         write_usecs,
06652                 sync_usecs,
06653                 total_usecs,
06654                 longest_usecs,
06655                 average_usecs;
06656     uint64      average_sync_time;
06657 
06658     CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
06659 
06660     TimestampDifference(CheckpointStats.ckpt_write_t,
06661                         CheckpointStats.ckpt_sync_t,
06662                         &write_secs, &write_usecs);
06663 
06664     TimestampDifference(CheckpointStats.ckpt_sync_t,
06665                         CheckpointStats.ckpt_sync_end_t,
06666                         &sync_secs, &sync_usecs);
06667 
06668     /* Accumulate checkpoint timing summary data, in milliseconds. */
06669     BgWriterStats.m_checkpoint_write_time +=
06670         write_secs * 1000 + write_usecs / 1000;
06671     BgWriterStats.m_checkpoint_sync_time +=
06672         sync_secs * 1000 + sync_usecs / 1000;
06673 
06674     /*
06675      * All of the published timing statistics are accounted for.  Only
06676      * continue if a log message is to be written.
06677      */
06678     if (!log_checkpoints)
06679         return;
06680 
06681     TimestampDifference(CheckpointStats.ckpt_start_t,
06682                         CheckpointStats.ckpt_end_t,
06683                         &total_secs, &total_usecs);
06684 
06685     /*
06686      * Timing values returned from CheckpointStats are in microseconds.
06687      * Convert to the second plus microsecond form that TimestampDifference
06688      * returns for homogeneous printing.
06689      */
06690     longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
06691     longest_usecs = CheckpointStats.ckpt_longest_sync -
06692         (uint64) longest_secs *1000000;
06693 
06694     average_sync_time = 0;
06695     if (CheckpointStats.ckpt_sync_rels > 0)
06696         average_sync_time = CheckpointStats.ckpt_agg_sync_time /
06697             CheckpointStats.ckpt_sync_rels;
06698     average_secs = (long) (average_sync_time / 1000000);
06699     average_usecs = average_sync_time - (uint64) average_secs *1000000;
06700 
06701     if (restartpoint)
06702         elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
06703              "%d transaction log file(s) added, %d removed, %d recycled; "
06704              "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
06705              "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
06706              CheckpointStats.ckpt_bufs_written,
06707              (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
06708              CheckpointStats.ckpt_segs_added,
06709              CheckpointStats.ckpt_segs_removed,
06710              CheckpointStats.ckpt_segs_recycled,
06711              write_secs, write_usecs / 1000,
06712              sync_secs, sync_usecs / 1000,
06713              total_secs, total_usecs / 1000,
06714              CheckpointStats.ckpt_sync_rels,
06715              longest_secs, longest_usecs / 1000,
06716              average_secs, average_usecs / 1000);
06717     else
06718         elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
06719              "%d transaction log file(s) added, %d removed, %d recycled; "
06720              "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
06721              "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
06722              CheckpointStats.ckpt_bufs_written,
06723              (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
06724              CheckpointStats.ckpt_segs_added,
06725              CheckpointStats.ckpt_segs_removed,
06726              CheckpointStats.ckpt_segs_recycled,
06727              write_secs, write_usecs / 1000,
06728              sync_secs, sync_usecs / 1000,
06729              total_secs, total_usecs / 1000,
06730              CheckpointStats.ckpt_sync_rels,
06731              longest_secs, longest_usecs / 1000,
06732              average_secs, average_usecs / 1000);
06733 }
06734 
06735 /*
06736  * Perform a checkpoint --- either during shutdown, or on-the-fly
06737  *
06738  * flags is a bitwise OR of the following:
06739  *  CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
06740  *  CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
06741  *  CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
06742  *      ignoring checkpoint_completion_target parameter.
06743  *  CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
06744  *      since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
06745  *      CHECKPOINT_END_OF_RECOVERY).
06746  *
06747  * Note: flags contains other bits, of interest here only for logging purposes.
06748  * In particular note that this routine is synchronous and does not pay
06749  * attention to CHECKPOINT_WAIT.
06750  *
06751  * If !shutdown then we are writing an online checkpoint. This is a very special
06752  * kind of operation and WAL record because the checkpoint action occurs over
06753  * a period of time yet logically occurs at just a single LSN. The logical
06754  * position of the WAL record (redo ptr) is the same or earlier than the
06755  * physical position. When we replay WAL we locate the checkpoint via its
06756  * physical position then read the redo ptr and actually start replay at the
06757  * earlier logical position. Note that we don't write *anything* to WAL at
06758  * the logical position, so that location could be any other kind of WAL record.
06759  * All of this mechanism allows us to continue working while we checkpoint.
06760  * As a result, timing of actions is critical here and be careful to note that
06761  * this function will likely take minutes to execute on a busy system.
06762  */
06763 void
06764 CreateCheckPoint(int flags)
06765 {
06766     bool        shutdown;
06767     CheckPoint  checkPoint;
06768     XLogRecPtr  recptr;
06769     XLogCtlInsert *Insert = &XLogCtl->Insert;
06770     XLogRecData rdata;
06771     uint32      freespace;
06772     XLogSegNo   _logSegNo;
06773     VirtualTransactionId *vxids;
06774     int nvxids;
06775 
06776     /*
06777      * An end-of-recovery checkpoint is really a shutdown checkpoint, just
06778      * issued at a different time.
06779      */
06780     if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
06781         shutdown = true;
06782     else
06783         shutdown = false;
06784 
06785     /* sanity check */
06786     if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
06787         elog(ERROR, "can't create a checkpoint during recovery");
06788 
06789     /*
06790      * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
06791      * (This is just pro forma, since in the present system structure there is
06792      * only one process that is allowed to issue checkpoints at any given
06793      * time.)
06794      */
06795     LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
06796 
06797     /*
06798      * Prepare to accumulate statistics.
06799      *
06800      * Note: because it is possible for log_checkpoints to change while a
06801      * checkpoint proceeds, we always accumulate stats, even if
06802      * log_checkpoints is currently off.
06803      */
06804     MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
06805     CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
06806 
06807     /*
06808      * Use a critical section to force system panic if we have trouble.
06809      */
06810     START_CRIT_SECTION();
06811 
06812     if (shutdown)
06813     {
06814         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
06815         ControlFile->state = DB_SHUTDOWNING;
06816         ControlFile->time = (pg_time_t) time(NULL);
06817         UpdateControlFile();
06818         LWLockRelease(ControlFileLock);
06819     }
06820 
06821     /*
06822      * Let smgr prepare for checkpoint; this has to happen before we determine
06823      * the REDO pointer.  Note that smgr must not do anything that'd have to
06824      * be undone if we decide no checkpoint is needed.
06825      */
06826     smgrpreckpt();
06827 
06828     /* Begin filling in the checkpoint WAL record */
06829     MemSet(&checkPoint, 0, sizeof(checkPoint));
06830     checkPoint.time = (pg_time_t) time(NULL);
06831 
06832     /*
06833      * For Hot Standby, derive the oldestActiveXid before we fix the redo
06834      * pointer. This allows us to begin accumulating changes to assemble our
06835      * starting snapshot of locks and transactions.
06836      */
06837     if (!shutdown && XLogStandbyInfoActive())
06838         checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
06839     else
06840         checkPoint.oldestActiveXid = InvalidTransactionId;
06841 
06842     /*
06843      * We must hold WALInsertLock while examining insert state to determine
06844      * the checkpoint REDO pointer.
06845      */
06846     LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
06847 
06848     /*
06849      * If this isn't a shutdown or forced checkpoint, and we have not inserted
06850      * any XLOG records since the start of the last checkpoint, skip the
06851      * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
06852      * when the system is idle. That wastes log space, and more importantly it
06853      * exposes us to possible loss of both current and previous checkpoint
06854      * records if the machine crashes just as we're writing the update.
06855      * (Perhaps it'd make even more sense to checkpoint only when the previous
06856      * checkpoint record is in a different xlog page?)
06857      *
06858      * We have to make two tests to determine that nothing has happened since
06859      * the start of the last checkpoint: current insertion point must match
06860      * the end of the last checkpoint record, and its redo pointer must point
06861      * to itself.
06862      */
06863     if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
06864                   CHECKPOINT_FORCE)) == 0)
06865     {
06866         XLogRecPtr  curInsert;
06867 
06868         INSERT_RECPTR(curInsert, Insert, Insert->curridx);
06869         if (curInsert == ControlFile->checkPoint + 
06870             MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
06871             ControlFile->checkPoint == ControlFile->checkPointCopy.redo)
06872         {
06873             LWLockRelease(WALInsertLock);
06874             LWLockRelease(CheckpointLock);
06875             END_CRIT_SECTION();
06876             return;
06877         }
06878     }
06879 
06880     /*
06881      * An end-of-recovery checkpoint is created before anyone is allowed to
06882      * write WAL. To allow us to write the checkpoint record, temporarily
06883      * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
06884      * initialized, which we need here and in AdvanceXLInsertBuffer.)
06885      */
06886     if (flags & CHECKPOINT_END_OF_RECOVERY)
06887         LocalSetXLogInsertAllowed();
06888 
06889     checkPoint.ThisTimeLineID = ThisTimeLineID;
06890     if (flags & CHECKPOINT_END_OF_RECOVERY)
06891         checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
06892     else
06893         checkPoint.PrevTimeLineID = ThisTimeLineID;
06894 
06895     checkPoint.fullPageWrites = Insert->fullPageWrites;
06896 
06897     /*
06898      * Compute new REDO record ptr = location of next XLOG record.
06899      *
06900      * NB: this is NOT necessarily where the checkpoint record itself will be,
06901      * since other backends may insert more XLOG records while we're off doing
06902      * the buffer flush work.  Those XLOG records are logically after the
06903      * checkpoint, even though physically before it.  Got that?
06904      */
06905     freespace = INSERT_FREESPACE(Insert);
06906     if (freespace == 0)
06907     {
06908         (void) AdvanceXLInsertBuffer(false);
06909         /* OK to ignore update return flag, since we will do flush anyway */
06910         freespace = INSERT_FREESPACE(Insert);
06911     }
06912     INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
06913 
06914     /*
06915      * Here we update the shared RedoRecPtr for future XLogInsert calls; this
06916      * must be done while holding the insert lock AND the info_lck.
06917      *
06918      * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
06919      * pointing past where it really needs to point.  This is okay; the only
06920      * consequence is that XLogInsert might back up whole buffers that it
06921      * didn't really need to.  We can't postpone advancing RedoRecPtr because
06922      * XLogInserts that happen while we are dumping buffers must assume that
06923      * their buffer changes are not included in the checkpoint.
06924      */
06925     {
06926         /* use volatile pointer to prevent code rearrangement */
06927         volatile XLogCtlData *xlogctl = XLogCtl;
06928 
06929         SpinLockAcquire(&xlogctl->info_lck);
06930         RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
06931         SpinLockRelease(&xlogctl->info_lck);
06932     }
06933 
06934     /*
06935      * Now we can release WAL insert lock, allowing other xacts to proceed
06936      * while we are flushing disk buffers.
06937      */
06938     LWLockRelease(WALInsertLock);
06939 
06940     /*
06941      * If enabled, log checkpoint start.  We postpone this until now so as not
06942      * to log anything if we decided to skip the checkpoint.
06943      */
06944     if (log_checkpoints)
06945         LogCheckpointStart(flags, false);
06946 
06947     TRACE_POSTGRESQL_CHECKPOINT_START(flags);
06948 
06949     /*
06950      * In some cases there are groups of actions that must all occur on
06951      * one side or the other of a checkpoint record. Before flushing the
06952      * checkpoint record we must explicitly wait for any backend currently
06953      * performing those groups of actions.
06954      *
06955      * One example is end of transaction, so we must wait for any transactions
06956      * that are currently in commit critical sections.  If an xact inserted
06957      * its commit record into XLOG just before the REDO point, then a crash
06958      * restart from the REDO point would not replay that record, which means
06959      * that our flushing had better include the xact's update of pg_clog.  So
06960      * we wait till he's out of his commit critical section before proceeding.
06961      * See notes in RecordTransactionCommit().
06962      *
06963      * Because we've already released WALInsertLock, this test is a bit fuzzy:
06964      * it is possible that we will wait for xacts we didn't really need to
06965      * wait for.  But the delay should be short and it seems better to make
06966      * checkpoint take a bit longer than to hold locks longer than necessary.
06967      * (In fact, the whole reason we have this issue is that xact.c does
06968      * commit record XLOG insertion and clog update as two separate steps
06969      * protected by different locks, but again that seems best on grounds of
06970      * minimizing lock contention.)
06971      *
06972      * A transaction that has not yet set delayChkpt when we look cannot be at
06973      * risk, since he's not inserted his commit record yet; and one that's
06974      * already cleared it is not at risk either, since he's done fixing clog
06975      * and we will correctly flush the update below.  So we cannot miss any
06976      * xacts we need to wait for.
06977      */
06978     vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
06979     if (nvxids > 0)
06980     {
06981         uint32  nwaits = 0;
06982 
06983         do
06984         {
06985             pg_usleep(10000L);  /* wait for 10 msec */
06986             nwaits++;
06987         } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
06988     }
06989     pfree(vxids);
06990 
06991     /*
06992      * Get the other info we need for the checkpoint record.
06993      */
06994     LWLockAcquire(XidGenLock, LW_SHARED);
06995     checkPoint.nextXid = ShmemVariableCache->nextXid;
06996     checkPoint.oldestXid = ShmemVariableCache->oldestXid;
06997     checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
06998     LWLockRelease(XidGenLock);
06999 
07000     /* Increase XID epoch if we've wrapped around since last checkpoint */
07001     checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
07002     if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
07003         checkPoint.nextXidEpoch++;
07004 
07005     LWLockAcquire(OidGenLock, LW_SHARED);
07006     checkPoint.nextOid = ShmemVariableCache->nextOid;
07007     if (!shutdown)
07008         checkPoint.nextOid += ShmemVariableCache->oidCount;
07009     LWLockRelease(OidGenLock);
07010 
07011     MultiXactGetCheckptMulti(shutdown,
07012                              &checkPoint.nextMulti,
07013                              &checkPoint.nextMultiOffset,
07014                              &checkPoint.oldestMulti,
07015                              &checkPoint.oldestMultiDB);
07016 
07017     /*
07018      * Having constructed the checkpoint record, ensure all shmem disk buffers
07019      * and commit-log buffers are flushed to disk.
07020      *
07021      * This I/O could fail for various reasons.  If so, we will fail to
07022      * complete the checkpoint, but there is no reason to force a system
07023      * panic. Accordingly, exit critical section while doing it.
07024      */
07025     END_CRIT_SECTION();
07026 
07027     CheckPointGuts(checkPoint.redo, flags);
07028 
07029     /*
07030      * Take a snapshot of running transactions and write this to WAL. This
07031      * allows us to reconstruct the state of running transactions during
07032      * archive recovery, if required. Skip, if this info disabled.
07033      *
07034      * If we are shutting down, or Startup process is completing crash
07035      * recovery we don't need to write running xact data.
07036      */
07037     if (!shutdown && XLogStandbyInfoActive())
07038         LogStandbySnapshot();
07039 
07040     START_CRIT_SECTION();
07041 
07042     /*
07043      * Now insert the checkpoint record into XLOG.
07044      */
07045     rdata.data = (char *) (&checkPoint);
07046     rdata.len = sizeof(checkPoint);
07047     rdata.buffer = InvalidBuffer;
07048     rdata.next = NULL;
07049 
07050     recptr = XLogInsert(RM_XLOG_ID,
07051                         shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
07052                         XLOG_CHECKPOINT_ONLINE,
07053                         &rdata);
07054 
07055     XLogFlush(recptr);
07056 
07057     /*
07058      * We mustn't write any new WAL after a shutdown checkpoint, or it will be
07059      * overwritten at next startup.  No-one should even try, this just allows
07060      * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
07061      * to just temporarily disable writing until the system has exited
07062      * recovery.
07063      */
07064     if (shutdown)
07065     {
07066         if (flags & CHECKPOINT_END_OF_RECOVERY)
07067             LocalXLogInsertAllowed = -1;        /* return to "check" state */
07068         else
07069             LocalXLogInsertAllowed = 0; /* never again write WAL */
07070     }
07071 
07072     /*
07073      * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
07074      * = end of actual checkpoint record.
07075      */
07076     if (shutdown && checkPoint.redo != ProcLastRecPtr)
07077         ereport(PANIC,
07078                 (errmsg("concurrent transaction log activity while database system is shutting down")));
07079 
07080     /*
07081      * Select point at which we can truncate the log, which we base on the
07082      * prior checkpoint's earliest info.
07083      */
07084     XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
07085 
07086     /*
07087      * Update the control file.
07088      */
07089     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
07090     if (shutdown)
07091         ControlFile->state = DB_SHUTDOWNED;
07092     ControlFile->prevCheckPoint = ControlFile->checkPoint;
07093     ControlFile->checkPoint = ProcLastRecPtr;
07094     ControlFile->checkPointCopy = checkPoint;
07095     ControlFile->time = (pg_time_t) time(NULL);
07096     /* crash recovery should always recover to the end of WAL */
07097     ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
07098     ControlFile->minRecoveryPointTLI = 0;
07099 
07100     /*
07101      * Persist unloggedLSN value. It's reset on crash recovery, so this goes
07102      * unused on non-shutdown checkpoints, but seems useful to store it always
07103      * for debugging purposes.
07104      */
07105     SpinLockAcquire(&XLogCtl->ulsn_lck);
07106     ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
07107     SpinLockRelease(&XLogCtl->ulsn_lck);
07108 
07109     UpdateControlFile();
07110     LWLockRelease(ControlFileLock);
07111 
07112     /* Update shared-memory copy of checkpoint XID/epoch */
07113     {
07114         /* use volatile pointer to prevent code rearrangement */
07115         volatile XLogCtlData *xlogctl = XLogCtl;
07116 
07117         SpinLockAcquire(&xlogctl->info_lck);
07118         xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
07119         xlogctl->ckptXid = checkPoint.nextXid;
07120         SpinLockRelease(&xlogctl->info_lck);
07121     }
07122 
07123     /*
07124      * We are now done with critical updates; no need for system panic if we
07125      * have trouble while fooling with old log segments.
07126      */
07127     END_CRIT_SECTION();
07128 
07129     /*
07130      * Let smgr do post-checkpoint cleanup (eg, deleting old files).
07131      */
07132     smgrpostckpt();
07133 
07134     /*
07135      * Delete old log files (those no longer needed even for previous
07136      * checkpoint or the standbys in XLOG streaming).
07137      */
07138     if (_logSegNo)
07139     {
07140         KeepLogSeg(recptr, &_logSegNo);
07141         _logSegNo--;
07142         RemoveOldXlogFiles(_logSegNo, recptr);
07143     }
07144 
07145     /*
07146      * Make more log segments if needed.  (Do this after recycling old log
07147      * segments, since that may supply some of the needed files.)
07148      */
07149     if (!shutdown)
07150         PreallocXlogFiles(recptr);
07151 
07152     /*
07153      * Truncate pg_subtrans if possible.  We can throw away all data before
07154      * the oldest XMIN of any running transaction.  No future transaction will
07155      * attempt to reference any pg_subtrans entry older than that (see Asserts
07156      * in subtrans.c).  During recovery, though, we mustn't do this because
07157      * StartupSUBTRANS hasn't been called yet.
07158      */
07159     if (!RecoveryInProgress())
07160         TruncateSUBTRANS(GetOldestXmin(true, false));
07161 
07162     /* Real work is done, but log and update stats before releasing lock. */
07163     LogCheckpointEnd(false);
07164 
07165     TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
07166                                      NBuffers,
07167                                      CheckpointStats.ckpt_segs_added,
07168                                      CheckpointStats.ckpt_segs_removed,
07169                                      CheckpointStats.ckpt_segs_recycled);
07170 
07171     LWLockRelease(CheckpointLock);
07172 }
07173 
07174 /*
07175  * Mark the end of recovery in WAL though without running a full checkpoint.
07176  * We can expect that a restartpoint is likely to be in progress as we
07177  * do this, though we are unwilling to wait for it to complete. So be
07178  * careful to avoid taking the CheckpointLock anywhere here.
07179  *
07180  * CreateRestartPoint() allows for the case where recovery may end before
07181  * the restartpoint completes so there is no concern of concurrent behaviour.
07182  */
07183 void
07184 CreateEndOfRecoveryRecord(void)
07185 {
07186     xl_end_of_recovery  xlrec;
07187     XLogRecData         rdata;
07188     XLogRecPtr          recptr;
07189 
07190     /* sanity check */
07191     if (!RecoveryInProgress())
07192         elog(ERROR, "can only be used to end recovery");
07193 
07194     xlrec.end_time = time(NULL);
07195 
07196     LWLockAcquire(WALInsertLock, LW_SHARED);
07197     xlrec.ThisTimeLineID = ThisTimeLineID;
07198     xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
07199     LWLockRelease(WALInsertLock);
07200 
07201     LocalSetXLogInsertAllowed();
07202 
07203     START_CRIT_SECTION();
07204 
07205     rdata.data = (char *) &xlrec;
07206     rdata.len = sizeof(xl_end_of_recovery);
07207     rdata.buffer = InvalidBuffer;
07208     rdata.next = NULL;
07209 
07210     recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY, &rdata);
07211 
07212     XLogFlush(recptr);
07213 
07214     /*
07215      * Update the control file so that crash recovery can follow
07216      * the timeline changes to this point.
07217      */
07218     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
07219     ControlFile->time = (pg_time_t) xlrec.end_time;
07220     ControlFile->minRecoveryPoint = recptr;
07221     ControlFile->minRecoveryPointTLI = ThisTimeLineID;
07222     UpdateControlFile();
07223     LWLockRelease(ControlFileLock);
07224 
07225     END_CRIT_SECTION();
07226 
07227     LocalXLogInsertAllowed = -1;        /* return to "check" state */
07228 }
07229 
07230 /*
07231  * Flush all data in shared memory to disk, and fsync
07232  *
07233  * This is the common code shared between regular checkpoints and
07234  * recovery restartpoints.
07235  */
07236 static void
07237 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
07238 {
07239     CheckPointCLOG();
07240     CheckPointSUBTRANS();
07241     CheckPointMultiXact();
07242     CheckPointPredicate();
07243     CheckPointRelationMap();
07244     CheckPointBuffers(flags);   /* performs all required fsyncs */
07245     /* We deliberately delay 2PC checkpointing as long as possible */
07246     CheckPointTwoPhase(checkPointRedo);
07247 }
07248 
07249 /*
07250  * Save a checkpoint for recovery restart if appropriate
07251  *
07252  * This function is called each time a checkpoint record is read from XLOG.
07253  * It must determine whether the checkpoint represents a safe restartpoint or
07254  * not.  If so, the checkpoint record is stashed in shared memory so that
07255  * CreateRestartPoint can consult it.  (Note that the latter function is
07256  * executed by the checkpointer, while this one will be executed by the
07257  * startup process.)
07258  */
07259 static void
07260 RecoveryRestartPoint(const CheckPoint *checkPoint)
07261 {
07262     int         rmid;
07263 
07264     /* use volatile pointer to prevent code rearrangement */
07265     volatile XLogCtlData *xlogctl = XLogCtl;
07266 
07267     /*
07268      * Is it safe to restartpoint?  We must ask each of the resource managers
07269      * whether they have any partial state information that might prevent a
07270      * correct restart from this point.  If so, we skip this opportunity, but
07271      * return at the next checkpoint record for another try.
07272      */
07273     for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
07274     {
07275         if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
07276             if (!(RmgrTable[rmid].rm_safe_restartpoint()))
07277             {
07278                 elog(trace_recovery(DEBUG2),
07279                      "RM %d not safe to record restart point at %X/%X",
07280                      rmid,
07281                      (uint32) (checkPoint->redo >> 32),
07282                      (uint32) checkPoint->redo);
07283                 return;
07284             }
07285     }
07286 
07287     /*
07288      * Also refrain from creating a restartpoint if we have seen any
07289      * references to non-existent pages. Restarting recovery from the
07290      * restartpoint would not see the references, so we would lose the
07291      * cross-check that the pages belonged to a relation that was dropped
07292      * later.
07293      */
07294     if (XLogHaveInvalidPages())
07295     {
07296         elog(trace_recovery(DEBUG2),
07297              "could not record restart point at %X/%X because there "
07298              "are unresolved references to invalid pages",
07299              (uint32) (checkPoint->redo >> 32),
07300              (uint32) checkPoint->redo);
07301         return;
07302     }
07303 
07304     /*
07305      * Copy the checkpoint record to shared memory, so that checkpointer can
07306      * work out the next time it wants to perform a restartpoint.
07307      */
07308     SpinLockAcquire(&xlogctl->info_lck);
07309     xlogctl->lastCheckPointRecPtr = ReadRecPtr;
07310     xlogctl->lastCheckPoint = *checkPoint;
07311     SpinLockRelease(&xlogctl->info_lck);
07312 }
07313 
07314 /*
07315  * Establish a restartpoint if possible.
07316  *
07317  * This is similar to CreateCheckPoint, but is used during WAL recovery
07318  * to establish a point from which recovery can roll forward without
07319  * replaying the entire recovery log.
07320  *
07321  * Returns true if a new restartpoint was established. We can only establish
07322  * a restartpoint if we have replayed a safe checkpoint record since last
07323  * restartpoint.
07324  */
07325 bool
07326 CreateRestartPoint(int flags)
07327 {
07328     XLogRecPtr  lastCheckPointRecPtr;
07329     CheckPoint  lastCheckPoint;
07330     XLogSegNo   _logSegNo;
07331     TimestampTz xtime;
07332 
07333     /* use volatile pointer to prevent code rearrangement */
07334     volatile XLogCtlData *xlogctl = XLogCtl;
07335 
07336     /*
07337      * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
07338      * happens at a time.
07339      */
07340     LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
07341 
07342     /* Get a local copy of the last safe checkpoint record. */
07343     SpinLockAcquire(&xlogctl->info_lck);
07344     lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
07345     lastCheckPoint = xlogctl->lastCheckPoint;
07346     SpinLockRelease(&xlogctl->info_lck);
07347 
07348     /*
07349      * Check that we're still in recovery mode. It's ok if we exit recovery
07350      * mode after this check, the restart point is valid anyway.
07351      */
07352     if (!RecoveryInProgress())
07353     {
07354         ereport(DEBUG2,
07355               (errmsg("skipping restartpoint, recovery has already ended")));
07356         LWLockRelease(CheckpointLock);
07357         return false;
07358     }
07359 
07360     /*
07361      * If the last checkpoint record we've replayed is already our last
07362      * restartpoint, we can't perform a new restart point. We still update
07363      * minRecoveryPoint in that case, so that if this is a shutdown restart
07364      * point, we won't start up earlier than before. That's not strictly
07365      * necessary, but when hot standby is enabled, it would be rather weird if
07366      * the database opened up for read-only connections at a point-in-time
07367      * before the last shutdown. Such time travel is still possible in case of
07368      * immediate shutdown, though.
07369      *
07370      * We don't explicitly advance minRecoveryPoint when we do create a
07371      * restartpoint. It's assumed that flushing the buffers will do that as a
07372      * side-effect.
07373      */
07374     if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
07375         lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
07376     {
07377         ereport(DEBUG2,
07378                 (errmsg("skipping restartpoint, already performed at %X/%X",
07379                         (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo)));
07380 
07381         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
07382         if (flags & CHECKPOINT_IS_SHUTDOWN)
07383         {
07384             LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
07385             ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
07386             ControlFile->time = (pg_time_t) time(NULL);
07387             UpdateControlFile();
07388             LWLockRelease(ControlFileLock);
07389         }
07390         LWLockRelease(CheckpointLock);
07391         return false;
07392     }
07393 
07394     /*
07395      * Update the shared RedoRecPtr so that the startup process can calculate
07396      * the number of segments replayed since last restartpoint, and request a
07397      * restartpoint if it exceeds checkpoint_segments.
07398      *
07399      * You need to hold WALInsertLock and info_lck to update it, although
07400      * during recovery acquiring WALInsertLock is just pro forma, because
07401      * there is no other processes updating Insert.RedoRecPtr.
07402      */
07403     LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
07404     SpinLockAcquire(&xlogctl->info_lck);
07405     xlogctl->Insert.RedoRecPtr = lastCheckPoint.redo;
07406     SpinLockRelease(&xlogctl->info_lck);
07407     LWLockRelease(WALInsertLock);
07408 
07409     /*
07410      * Prepare to accumulate statistics.
07411      *
07412      * Note: because it is possible for log_checkpoints to change while a
07413      * checkpoint proceeds, we always accumulate stats, even if
07414      * log_checkpoints is currently off.
07415      */
07416     MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
07417     CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
07418 
07419     if (log_checkpoints)
07420         LogCheckpointStart(flags, true);
07421 
07422     CheckPointGuts(lastCheckPoint.redo, flags);
07423 
07424     /*
07425      * Select point at which we can truncate the xlog, which we base on the
07426      * prior checkpoint's earliest info.
07427      */
07428     XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo);
07429 
07430     /*
07431      * Update pg_control, using current time.  Check that it still shows
07432      * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
07433      * this is a quick hack to make sure nothing really bad happens if somehow
07434      * we get here after the end-of-recovery checkpoint.
07435      */
07436     LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
07437     if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
07438         ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
07439     {
07440         ControlFile->prevCheckPoint = ControlFile->checkPoint;
07441         ControlFile->checkPoint = lastCheckPointRecPtr;
07442         ControlFile->checkPointCopy = lastCheckPoint;
07443         ControlFile->time = (pg_time_t) time(NULL);
07444         if (flags & CHECKPOINT_IS_SHUTDOWN)
07445             ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
07446         UpdateControlFile();
07447     }
07448     LWLockRelease(ControlFileLock);
07449 
07450     /*
07451      * Delete old log files (those no longer needed even for previous
07452      * checkpoint/restartpoint) to prevent the disk holding the xlog from
07453      * growing full.
07454      */
07455     if (_logSegNo)
07456     {
07457         XLogRecPtr  receivePtr;
07458         XLogRecPtr  replayPtr;
07459         XLogRecPtr  endptr;
07460 
07461         /*
07462          * Get the current end of xlog replayed or received, whichever is later.
07463          */
07464         receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
07465         replayPtr = GetXLogReplayRecPtr(NULL);
07466         endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
07467 
07468         KeepLogSeg(endptr, &_logSegNo);
07469         _logSegNo--;
07470 
07471         /*
07472          * Update ThisTimeLineID to the timeline we're currently replaying,
07473          * so that we install any recycled segments on that timeline.
07474          *
07475          * There is no guarantee that the WAL segments will be useful on the
07476          * current timeline; if recovery proceeds to a new timeline right
07477          * after this, the pre-allocated WAL segments on this timeline will
07478          * not be used, and will go wasted until recycled on the next
07479          * restartpoint. We'll live with that.
07480          */
07481         (void) GetXLogReplayRecPtr(&ThisTimeLineID);
07482 
07483         RemoveOldXlogFiles(_logSegNo, endptr);
07484 
07485         /*
07486          * Make more log segments if needed.  (Do this after recycling old log
07487          * segments, since that may supply some of the needed files.)
07488          */
07489         PreallocXlogFiles(endptr);
07490     }
07491 
07492     /*
07493      * Truncate pg_subtrans if possible.  We can throw away all data before
07494      * the oldest XMIN of any running transaction.  No future transaction will
07495      * attempt to reference any pg_subtrans entry older than that (see Asserts
07496      * in subtrans.c).  When hot standby is disabled, though, we mustn't do
07497      * this because StartupSUBTRANS hasn't been called yet.
07498      */
07499     if (EnableHotStandby)
07500         TruncateSUBTRANS(GetOldestXmin(true, false));
07501 
07502     /* Real work is done, but log and update before releasing lock. */
07503     LogCheckpointEnd(true);
07504 
07505     xtime = GetLatestXTime();
07506     ereport((log_checkpoints ? LOG : DEBUG2),
07507             (errmsg("recovery restart point at %X/%X",
07508                     (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
07509            xtime ? errdetail("last completed transaction was at log time %s",
07510                              timestamptz_to_str(xtime)) : 0));
07511 
07512     LWLockRelease(CheckpointLock);
07513 
07514     /*
07515      * Finally, execute archive_cleanup_command, if any.
07516      */
07517     if (XLogCtl->archiveCleanupCommand[0])
07518         ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
07519                                "archive_cleanup_command",
07520                                false);
07521 
07522     return true;
07523 }
07524 
07525 /*
07526  * Retreat *logSegNo to the last segment that we need to retain because of
07527  * wal_keep_segments. This is calculated by subtracting wal_keep_segments
07528  * from the given xlog location, recptr.
07529  */
07530 static void
07531 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
07532 {
07533     XLogSegNo   segno;
07534 
07535     if (wal_keep_segments == 0)
07536         return;
07537 
07538     XLByteToSeg(recptr, segno);
07539 
07540     /* avoid underflow, don't go below 1 */
07541     if (segno <= wal_keep_segments)
07542         segno = 1;
07543     else
07544         segno = segno - wal_keep_segments;
07545 
07546     /* don't delete WAL segments newer than the calculated segment */
07547     if (segno < *logSegNo)
07548         *logSegNo = segno;
07549 }
07550 
07551 /*
07552  * Write a NEXTOID log record
07553  */
07554 void
07555 XLogPutNextOid(Oid nextOid)
07556 {
07557     XLogRecData rdata;
07558 
07559     rdata.data = (char *) (&nextOid);
07560     rdata.len = sizeof(Oid);
07561     rdata.buffer = InvalidBuffer;
07562     rdata.next = NULL;
07563     (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
07564 
07565     /*
07566      * We need not flush the NEXTOID record immediately, because any of the
07567      * just-allocated OIDs could only reach disk as part of a tuple insert or
07568      * update that would have its own XLOG record that must follow the NEXTOID
07569      * record.  Therefore, the standard buffer LSN interlock applied to those
07570      * records will ensure no such OID reaches disk before the NEXTOID record
07571      * does.
07572      *
07573      * Note, however, that the above statement only covers state "within" the
07574      * database.  When we use a generated OID as a file or directory name, we
07575      * are in a sense violating the basic WAL rule, because that filesystem
07576      * change may reach disk before the NEXTOID WAL record does.  The impact
07577      * of this is that if a database crash occurs immediately afterward, we
07578      * might after restart re-generate the same OID and find that it conflicts
07579      * with the leftover file or directory.  But since for safety's sake we
07580      * always loop until finding a nonconflicting filename, this poses no real
07581      * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
07582      */
07583 }
07584 
07585 /*
07586  * Write an XLOG SWITCH record.
07587  *
07588  * Here we just blindly issue an XLogInsert request for the record.
07589  * All the magic happens inside XLogInsert.
07590  *
07591  * The return value is either the end+1 address of the switch record,
07592  * or the end+1 address of the prior segment if we did not need to
07593  * write a switch record because we are already at segment start.
07594  */
07595 XLogRecPtr
07596 RequestXLogSwitch(void)
07597 {
07598     XLogRecPtr  RecPtr;
07599     XLogRecData rdata;
07600 
07601     /* XLOG SWITCH, alone among xlog record types, has no data */
07602     rdata.buffer = InvalidBuffer;
07603     rdata.data = NULL;
07604     rdata.len = 0;
07605     rdata.next = NULL;
07606 
07607     RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);
07608 
07609     return RecPtr;
07610 }
07611 
07612 /*
07613  * Write a RESTORE POINT record
07614  */
07615 XLogRecPtr
07616 XLogRestorePoint(const char *rpName)
07617 {
07618     XLogRecPtr  RecPtr;
07619     XLogRecData rdata;
07620     xl_restore_point xlrec;
07621 
07622     xlrec.rp_time = GetCurrentTimestamp();
07623     strncpy(xlrec.rp_name, rpName, MAXFNAMELEN);
07624 
07625     rdata.buffer = InvalidBuffer;
07626     rdata.data = (char *) &xlrec;
07627     rdata.len = sizeof(xl_restore_point);
07628     rdata.next = NULL;
07629 
07630     RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT, &rdata);
07631 
07632     ereport(LOG,
07633             (errmsg("restore point \"%s\" created at %X/%X",
07634                     rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
07635 
07636     return RecPtr;
07637 }
07638 
07639 /*
07640  * Write a backup block if needed when we are setting a hint. Note that
07641  * this may be called for a variety of page types, not just heaps.
07642  *
07643  * Callable while holding just share lock on the buffer content.
07644  *
07645  * We can't use the plain backup block mechanism since that relies on the
07646  * Buffer being exclusively locked. Since some modifications (setting LSN, hint
07647  * bits) are allowed in a sharelocked buffer that can lead to wal checksum
07648  * failures. So instead we copy the page and insert the copied data as normal
07649  * record data.
07650  *
07651  * We only need to do something if page has not yet been full page written in
07652  * this checkpoint round. The LSN of the inserted wal record is returned if we
07653  * had to write, InvalidXLogRecPtr otherwise.
07654  *
07655  * It is possible that multiple concurrent backends could attempt to write WAL
07656  * records. In that case, multiple copies of the same block would be recorded
07657  * in separate WAL records by different backends, though that is still OK from
07658  * a correctness perspective.
07659  *
07660  * Note that this only works for buffers that fit the standard page model,
07661  * i.e. those for which buffer_std == true
07662  */
07663 XLogRecPtr
07664 XLogSaveBufferForHint(Buffer buffer)
07665 {
07666     XLogRecPtr recptr = InvalidXLogRecPtr;
07667     XLogRecPtr lsn;
07668     XLogRecData rdata[2];
07669     BkpBlock bkpb;
07670 
07671     /*
07672      * Ensure no checkpoint can change our view of RedoRecPtr.
07673      */
07674     Assert(MyPgXact->delayChkpt);
07675 
07676     /*
07677      * Update RedoRecPtr so XLogCheckBuffer can make the right decision
07678      */
07679     GetRedoRecPtr();
07680 
07681     /*
07682      * Setup phony rdata element for use within XLogCheckBuffer only.
07683      * We reuse and reset rdata for any actual WAL record insert.
07684      */
07685     rdata[0].buffer = buffer;
07686     rdata[0].buffer_std = true;
07687 
07688     /*
07689      * Check buffer while not holding an exclusive lock.
07690      */
07691     if (XLogCheckBuffer(rdata, false, &lsn, &bkpb))
07692     {
07693         char copied_buffer[BLCKSZ];
07694         char *origdata = (char *) BufferGetBlock(buffer);
07695 
07696         /*
07697          * Copy buffer so we don't have to worry about concurrent hint bit or
07698          * lsn updates. We assume pd_lower/upper cannot be changed without an
07699          * exclusive lock, so the contents bkp are not racy.
07700          */
07701         memcpy(copied_buffer, origdata, bkpb.hole_offset);
07702         memcpy(copied_buffer + bkpb.hole_offset,
07703                 origdata + bkpb.hole_offset + bkpb.hole_length,
07704                 BLCKSZ - bkpb.hole_offset - bkpb.hole_length);
07705 
07706         /*
07707          * Header for backup block.
07708          */
07709         rdata[0].data = (char *) &bkpb;
07710         rdata[0].len = sizeof(BkpBlock);
07711         rdata[0].buffer = InvalidBuffer;
07712         rdata[0].next = &(rdata[1]);
07713 
07714         /*
07715          * Save copy of the buffer.
07716          */
07717         rdata[1].data = copied_buffer;
07718         rdata[1].len = BLCKSZ - bkpb.hole_length;
07719         rdata[1].buffer = InvalidBuffer;
07720         rdata[1].next = NULL;
07721 
07722         recptr = XLogInsert(RM_XLOG_ID, XLOG_HINT, rdata);
07723     }
07724 
07725     return recptr;
07726 }
07727 
07728 /*
07729  * Check if any of the GUC parameters that are critical for hot standby
07730  * have changed, and update the value in pg_control file if necessary.
07731  */
07732 static void
07733 XLogReportParameters(void)
07734 {
07735     if (wal_level != ControlFile->wal_level ||
07736         MaxConnections != ControlFile->MaxConnections ||
07737         max_prepared_xacts != ControlFile->max_prepared_xacts ||
07738         max_locks_per_xact != ControlFile->max_locks_per_xact)
07739     {
07740         /*
07741          * The change in number of backend slots doesn't need to be WAL-logged
07742          * if archiving is not enabled, as you can't start archive recovery
07743          * with wal_level=minimal anyway. We don't really care about the
07744          * values in pg_control either if wal_level=minimal, but seems better
07745          * to keep them up-to-date to avoid confusion.
07746          */
07747         if (wal_level != ControlFile->wal_level || XLogIsNeeded())
07748         {
07749             XLogRecData rdata;
07750             xl_parameter_change xlrec;
07751 
07752             xlrec.MaxConnections = MaxConnections;
07753             xlrec.max_prepared_xacts = max_prepared_xacts;
07754             xlrec.max_locks_per_xact = max_locks_per_xact;
07755             xlrec.wal_level = wal_level;
07756 
07757             rdata.buffer = InvalidBuffer;
07758             rdata.data = (char *) &xlrec;
07759             rdata.len = sizeof(xlrec);
07760             rdata.next = NULL;
07761 
07762             XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE, &rdata);
07763         }
07764 
07765         ControlFile->MaxConnections = MaxConnections;
07766         ControlFile->max_prepared_xacts = max_prepared_xacts;
07767         ControlFile->max_locks_per_xact = max_locks_per_xact;
07768         ControlFile->wal_level = wal_level;
07769         UpdateControlFile();
07770     }
07771 }
07772 
07773 /*
07774  * Update full_page_writes in shared memory, and write an
07775  * XLOG_FPW_CHANGE record if necessary.
07776  *
07777  * Note: this function assumes there is no other process running
07778  * concurrently that could update it.
07779  */
07780 void
07781 UpdateFullPageWrites(void)
07782 {
07783     XLogCtlInsert *Insert = &XLogCtl->Insert;
07784 
07785     /*
07786      * Do nothing if full_page_writes has not been changed.
07787      *
07788      * It's safe to check the shared full_page_writes without the lock,
07789      * because we assume that there is no concurrently running process which
07790      * can update it.
07791      */
07792     if (fullPageWrites == Insert->fullPageWrites)
07793         return;
07794 
07795     START_CRIT_SECTION();
07796 
07797     /*
07798      * It's always safe to take full page images, even when not strictly
07799      * required, but not the other round. So if we're setting full_page_writes
07800      * to true, first set it true and then write the WAL record. If we're
07801      * setting it to false, first write the WAL record and then set the global
07802      * flag.
07803      */
07804     if (fullPageWrites)
07805     {
07806         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
07807         Insert->fullPageWrites = true;
07808         LWLockRelease(WALInsertLock);
07809     }
07810 
07811     /*
07812      * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
07813      * full_page_writes during archive recovery, if required.
07814      */
07815     if (XLogStandbyInfoActive() && !RecoveryInProgress())
07816     {
07817         XLogRecData rdata;
07818 
07819         rdata.data = (char *) (&fullPageWrites);
07820         rdata.len = sizeof(bool);
07821         rdata.buffer = InvalidBuffer;
07822         rdata.next = NULL;
07823 
07824         XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE, &rdata);
07825     }
07826 
07827     if (!fullPageWrites)
07828     {
07829         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
07830         Insert->fullPageWrites = false;
07831         LWLockRelease(WALInsertLock);
07832     }
07833     END_CRIT_SECTION();
07834 }
07835 
07836 /*
07837  * Check that it's OK to switch to new timeline during recovery.
07838  *
07839  * 'lsn' is the address of the shutdown checkpoint record we're about to
07840  * replay. (Currently, timeline can only change at a shutdown checkpoint).
07841  */
07842 static void
07843 checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
07844 {
07845     /* Check that the record agrees on what the current (old) timeline is */
07846     if (prevTLI != ThisTimeLineID)
07847         ereport(PANIC,
07848                 (errmsg("unexpected prev timeline ID %u (current timeline ID %u) in checkpoint record",
07849                         prevTLI, ThisTimeLineID)));
07850     /*
07851      * The new timeline better be in the list of timelines we expect
07852      * to see, according to the timeline history. It should also not
07853      * decrease.
07854      */
07855     if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
07856         ereport(PANIC,
07857                 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
07858                         newTLI, ThisTimeLineID)));
07859 
07860     /*
07861      * If we have not yet reached min recovery point, and we're about
07862      * to switch to a timeline greater than the timeline of the min
07863      * recovery point: trouble. After switching to the new timeline,
07864      * we could not possibly visit the min recovery point on the
07865      * correct timeline anymore. This can happen if there is a newer
07866      * timeline in the archive that branched before the timeline the
07867      * min recovery point is on, and you attempt to do PITR to the
07868      * new timeline.
07869      */
07870     if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
07871         lsn < minRecoveryPoint &&
07872         newTLI > minRecoveryPointTLI)
07873         ereport(PANIC,
07874                 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
07875                         newTLI,
07876                         (uint32) (minRecoveryPoint >> 32),
07877                         (uint32) minRecoveryPoint,
07878                         minRecoveryPointTLI)));
07879 
07880     /* Looks good */
07881 }
07882 
07883 /*
07884  * XLOG resource manager's routines
07885  *
07886  * Definitions of info values are in include/catalog/pg_control.h, though
07887  * not all record types are related to control file updates.
07888  */
07889 void
07890 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
07891 {
07892     uint8       info = record->xl_info & ~XLR_INFO_MASK;
07893 
07894     /* Backup blocks are not used by XLOG rmgr */
07895     Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
07896 
07897     if (info == XLOG_NEXTOID)
07898     {
07899         Oid         nextOid;
07900 
07901         /*
07902          * We used to try to take the maximum of ShmemVariableCache->nextOid
07903          * and the recorded nextOid, but that fails if the OID counter wraps
07904          * around.  Since no OID allocation should be happening during replay
07905          * anyway, better to just believe the record exactly.  We still take
07906          * OidGenLock while setting the variable, just in case.
07907          */
07908         memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
07909         LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
07910         ShmemVariableCache->nextOid = nextOid;
07911         ShmemVariableCache->oidCount = 0;
07912         LWLockRelease(OidGenLock);
07913     }
07914     else if (info == XLOG_CHECKPOINT_SHUTDOWN)
07915     {
07916         CheckPoint  checkPoint;
07917 
07918         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
07919         /* In a SHUTDOWN checkpoint, believe the counters exactly */
07920         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
07921         ShmemVariableCache->nextXid = checkPoint.nextXid;
07922         LWLockRelease(XidGenLock);
07923         LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
07924         ShmemVariableCache->nextOid = checkPoint.nextOid;
07925         ShmemVariableCache->oidCount = 0;
07926         LWLockRelease(OidGenLock);
07927         MultiXactSetNextMXact(checkPoint.nextMulti,
07928                               checkPoint.nextMultiOffset);
07929         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
07930         SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
07931 
07932         /*
07933          * If we see a shutdown checkpoint while waiting for an end-of-backup
07934          * record, the backup was canceled and the end-of-backup record will
07935          * never arrive.
07936          */
07937         if (ArchiveRecoveryRequested &&
07938             !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
07939             XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
07940             ereport(PANIC,
07941             (errmsg("online backup was canceled, recovery cannot continue")));
07942 
07943         /*
07944          * If we see a shutdown checkpoint, we know that nothing was running
07945          * on the master at this point. So fake-up an empty running-xacts
07946          * record and use that here and now. Recover additional standby state
07947          * for prepared transactions.
07948          */
07949         if (standbyState >= STANDBY_INITIALIZED)
07950         {
07951             TransactionId *xids;
07952             int         nxids;
07953             TransactionId oldestActiveXID;
07954             TransactionId latestCompletedXid;
07955             RunningTransactionsData running;
07956 
07957             oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
07958 
07959             /*
07960              * Construct a RunningTransactions snapshot representing a shut
07961              * down server, with only prepared transactions still alive. We're
07962              * never overflowed at this point because all subxids are listed
07963              * with their parent prepared transactions.
07964              */
07965             running.xcnt = nxids;
07966             running.subxcnt = 0;
07967             running.subxid_overflow = false;
07968             running.nextXid = checkPoint.nextXid;
07969             running.oldestRunningXid = oldestActiveXID;
07970             latestCompletedXid = checkPoint.nextXid;
07971             TransactionIdRetreat(latestCompletedXid);
07972             Assert(TransactionIdIsNormal(latestCompletedXid));
07973             running.latestCompletedXid = latestCompletedXid;
07974             running.xids = xids;
07975 
07976             ProcArrayApplyRecoveryInfo(&running);
07977 
07978             StandbyRecoverPreparedTransactions(true);
07979         }
07980 
07981         /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
07982         ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
07983         ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
07984 
07985         /* Update shared-memory copy of checkpoint XID/epoch */
07986         {
07987             /* use volatile pointer to prevent code rearrangement */
07988             volatile XLogCtlData *xlogctl = XLogCtl;
07989 
07990             SpinLockAcquire(&xlogctl->info_lck);
07991             xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
07992             xlogctl->ckptXid = checkPoint.nextXid;
07993             SpinLockRelease(&xlogctl->info_lck);
07994         }
07995 
07996         /*
07997          * We should've already switched to the new TLI before replaying this
07998          * record.
07999          */
08000         if (checkPoint.ThisTimeLineID != ThisTimeLineID)
08001             ereport(PANIC,
08002                     (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
08003                             checkPoint.ThisTimeLineID, ThisTimeLineID)));
08004 
08005         RecoveryRestartPoint(&checkPoint);
08006     }
08007     else if (info == XLOG_CHECKPOINT_ONLINE)
08008     {
08009         CheckPoint  checkPoint;
08010 
08011         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
08012         /* In an ONLINE checkpoint, treat the XID counter as a minimum */
08013         LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
08014         if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
08015                                   checkPoint.nextXid))
08016             ShmemVariableCache->nextXid = checkPoint.nextXid;
08017         LWLockRelease(XidGenLock);
08018         /* ... but still treat OID counter as exact */
08019         LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
08020         ShmemVariableCache->nextOid = checkPoint.nextOid;
08021         ShmemVariableCache->oidCount = 0;
08022         LWLockRelease(OidGenLock);
08023         MultiXactAdvanceNextMXact(checkPoint.nextMulti,
08024                                   checkPoint.nextMultiOffset);
08025         if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
08026                                   checkPoint.oldestXid))
08027             SetTransactionIdLimit(checkPoint.oldestXid,
08028                                   checkPoint.oldestXidDB);
08029         MultiXactAdvanceOldest(checkPoint.oldestMulti,
08030                                checkPoint.oldestMultiDB);
08031 
08032         /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
08033         ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
08034         ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
08035 
08036         /* Update shared-memory copy of checkpoint XID/epoch */
08037         {
08038             /* use volatile pointer to prevent code rearrangement */
08039             volatile XLogCtlData *xlogctl = XLogCtl;
08040 
08041             SpinLockAcquire(&xlogctl->info_lck);
08042             xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
08043             xlogctl->ckptXid = checkPoint.nextXid;
08044             SpinLockRelease(&xlogctl->info_lck);
08045         }
08046 
08047         /* TLI should not change in an on-line checkpoint */
08048         if (checkPoint.ThisTimeLineID != ThisTimeLineID)
08049             ereport(PANIC,
08050                     (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
08051                             checkPoint.ThisTimeLineID, ThisTimeLineID)));
08052 
08053         RecoveryRestartPoint(&checkPoint);
08054     }
08055     else if (info == XLOG_END_OF_RECOVERY)
08056     {
08057         xl_end_of_recovery xlrec;
08058 
08059         memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
08060 
08061         /*
08062          * For Hot Standby, we could treat this like a Shutdown Checkpoint,
08063          * but this case is rarer and harder to test, so the benefit doesn't
08064          * outweigh the potential extra cost of maintenance.
08065          */
08066 
08067         /*
08068          * We should've already switched to the new TLI before replaying this
08069          * record.
08070          */
08071         if (xlrec.ThisTimeLineID != ThisTimeLineID)
08072             ereport(PANIC,
08073                     (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
08074                             xlrec.ThisTimeLineID, ThisTimeLineID)));
08075     }
08076     else if (info == XLOG_NOOP)
08077     {
08078         /* nothing to do here */
08079     }
08080     else if (info == XLOG_SWITCH)
08081     {
08082         /* nothing to do here */
08083     }
08084     else if (info == XLOG_RESTORE_POINT)
08085     {
08086         /* nothing to do here */
08087     }
08088     else if (info == XLOG_HINT)
08089     {
08090         char *data;
08091         BkpBlock bkpb;
08092 
08093         /*
08094          * Hint bit records contain a backup block stored "inline" in the normal
08095          * data since the locking when writing hint records isn't sufficient to
08096          * use the normal backup block mechanism, which assumes exclusive lock
08097          * on the buffer supplied.
08098          *
08099          * Since the only change in these backup block are hint bits, there are
08100          * no recovery conflicts generated.
08101          *
08102          * This also means there is no corresponding API call for this,
08103          * so an smgr implementation has no need to implement anything.
08104          * Which means nothing is needed in md.c etc
08105          */
08106         data = XLogRecGetData(record);
08107         memcpy(&bkpb, data, sizeof(BkpBlock));
08108         data += sizeof(BkpBlock);
08109 
08110         RestoreBackupBlockContents(lsn, bkpb, data, false, false);
08111     }
08112     else if (info == XLOG_BACKUP_END)
08113     {
08114         XLogRecPtr  startpoint;
08115 
08116         memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
08117 
08118         if (ControlFile->backupStartPoint == startpoint)
08119         {
08120             /*
08121              * We have reached the end of base backup, the point where
08122              * pg_stop_backup() was done. The data on disk is now consistent.
08123              * Reset backupStartPoint, and update minRecoveryPoint to make
08124              * sure we don't allow starting up at an earlier point even if
08125              * recovery is stopped and restarted soon after this.
08126              */
08127             elog(DEBUG1, "end of backup reached");
08128 
08129             LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
08130 
08131             if (ControlFile->minRecoveryPoint < lsn)
08132             {
08133                 ControlFile->minRecoveryPoint = lsn;
08134                 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
08135             }
08136             ControlFile->backupStartPoint = InvalidXLogRecPtr;
08137             ControlFile->backupEndRequired = false;
08138             UpdateControlFile();
08139 
08140             LWLockRelease(ControlFileLock);
08141         }
08142     }
08143     else if (info == XLOG_PARAMETER_CHANGE)
08144     {
08145         xl_parameter_change xlrec;
08146 
08147         /* Update our copy of the parameters in pg_control */
08148         memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
08149 
08150         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
08151         ControlFile->MaxConnections = xlrec.MaxConnections;
08152         ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
08153         ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
08154         ControlFile->wal_level = xlrec.wal_level;
08155 
08156         /*
08157          * Update minRecoveryPoint to ensure that if recovery is aborted, we
08158          * recover back up to this point before allowing hot standby again.
08159          * This is particularly important if wal_level was set to 'archive'
08160          * before, and is now 'hot_standby', to ensure you don't run queries
08161          * against the WAL preceding the wal_level change. Same applies to
08162          * decreasing max_* settings.
08163          */
08164         minRecoveryPoint = ControlFile->minRecoveryPoint;
08165         minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
08166         if (minRecoveryPoint != 0 && minRecoveryPoint < lsn)
08167         {
08168             ControlFile->minRecoveryPoint = lsn;
08169             ControlFile->minRecoveryPointTLI = ThisTimeLineID;
08170         }
08171 
08172         UpdateControlFile();
08173         LWLockRelease(ControlFileLock);
08174 
08175         /* Check to see if any changes to max_connections give problems */
08176         CheckRequiredParameterValues();
08177     }
08178     else if (info == XLOG_FPW_CHANGE)
08179     {
08180         /* use volatile pointer to prevent code rearrangement */
08181         volatile XLogCtlData *xlogctl = XLogCtl;
08182         bool        fpw;
08183 
08184         memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
08185 
08186         /*
08187          * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
08188          * do_pg_start_backup() and do_pg_stop_backup() can check whether
08189          * full_page_writes has been disabled during online backup.
08190          */
08191         if (!fpw)
08192         {
08193             SpinLockAcquire(&xlogctl->info_lck);
08194             if (xlogctl->lastFpwDisableRecPtr < ReadRecPtr)
08195                 xlogctl->lastFpwDisableRecPtr = ReadRecPtr;
08196             SpinLockRelease(&xlogctl->info_lck);
08197         }
08198 
08199         /* Keep track of full_page_writes */
08200         lastFullPageWrites = fpw;
08201     }
08202 }
08203 
08204 #ifdef WAL_DEBUG
08205 
08206 static void
08207 xlog_outrec(StringInfo buf, XLogRecord *record)
08208 {
08209     int         i;
08210 
08211     appendStringInfo(buf, "prev %X/%X; xid %u",
08212                      (uint32) (record->xl_prev >> 32),
08213                      (uint32) record->xl_prev,
08214                      record->xl_xid);
08215 
08216     appendStringInfo(buf, "; len %u",
08217                      record->xl_len);
08218 
08219     for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
08220     {
08221         if (record->xl_info & XLR_BKP_BLOCK(i))
08222             appendStringInfo(buf, "; bkpb%d", i);
08223     }
08224 
08225     appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
08226 }
08227 #endif   /* WAL_DEBUG */
08228 
08229 
08230 /*
08231  * Return the (possible) sync flag used for opening a file, depending on the
08232  * value of the GUC wal_sync_method.
08233  */
08234 static int
08235 get_sync_bit(int method)
08236 {
08237     int         o_direct_flag = 0;
08238 
08239     /* If fsync is disabled, never open in sync mode */
08240     if (!enableFsync)
08241         return 0;
08242 
08243     /*
08244      * Optimize writes by bypassing kernel cache with O_DIRECT when using
08245      * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
08246      * disabled, otherwise the archive command or walsender process will read
08247      * the WAL soon after writing it, which is guaranteed to cause a physical
08248      * read if we bypassed the kernel cache. We also skip the
08249      * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
08250      * reason.
08251      *
08252      * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
08253      * written by walreceiver is normally read by the startup process soon
08254      * after its written. Also, walreceiver performs unaligned writes, which
08255      * don't work with O_DIRECT, so it is required for correctness too.
08256      */
08257     if (!XLogIsNeeded() && !AmWalReceiverProcess())
08258         o_direct_flag = PG_O_DIRECT;
08259 
08260     switch (method)
08261     {
08262             /*
08263              * enum values for all sync options are defined even if they are
08264              * not supported on the current platform.  But if not, they are
08265              * not included in the enum option array, and therefore will never
08266              * b