Header And Logo

PostgreSQL
| The world's most advanced open source database.

multixact.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * multixact.c
00004  *      PostgreSQL multi-transaction-log manager
00005  *
00006  * The pg_multixact manager is a pg_clog-like manager that stores an array of
00007  * MultiXactMember for each MultiXactId.  It is a fundamental part of the
00008  * shared-row-lock implementation.  Each MultiXactMember is comprised of a
00009  * TransactionId and a set of flag bits.  The name is a bit historical:
00010  * originally, a MultiXactId consisted of more than one TransactionId (except
00011  * in rare corner cases), hence "multi".  Nowadays, however, it's perfectly
00012  * legitimate to have MultiXactIds that only include a single Xid.
00013  *
00014  * The meaning of the flag bits is opaque to this module, but they are mostly
00015  * used in heapam.c to identify lock modes that each of the member transactions
00016  * is holding on any given tuple.  This module just contains support to store
00017  * and retrieve the arrays.
00018  *
00019  * We use two SLRU areas, one for storing the offsets at which the data
00020  * starts for each MultiXactId in the other one.  This trick allows us to
00021  * store variable length arrays of TransactionIds.  (We could alternatively
00022  * use one area containing counts and TransactionIds, with valid MultiXactId
00023  * values pointing at slots containing counts; but that way seems less robust
00024  * since it would get completely confused if someone inquired about a bogus
00025  * MultiXactId that pointed to an intermediate slot containing an XID.)
00026  *
00027  * XLOG interactions: this module generates an XLOG record whenever a new
00028  * OFFSETs or MEMBERs page is initialized to zeroes, as well as an XLOG record
00029  * whenever a new MultiXactId is defined.  This allows us to completely
00030  * rebuild the data entered since the last checkpoint during XLOG replay.
00031  * Because this is possible, we need not follow the normal rule of
00032  * "write WAL before data"; the only correctness guarantee needed is that
00033  * we flush and sync all dirty OFFSETs and MEMBERs pages to disk before a
00034  * checkpoint is considered complete.  If a page does make it to disk ahead
00035  * of corresponding WAL records, it will be forcibly zeroed before use anyway.
00036  * Therefore, we don't need to mark our pages with LSN information; we have
00037  * enough synchronization already.
00038  *
00039  * Like clog.c, and unlike subtrans.c, we have to preserve state across
00040  * crashes and ensure that MXID and offset numbering increases monotonically
00041  * across a crash.  We do this in the same way as it's done for transaction
00042  * IDs: the WAL record is guaranteed to contain evidence of every MXID we
00043  * could need to worry about, and we just make sure that at the end of
00044  * replay, the next-MXID and next-offset counters are at least as large as
00045  * anything we saw during replay.
00046  *
00047  * We are able to remove segments no longer necessary by carefully tracking
00048  * each table's used values: during vacuum, any multixact older than a
00049  * certain value is removed; the cutoff value is stored in pg_class.
00050  * The minimum value in each database is stored in pg_database, and the
00051  * global minimum is part of pg_control.  Any vacuum that is able to
00052  * advance its database's minimum value also computes a new global minimum,
00053  * and uses this value to truncate older segments.  When new multixactid
00054  * values are to be created, care is taken that the counter does not
00055  * fall within the wraparound horizon considering the global minimum value.
00056  *
00057  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00058  * Portions Copyright (c) 1994, Regents of the University of California
00059  *
00060  * src/backend/access/transam/multixact.c
00061  *
00062  *-------------------------------------------------------------------------
00063  */
00064 #include "postgres.h"
00065 
00066 #include "access/multixact.h"
00067 #include "access/slru.h"
00068 #include "access/transam.h"
00069 #include "access/twophase.h"
00070 #include "access/twophase_rmgr.h"
00071 #include "access/xact.h"
00072 #include "catalog/pg_type.h"
00073 #include "commands/dbcommands.h"
00074 #include "funcapi.h"
00075 #include "miscadmin.h"
00076 #include "pg_trace.h"
00077 #include "storage/lmgr.h"
00078 #include "storage/pmsignal.h"
00079 #include "storage/procarray.h"
00080 #include "utils/builtins.h"
00081 #include "utils/memutils.h"
00082 #include "utils/snapmgr.h"
00083 
00084 
00085 /*
00086  * Defines for MultiXactOffset page sizes.  A page is the same BLCKSZ as is
00087  * used everywhere else in Postgres.
00088  *
00089  * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF,
00090  * MultiXact page numbering also wraps around at
00091  * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at
00092  * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_SEGMENTS_PER_PAGE.    We need
00093  * take no explicit notice of that fact in this module, except when comparing
00094  * segment and page numbers in TruncateMultiXact (see
00095  * MultiXactOffsetPagePrecedes).
00096  */
00097 
00098 /* We need four bytes per offset */
00099 #define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
00100 
00101 #define MultiXactIdToOffsetPage(xid) \
00102     ((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
00103 #define MultiXactIdToOffsetEntry(xid) \
00104     ((xid) % (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
00105 
00106 /*
00107  * The situation for members is a bit more complex: we store one byte of
00108  * additional flag bits for each TransactionId.  To do this without getting
00109  * into alignment issues, we store four bytes of flags, and then the
00110  * corresponding 4 Xids.  Each such 5-word (20-byte) set we call a "group", and
00111  * are stored as a whole in pages.  Thus, with 8kB BLCKSZ, we keep 409 groups
00112  * per page.  This wastes 12 bytes per page, but that's OK -- simplicity (and
00113  * performance) trumps space efficiency here.
00114  *
00115  * Note that the "offset" macros work with byte offset, not array indexes, so
00116  * arithmetic must be done using "char *" pointers.
00117  */
00118 /* We need eight bits per xact, so one xact fits in a byte */
00119 #define MXACT_MEMBER_BITS_PER_XACT          8
00120 #define MXACT_MEMBER_FLAGS_PER_BYTE         1
00121 #define MXACT_MEMBER_XACT_BITMASK   ((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
00122 
00123 /* how many full bytes of flags are there in a group? */
00124 #define MULTIXACT_FLAGBYTES_PER_GROUP       4
00125 #define MULTIXACT_MEMBERS_PER_MEMBERGROUP   \
00126     (MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
00127 /* size in bytes of a complete group */
00128 #define MULTIXACT_MEMBERGROUP_SIZE \
00129     (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
00130 #define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
00131 #define MULTIXACT_MEMBERS_PER_PAGE  \
00132     (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
00133 
00134 /* page in which a member is to be found */
00135 #define MXOffsetToMemberPage(xid) ((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_PAGE)
00136 
00137 /* Location (byte offset within page) of flag word for a given member */
00138 #define MXOffsetToFlagsOffset(xid) \
00139     ((((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) % \
00140       (TransactionId) MULTIXACT_MEMBERGROUPS_PER_PAGE) * \
00141      (TransactionId) MULTIXACT_MEMBERGROUP_SIZE)
00142 #define MXOffsetToFlagsBitShift(xid) \
00143     (((xid) % (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) * \
00144      MXACT_MEMBER_BITS_PER_XACT)
00145 
00146 /* Location (byte offset within page) of TransactionId of given member */
00147 #define MXOffsetToMemberOffset(xid) \
00148     (MXOffsetToFlagsOffset(xid) + MULTIXACT_FLAGBYTES_PER_GROUP + \
00149      ((xid) % MULTIXACT_MEMBERS_PER_MEMBERGROUP) * sizeof(TransactionId))
00150 
00151 
00152 /*
00153  * Links to shared-memory data structures for MultiXact control
00154  */
00155 static SlruCtlData MultiXactOffsetCtlData;
00156 static SlruCtlData MultiXactMemberCtlData;
00157 
00158 #define MultiXactOffsetCtl  (&MultiXactOffsetCtlData)
00159 #define MultiXactMemberCtl  (&MultiXactMemberCtlData)
00160 
00161 /*
00162  * MultiXact state shared across all backends.  All this state is protected
00163  * by MultiXactGenLock.  (We also use MultiXactOffsetControlLock and
00164  * MultiXactMemberControlLock to guard accesses to the two sets of SLRU
00165  * buffers.  For concurrency's sake, we avoid holding more than one of these
00166  * locks at a time.)
00167  */
00168 typedef struct MultiXactStateData
00169 {
00170     /* next-to-be-assigned MultiXactId */
00171     MultiXactId nextMXact;
00172 
00173     /* next-to-be-assigned offset */
00174     MultiXactOffset nextOffset;
00175 
00176     /* the Offset SLRU area was last truncated at this MultiXactId */
00177     MultiXactId lastTruncationPoint;
00178 
00179     /*
00180      * oldest multixact that is still on disk.  Anything older than this should
00181      * not be consulted.
00182      */
00183     MultiXactId     oldestMultiXactId;
00184     Oid             oldestMultiXactDB;
00185 
00186     /* support for anti-wraparound measures */
00187     MultiXactId     multiVacLimit;
00188     MultiXactId     multiWarnLimit;
00189     MultiXactId     multiStopLimit;
00190     MultiXactId     multiWrapLimit;
00191 
00192     /*
00193      * Per-backend data starts here.  We have two arrays stored in the area
00194      * immediately following the MultiXactStateData struct. Each is indexed by
00195      * BackendId.
00196      *
00197      * In both arrays, there's a slot for all normal backends (1..MaxBackends)
00198      * followed by a slot for max_prepared_xacts prepared transactions. Valid
00199      * BackendIds start from 1; element zero of each array is never used.
00200      *
00201      * OldestMemberMXactId[k] is the oldest MultiXactId each backend's current
00202      * transaction(s) could possibly be a member of, or InvalidMultiXactId
00203      * when the backend has no live transaction that could possibly be a
00204      * member of a MultiXact.  Each backend sets its entry to the current
00205      * nextMXact counter just before first acquiring a shared lock in a given
00206      * transaction, and clears it at transaction end. (This works because only
00207      * during or after acquiring a shared lock could an XID possibly become a
00208      * member of a MultiXact, and that MultiXact would have to be created
00209      * during or after the lock acquisition.)
00210      *
00211      * OldestVisibleMXactId[k] is the oldest MultiXactId each backend's
00212      * current transaction(s) think is potentially live, or InvalidMultiXactId
00213      * when not in a transaction or not in a transaction that's paid any
00214      * attention to MultiXacts yet.  This is computed when first needed in a
00215      * given transaction, and cleared at transaction end.  We can compute it
00216      * as the minimum of the valid OldestMemberMXactId[] entries at the time
00217      * we compute it (using nextMXact if none are valid).  Each backend is
00218      * required not to attempt to access any SLRU data for MultiXactIds older
00219      * than its own OldestVisibleMXactId[] setting; this is necessary because
00220      * the checkpointer could truncate away such data at any instant.
00221      *
00222      * The checkpointer can compute the safe truncation point as the oldest
00223      * valid value among all the OldestMemberMXactId[] and
00224      * OldestVisibleMXactId[] entries, or nextMXact if none are valid.
00225      * Clearly, it is not possible for any later-computed OldestVisibleMXactId
00226      * value to be older than this, and so there is no risk of truncating data
00227      * that is still needed.
00228      */
00229     MultiXactId perBackendXactIds[1];   /* VARIABLE LENGTH ARRAY */
00230 } MultiXactStateData;
00231 
00232 /*
00233  * Last element of OldestMemberMXactID and OldestVisibleMXactId arrays.
00234  * Valid elements are (1..MaxOldestSlot); element 0 is never used.
00235  */
00236 #define MaxOldestSlot   (MaxBackends + max_prepared_xacts)
00237 
00238 /* Pointers to the state data in shared memory */
00239 static MultiXactStateData *MultiXactState;
00240 static MultiXactId *OldestMemberMXactId;
00241 static MultiXactId *OldestVisibleMXactId;
00242 
00243 
00244 /*
00245  * Definitions for the backend-local MultiXactId cache.
00246  *
00247  * We use this cache to store known MultiXacts, so we don't need to go to
00248  * SLRU areas every time.
00249  *
00250  * The cache lasts for the duration of a single transaction, the rationale
00251  * for this being that most entries will contain our own TransactionId and
00252  * so they will be uninteresting by the time our next transaction starts.
00253  * (XXX not clear that this is correct --- other members of the MultiXact
00254  * could hang around longer than we did.  However, it's not clear what a
00255  * better policy for flushing old cache entries would be.)  FIXME actually
00256  * this is plain wrong now that multixact's may contain update Xids.
00257  *
00258  * We allocate the cache entries in a memory context that is deleted at
00259  * transaction end, so we don't need to do retail freeing of entries.
00260  */
00261 typedef struct mXactCacheEnt
00262 {
00263     struct mXactCacheEnt *next;
00264     MultiXactId multi;
00265     int         nmembers;
00266     MultiXactMember members[FLEXIBLE_ARRAY_MEMBER];
00267 } mXactCacheEnt;
00268 
00269 static mXactCacheEnt *MXactCache = NULL;
00270 static MemoryContext MXactContext = NULL;
00271 
00272 #ifdef MULTIXACT_DEBUG
00273 #define debug_elog2(a,b) elog(a,b)
00274 #define debug_elog3(a,b,c) elog(a,b,c)
00275 #define debug_elog4(a,b,c,d) elog(a,b,c,d)
00276 #define debug_elog5(a,b,c,d,e) elog(a,b,c,d,e)
00277 #define debug_elog6(a,b,c,d,e,f) elog(a,b,c,d,e,f)
00278 #else
00279 #define debug_elog2(a,b)
00280 #define debug_elog3(a,b,c)
00281 #define debug_elog4(a,b,c,d)
00282 #define debug_elog5(a,b,c,d,e)
00283 #define debug_elog6(a,b,c,d,e,f)
00284 #endif
00285 
00286 /* internal MultiXactId management */
00287 static void MultiXactIdSetOldestVisible(void);
00288 static MultiXactId CreateMultiXactId(int nmembers, MultiXactMember *members);
00289 static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
00290                    int nmembers, MultiXactMember *members);
00291 static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset);
00292 
00293 /* MultiXact cache management */
00294 static int mxactMemberComparator(const void *arg1, const void *arg2);
00295 static MultiXactId mXactCacheGetBySet(int nmembers, MultiXactMember *members);
00296 static int  mXactCacheGetById(MultiXactId multi, MultiXactMember **members);
00297 static void mXactCachePut(MultiXactId multi, int nmembers,
00298               MultiXactMember *members);
00299 
00300 static char *mxstatus_to_string(MultiXactStatus status);
00301 
00302 /* management of SLRU infrastructure */
00303 static int  ZeroMultiXactOffsetPage(int pageno, bool writeXlog);
00304 static int  ZeroMultiXactMemberPage(int pageno, bool writeXlog);
00305 static bool MultiXactOffsetPagePrecedes(int page1, int page2);
00306 static bool MultiXactMemberPagePrecedes(int page1, int page2);
00307 static bool MultiXactOffsetPrecedes(MultiXactOffset offset1,
00308                         MultiXactOffset offset2);
00309 static void ExtendMultiXactOffset(MultiXactId multi);
00310 static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers);
00311 static void WriteMZeroPageXlogRec(int pageno, uint8 info);
00312 
00313 
00314 /*
00315  * MultiXactIdCreate
00316  *      Construct a MultiXactId representing two TransactionIds.
00317  *
00318  * The two XIDs must be different, or be requesting different statuses.
00319  *
00320  * NB - we don't worry about our local MultiXactId cache here, because that
00321  * is handled by the lower-level routines.
00322  */
00323 MultiXactId
00324 MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1,
00325                   TransactionId xid2, MultiXactStatus status2)
00326 {
00327     MultiXactId newMulti;
00328     MultiXactMember members[2];
00329 
00330     AssertArg(TransactionIdIsValid(xid1));
00331     AssertArg(TransactionIdIsValid(xid2));
00332 
00333     Assert(!TransactionIdEquals(xid1, xid2) || (status1 != status2));
00334 
00335     /*
00336      * Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs
00337      * are still running.  In typical usage, xid2 will be our own XID and the
00338      * caller just did a check on xid1, so it'd be wasted effort.
00339      */
00340 
00341     members[0].xid = xid1;
00342     members[0].status = status1;
00343     members[1].xid = xid2;
00344     members[1].status = status2;
00345 
00346     newMulti = CreateMultiXactId(2, members);
00347 
00348     debug_elog3(DEBUG2, "Create: %s",
00349                 mxid_to_string(newMulti, 2, members));
00350 
00351     return newMulti;
00352 }
00353 
00354 /*
00355  * MultiXactIdExpand
00356  *      Add a TransactionId to a pre-existing MultiXactId.
00357  *
00358  * If the TransactionId is already a member of the passed MultiXactId with the
00359  * same status, just return it as-is.
00360  *
00361  * Note that we do NOT actually modify the membership of a pre-existing
00362  * MultiXactId; instead we create a new one.  This is necessary to avoid
00363  * a race condition against code trying to wait for one MultiXactId to finish;
00364  * see notes in heapam.c.
00365  *
00366  * NB - we don't worry about our local MultiXactId cache here, because that
00367  * is handled by the lower-level routines.
00368  *
00369  * Note: It is critical that MultiXactIds that come from an old cluster (i.e.
00370  * one upgraded by pg_upgrade from a cluster older than this feature) are not
00371  * passed in.
00372  */
00373 MultiXactId
00374 MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
00375 {
00376     MultiXactId newMulti;
00377     MultiXactMember *members;
00378     MultiXactMember *newMembers;
00379     int         nmembers;
00380     int         i;
00381     int         j;
00382 
00383     AssertArg(MultiXactIdIsValid(multi));
00384     AssertArg(TransactionIdIsValid(xid));
00385 
00386     debug_elog5(DEBUG2, "Expand: received multi %u, xid %u status %s",
00387                 multi, xid, mxstatus_to_string(status));
00388 
00389     /*
00390      * Note: we don't allow for old multis here.  The reason is that the
00391      * only caller of this function does a check that the multixact is
00392      * no longer running.
00393      */
00394     nmembers = GetMultiXactIdMembers(multi, &members, false);
00395 
00396     if (nmembers < 0)
00397     {
00398         MultiXactMember     member;
00399 
00400         /*
00401          * The MultiXactId is obsolete.  This can only happen if all the
00402          * MultiXactId members stop running between the caller checking and
00403          * passing it to us.  It would be better to return that fact to the
00404          * caller, but it would complicate the API and it's unlikely to happen
00405          * too often, so just deal with it by creating a singleton MultiXact.
00406          */
00407         member.xid = xid;
00408         member.status = status;
00409         newMulti = CreateMultiXactId(1, &member);
00410 
00411         debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u",
00412                     multi, newMulti);
00413         return newMulti;
00414     }
00415 
00416     /*
00417      * If the TransactionId is already a member of the MultiXactId with the
00418      * same status, just return the existing MultiXactId.
00419      */
00420     for (i = 0; i < nmembers; i++)
00421     {
00422         if (TransactionIdEquals(members[i].xid, xid) &&
00423             (members[i].status == status))
00424         {
00425             debug_elog4(DEBUG2, "Expand: %u is already a member of %u",
00426                         xid, multi);
00427             pfree(members);
00428             return multi;
00429         }
00430     }
00431 
00432     /*
00433      * Determine which of the members of the MultiXactId are still of interest.
00434      * This is any running transaction, and also any transaction that grabbed
00435      * something stronger than just a lock and was committed.  (An update that
00436      * aborted is of no interest here.)
00437      *
00438      * (Removing dead members is just an optimization, but a useful one.
00439      * Note we have the same race condition here as above: j could be 0 at the
00440      * end of the loop.)
00441      */
00442     newMembers = (MultiXactMember *)
00443         palloc(sizeof(MultiXactMember) * (nmembers + 1));
00444 
00445     for (i = 0, j = 0; i < nmembers; i++)
00446     {
00447         if (TransactionIdIsInProgress(members[i].xid) ||
00448             ((members[i].status > MultiXactStatusForUpdate) &&
00449              TransactionIdDidCommit(members[i].xid)))
00450         {
00451             newMembers[j].xid = members[i].xid;
00452             newMembers[j++].status = members[i].status;
00453         }
00454     }
00455 
00456     newMembers[j].xid = xid;
00457     newMembers[j++].status = status;
00458     newMulti = CreateMultiXactId(j, newMembers);
00459 
00460     pfree(members);
00461     pfree(newMembers);
00462 
00463     debug_elog3(DEBUG2, "Expand: returning new multi %u", newMulti);
00464 
00465     return newMulti;
00466 }
00467 
00468 /*
00469  * MultiXactIdIsRunning
00470  *      Returns whether a MultiXactId is "running".
00471  *
00472  * We return true if at least one member of the given MultiXactId is still
00473  * running.  Note that a "false" result is certain not to change,
00474  * because it is not legal to add members to an existing MultiXactId.
00475  *
00476  * Caller is expected to have verified that the multixact does not come from
00477  * a pg_upgraded share-locked tuple.
00478  */
00479 bool
00480 MultiXactIdIsRunning(MultiXactId multi)
00481 {
00482     MultiXactMember *members;
00483     int         nmembers;
00484     int         i;
00485 
00486     debug_elog3(DEBUG2, "IsRunning %u?", multi);
00487 
00488     /*
00489      * "false" here means we assume our callers have checked that the given
00490      * multi cannot possibly come from a pg_upgraded database.
00491      */
00492     nmembers = GetMultiXactIdMembers(multi, &members, false);
00493 
00494     if (nmembers < 0)
00495     {
00496         debug_elog2(DEBUG2, "IsRunning: no members");
00497         return false;
00498     }
00499 
00500     /*
00501      * Checking for myself is cheap compared to looking in shared memory;
00502      * return true if any live subtransaction of the current top-level
00503      * transaction is a member.
00504      *
00505      * This is not needed for correctness, it's just a fast path.
00506      */
00507     for (i = 0; i < nmembers; i++)
00508     {
00509         if (TransactionIdIsCurrentTransactionId(members[i].xid))
00510         {
00511             debug_elog3(DEBUG2, "IsRunning: I (%d) am running!", i);
00512             pfree(members);
00513             return true;
00514         }
00515     }
00516 
00517     /*
00518      * This could be made faster by having another entry point in procarray.c,
00519      * walking the PGPROC array only once for all the members.  But in most
00520      * cases nmembers should be small enough that it doesn't much matter.
00521      */
00522     for (i = 0; i < nmembers; i++)
00523     {
00524         if (TransactionIdIsInProgress(members[i].xid))
00525         {
00526             debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running",
00527                         i, members[i].xid);
00528             pfree(members);
00529             return true;
00530         }
00531     }
00532 
00533     pfree(members);
00534 
00535     debug_elog3(DEBUG2, "IsRunning: %u is not running", multi);
00536 
00537     return false;
00538 }
00539 
00540 /*
00541  * MultiXactIdSetOldestMember
00542  *      Save the oldest MultiXactId this transaction could be a member of.
00543  *
00544  * We set the OldestMemberMXactId for a given transaction the first time it's
00545  * going to do some operation that might require a MultiXactId (tuple lock,
00546  * update or delete).  We need to do this even if we end up using a
00547  * TransactionId instead of a MultiXactId, because there is a chance that
00548  * another transaction would add our XID to a MultiXactId.
00549  *
00550  * The value to set is the next-to-be-assigned MultiXactId, so this is meant to
00551  * be called just before doing any such possibly-MultiXactId-able operation.
00552  */
00553 void
00554 MultiXactIdSetOldestMember(void)
00555 {
00556     if (!MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]))
00557     {
00558         MultiXactId nextMXact;
00559 
00560         /*
00561          * You might think we don't need to acquire a lock here, since
00562          * fetching and storing of TransactionIds is probably atomic, but in
00563          * fact we do: suppose we pick up nextMXact and then lose the CPU for
00564          * a long time.  Someone else could advance nextMXact, and then
00565          * another someone else could compute an OldestVisibleMXactId that
00566          * would be after the value we are going to store when we get control
00567          * back.  Which would be wrong.
00568          */
00569         LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
00570 
00571         /*
00572          * We have to beware of the possibility that nextMXact is in the
00573          * wrapped-around state.  We don't fix the counter itself here, but we
00574          * must be sure to store a valid value in our array entry.
00575          */
00576         nextMXact = MultiXactState->nextMXact;
00577         if (nextMXact < FirstMultiXactId)
00578             nextMXact = FirstMultiXactId;
00579 
00580         OldestMemberMXactId[MyBackendId] = nextMXact;
00581 
00582         LWLockRelease(MultiXactGenLock);
00583 
00584         debug_elog4(DEBUG2, "MultiXact: setting OldestMember[%d] = %u",
00585                     MyBackendId, nextMXact);
00586     }
00587 }
00588 
00589 /*
00590  * MultiXactIdSetOldestVisible
00591  *      Save the oldest MultiXactId this transaction considers possibly live.
00592  *
00593  * We set the OldestVisibleMXactId for a given transaction the first time
00594  * it's going to inspect any MultiXactId.  Once we have set this, we are
00595  * guaranteed that the checkpointer won't truncate off SLRU data for
00596  * MultiXactIds at or after our OldestVisibleMXactId.
00597  *
00598  * The value to set is the oldest of nextMXact and all the valid per-backend
00599  * OldestMemberMXactId[] entries.  Because of the locking we do, we can be
00600  * certain that no subsequent call to MultiXactIdSetOldestMember can set
00601  * an OldestMemberMXactId[] entry older than what we compute here.  Therefore
00602  * there is no live transaction, now or later, that can be a member of any
00603  * MultiXactId older than the OldestVisibleMXactId we compute here.
00604  */
00605 static void
00606 MultiXactIdSetOldestVisible(void)
00607 {
00608     if (!MultiXactIdIsValid(OldestVisibleMXactId[MyBackendId]))
00609     {
00610         MultiXactId oldestMXact;
00611         int         i;
00612 
00613         LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
00614 
00615         /*
00616          * We have to beware of the possibility that nextMXact is in the
00617          * wrapped-around state.  We don't fix the counter itself here, but we
00618          * must be sure to store a valid value in our array entry.
00619          */
00620         oldestMXact = MultiXactState->nextMXact;
00621         if (oldestMXact < FirstMultiXactId)
00622             oldestMXact = FirstMultiXactId;
00623 
00624         for (i = 1; i <= MaxOldestSlot; i++)
00625         {
00626             MultiXactId thisoldest = OldestMemberMXactId[i];
00627 
00628             if (MultiXactIdIsValid(thisoldest) &&
00629                 MultiXactIdPrecedes(thisoldest, oldestMXact))
00630                 oldestMXact = thisoldest;
00631         }
00632 
00633         OldestVisibleMXactId[MyBackendId] = oldestMXact;
00634 
00635         LWLockRelease(MultiXactGenLock);
00636 
00637         debug_elog4(DEBUG2, "MultiXact: setting OldestVisible[%d] = %u",
00638                     MyBackendId, oldestMXact);
00639     }
00640 }
00641 
00642 /*
00643  * ReadNextMultiXactId
00644  *      Return the next MultiXactId to be assigned, but don't allocate it
00645  */
00646 MultiXactId
00647 ReadNextMultiXactId(void)
00648 {
00649     MultiXactId     mxid;
00650 
00651     /* XXX we could presumably do this without a lock. */
00652     LWLockAcquire(MultiXactGenLock, LW_SHARED);
00653     mxid = MultiXactState->nextMXact;
00654     LWLockRelease(MultiXactGenLock);
00655 
00656     if (mxid < FirstMultiXactId)
00657         mxid = FirstMultiXactId;
00658 
00659     return mxid;
00660 }
00661 
00662 /*
00663  * CreateMultiXactId
00664  *      Make a new MultiXactId
00665  *
00666  * Make XLOG, SLRU and cache entries for a new MultiXactId, recording the
00667  * given TransactionIds as members.  Returns the newly created MultiXactId.
00668  *
00669  * NB: the passed members[] array will be sorted in-place.
00670  */
00671 static MultiXactId
00672 CreateMultiXactId(int nmembers, MultiXactMember *members)
00673 {
00674     MultiXactId multi;
00675     MultiXactOffset offset;
00676     XLogRecData rdata[2];
00677     xl_multixact_create xlrec;
00678 
00679     debug_elog3(DEBUG2, "Create: %s",
00680                 mxid_to_string(InvalidMultiXactId, nmembers, members));
00681 
00682     /*
00683      * See if the same set of members already exists in our cache; if so, just
00684      * re-use that MultiXactId.  (Note: it might seem that looking in our
00685      * cache is insufficient, and we ought to search disk to see if a
00686      * duplicate definition already exists.  But since we only ever create
00687      * MultiXacts containing our own XID, in most cases any such MultiXacts
00688      * were in fact created by us, and so will be in our cache.  There are
00689      * corner cases where someone else added us to a MultiXact without our
00690      * knowledge, but it's not worth checking for.)
00691      */
00692     multi = mXactCacheGetBySet(nmembers, members);
00693     if (MultiXactIdIsValid(multi))
00694     {
00695         debug_elog2(DEBUG2, "Create: in cache!");
00696         return multi;
00697     }
00698 
00699     /*
00700      * Assign the MXID and offsets range to use, and make sure there is space
00701      * in the OFFSETs and MEMBERs files.  NB: this routine does
00702      * START_CRIT_SECTION().
00703      */
00704     multi = GetNewMultiXactId(nmembers, &offset);
00705 
00706     /*
00707      * Make an XLOG entry describing the new MXID.
00708      *
00709      * Note: we need not flush this XLOG entry to disk before proceeding. The
00710      * only way for the MXID to be referenced from any data page is for
00711      * heap_lock_tuple() to have put it there, and heap_lock_tuple() generates
00712      * an XLOG record that must follow ours.  The normal LSN interlock between
00713      * the data page and that XLOG record will ensure that our XLOG record
00714      * reaches disk first.  If the SLRU members/offsets data reaches disk
00715      * sooner than the XLOG record, we do not care because we'll overwrite it
00716      * with zeroes unless the XLOG record is there too; see notes at top of
00717      * this file.
00718      */
00719     xlrec.mid = multi;
00720     xlrec.moff = offset;
00721     xlrec.nmembers = nmembers;
00722 
00723     /*
00724      * XXX Note: there's a lot of padding space in MultiXactMember.  We could
00725      * find a more compact representation of this Xlog record -- perhaps all the
00726      * status flags in one XLogRecData, then all the xids in another one?  Not
00727      * clear that it's worth the trouble though.
00728      */
00729     rdata[0].data = (char *) (&xlrec);
00730     rdata[0].len = SizeOfMultiXactCreate;
00731     rdata[0].buffer = InvalidBuffer;
00732     rdata[0].next = &(rdata[1]);
00733 
00734     rdata[1].data = (char *) members;
00735     rdata[1].len = nmembers * sizeof(MultiXactMember);
00736     rdata[1].buffer = InvalidBuffer;
00737     rdata[1].next = NULL;
00738 
00739     (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID, rdata);
00740 
00741     /* Now enter the information into the OFFSETs and MEMBERs logs */
00742     RecordNewMultiXact(multi, offset, nmembers, members);
00743 
00744     /* Done with critical section */
00745     END_CRIT_SECTION();
00746 
00747     /* Store the new MultiXactId in the local cache, too */
00748     mXactCachePut(multi, nmembers, members);
00749 
00750     debug_elog2(DEBUG2, "Create: all done");
00751 
00752     return multi;
00753 }
00754 
00755 /*
00756  * RecordNewMultiXact
00757  *      Write info about a new multixact into the offsets and members files
00758  *
00759  * This is broken out of CreateMultiXactId so that xlog replay can use it.
00760  */
00761 static void
00762 RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
00763                    int nmembers, MultiXactMember *members)
00764 {
00765     int         pageno;
00766     int         prev_pageno;
00767     int         entryno;
00768     int         slotno;
00769     MultiXactOffset *offptr;
00770     int         i;
00771 
00772     LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
00773 
00774     pageno = MultiXactIdToOffsetPage(multi);
00775     entryno = MultiXactIdToOffsetEntry(multi);
00776 
00777     /*
00778      * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction"
00779      * to complain about if there's any I/O error.  This is kinda bogus, but
00780      * since the errors will always give the full pathname, it should be clear
00781      * enough that a MultiXactId is really involved.  Perhaps someday we'll
00782      * take the trouble to generalize the slru.c error reporting code.
00783      */
00784     slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
00785     offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
00786     offptr += entryno;
00787 
00788     *offptr = offset;
00789 
00790     MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
00791 
00792     /* Exchange our lock */
00793     LWLockRelease(MultiXactOffsetControlLock);
00794 
00795     LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
00796 
00797     prev_pageno = -1;
00798 
00799     for (i = 0; i < nmembers; i++, offset++)
00800     {
00801         TransactionId *memberptr;
00802         uint32     *flagsptr;
00803         uint32      flagsval;
00804         int         bshift;
00805         int         flagsoff;
00806         int         memberoff;
00807 
00808         Assert(members[i].status <= MultiXactStatusUpdate);
00809 
00810         pageno = MXOffsetToMemberPage(offset);
00811         memberoff = MXOffsetToMemberOffset(offset);
00812         flagsoff = MXOffsetToFlagsOffset(offset);
00813         bshift = MXOffsetToFlagsBitShift(offset);
00814 
00815         if (pageno != prev_pageno)
00816         {
00817             slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
00818             prev_pageno = pageno;
00819         }
00820 
00821         memberptr = (TransactionId *)
00822             (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
00823 
00824         *memberptr = members[i].xid;
00825 
00826         flagsptr = (uint32 *)
00827             (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
00828 
00829         flagsval = *flagsptr;
00830         flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
00831         flagsval |= (members[i].status << bshift);
00832         *flagsptr = flagsval;
00833 
00834         MultiXactMemberCtl->shared->page_dirty[slotno] = true;
00835     }
00836 
00837     LWLockRelease(MultiXactMemberControlLock);
00838 }
00839 
00840 /*
00841  * GetNewMultiXactId
00842  *      Get the next MultiXactId.
00843  *
00844  * Also, reserve the needed amount of space in the "members" area.  The
00845  * starting offset of the reserved space is returned in *offset.
00846  *
00847  * This may generate XLOG records for expansion of the offsets and/or members
00848  * files.  Unfortunately, we have to do that while holding MultiXactGenLock
00849  * to avoid race conditions --- the XLOG record for zeroing a page must appear
00850  * before any backend can possibly try to store data in that page!
00851  *
00852  * We start a critical section before advancing the shared counters.  The
00853  * caller must end the critical section after writing SLRU data.
00854  */
00855 static MultiXactId
00856 GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
00857 {
00858     MultiXactId result;
00859     MultiXactOffset nextOffset;
00860 
00861     debug_elog3(DEBUG2, "GetNew: for %d xids", nmembers);
00862 
00863     /* MultiXactIdSetOldestMember() must have been called already */
00864     Assert(MultiXactIdIsValid(OldestMemberMXactId[MyBackendId]));
00865 
00866     /* safety check, we should never get this far in a HS slave */
00867     if (RecoveryInProgress())
00868         elog(ERROR, "cannot assign MultiXactIds during recovery");
00869 
00870     LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
00871 
00872     /* Handle wraparound of the nextMXact counter */
00873     if (MultiXactState->nextMXact < FirstMultiXactId)
00874         MultiXactState->nextMXact = FirstMultiXactId;
00875 
00876     /* Assign the MXID */
00877     result = MultiXactState->nextMXact;
00878 
00879     /*----------
00880      * Check to see if it's safe to assign another MultiXactId.  This protects
00881      * against catastrophic data loss due to multixact wraparound.  The basic
00882      * rules are:
00883      *
00884      * If we're past multiVacLimit, start trying to force autovacuum cycles.
00885      * If we're past multiWarnLimit, start issuing warnings.
00886      * If we're past multiStopLimit, refuse to create new MultiXactIds.
00887      *
00888      * Note these are pretty much the same protections in GetNewTransactionId.
00889      *----------
00890      */
00891     if (!MultiXactIdPrecedes(result, MultiXactState->multiVacLimit))
00892     {
00893         /*
00894          * For safety's sake, we release MultiXactGenLock while sending
00895          * signals, warnings, etc.  This is not so much because we care about
00896          * preserving concurrency in this situation, as to avoid any
00897          * possibility of deadlock while doing get_database_name(). First,
00898          * copy all the shared values we'll need in this path.
00899          */
00900         MultiXactId multiWarnLimit = MultiXactState->multiWarnLimit;
00901         MultiXactId multiStopLimit = MultiXactState->multiStopLimit;
00902         MultiXactId multiWrapLimit = MultiXactState->multiWrapLimit;
00903         Oid         oldest_datoid = MultiXactState->oldestMultiXactDB;
00904 
00905         LWLockRelease(MultiXactGenLock);
00906 
00907         /*
00908          * To avoid swamping the postmaster with signals, we issue the autovac
00909          * request only once per 64K transaction starts.  This still gives
00910          * plenty of chances before we get into real trouble.
00911          */
00912         if (IsUnderPostmaster && (result % 65536) == 0)
00913             SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
00914 
00915         if (IsUnderPostmaster &&
00916             !MultiXactIdPrecedes(result, multiStopLimit))
00917         {
00918             char       *oldest_datname = get_database_name(oldest_datoid);
00919 
00920             /* complain even if that DB has disappeared */
00921             if (oldest_datname)
00922                 ereport(ERROR,
00923                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
00924                          errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database \"%s\"",
00925                                 oldest_datname),
00926                          errhint("Execute a database-wide VACUUM in that database.\n"
00927                                  "You might also need to commit or roll back old prepared transactions.")));
00928             else
00929                 ereport(ERROR,
00930                         (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
00931                          errmsg("database is not accepting commands that generate new MultiXactIds to avoid wraparound data loss in database with OID %u",
00932                                 oldest_datoid),
00933                          errhint("Execute a database-wide VACUUM in that database.\n"
00934                                  "You might also need to commit or roll back old prepared transactions.")));
00935         }
00936         else if (!MultiXactIdPrecedes(result, multiWarnLimit))
00937         {
00938             char       *oldest_datname = get_database_name(oldest_datoid);
00939 
00940             /* complain even if that DB has disappeared */
00941             if (oldest_datname)
00942                 ereport(WARNING,
00943                         (errmsg("database \"%s\" must be vacuumed before %u more MultiXactIds are used",
00944                                 oldest_datname,
00945                                 multiWrapLimit - result),
00946                          errhint("Execute a database-wide VACUUM in that database.\n"
00947                                  "You might also need to commit or roll back old prepared transactions.")));
00948             else
00949                 ereport(WARNING,
00950                         (errmsg("database with OID %u must be vacuumed before %u more MultiXactIds are used",
00951                                 oldest_datoid,
00952                                 multiWrapLimit - result),
00953                          errhint("Execute a database-wide VACUUM in that database.\n"
00954                                  "You might also need to commit or roll back old prepared transactions.")));
00955         }
00956 
00957         /* Re-acquire lock and start over */
00958         LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
00959         result = MultiXactState->nextMXact;
00960         if (result < FirstMultiXactId)
00961             result = FirstMultiXactId;
00962     }
00963 
00964     /* Make sure there is room for the MXID in the file.  */
00965     ExtendMultiXactOffset(result);
00966 
00967     /*
00968      * Reserve the members space, similarly to above.  Also, be careful not to
00969      * return zero as the starting offset for any multixact. See
00970      * GetMultiXactIdMembers() for motivation.
00971      */
00972     nextOffset = MultiXactState->nextOffset;
00973     if (nextOffset == 0)
00974     {
00975         *offset = 1;
00976         nmembers++;             /* allocate member slot 0 too */
00977     }
00978     else
00979         *offset = nextOffset;
00980 
00981     ExtendMultiXactMember(nextOffset, nmembers);
00982 
00983     /*
00984      * Critical section from here until caller has written the data into the
00985      * just-reserved SLRU space; we don't want to error out with a partly
00986      * written MultiXact structure.  (In particular, failing to write our
00987      * start offset after advancing nextMXact would effectively corrupt the
00988      * previous MultiXact.)
00989      */
00990     START_CRIT_SECTION();
00991 
00992     /*
00993      * Advance counters.  As in GetNewTransactionId(), this must not happen
00994      * until after file extension has succeeded!
00995      *
00996      * We don't care about MultiXactId wraparound here; it will be handled by
00997      * the next iteration.  But note that nextMXact may be InvalidMultiXactId
00998      * or the first value on a segment-beginning page after this routine exits,
00999      * so anyone else looking at the variable must be prepared to deal with
01000      * either case.  Similarly, nextOffset may be zero, but we won't use that
01001      * as the actual start offset of the next multixact.
01002      */
01003     (MultiXactState->nextMXact)++;
01004 
01005     MultiXactState->nextOffset += nmembers;
01006 
01007     LWLockRelease(MultiXactGenLock);
01008 
01009     debug_elog4(DEBUG2, "GetNew: returning %u offset %u", result, *offset);
01010     return result;
01011 }
01012 
01013 /*
01014  * GetMultiXactIdMembers
01015  *      Returns the set of MultiXactMembers that make up a MultiXactId
01016  *
01017  * If the given MultiXactId is older than the value we know to be oldest, we
01018  * return -1.  The caller is expected to allow that only in permissible cases,
01019  * i.e. when the infomask lets it presuppose that the tuple had been
01020  * share-locked before a pg_upgrade; this means that the HEAP_XMAX_LOCK_ONLY
01021  * needs to be set, but HEAP_XMAX_KEYSHR_LOCK and HEAP_XMAX_EXCL_LOCK are not
01022  * set.
01023  *
01024  * Other border conditions, such as trying to read a value that's larger than
01025  * the value currently known as the next to assign, raise an error.  Previously
01026  * these also returned -1, but since this can lead to the wrong visibility
01027  * results, it is dangerous to do that.
01028  */
01029 int
01030 GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
01031                       bool allow_old)
01032 {
01033     int         pageno;
01034     int         prev_pageno;
01035     int         entryno;
01036     int         slotno;
01037     MultiXactOffset *offptr;
01038     MultiXactOffset offset;
01039     int         length;
01040     int         truelength;
01041     int         i;
01042     MultiXactId oldestMXact;
01043     MultiXactId nextMXact;
01044     MultiXactId tmpMXact;
01045     MultiXactOffset nextOffset;
01046     MultiXactMember *ptr;
01047 
01048     debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
01049 
01050     Assert(MultiXactIdIsValid(multi));
01051 
01052     /* See if the MultiXactId is in the local cache */
01053     length = mXactCacheGetById(multi, members);
01054     if (length >= 0)
01055     {
01056         debug_elog3(DEBUG2, "GetMembers: found %s in the cache",
01057                     mxid_to_string(multi, length, *members));
01058         return length;
01059     }
01060 
01061     /* Set our OldestVisibleMXactId[] entry if we didn't already */
01062     MultiXactIdSetOldestVisible();
01063 
01064     /*
01065      * We check known limits on MultiXact before resorting to the SLRU area.
01066      *
01067      * An ID older than MultiXactState->oldestMultiXactId cannot possibly be
01068      * useful; it should have already been frozen by vacuum.  We've truncated
01069      * the on-disk structures anyway.  Returning the wrong values could lead to
01070      * an incorrect visibility result.  However, to support pg_upgrade we need
01071      * to allow an empty set to be returned regardless, if the caller is
01072      * willing to accept it; the caller is expected to check that it's an
01073      * allowed condition (such as ensuring that the infomask bits set on the
01074      * tuple are consistent with the pg_upgrade scenario).  If the caller is
01075      * expecting this to be called only on recently created multis, then we
01076      * raise an error.
01077      *
01078      * Conversely, an ID >= nextMXact shouldn't ever be seen here; if it is
01079      * seen, it implies undetected ID wraparound has occurred.  This raises
01080      * a hard error.
01081      *
01082      * Shared lock is enough here since we aren't modifying any global state.
01083      * Acquire it just long enough to grab the current counter values.  We may
01084      * need both nextMXact and nextOffset; see below.
01085      */
01086     LWLockAcquire(MultiXactGenLock, LW_SHARED);
01087 
01088     oldestMXact = MultiXactState->oldestMultiXactId;
01089     nextMXact = MultiXactState->nextMXact;
01090     nextOffset = MultiXactState->nextOffset;
01091 
01092     LWLockRelease(MultiXactGenLock);
01093 
01094     if (MultiXactIdPrecedes(multi, oldestMXact))
01095     {
01096         ereport(allow_old ? DEBUG1 : ERROR,
01097                 (errcode(ERRCODE_INTERNAL_ERROR),
01098                  errmsg("MultiXactId %u does no longer exist -- apparent wraparound",
01099                         multi)));
01100         return -1;
01101     }
01102 
01103     if (!MultiXactIdPrecedes(multi, nextMXact))
01104         ereport(ERROR,
01105                 (errcode(ERRCODE_INTERNAL_ERROR),
01106                  errmsg("MultiXactId %u has not been created yet -- apparent wraparound",
01107                         multi)));
01108 
01109     /*
01110      * Find out the offset at which we need to start reading MultiXactMembers
01111      * and the number of members in the multixact.  We determine the latter as
01112      * the difference between this multixact's starting offset and the next
01113      * one's.  However, there are some corner cases to worry about:
01114      *
01115      * 1. This multixact may be the latest one created, in which case there is
01116      * no next one to look at.  In this case the nextOffset value we just
01117      * saved is the correct endpoint.
01118      *
01119      * 2. The next multixact may still be in process of being filled in: that
01120      * is, another process may have done GetNewMultiXactId but not yet written
01121      * the offset entry for that ID.  In that scenario, it is guaranteed that
01122      * the offset entry for that multixact exists (because GetNewMultiXactId
01123      * won't release MultiXactGenLock until it does) but contains zero
01124      * (because we are careful to pre-zero offset pages). Because
01125      * GetNewMultiXactId will never return zero as the starting offset for a
01126      * multixact, when we read zero as the next multixact's offset, we know we
01127      * have this case.  We sleep for a bit and try again.
01128      *
01129      * 3. Because GetNewMultiXactId increments offset zero to offset one to
01130      * handle case #2, there is an ambiguity near the point of offset
01131      * wraparound.  If we see next multixact's offset is one, is that our
01132      * multixact's actual endpoint, or did it end at zero with a subsequent
01133      * increment?  We handle this using the knowledge that if the zero'th
01134      * member slot wasn't filled, it'll contain zero, and zero isn't a valid
01135      * transaction ID so it can't be a multixact member.  Therefore, if we
01136      * read a zero from the members array, just ignore it.
01137      *
01138      * This is all pretty messy, but the mess occurs only in infrequent corner
01139      * cases, so it seems better than holding the MultiXactGenLock for a long
01140      * time on every multixact creation.
01141      */
01142 retry:
01143     LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
01144 
01145     pageno = MultiXactIdToOffsetPage(multi);
01146     entryno = MultiXactIdToOffsetEntry(multi);
01147 
01148     slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
01149     offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
01150     offptr += entryno;
01151     offset = *offptr;
01152 
01153     Assert(offset != 0);
01154 
01155     /*
01156      * Use the same increment rule as GetNewMultiXactId(), that is, don't
01157      * handle wraparound explicitly until needed.
01158      */
01159     tmpMXact = multi + 1;
01160 
01161     if (nextMXact == tmpMXact)
01162     {
01163         /* Corner case 1: there is no next multixact */
01164         length = nextOffset - offset;
01165     }
01166     else
01167     {
01168         MultiXactOffset nextMXOffset;
01169 
01170         /* handle wraparound if needed */
01171         if (tmpMXact < FirstMultiXactId)
01172             tmpMXact = FirstMultiXactId;
01173 
01174         prev_pageno = pageno;
01175 
01176         pageno = MultiXactIdToOffsetPage(tmpMXact);
01177         entryno = MultiXactIdToOffsetEntry(tmpMXact);
01178 
01179         if (pageno != prev_pageno)
01180             slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
01181 
01182         offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
01183         offptr += entryno;
01184         nextMXOffset = *offptr;
01185 
01186         if (nextMXOffset == 0)
01187         {
01188             /* Corner case 2: next multixact is still being filled in */
01189             LWLockRelease(MultiXactOffsetControlLock);
01190             pg_usleep(1000L);
01191             goto retry;
01192         }
01193 
01194         length = nextMXOffset - offset;
01195     }
01196 
01197     LWLockRelease(MultiXactOffsetControlLock);
01198 
01199     ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
01200     *members = ptr;
01201 
01202     /* Now get the members themselves. */
01203     LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
01204 
01205     truelength = 0;
01206     prev_pageno = -1;
01207     for (i = 0; i < length; i++, offset++)
01208     {
01209         TransactionId *xactptr;
01210         uint32     *flagsptr;
01211         int         flagsoff;
01212         int         bshift;
01213         int         memberoff;
01214 
01215         pageno = MXOffsetToMemberPage(offset);
01216         memberoff = MXOffsetToMemberOffset(offset);
01217 
01218         if (pageno != prev_pageno)
01219         {
01220             slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi);
01221             prev_pageno = pageno;
01222         }
01223 
01224         xactptr = (TransactionId *)
01225             (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
01226 
01227         if (!TransactionIdIsValid(*xactptr))
01228         {
01229             /* Corner case 3: we must be looking at unused slot zero */
01230             Assert(offset == 0);
01231             continue;
01232         }
01233 
01234         flagsoff = MXOffsetToFlagsOffset(offset);
01235         bshift = MXOffsetToFlagsBitShift(offset);
01236         flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
01237 
01238         ptr[truelength].xid = *xactptr;
01239         ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
01240         truelength++;
01241     }
01242 
01243     LWLockRelease(MultiXactMemberControlLock);
01244 
01245     /*
01246      * Copy the result into the local cache.
01247      */
01248     mXactCachePut(multi, truelength, ptr);
01249 
01250     debug_elog3(DEBUG2, "GetMembers: no cache for %s",
01251                 mxid_to_string(multi, truelength, ptr));
01252     return truelength;
01253 }
01254 
01255 /*
01256  * mxactMemberComparator
01257  *      qsort comparison function for MultiXactMember
01258  *
01259  * We can't use wraparound comparison for XIDs because that does not respect
01260  * the triangle inequality!  Any old sort order will do.
01261  */
01262 static int
01263 mxactMemberComparator(const void *arg1, const void *arg2)
01264 {
01265     MultiXactMember member1 = *(const MultiXactMember *) arg1;
01266     MultiXactMember member2 = *(const MultiXactMember *) arg2;
01267 
01268     if (member1.xid > member2.xid)
01269         return 1;
01270     if (member1.xid < member2.xid)
01271         return -1;
01272     if (member1.status > member2.status)
01273         return 1;
01274     if (member1.status < member2.status)
01275         return -1;
01276     return 0;
01277 }
01278 
01279 /*
01280  * mXactCacheGetBySet
01281  *      returns a MultiXactId from the cache based on the set of
01282  *      TransactionIds that compose it, or InvalidMultiXactId if
01283  *      none matches.
01284  *
01285  * This is helpful, for example, if two transactions want to lock a huge
01286  * table.  By using the cache, the second will use the same MultiXactId
01287  * for the majority of tuples, thus keeping MultiXactId usage low (saving
01288  * both I/O and wraparound issues).
01289  *
01290  * NB: the passed members array will be sorted in-place.
01291  */
01292 static MultiXactId
01293 mXactCacheGetBySet(int nmembers, MultiXactMember *members)
01294 {
01295     mXactCacheEnt *entry;
01296 
01297     debug_elog3(DEBUG2, "CacheGet: looking for %s",
01298                 mxid_to_string(InvalidMultiXactId, nmembers, members));
01299 
01300     /* sort the array so comparison is easy */
01301     qsort(members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
01302 
01303     for (entry = MXactCache; entry != NULL; entry = entry->next)
01304     {
01305         if (entry->nmembers != nmembers)
01306             continue;
01307 
01308         /*
01309          * We assume the cache entries are sorted, and that the unused bits in
01310          * "status" are zeroed.
01311          */
01312         if (memcmp(members, entry->members, nmembers * sizeof(MultiXactMember)) == 0)
01313         {
01314             debug_elog3(DEBUG2, "CacheGet: found %u", entry->multi);
01315             return entry->multi;
01316         }
01317     }
01318 
01319     debug_elog2(DEBUG2, "CacheGet: not found :-(");
01320     return InvalidMultiXactId;
01321 }
01322 
01323 /*
01324  * mXactCacheGetById
01325  *      returns the composing MultiXactMember set from the cache for a
01326  *      given MultiXactId, if present.
01327  *
01328  * If successful, *xids is set to the address of a palloc'd copy of the
01329  * MultiXactMember set.  Return value is number of members, or -1 on failure.
01330  */
01331 static int
01332 mXactCacheGetById(MultiXactId multi, MultiXactMember **members)
01333 {
01334     mXactCacheEnt *entry;
01335 
01336     debug_elog3(DEBUG2, "CacheGet: looking for %u", multi);
01337 
01338     for (entry = MXactCache; entry != NULL; entry = entry->next)
01339     {
01340         if (entry->multi == multi)
01341         {
01342             MultiXactMember *ptr;
01343             Size        size;
01344 
01345             size = sizeof(MultiXactMember) * entry->nmembers;
01346             ptr = (MultiXactMember *) palloc(size);
01347             *members = ptr;
01348 
01349             memcpy(ptr, entry->members, size);
01350 
01351             debug_elog3(DEBUG2, "CacheGet: found %s",
01352                         mxid_to_string(multi, entry->nmembers, entry->members));
01353             return entry->nmembers;
01354         }
01355     }
01356 
01357     debug_elog2(DEBUG2, "CacheGet: not found");
01358     return -1;
01359 }
01360 
01361 /*
01362  * mXactCachePut
01363  *      Add a new MultiXactId and its composing set into the local cache.
01364  */
01365 static void
01366 mXactCachePut(MultiXactId multi, int nmembers, MultiXactMember *members)
01367 {
01368     mXactCacheEnt *entry;
01369 
01370     debug_elog3(DEBUG2, "CachePut: storing %s",
01371                 mxid_to_string(multi, nmembers, members));
01372 
01373     if (MXactContext == NULL)
01374     {
01375         /* The cache only lives as long as the current transaction */
01376         debug_elog2(DEBUG2, "CachePut: initializing memory context");
01377         MXactContext = AllocSetContextCreate(TopTransactionContext,
01378                                              "MultiXact Cache Context",
01379                                              ALLOCSET_SMALL_MINSIZE,
01380                                              ALLOCSET_SMALL_INITSIZE,
01381                                              ALLOCSET_SMALL_MAXSIZE);
01382     }
01383 
01384     entry = (mXactCacheEnt *)
01385         MemoryContextAlloc(MXactContext,
01386                            offsetof(mXactCacheEnt, members) +
01387                            nmembers * sizeof(MultiXactMember));
01388 
01389     entry->multi = multi;
01390     entry->nmembers = nmembers;
01391     memcpy(entry->members, members, nmembers * sizeof(MultiXactMember));
01392 
01393     /* mXactCacheGetBySet assumes the entries are sorted, so sort them */
01394     qsort(entry->members, nmembers, sizeof(MultiXactMember), mxactMemberComparator);
01395 
01396     entry->next = MXactCache;
01397     MXactCache = entry;
01398 }
01399 
01400 static char *
01401 mxstatus_to_string(MultiXactStatus status)
01402 {
01403     switch (status)
01404     {
01405         case MultiXactStatusForKeyShare:
01406             return "keysh";
01407         case MultiXactStatusForShare:
01408             return "sh";
01409         case MultiXactStatusForNoKeyUpdate:
01410             return "fornokeyupd";
01411         case MultiXactStatusForUpdate:
01412             return "forupd";
01413         case MultiXactStatusNoKeyUpdate:
01414             return "nokeyupd";
01415         case MultiXactStatusUpdate:
01416             return "upd";
01417         default:
01418             elog(ERROR, "unrecognized multixact status %d", status);
01419             return "";
01420     }
01421 }
01422 
01423 char *
01424 mxid_to_string(MultiXactId multi, int nmembers, MultiXactMember *members)
01425 {
01426     static char    *str = NULL;
01427     StringInfoData  buf;
01428     int         i;
01429 
01430     if (str != NULL)
01431         pfree(str);
01432 
01433     initStringInfo(&buf);
01434 
01435     appendStringInfo(&buf, "%u %d[%u (%s)", multi, nmembers, members[0].xid,
01436                      mxstatus_to_string(members[0].status));
01437 
01438     for (i = 1; i < nmembers; i++)
01439         appendStringInfo(&buf, ", %u (%s)", members[i].xid,
01440                          mxstatus_to_string(members[i].status));
01441 
01442     appendStringInfoChar(&buf, ']');
01443     str = MemoryContextStrdup(TopMemoryContext, buf.data);
01444     pfree(buf.data);
01445     return str;
01446 }
01447 
01448 /*
01449  * AtEOXact_MultiXact
01450  *      Handle transaction end for MultiXact
01451  *
01452  * This is called at top transaction commit or abort (we don't care which).
01453  */
01454 void
01455 AtEOXact_MultiXact(void)
01456 {
01457     /*
01458      * Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of
01459      * which should only be valid while within a transaction.
01460      *
01461      * We assume that storing a MultiXactId is atomic and so we need not take
01462      * MultiXactGenLock to do this.
01463      */
01464     OldestMemberMXactId[MyBackendId] = InvalidMultiXactId;
01465     OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId;
01466 
01467     /*
01468      * Discard the local MultiXactId cache.  Since MXactContext was created as
01469      * a child of TopTransactionContext, we needn't delete it explicitly.
01470      */
01471     MXactContext = NULL;
01472     MXactCache = NULL;
01473 }
01474 
01475 /*
01476  * AtPrepare_MultiXact
01477  *      Save multixact state at 2PC tranasction prepare
01478  *
01479  * In this phase, we only store our OldestMemberMXactId value in the two-phase
01480  * state file.
01481  */
01482 void
01483 AtPrepare_MultiXact(void)
01484 {
01485     MultiXactId myOldestMember = OldestMemberMXactId[MyBackendId];
01486 
01487     if (MultiXactIdIsValid(myOldestMember))
01488         RegisterTwoPhaseRecord(TWOPHASE_RM_MULTIXACT_ID, 0,
01489                                &myOldestMember, sizeof(MultiXactId));
01490 }
01491 
01492 /*
01493  * PostPrepare_MultiXact
01494  *      Clean up after successful PREPARE TRANSACTION
01495  */
01496 void
01497 PostPrepare_MultiXact(TransactionId xid)
01498 {
01499     MultiXactId myOldestMember;
01500 
01501     /*
01502      * Transfer our OldestMemberMXactId value to the slot reserved for the
01503      * prepared transaction.
01504      */
01505     myOldestMember = OldestMemberMXactId[MyBackendId];
01506     if (MultiXactIdIsValid(myOldestMember))
01507     {
01508         BackendId   dummyBackendId = TwoPhaseGetDummyBackendId(xid);
01509 
01510         /*
01511          * Even though storing MultiXactId is atomic, acquire lock to make
01512          * sure others see both changes, not just the reset of the slot of the
01513          * current backend. Using a volatile pointer might suffice, but this
01514          * isn't a hot spot.
01515          */
01516         LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
01517 
01518         OldestMemberMXactId[dummyBackendId] = myOldestMember;
01519         OldestMemberMXactId[MyBackendId] = InvalidMultiXactId;
01520 
01521         LWLockRelease(MultiXactGenLock);
01522     }
01523 
01524     /*
01525      * We don't need to transfer OldestVisibleMXactId value, because the
01526      * transaction is not going to be looking at any more multixacts once it's
01527      * prepared.
01528      *
01529      * We assume that storing a MultiXactId is atomic and so we need not take
01530      * MultiXactGenLock to do this.
01531      */
01532     OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId;
01533 
01534     /*
01535      * Discard the local MultiXactId cache like in AtEOX_MultiXact
01536      */
01537     MXactContext = NULL;
01538     MXactCache = NULL;
01539 }
01540 
01541 /*
01542  * multixact_twophase_recover
01543  *      Recover the state of a prepared transaction at startup
01544  */
01545 void
01546 multixact_twophase_recover(TransactionId xid, uint16 info,
01547                            void *recdata, uint32 len)
01548 {
01549     BackendId   dummyBackendId = TwoPhaseGetDummyBackendId(xid);
01550     MultiXactId oldestMember;
01551 
01552     /*
01553      * Get the oldest member XID from the state file record, and set it in the
01554      * OldestMemberMXactId slot reserved for this prepared transaction.
01555      */
01556     Assert(len == sizeof(MultiXactId));
01557     oldestMember = *((MultiXactId *) recdata);
01558 
01559     OldestMemberMXactId[dummyBackendId] = oldestMember;
01560 }
01561 
01562 /*
01563  * multixact_twophase_postcommit
01564  *      Similar to AtEOX_MultiXact but for COMMIT PREPARED
01565  */
01566 void
01567 multixact_twophase_postcommit(TransactionId xid, uint16 info,
01568                               void *recdata, uint32 len)
01569 {
01570     BackendId   dummyBackendId = TwoPhaseGetDummyBackendId(xid);
01571 
01572     Assert(len == sizeof(MultiXactId));
01573 
01574     OldestMemberMXactId[dummyBackendId] = InvalidMultiXactId;
01575 }
01576 
01577 /*
01578  * multixact_twophase_postabort
01579  *      This is actually just the same as the COMMIT case.
01580  */
01581 void
01582 multixact_twophase_postabort(TransactionId xid, uint16 info,
01583                              void *recdata, uint32 len)
01584 {
01585     multixact_twophase_postcommit(xid, info, recdata, len);
01586 }
01587 
01588 /*
01589  * Initialization of shared memory for MultiXact.  We use two SLRU areas,
01590  * thus double memory.  Also, reserve space for the shared MultiXactState
01591  * struct and the per-backend MultiXactId arrays (two of those, too).
01592  */
01593 Size
01594 MultiXactShmemSize(void)
01595 {
01596     Size        size;
01597 
01598 #define SHARED_MULTIXACT_STATE_SIZE \
01599     add_size(sizeof(MultiXactStateData), \
01600              mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot))
01601 
01602     size = SHARED_MULTIXACT_STATE_SIZE;
01603     size = add_size(size, SimpleLruShmemSize(NUM_MXACTOFFSET_BUFFERS, 0));
01604     size = add_size(size, SimpleLruShmemSize(NUM_MXACTMEMBER_BUFFERS, 0));
01605 
01606     return size;
01607 }
01608 
01609 void
01610 MultiXactShmemInit(void)
01611 {
01612     bool        found;
01613 
01614     debug_elog2(DEBUG2, "Shared Memory Init for MultiXact");
01615 
01616     MultiXactOffsetCtl->PagePrecedes = MultiXactOffsetPagePrecedes;
01617     MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes;
01618 
01619     SimpleLruInit(MultiXactOffsetCtl,
01620                   "MultiXactOffset Ctl", NUM_MXACTOFFSET_BUFFERS, 0,
01621                   MultiXactOffsetControlLock, "pg_multixact/offsets");
01622     SimpleLruInit(MultiXactMemberCtl,
01623                   "MultiXactMember Ctl", NUM_MXACTMEMBER_BUFFERS, 0,
01624                   MultiXactMemberControlLock, "pg_multixact/members");
01625 
01626     /* Initialize our shared state struct */
01627     MultiXactState = ShmemInitStruct("Shared MultiXact State",
01628                                      SHARED_MULTIXACT_STATE_SIZE,
01629                                      &found);
01630     if (!IsUnderPostmaster)
01631     {
01632         Assert(!found);
01633 
01634         /* Make sure we zero out the per-backend state */
01635         MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE);
01636     }
01637     else
01638         Assert(found);
01639 
01640     /*
01641      * Set up array pointers.  Note that perBackendXactIds[0] is wasted space
01642      * since we only use indexes 1..MaxOldestSlot in each array.
01643      */
01644     OldestMemberMXactId = MultiXactState->perBackendXactIds;
01645     OldestVisibleMXactId = OldestMemberMXactId + MaxOldestSlot;
01646 }
01647 
01648 /*
01649  * This func must be called ONCE on system install.  It creates the initial
01650  * MultiXact segments.  (The MultiXacts directories are assumed to have been
01651  * created by initdb, and MultiXactShmemInit must have been called already.)
01652  */
01653 void
01654 BootStrapMultiXact(void)
01655 {
01656     int         slotno;
01657 
01658     LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
01659 
01660     /* Create and zero the first page of the offsets log */
01661     slotno = ZeroMultiXactOffsetPage(0, false);
01662 
01663     /* Make sure it's written out */
01664     SimpleLruWritePage(MultiXactOffsetCtl, slotno);
01665     Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
01666 
01667     LWLockRelease(MultiXactOffsetControlLock);
01668 
01669     LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
01670 
01671     /* Create and zero the first page of the members log */
01672     slotno = ZeroMultiXactMemberPage(0, false);
01673 
01674     /* Make sure it's written out */
01675     SimpleLruWritePage(MultiXactMemberCtl, slotno);
01676     Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]);
01677 
01678     LWLockRelease(MultiXactMemberControlLock);
01679 }
01680 
01681 /*
01682  * Initialize (or reinitialize) a page of MultiXactOffset to zeroes.
01683  * If writeXlog is TRUE, also emit an XLOG record saying we did this.
01684  *
01685  * The page is not actually written, just set up in shared memory.
01686  * The slot number of the new page is returned.
01687  *
01688  * Control lock must be held at entry, and will be held at exit.
01689  */
01690 static int
01691 ZeroMultiXactOffsetPage(int pageno, bool writeXlog)
01692 {
01693     int         slotno;
01694 
01695     slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
01696 
01697     if (writeXlog)
01698         WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
01699 
01700     return slotno;
01701 }
01702 
01703 /*
01704  * Ditto, for MultiXactMember
01705  */
01706 static int
01707 ZeroMultiXactMemberPage(int pageno, bool writeXlog)
01708 {
01709     int         slotno;
01710 
01711     slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno);
01712 
01713     if (writeXlog)
01714         WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
01715 
01716     return slotno;
01717 }
01718 
01719 /*
01720  * This must be called ONCE during postmaster or standalone-backend startup.
01721  *
01722  * StartupXLOG has already established nextMXact/nextOffset by calling
01723  * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact, and the oldestMulti
01724  * info from pg_control and/or MultiXactAdvanceOldest.  Note that we may
01725  * already have replayed WAL data into the SLRU files.
01726  *
01727  * We don't need any locks here, really; the SLRU locks are taken
01728  * only because slru.c expects to be called with locks held.
01729  */
01730 void
01731 StartupMultiXact(void)
01732 {
01733     MultiXactId multi = MultiXactState->nextMXact;
01734     MultiXactOffset offset = MultiXactState->nextOffset;
01735     int         pageno;
01736     int         entryno;
01737     int         flagsoff;
01738 
01739     /* Clean up offsets state */
01740     LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
01741 
01742     /*
01743      * Initialize our idea of the latest page number.
01744      */
01745     pageno = MultiXactIdToOffsetPage(multi);
01746     MultiXactOffsetCtl->shared->latest_page_number = pageno;
01747 
01748     /*
01749      * Zero out the remainder of the current offsets page.  See notes in
01750      * StartupCLOG() for motivation.
01751      */
01752     entryno = MultiXactIdToOffsetEntry(multi);
01753     if (entryno != 0)
01754     {
01755         int         slotno;
01756         MultiXactOffset *offptr;
01757 
01758         slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
01759         offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
01760         offptr += entryno;
01761 
01762         MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
01763 
01764         MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
01765     }
01766 
01767     LWLockRelease(MultiXactOffsetControlLock);
01768 
01769     /* And the same for members */
01770     LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
01771 
01772     /*
01773      * Initialize our idea of the latest page number.
01774      */
01775     pageno = MXOffsetToMemberPage(offset);
01776     MultiXactMemberCtl->shared->latest_page_number = pageno;
01777 
01778     /*
01779      * Zero out the remainder of the current members page.  See notes in
01780      * TrimCLOG() for motivation.
01781      */
01782     flagsoff = MXOffsetToFlagsOffset(offset);
01783     if (flagsoff != 0)
01784     {
01785         int         slotno;
01786         TransactionId *xidptr;
01787         int         memberoff;
01788 
01789         memberoff = MXOffsetToMemberOffset(offset);
01790         slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
01791         xidptr = (TransactionId *)
01792             (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
01793 
01794         MemSet(xidptr, 0, BLCKSZ - memberoff);
01795 
01796         /*
01797          * Note: we don't need to zero out the flag bits in the remaining
01798          * members of the current group, because they are always reset before
01799          * writing.
01800          */
01801 
01802         MultiXactMemberCtl->shared->page_dirty[slotno] = true;
01803     }
01804 
01805     LWLockRelease(MultiXactMemberControlLock);
01806 }
01807 
01808 /*
01809  * This must be called ONCE during postmaster or standalone-backend shutdown
01810  */
01811 void
01812 ShutdownMultiXact(void)
01813 {
01814     /* Flush dirty MultiXact pages to disk */
01815     TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(false);
01816     SimpleLruFlush(MultiXactOffsetCtl, false);
01817     SimpleLruFlush(MultiXactMemberCtl, false);
01818     TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(false);
01819 }
01820 
01821 /*
01822  * Get the MultiXact data to save in a checkpoint record
01823  */
01824 void
01825 MultiXactGetCheckptMulti(bool is_shutdown,
01826                          MultiXactId *nextMulti,
01827                          MultiXactOffset *nextMultiOffset,
01828                          MultiXactId *oldestMulti,
01829                          Oid *oldestMultiDB)
01830 {
01831     LWLockAcquire(MultiXactGenLock, LW_SHARED);
01832     *nextMulti = MultiXactState->nextMXact;
01833     *nextMultiOffset = MultiXactState->nextOffset;
01834     *oldestMulti = MultiXactState->oldestMultiXactId;
01835     *oldestMultiDB = MultiXactState->oldestMultiXactDB;
01836     LWLockRelease(MultiXactGenLock);
01837 
01838     debug_elog6(DEBUG2,
01839                 "MultiXact: checkpoint is nextMulti %u, nextOffset %u, oldestMulti %u in DB %u",
01840                 *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB);
01841 }
01842 
01843 /*
01844  * Perform a checkpoint --- either during shutdown, or on-the-fly
01845  */
01846 void
01847 CheckPointMultiXact(void)
01848 {
01849     TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true);
01850 
01851     /* Flush dirty MultiXact pages to disk */
01852     SimpleLruFlush(MultiXactOffsetCtl, true);
01853     SimpleLruFlush(MultiXactMemberCtl, true);
01854 
01855     TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
01856 }
01857 
01858 /*
01859  * Set the next-to-be-assigned MultiXactId and offset
01860  *
01861  * This is used when we can determine the correct next ID/offset exactly
01862  * from a checkpoint record.  Although this is only called during bootstrap
01863  * and XLog replay, we take the lock in case any hot-standby backends are
01864  * examining the values.
01865  */
01866 void
01867 MultiXactSetNextMXact(MultiXactId nextMulti,
01868                       MultiXactOffset nextMultiOffset)
01869 {
01870     debug_elog4(DEBUG2, "MultiXact: setting next multi to %u offset %u",
01871                 nextMulti, nextMultiOffset);
01872     LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
01873     MultiXactState->nextMXact = nextMulti;
01874     MultiXactState->nextOffset = nextMultiOffset;
01875     LWLockRelease(MultiXactGenLock);
01876 }
01877 
01878 /*
01879  * Determine the last safe MultiXactId to allocate given the currently oldest
01880  * datminmxid (ie, the oldest MultiXactId that might exist in any database
01881  * of our cluster), and the OID of the (or a) database with that value.
01882  */
01883 void
01884 SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid)
01885 {
01886     MultiXactId multiVacLimit;
01887     MultiXactId multiWarnLimit;
01888     MultiXactId multiStopLimit;
01889     MultiXactId multiWrapLimit;
01890     MultiXactId curMulti;
01891 
01892     Assert(MultiXactIdIsValid(oldest_datminmxid));
01893 
01894     /*
01895      * The place where we actually get into deep trouble is halfway around
01896      * from the oldest potentially-existing XID/multi.  (This calculation is
01897      * probably off by one or two counts for Xids, because the special XIDs
01898      * reduce the size of the loop a little bit.  But we throw in plenty of
01899      * slop below, so it doesn't matter.)
01900      */
01901     multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1);
01902     if (multiWrapLimit < FirstMultiXactId)
01903         multiWrapLimit += FirstMultiXactId;
01904 
01905     /*
01906      * We'll refuse to continue assigning MultiXactIds once we get within 100
01907      * multi of data loss.
01908      */
01909     multiStopLimit = multiWrapLimit - 100;
01910     if (multiStopLimit < FirstMultiXactId)
01911         multiStopLimit -= FirstMultiXactId;
01912 
01913     /*
01914      * We'll start complaining loudly when we get within 10M multis of the stop
01915      * point.   This is kind of arbitrary, but if you let your gas gauge get
01916      * down to 1% of full, would you be looking for the next gas station?  We
01917      * need to be fairly liberal about this number because there are lots of
01918      * scenarios where most transactions are done by automatic clients that
01919      * won't pay attention to warnings. (No, we're not gonna make this
01920      * configurable.  If you know enough to configure it, you know enough to
01921      * not get in this kind of trouble in the first place.)
01922      */
01923     multiWarnLimit = multiStopLimit - 10000000;
01924     if (multiWarnLimit < FirstMultiXactId)
01925         multiWarnLimit -= FirstMultiXactId;
01926 
01927     /*
01928      * We'll start trying to force autovacuums when oldest_datminmxid gets
01929      * to be more than 200 million transactions old.
01930      */
01931     multiVacLimit = oldest_datminmxid + 200000000;
01932     if (multiVacLimit < FirstMultiXactId)
01933         multiVacLimit += FirstMultiXactId;
01934 
01935     /* Grab lock for just long enough to set the new limit values */
01936     LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
01937     MultiXactState->oldestMultiXactId = oldest_datminmxid;
01938     MultiXactState->oldestMultiXactDB = oldest_datoid;
01939     MultiXactState->multiVacLimit = multiVacLimit;
01940     MultiXactState->multiWarnLimit = multiWarnLimit;
01941     MultiXactState->multiStopLimit = multiStopLimit;
01942     MultiXactState->multiWrapLimit = multiWrapLimit;
01943     curMulti = MultiXactState->nextMXact;
01944     LWLockRelease(MultiXactGenLock);
01945 
01946     /* Log the info */
01947     ereport(DEBUG1,
01948             (errmsg("MultiXactId wrap limit is %u, limited by database with OID %u",
01949                     multiWrapLimit, oldest_datoid)));
01950 
01951     /*
01952      * If past the autovacuum force point, immediately signal an autovac
01953      * request.  The reason for this is that autovac only processes one
01954      * database per invocation.  Once it's finished cleaning up the oldest
01955      * database, it'll call here, and we'll signal the postmaster to start
01956      * another iteration immediately if there are still any old databases.
01957      */
01958     if (MultiXactIdPrecedes(multiVacLimit, curMulti) &&
01959         IsUnderPostmaster && !InRecovery)
01960         SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
01961 
01962     /* Give an immediate warning if past the wrap warn point */
01963     if (MultiXactIdPrecedes(multiWarnLimit, curMulti) && !InRecovery)
01964     {
01965         char       *oldest_datname;
01966 
01967         /*
01968          * We can be called when not inside a transaction, for example during
01969          * StartupXLOG().  In such a case we cannot do database access, so we
01970          * must just report the oldest DB's OID.
01971          *
01972          * Note: it's also possible that get_database_name fails and returns
01973          * NULL, for example because the database just got dropped.  We'll
01974          * still warn, even though the warning might now be unnecessary.
01975          */
01976         if (IsTransactionState())
01977             oldest_datname = get_database_name(oldest_datoid);
01978         else
01979             oldest_datname = NULL;
01980 
01981         if (oldest_datname)
01982             ereport(WARNING,
01983                     (errmsg("database \"%s\" must be vacuumed before %u more MultiXactIds are used",
01984                             oldest_datname,
01985                             multiWrapLimit - curMulti),
01986                      errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
01987                              "You might also need to commit or roll back old prepared transactions.")));
01988         else
01989             ereport(WARNING,
01990                     (errmsg("database with OID %u must be vacuumed before %u more MultiXactIds are used",
01991                             oldest_datoid,
01992                             multiWrapLimit - curMulti),
01993                      errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
01994                              "You might also need to commit or roll back old prepared transactions.")));
01995     }
01996 }
01997 
01998 /*
01999  * Ensure the next-to-be-assigned MultiXactId is at least minMulti,
02000  * and similarly nextOffset is at least minMultiOffset.
02001  *
02002  * This is used when we can determine minimum safe values from an XLog
02003  * record (either an on-line checkpoint or an mxact creation log entry).
02004  * Although this is only called during XLog replay, we take the lock in case
02005  * any hot-standby backends are examining the values.
02006  */
02007 void
02008 MultiXactAdvanceNextMXact(MultiXactId minMulti,
02009                           MultiXactOffset minMultiOffset)
02010 {
02011     LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
02012     if (MultiXactIdPrecedes(MultiXactState->nextMXact, minMulti))
02013     {
02014         debug_elog3(DEBUG2, "MultiXact: setting next multi to %u", minMulti);
02015         MultiXactState->nextMXact = minMulti;
02016     }
02017     if (MultiXactOffsetPrecedes(MultiXactState->nextOffset, minMultiOffset))
02018     {
02019         debug_elog3(DEBUG2, "MultiXact: setting next offset to %u",
02020                     minMultiOffset);
02021         MultiXactState->nextOffset = minMultiOffset;
02022     }
02023     LWLockRelease(MultiXactGenLock);
02024 }
02025 
02026 /*
02027  * Update our oldestMultiXactId value, but only if it's more recent than
02028  * what we had.
02029  */
02030 void
02031 MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB)
02032 {
02033     if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti))
02034         SetMultiXactIdLimit(oldestMulti, oldestMultiDB);
02035 }
02036 
02037 /*
02038  * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId.
02039  *
02040  * NB: this is called while holding MultiXactGenLock.  We want it to be very
02041  * fast most of the time; even when it's not so fast, no actual I/O need
02042  * happen unless we're forced to write out a dirty log or xlog page to make
02043  * room in shared memory.
02044  */
02045 static void
02046 ExtendMultiXactOffset(MultiXactId multi)
02047 {
02048     int         pageno;
02049 
02050     /*
02051      * No work except at first MultiXactId of a page.  But beware: just after
02052      * wraparound, the first MultiXactId of page zero is FirstMultiXactId.
02053      */
02054     if (MultiXactIdToOffsetEntry(multi) != 0 &&
02055         multi != FirstMultiXactId)
02056         return;
02057 
02058     pageno = MultiXactIdToOffsetPage(multi);
02059 
02060     LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
02061 
02062     /* Zero the page and make an XLOG entry about it */
02063     ZeroMultiXactOffsetPage(pageno, true);
02064 
02065     LWLockRelease(MultiXactOffsetControlLock);
02066 }
02067 
02068 /*
02069  * Make sure that MultiXactMember has room for the members of a newly-
02070  * allocated MultiXactId.
02071  *
02072  * Like the above routine, this is called while holding MultiXactGenLock;
02073  * same comments apply.
02074  */
02075 static void
02076 ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
02077 {
02078     /*
02079      * It's possible that the members span more than one page of the members
02080      * file, so we loop to ensure we consider each page.  The coding is not
02081      * optimal if the members span several pages, but that seems unusual
02082      * enough to not worry much about.
02083      */
02084     while (nmembers > 0)
02085     {
02086         int         flagsoff;
02087         int         flagsbit;
02088         int         difference;
02089 
02090         /*
02091          * Only zero when at first entry of a page.
02092          */
02093         flagsoff = MXOffsetToFlagsOffset(offset);
02094         flagsbit = MXOffsetToFlagsBitShift(offset);
02095         if (flagsoff == 0 && flagsbit == 0)
02096         {
02097             int         pageno;
02098 
02099             pageno = MXOffsetToMemberPage(offset);
02100 
02101             LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
02102 
02103             /* Zero the page and make an XLOG entry about it */
02104             ZeroMultiXactMemberPage(pageno, true);
02105 
02106             LWLockRelease(MultiXactMemberControlLock);
02107         }
02108 
02109         /* Advance to next page (OK if nmembers goes negative) */
02110         difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE;
02111         offset += difference;
02112         nmembers -= difference;
02113     }
02114 }
02115 
02116 /*
02117  * GetOldestMultiXactId
02118  *
02119  * Return the oldest MultiXactId that's still possibly still seen as live by
02120  * any running transaction.  Older ones might still exist on disk, but they no
02121  * longer have any running member transaction.
02122  *
02123  * It's not safe to truncate MultiXact SLRU segments on the value returned by
02124  * this function; however, it can be used by a full-table vacuum to set the
02125  * point at which it will be possible to truncate SLRU for that table.
02126  */
02127 MultiXactId
02128 GetOldestMultiXactId(void)
02129 {
02130     MultiXactId     oldestMXact;
02131     MultiXactId     nextMXact;
02132     int             i;
02133 
02134     /*
02135      * This is the oldest valid value among all the OldestMemberMXactId[] and
02136      * OldestVisibleMXactId[] entries, or nextMXact if none are valid.
02137      */
02138     LWLockAcquire(MultiXactGenLock, LW_SHARED);
02139 
02140     /*
02141      * We have to beware of the possibility that nextMXact is in the
02142      * wrapped-around state.  We don't fix the counter itself here, but we
02143      * must be sure to use a valid value in our calculation.
02144      */
02145     nextMXact = MultiXactState->nextMXact;
02146     if (nextMXact < FirstMultiXactId)
02147         nextMXact = FirstMultiXactId;
02148 
02149     oldestMXact = nextMXact;
02150     for (i = 1; i <= MaxOldestSlot; i++)
02151     {
02152         MultiXactId thisoldest;
02153 
02154         thisoldest = OldestMemberMXactId[i];
02155         if (MultiXactIdIsValid(thisoldest) &&
02156             MultiXactIdPrecedes(thisoldest, oldestMXact))
02157             oldestMXact = thisoldest;
02158         thisoldest = OldestVisibleMXactId[i];
02159         if (MultiXactIdIsValid(thisoldest) &&
02160             MultiXactIdPrecedes(thisoldest, oldestMXact))
02161             oldestMXact = thisoldest;
02162     }
02163 
02164     LWLockRelease(MultiXactGenLock);
02165 
02166     return oldestMXact;
02167 }
02168 
02169 typedef struct mxtruncinfo
02170 {
02171     int     earliestExistingPage;
02172 } mxtruncinfo;
02173 
02174 /*
02175  * SlruScanDirectory callback
02176  *      This callback determines the earliest existing page number.
02177  */
02178 static bool
02179 SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int segpage, void *data)
02180 {
02181     mxtruncinfo     *trunc = (mxtruncinfo *) data;
02182 
02183     if (trunc->earliestExistingPage == -1 ||
02184         ctl->PagePrecedes(segpage, trunc->earliestExistingPage))
02185     {
02186         trunc->earliestExistingPage = segpage;
02187     }
02188 
02189     return false;   /* keep going */
02190 }
02191 
02192 /*
02193  * Remove all MultiXactOffset and MultiXactMember segments before the oldest
02194  * ones still of interest.
02195  *
02196  * This is called by vacuum after it has successfully advanced a database's
02197  * datminmxid value; the cutoff value we're passed is the minimum of all
02198  * databases' datminmxid values.
02199  */
02200 void
02201 TruncateMultiXact(MultiXactId oldestMXact)
02202 {
02203     MultiXactOffset oldestOffset;
02204     mxtruncinfo     trunc;
02205     MultiXactId     earliest;
02206 
02207     /*
02208      * Note we can't just plow ahead with the truncation; it's possible that
02209      * there are no segments to truncate, which is a problem because we are
02210      * going to attempt to read the offsets page to determine where to truncate
02211      * the members SLRU.  So we first scan the directory to determine the
02212      * earliest offsets page number that we can read without error.
02213      */
02214     trunc.earliestExistingPage = -1;
02215     SlruScanDirectory(MultiXactOffsetCtl, SlruScanDirCbFindEarliest, &trunc);
02216     earliest = trunc.earliestExistingPage * MULTIXACT_OFFSETS_PER_PAGE;
02217 
02218     /* nothing to do */
02219     if (MultiXactIdPrecedes(oldestMXact, earliest))
02220         return;
02221 
02222     /*
02223      * First, compute the safe truncation point for MultiXactMember.
02224      * This is the starting offset of the multixact we were passed
02225      * as MultiXactOffset cutoff.
02226      */
02227     {
02228         int         pageno;
02229         int         slotno;
02230         int         entryno;
02231         MultiXactOffset *offptr;
02232 
02233         /* lock is acquired by SimpleLruReadPage_ReadOnly */
02234 
02235         pageno = MultiXactIdToOffsetPage(oldestMXact);
02236         entryno = MultiXactIdToOffsetEntry(oldestMXact);
02237 
02238         slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno,
02239                                             oldestMXact);
02240         offptr = (MultiXactOffset *)
02241             MultiXactOffsetCtl->shared->page_buffer[slotno];
02242         offptr += entryno;
02243         oldestOffset = *offptr;
02244 
02245         LWLockRelease(MultiXactOffsetControlLock);
02246     }
02247 
02248     /* truncate MultiXactOffset */
02249     SimpleLruTruncate(MultiXactOffsetCtl,
02250                       MultiXactIdToOffsetPage(oldestMXact));
02251 
02252     /* truncate MultiXactMembers and we're done */
02253     SimpleLruTruncate(MultiXactMemberCtl,
02254                       MXOffsetToMemberPage(oldestOffset));
02255 }
02256 
02257 /*
02258  * Decide which of two MultiXactOffset page numbers is "older" for truncation
02259  * purposes.
02260  *
02261  * We need to use comparison of MultiXactId here in order to do the right
02262  * thing with wraparound.  However, if we are asked about page number zero, we
02263  * don't want to hand InvalidMultiXactId to MultiXactIdPrecedes: it'll get
02264  * weird.  So, offset both multis by FirstMultiXactId to avoid that.
02265  * (Actually, the current implementation doesn't do anything weird with
02266  * InvalidMultiXactId, but there's no harm in leaving this code like this.)
02267  */
02268 static bool
02269 MultiXactOffsetPagePrecedes(int page1, int page2)
02270 {
02271     MultiXactId multi1;
02272     MultiXactId multi2;
02273 
02274     multi1 = ((MultiXactId) page1) * MULTIXACT_OFFSETS_PER_PAGE;
02275     multi1 += FirstMultiXactId;
02276     multi2 = ((MultiXactId) page2) * MULTIXACT_OFFSETS_PER_PAGE;
02277     multi2 += FirstMultiXactId;
02278 
02279     return MultiXactIdPrecedes(multi1, multi2);
02280 }
02281 
02282 /*
02283  * Decide which of two MultiXactMember page numbers is "older" for truncation
02284  * purposes.  There is no "invalid offset number" so use the numbers verbatim.
02285  */
02286 static bool
02287 MultiXactMemberPagePrecedes(int page1, int page2)
02288 {
02289     MultiXactOffset offset1;
02290     MultiXactOffset offset2;
02291 
02292     offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE;
02293     offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE;
02294 
02295     return MultiXactOffsetPrecedes(offset1, offset2);
02296 }
02297 
02298 /*
02299  * Decide which of two MultiXactIds is earlier.
02300  *
02301  * XXX do we need to do something special for InvalidMultiXactId?
02302  * (Doesn't look like it.)
02303  */
02304 bool
02305 MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2)
02306 {
02307     int32       diff = (int32) (multi1 - multi2);
02308 
02309     return (diff < 0);
02310 }
02311 
02312 /*
02313  * Decide which of two offsets is earlier.
02314  */
02315 static bool
02316 MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
02317 {
02318     int32       diff = (int32) (offset1 - offset2);
02319 
02320     return (diff < 0);
02321 }
02322 
02323 /*
02324  * Write an xlog record reflecting the zeroing of either a MEMBERs or
02325  * OFFSETs page (info shows which)
02326  */
02327 static void
02328 WriteMZeroPageXlogRec(int pageno, uint8 info)
02329 {
02330     XLogRecData rdata;
02331 
02332     rdata.data = (char *) (&pageno);
02333     rdata.len = sizeof(int);
02334     rdata.buffer = InvalidBuffer;
02335     rdata.next = NULL;
02336     (void) XLogInsert(RM_MULTIXACT_ID, info, &rdata);
02337 }
02338 
02339 /*
02340  * MULTIXACT resource manager's routines
02341  */
02342 void
02343 multixact_redo(XLogRecPtr lsn, XLogRecord *record)
02344 {
02345     uint8       info = record->xl_info & ~XLR_INFO_MASK;
02346 
02347     /* Backup blocks are not used in multixact records */
02348     Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
02349 
02350     if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE)
02351     {
02352         int         pageno;
02353         int         slotno;
02354 
02355         memcpy(&pageno, XLogRecGetData(record), sizeof(int));
02356 
02357         LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
02358 
02359         slotno = ZeroMultiXactOffsetPage(pageno, false);
02360         SimpleLruWritePage(MultiXactOffsetCtl, slotno);
02361         Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
02362 
02363         LWLockRelease(MultiXactOffsetControlLock);
02364     }
02365     else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE)
02366     {
02367         int         pageno;
02368         int         slotno;
02369 
02370         memcpy(&pageno, XLogRecGetData(record), sizeof(int));
02371 
02372         LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
02373 
02374         slotno = ZeroMultiXactMemberPage(pageno, false);
02375         SimpleLruWritePage(MultiXactMemberCtl, slotno);
02376         Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]);
02377 
02378         LWLockRelease(MultiXactMemberControlLock);
02379     }
02380     else if (info == XLOG_MULTIXACT_CREATE_ID)
02381     {
02382         xl_multixact_create *xlrec =
02383             (xl_multixact_create *) XLogRecGetData(record);
02384         TransactionId max_xid;
02385         int         i;
02386 
02387         /* Store the data back into the SLRU files */
02388         RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers,
02389                            xlrec->members);
02390 
02391         /* Make sure nextMXact/nextOffset are beyond what this record has */
02392         MultiXactAdvanceNextMXact(xlrec->mid + 1,
02393                                   xlrec->moff + xlrec->nmembers);
02394 
02395         /*
02396          * Make sure nextXid is beyond any XID mentioned in the record. This
02397          * should be unnecessary, since any XID found here ought to have other
02398          * evidence in the XLOG, but let's be safe.
02399          */
02400         max_xid = record->xl_xid;
02401         for (i = 0; i < xlrec->nmembers; i++)
02402         {
02403             if (TransactionIdPrecedes(max_xid, xlrec->members[i].xid))
02404                 max_xid = xlrec->members[i].xid;
02405         }
02406 
02407         /*
02408          * We don't expect anyone else to modify nextXid, hence startup
02409          * process doesn't need to hold a lock while checking this. We still
02410          * acquire the lock to modify it, though.
02411          */
02412         if (TransactionIdFollowsOrEquals(max_xid,
02413                                          ShmemVariableCache->nextXid))
02414         {
02415             LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
02416             ShmemVariableCache->nextXid = max_xid;
02417             TransactionIdAdvance(ShmemVariableCache->nextXid);
02418             LWLockRelease(XidGenLock);
02419         }
02420     }
02421     else
02422         elog(PANIC, "multixact_redo: unknown op code %u", info);
02423 }
02424 
02425 Datum
02426 pg_get_multixact_members(PG_FUNCTION_ARGS)
02427 {
02428     typedef struct
02429     {
02430         MultiXactMember *members;
02431         int             nmembers;
02432         int             iter;
02433     } mxact;
02434     MultiXactId     mxid = PG_GETARG_UINT32(0);
02435     mxact          *multi;
02436     FuncCallContext *funccxt;
02437 
02438     if (mxid < FirstMultiXactId)
02439         ereport(ERROR,
02440                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
02441                  errmsg("invalid MultiXactId: %u", mxid)));
02442 
02443     if (SRF_IS_FIRSTCALL())
02444     {
02445         MemoryContext oldcxt;
02446         TupleDesc   tupdesc;
02447 
02448         funccxt = SRF_FIRSTCALL_INIT();
02449         oldcxt = MemoryContextSwitchTo(funccxt->multi_call_memory_ctx);
02450 
02451         multi = palloc(sizeof(mxact));
02452         /* no need to allow for old values here */
02453         multi->nmembers = GetMultiXactIdMembers(mxid, &multi->members, false);
02454         multi->iter = 0;
02455 
02456         tupdesc = CreateTemplateTupleDesc(2, false);
02457         TupleDescInitEntry(tupdesc, (AttrNumber) 1, "xid",
02458                            XIDOID, -1, 0);
02459         TupleDescInitEntry(tupdesc, (AttrNumber) 2, "mode",
02460                            TEXTOID, -1, 0);
02461 
02462         funccxt->attinmeta = TupleDescGetAttInMetadata(tupdesc);
02463         funccxt->user_fctx = multi;
02464 
02465         MemoryContextSwitchTo(oldcxt);
02466     }
02467 
02468     funccxt = SRF_PERCALL_SETUP();
02469     multi = (mxact *) funccxt->user_fctx;
02470 
02471     while (multi->iter < multi->nmembers)
02472     {
02473         HeapTuple   tuple;
02474         char       *values[2];
02475 
02476         values[0] = palloc(32);
02477         sprintf(values[0], "%u", multi->members[multi->iter].xid);
02478         values[1] = mxstatus_to_string(multi->members[multi->iter].status);
02479 
02480         tuple = BuildTupleFromCStrings(funccxt->attinmeta, values);
02481 
02482         multi->iter++;
02483         pfree(values[0]);
02484         SRF_RETURN_NEXT(funccxt, HeapTupleGetDatum(tuple));
02485     }
02486 
02487     if (multi->nmembers > 0)
02488         pfree(multi->members);
02489     pfree(multi);
02490 
02491     SRF_RETURN_DONE(funccxt);
02492 }