Header And Logo

PostgreSQL
| The world's most advanced open source database.

snapmgr.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  * snapmgr.c
00003  *      PostgreSQL snapshot manager
00004  *
00005  * We keep track of snapshots in two ways: those "registered" by resowner.c,
00006  * and the "active snapshot" stack.  All snapshots in either of them live in
00007  * persistent memory.  When a snapshot is no longer in any of these lists
00008  * (tracked by separate refcounts on each snapshot), its memory can be freed.
00009  *
00010  * The FirstXactSnapshot, if any, is treated a bit specially: we increment its
00011  * regd_count and count it in RegisteredSnapshots, but this reference is not
00012  * tracked by a resource owner. We used to use the TopTransactionResourceOwner
00013  * to track this snapshot reference, but that introduces logical circularity
00014  * and thus makes it impossible to clean up in a sane fashion.  It's better to
00015  * handle this reference as an internally-tracked registration, so that this
00016  * module is entirely lower-level than ResourceOwners.
00017  *
00018  * Likewise, any snapshots that have been exported by pg_export_snapshot
00019  * have regd_count = 1 and are counted in RegisteredSnapshots, but are not
00020  * tracked by any resource owner.
00021  *
00022  * These arrangements let us reset MyPgXact->xmin when there are no snapshots
00023  * referenced by this transaction.  (One possible improvement would be to be
00024  * able to advance Xmin when the snapshot with the earliest Xmin is no longer
00025  * referenced.  That's a bit harder though, it requires more locking, and
00026  * anyway it should be rather uncommon to keep temporary snapshots referenced
00027  * for too long.)
00028  *
00029  *
00030  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00031  * Portions Copyright (c) 1994, Regents of the University of California
00032  *
00033  * IDENTIFICATION
00034  *    src/backend/utils/time/snapmgr.c
00035  *
00036  *-------------------------------------------------------------------------
00037  */
00038 #include "postgres.h"
00039 
00040 #include <sys/stat.h>
00041 #include <unistd.h>
00042 
00043 #include "access/transam.h"
00044 #include "access/xact.h"
00045 #include "miscadmin.h"
00046 #include "storage/predicate.h"
00047 #include "storage/proc.h"
00048 #include "storage/procarray.h"
00049 #include "utils/builtins.h"
00050 #include "utils/memutils.h"
00051 #include "utils/resowner_private.h"
00052 #include "utils/snapmgr.h"
00053 #include "utils/tqual.h"
00054 
00055 
00056 /*
00057  * CurrentSnapshot points to the only snapshot taken in transaction-snapshot
00058  * mode, and to the latest one taken in a read-committed transaction.
00059  * SecondarySnapshot is a snapshot that's always up-to-date as of the current
00060  * instant, even in transaction-snapshot mode.  It should only be used for
00061  * special-purpose code (say, RI checking.)
00062  *
00063  * These SnapshotData structs are static to simplify memory allocation
00064  * (see the hack in GetSnapshotData to avoid repeated malloc/free).
00065  */
00066 static SnapshotData CurrentSnapshotData = {HeapTupleSatisfiesMVCC};
00067 static SnapshotData SecondarySnapshotData = {HeapTupleSatisfiesMVCC};
00068 
00069 /* Pointers to valid snapshots */
00070 static Snapshot CurrentSnapshot = NULL;
00071 static Snapshot SecondarySnapshot = NULL;
00072 
00073 /*
00074  * These are updated by GetSnapshotData.  We initialize them this way
00075  * for the convenience of TransactionIdIsInProgress: even in bootstrap
00076  * mode, we don't want it to say that BootstrapTransactionId is in progress.
00077  *
00078  * RecentGlobalXmin is initialized to InvalidTransactionId, to ensure that no
00079  * one tries to use a stale value.  Readers should ensure that it has been set
00080  * to something else before using it.
00081  */
00082 TransactionId TransactionXmin = FirstNormalTransactionId;
00083 TransactionId RecentXmin = FirstNormalTransactionId;
00084 TransactionId RecentGlobalXmin = InvalidTransactionId;
00085 
00086 /*
00087  * Elements of the active snapshot stack.
00088  *
00089  * Each element here accounts for exactly one active_count on SnapshotData.
00090  *
00091  * NB: the code assumes that elements in this list are in non-increasing
00092  * order of as_level; also, the list must be NULL-terminated.
00093  */
00094 typedef struct ActiveSnapshotElt
00095 {
00096     Snapshot    as_snap;
00097     int         as_level;
00098     struct ActiveSnapshotElt *as_next;
00099 } ActiveSnapshotElt;
00100 
00101 /* Top of the stack of active snapshots */
00102 static ActiveSnapshotElt *ActiveSnapshot = NULL;
00103 
00104 /*
00105  * How many snapshots is resowner.c tracking for us?
00106  *
00107  * Note: for now, a simple counter is enough.  However, if we ever want to be
00108  * smarter about advancing our MyPgXact->xmin we will need to be more
00109  * sophisticated about this, perhaps keeping our own list of snapshots.
00110  */
00111 static int  RegisteredSnapshots = 0;
00112 
00113 /* first GetTransactionSnapshot call in a transaction? */
00114 bool        FirstSnapshotSet = false;
00115 
00116 /*
00117  * Remember the serializable transaction snapshot, if any.  We cannot trust
00118  * FirstSnapshotSet in combination with IsolationUsesXactSnapshot(), because
00119  * GUC may be reset before us, changing the value of IsolationUsesXactSnapshot.
00120  */
00121 static Snapshot FirstXactSnapshot = NULL;
00122 
00123 /* Define pathname of exported-snapshot files */
00124 #define SNAPSHOT_EXPORT_DIR "pg_snapshots"
00125 #define XactExportFilePath(path, xid, num, suffix) \
00126     snprintf(path, sizeof(path), SNAPSHOT_EXPORT_DIR "/%08X-%d%s", \
00127              xid, num, suffix)
00128 
00129 /* Current xact's exported snapshots (a list of Snapshot structs) */
00130 static List *exportedSnapshots = NIL;
00131 
00132 
00133 static Snapshot CopySnapshot(Snapshot snapshot);
00134 static void FreeSnapshot(Snapshot snapshot);
00135 static void SnapshotResetXmin(void);
00136 
00137 
00138 /*
00139  * GetTransactionSnapshot
00140  *      Get the appropriate snapshot for a new query in a transaction.
00141  *
00142  * Note that the return value may point at static storage that will be modified
00143  * by future calls and by CommandCounterIncrement().  Callers should call
00144  * RegisterSnapshot or PushActiveSnapshot on the returned snap if it is to be
00145  * used very long.
00146  */
00147 Snapshot
00148 GetTransactionSnapshot(void)
00149 {
00150     /* First call in transaction? */
00151     if (!FirstSnapshotSet)
00152     {
00153         Assert(RegisteredSnapshots == 0);
00154         Assert(FirstXactSnapshot == NULL);
00155 
00156         /*
00157          * In transaction-snapshot mode, the first snapshot must live until
00158          * end of xact regardless of what the caller does with it, so we must
00159          * make a copy of it rather than returning CurrentSnapshotData
00160          * directly.  Furthermore, if we're running in serializable mode,
00161          * predicate.c needs to wrap the snapshot fetch in its own processing.
00162          */
00163         if (IsolationUsesXactSnapshot())
00164         {
00165             /* First, create the snapshot in CurrentSnapshotData */
00166             if (IsolationIsSerializable())
00167                 CurrentSnapshot = GetSerializableTransactionSnapshot(&CurrentSnapshotData);
00168             else
00169                 CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
00170             /* Make a saved copy */
00171             CurrentSnapshot = CopySnapshot(CurrentSnapshot);
00172             FirstXactSnapshot = CurrentSnapshot;
00173             /* Mark it as "registered" in FirstXactSnapshot */
00174             FirstXactSnapshot->regd_count++;
00175             RegisteredSnapshots++;
00176         }
00177         else
00178             CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
00179 
00180         FirstSnapshotSet = true;
00181         return CurrentSnapshot;
00182     }
00183 
00184     if (IsolationUsesXactSnapshot())
00185         return CurrentSnapshot;
00186 
00187     CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
00188 
00189     return CurrentSnapshot;
00190 }
00191 
00192 /*
00193  * GetLatestSnapshot
00194  *      Get a snapshot that is up-to-date as of the current instant,
00195  *      even if we are executing in transaction-snapshot mode.
00196  */
00197 Snapshot
00198 GetLatestSnapshot(void)
00199 {
00200     /* If first call in transaction, go ahead and set the xact snapshot */
00201     if (!FirstSnapshotSet)
00202         return GetTransactionSnapshot();
00203 
00204     SecondarySnapshot = GetSnapshotData(&SecondarySnapshotData);
00205 
00206     return SecondarySnapshot;
00207 }
00208 
00209 /*
00210  * SnapshotSetCommandId
00211  *      Propagate CommandCounterIncrement into the static snapshots, if set
00212  */
00213 void
00214 SnapshotSetCommandId(CommandId curcid)
00215 {
00216     if (!FirstSnapshotSet)
00217         return;
00218 
00219     if (CurrentSnapshot)
00220         CurrentSnapshot->curcid = curcid;
00221     if (SecondarySnapshot)
00222         SecondarySnapshot->curcid = curcid;
00223 }
00224 
00225 /*
00226  * SetTransactionSnapshot
00227  *      Set the transaction's snapshot from an imported MVCC snapshot.
00228  *
00229  * Note that this is very closely tied to GetTransactionSnapshot --- it
00230  * must take care of all the same considerations as the first-snapshot case
00231  * in GetTransactionSnapshot.
00232  */
00233 static void
00234 SetTransactionSnapshot(Snapshot sourcesnap, TransactionId sourcexid)
00235 {
00236     /* Caller should have checked this already */
00237     Assert(!FirstSnapshotSet);
00238 
00239     Assert(RegisteredSnapshots == 0);
00240     Assert(FirstXactSnapshot == NULL);
00241 
00242     /*
00243      * Even though we are not going to use the snapshot it computes, we must
00244      * call GetSnapshotData, for two reasons: (1) to be sure that
00245      * CurrentSnapshotData's XID arrays have been allocated, and (2) to update
00246      * RecentXmin and RecentGlobalXmin.  (We could alternatively include those
00247      * two variables in exported snapshot files, but it seems better to have
00248      * snapshot importers compute reasonably up-to-date values for them.)
00249      */
00250     CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
00251 
00252     /*
00253      * Now copy appropriate fields from the source snapshot.
00254      */
00255     CurrentSnapshot->xmin = sourcesnap->xmin;
00256     CurrentSnapshot->xmax = sourcesnap->xmax;
00257     CurrentSnapshot->xcnt = sourcesnap->xcnt;
00258     Assert(sourcesnap->xcnt <= GetMaxSnapshotXidCount());
00259     memcpy(CurrentSnapshot->xip, sourcesnap->xip,
00260            sourcesnap->xcnt * sizeof(TransactionId));
00261     CurrentSnapshot->subxcnt = sourcesnap->subxcnt;
00262     Assert(sourcesnap->subxcnt <= GetMaxSnapshotSubxidCount());
00263     memcpy(CurrentSnapshot->subxip, sourcesnap->subxip,
00264            sourcesnap->subxcnt * sizeof(TransactionId));
00265     CurrentSnapshot->suboverflowed = sourcesnap->suboverflowed;
00266     CurrentSnapshot->takenDuringRecovery = sourcesnap->takenDuringRecovery;
00267     /* NB: curcid should NOT be copied, it's a local matter */
00268 
00269     /*
00270      * Now we have to fix what GetSnapshotData did with MyPgXact->xmin and
00271      * TransactionXmin.  There is a race condition: to make sure we are not
00272      * causing the global xmin to go backwards, we have to test that the
00273      * source transaction is still running, and that has to be done
00274      * atomically. So let procarray.c do it.
00275      *
00276      * Note: in serializable mode, predicate.c will do this a second time. It
00277      * doesn't seem worth contorting the logic here to avoid two calls,
00278      * especially since it's not clear that predicate.c *must* do this.
00279      */
00280     if (!ProcArrayInstallImportedXmin(CurrentSnapshot->xmin, sourcexid))
00281         ereport(ERROR,
00282                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
00283                  errmsg("could not import the requested snapshot"),
00284                errdetail("The source transaction %u is not running anymore.",
00285                          sourcexid)));
00286 
00287     /*
00288      * In transaction-snapshot mode, the first snapshot must live until end of
00289      * xact, so we must make a copy of it.  Furthermore, if we're running in
00290      * serializable mode, predicate.c needs to do its own processing.
00291      */
00292     if (IsolationUsesXactSnapshot())
00293     {
00294         if (IsolationIsSerializable())
00295             SetSerializableTransactionSnapshot(CurrentSnapshot, sourcexid);
00296         /* Make a saved copy */
00297         CurrentSnapshot = CopySnapshot(CurrentSnapshot);
00298         FirstXactSnapshot = CurrentSnapshot;
00299         /* Mark it as "registered" in FirstXactSnapshot */
00300         FirstXactSnapshot->regd_count++;
00301         RegisteredSnapshots++;
00302     }
00303 
00304     FirstSnapshotSet = true;
00305 }
00306 
00307 /*
00308  * CopySnapshot
00309  *      Copy the given snapshot.
00310  *
00311  * The copy is palloc'd in TopTransactionContext and has initial refcounts set
00312  * to 0.  The returned snapshot has the copied flag set.
00313  */
00314 static Snapshot
00315 CopySnapshot(Snapshot snapshot)
00316 {
00317     Snapshot    newsnap;
00318     Size        subxipoff;
00319     Size        size;
00320 
00321     Assert(snapshot != InvalidSnapshot);
00322 
00323     /* We allocate any XID arrays needed in the same palloc block. */
00324     size = subxipoff = sizeof(SnapshotData) +
00325         snapshot->xcnt * sizeof(TransactionId);
00326     if (snapshot->subxcnt > 0)
00327         size += snapshot->subxcnt * sizeof(TransactionId);
00328 
00329     newsnap = (Snapshot) MemoryContextAlloc(TopTransactionContext, size);
00330     memcpy(newsnap, snapshot, sizeof(SnapshotData));
00331 
00332     newsnap->regd_count = 0;
00333     newsnap->active_count = 0;
00334     newsnap->copied = true;
00335 
00336     /* setup XID array */
00337     if (snapshot->xcnt > 0)
00338     {
00339         newsnap->xip = (TransactionId *) (newsnap + 1);
00340         memcpy(newsnap->xip, snapshot->xip,
00341                snapshot->xcnt * sizeof(TransactionId));
00342     }
00343     else
00344         newsnap->xip = NULL;
00345 
00346     /*
00347      * Setup subXID array. Don't bother to copy it if it had overflowed,
00348      * though, because it's not used anywhere in that case. Except if it's a
00349      * snapshot taken during recovery; all the top-level XIDs are in subxip as
00350      * well in that case, so we mustn't lose them.
00351      */
00352     if (snapshot->subxcnt > 0 &&
00353         (!snapshot->suboverflowed || snapshot->takenDuringRecovery))
00354     {
00355         newsnap->subxip = (TransactionId *) ((char *) newsnap + subxipoff);
00356         memcpy(newsnap->subxip, snapshot->subxip,
00357                snapshot->subxcnt * sizeof(TransactionId));
00358     }
00359     else
00360         newsnap->subxip = NULL;
00361 
00362     return newsnap;
00363 }
00364 
00365 /*
00366  * FreeSnapshot
00367  *      Free the memory associated with a snapshot.
00368  */
00369 static void
00370 FreeSnapshot(Snapshot snapshot)
00371 {
00372     Assert(snapshot->regd_count == 0);
00373     Assert(snapshot->active_count == 0);
00374     Assert(snapshot->copied);
00375 
00376     pfree(snapshot);
00377 }
00378 
00379 /*
00380  * PushActiveSnapshot
00381  *      Set the given snapshot as the current active snapshot
00382  *
00383  * If the passed snapshot is a statically-allocated one, or it is possibly
00384  * subject to a future command counter update, create a new long-lived copy
00385  * with active refcount=1.  Otherwise, only increment the refcount.
00386  */
00387 void
00388 PushActiveSnapshot(Snapshot snap)
00389 {
00390     ActiveSnapshotElt *newactive;
00391 
00392     Assert(snap != InvalidSnapshot);
00393 
00394     newactive = MemoryContextAlloc(TopTransactionContext, sizeof(ActiveSnapshotElt));
00395 
00396     /*
00397      * Checking SecondarySnapshot is probably useless here, but it seems
00398      * better to be sure.
00399      */
00400     if (snap == CurrentSnapshot || snap == SecondarySnapshot || !snap->copied)
00401         newactive->as_snap = CopySnapshot(snap);
00402     else
00403         newactive->as_snap = snap;
00404 
00405     newactive->as_next = ActiveSnapshot;
00406     newactive->as_level = GetCurrentTransactionNestLevel();
00407 
00408     newactive->as_snap->active_count++;
00409 
00410     ActiveSnapshot = newactive;
00411 }
00412 
00413 /*
00414  * PushCopiedSnapshot
00415  *      As above, except forcibly copy the presented snapshot.
00416  *
00417  * This should be used when the ActiveSnapshot has to be modifiable, for
00418  * example if the caller intends to call UpdateActiveSnapshotCommandId.
00419  * The new snapshot will be released when popped from the stack.
00420  */
00421 void
00422 PushCopiedSnapshot(Snapshot snapshot)
00423 {
00424     PushActiveSnapshot(CopySnapshot(snapshot));
00425 }
00426 
00427 /*
00428  * UpdateActiveSnapshotCommandId
00429  *
00430  * Update the current CID of the active snapshot.  This can only be applied
00431  * to a snapshot that is not referenced elsewhere.
00432  */
00433 void
00434 UpdateActiveSnapshotCommandId(void)
00435 {
00436     Assert(ActiveSnapshot != NULL);
00437     Assert(ActiveSnapshot->as_snap->active_count == 1);
00438     Assert(ActiveSnapshot->as_snap->regd_count == 0);
00439 
00440     ActiveSnapshot->as_snap->curcid = GetCurrentCommandId(false);
00441 }
00442 
00443 /*
00444  * PopActiveSnapshot
00445  *
00446  * Remove the topmost snapshot from the active snapshot stack, decrementing the
00447  * reference count, and free it if this was the last reference.
00448  */
00449 void
00450 PopActiveSnapshot(void)
00451 {
00452     ActiveSnapshotElt *newstack;
00453 
00454     newstack = ActiveSnapshot->as_next;
00455 
00456     Assert(ActiveSnapshot->as_snap->active_count > 0);
00457 
00458     ActiveSnapshot->as_snap->active_count--;
00459 
00460     if (ActiveSnapshot->as_snap->active_count == 0 &&
00461         ActiveSnapshot->as_snap->regd_count == 0)
00462         FreeSnapshot(ActiveSnapshot->as_snap);
00463 
00464     pfree(ActiveSnapshot);
00465     ActiveSnapshot = newstack;
00466 
00467     SnapshotResetXmin();
00468 }
00469 
00470 /*
00471  * GetActiveSnapshot
00472  *      Return the topmost snapshot in the Active stack.
00473  */
00474 Snapshot
00475 GetActiveSnapshot(void)
00476 {
00477     Assert(ActiveSnapshot != NULL);
00478 
00479     return ActiveSnapshot->as_snap;
00480 }
00481 
00482 /*
00483  * ActiveSnapshotSet
00484  *      Return whether there is at least one snapshot in the Active stack
00485  */
00486 bool
00487 ActiveSnapshotSet(void)
00488 {
00489     return ActiveSnapshot != NULL;
00490 }
00491 
00492 /*
00493  * RegisterSnapshot
00494  *      Register a snapshot as being in use by the current resource owner
00495  *
00496  * If InvalidSnapshot is passed, it is not registered.
00497  */
00498 Snapshot
00499 RegisterSnapshot(Snapshot snapshot)
00500 {
00501     if (snapshot == InvalidSnapshot)
00502         return InvalidSnapshot;
00503 
00504     return RegisterSnapshotOnOwner(snapshot, CurrentResourceOwner);
00505 }
00506 
00507 /*
00508  * RegisterSnapshotOnOwner
00509  *      As above, but use the specified resource owner
00510  */
00511 Snapshot
00512 RegisterSnapshotOnOwner(Snapshot snapshot, ResourceOwner owner)
00513 {
00514     Snapshot    snap;
00515 
00516     if (snapshot == InvalidSnapshot)
00517         return InvalidSnapshot;
00518 
00519     /* Static snapshot?  Create a persistent copy */
00520     snap = snapshot->copied ? snapshot : CopySnapshot(snapshot);
00521 
00522     /* and tell resowner.c about it */
00523     ResourceOwnerEnlargeSnapshots(owner);
00524     snap->regd_count++;
00525     ResourceOwnerRememberSnapshot(owner, snap);
00526 
00527     RegisteredSnapshots++;
00528 
00529     return snap;
00530 }
00531 
00532 /*
00533  * UnregisterSnapshot
00534  *
00535  * Decrement the reference count of a snapshot, remove the corresponding
00536  * reference from CurrentResourceOwner, and free the snapshot if no more
00537  * references remain.
00538  */
00539 void
00540 UnregisterSnapshot(Snapshot snapshot)
00541 {
00542     if (snapshot == NULL)
00543         return;
00544 
00545     UnregisterSnapshotFromOwner(snapshot, CurrentResourceOwner);
00546 }
00547 
00548 /*
00549  * UnregisterSnapshotFromOwner
00550  *      As above, but use the specified resource owner
00551  */
00552 void
00553 UnregisterSnapshotFromOwner(Snapshot snapshot, ResourceOwner owner)
00554 {
00555     if (snapshot == NULL)
00556         return;
00557 
00558     Assert(snapshot->regd_count > 0);
00559     Assert(RegisteredSnapshots > 0);
00560 
00561     ResourceOwnerForgetSnapshot(owner, snapshot);
00562     RegisteredSnapshots--;
00563     if (--snapshot->regd_count == 0 && snapshot->active_count == 0)
00564     {
00565         FreeSnapshot(snapshot);
00566         SnapshotResetXmin();
00567     }
00568 }
00569 
00570 /*
00571  * SnapshotResetXmin
00572  *
00573  * If there are no more snapshots, we can reset our PGXACT->xmin to InvalidXid.
00574  * Note we can do this without locking because we assume that storing an Xid
00575  * is atomic.
00576  */
00577 static void
00578 SnapshotResetXmin(void)
00579 {
00580     if (RegisteredSnapshots == 0 && ActiveSnapshot == NULL)
00581         MyPgXact->xmin = InvalidTransactionId;
00582 }
00583 
00584 /*
00585  * AtSubCommit_Snapshot
00586  */
00587 void
00588 AtSubCommit_Snapshot(int level)
00589 {
00590     ActiveSnapshotElt *active;
00591 
00592     /*
00593      * Relabel the active snapshots set in this subtransaction as though they
00594      * are owned by the parent subxact.
00595      */
00596     for (active = ActiveSnapshot; active != NULL; active = active->as_next)
00597     {
00598         if (active->as_level < level)
00599             break;
00600         active->as_level = level - 1;
00601     }
00602 }
00603 
00604 /*
00605  * AtSubAbort_Snapshot
00606  *      Clean up snapshots after a subtransaction abort
00607  */
00608 void
00609 AtSubAbort_Snapshot(int level)
00610 {
00611     /* Forget the active snapshots set by this subtransaction */
00612     while (ActiveSnapshot && ActiveSnapshot->as_level >= level)
00613     {
00614         ActiveSnapshotElt *next;
00615 
00616         next = ActiveSnapshot->as_next;
00617 
00618         /*
00619          * Decrement the snapshot's active count.  If it's still registered or
00620          * marked as active by an outer subtransaction, we can't free it yet.
00621          */
00622         Assert(ActiveSnapshot->as_snap->active_count >= 1);
00623         ActiveSnapshot->as_snap->active_count -= 1;
00624 
00625         if (ActiveSnapshot->as_snap->active_count == 0 &&
00626             ActiveSnapshot->as_snap->regd_count == 0)
00627             FreeSnapshot(ActiveSnapshot->as_snap);
00628 
00629         /* and free the stack element */
00630         pfree(ActiveSnapshot);
00631 
00632         ActiveSnapshot = next;
00633     }
00634 
00635     SnapshotResetXmin();
00636 }
00637 
00638 /*
00639  * AtEOXact_Snapshot
00640  *      Snapshot manager's cleanup function for end of transaction
00641  */
00642 void
00643 AtEOXact_Snapshot(bool isCommit)
00644 {
00645     /*
00646      * In transaction-snapshot mode we must release our privately-managed
00647      * reference to the transaction snapshot.  We must decrement
00648      * RegisteredSnapshots to keep the check below happy.  But we don't bother
00649      * to do FreeSnapshot, for two reasons: the memory will go away with
00650      * TopTransactionContext anyway, and if someone has left the snapshot
00651      * stacked as active, we don't want the code below to be chasing through a
00652      * dangling pointer.
00653      */
00654     if (FirstXactSnapshot != NULL)
00655     {
00656         Assert(FirstXactSnapshot->regd_count > 0);
00657         Assert(RegisteredSnapshots > 0);
00658         RegisteredSnapshots--;
00659     }
00660     FirstXactSnapshot = NULL;
00661 
00662     /*
00663      * If we exported any snapshots, clean them up.
00664      */
00665     if (exportedSnapshots != NIL)
00666     {
00667         TransactionId myxid = GetTopTransactionId();
00668         int         i;
00669         char        buf[MAXPGPATH];
00670 
00671         /*
00672          * Get rid of the files.  Unlink failure is only a WARNING because (1)
00673          * it's too late to abort the transaction, and (2) leaving a leaked
00674          * file around has little real consequence anyway.
00675          */
00676         for (i = 1; i <= list_length(exportedSnapshots); i++)
00677         {
00678             XactExportFilePath(buf, myxid, i, "");
00679             if (unlink(buf))
00680                 elog(WARNING, "could not unlink file \"%s\": %m", buf);
00681         }
00682 
00683         /*
00684          * As with the FirstXactSnapshot, we needn't spend any effort on
00685          * cleaning up the per-snapshot data structures, but we do need to
00686          * adjust the RegisteredSnapshots count to prevent a warning below.
00687          *
00688          * Note: you might be thinking "why do we have the exportedSnapshots
00689          * list at all?  All we need is a counter!".  You're right, but we do
00690          * it this way in case we ever feel like improving xmin management.
00691          */
00692         Assert(RegisteredSnapshots >= list_length(exportedSnapshots));
00693         RegisteredSnapshots -= list_length(exportedSnapshots);
00694 
00695         exportedSnapshots = NIL;
00696     }
00697 
00698     /* On commit, complain about leftover snapshots */
00699     if (isCommit)
00700     {
00701         ActiveSnapshotElt *active;
00702 
00703         if (RegisteredSnapshots != 0)
00704             elog(WARNING, "%d registered snapshots seem to remain after cleanup",
00705                  RegisteredSnapshots);
00706 
00707         /* complain about unpopped active snapshots */
00708         for (active = ActiveSnapshot; active != NULL; active = active->as_next)
00709             elog(WARNING, "snapshot %p still active", active);
00710     }
00711 
00712     /*
00713      * And reset our state.  We don't need to free the memory explicitly --
00714      * it'll go away with TopTransactionContext.
00715      */
00716     ActiveSnapshot = NULL;
00717     RegisteredSnapshots = 0;
00718 
00719     CurrentSnapshot = NULL;
00720     SecondarySnapshot = NULL;
00721 
00722     FirstSnapshotSet = false;
00723 
00724     SnapshotResetXmin();
00725 }
00726 
00727 
00728 /*
00729  * ExportSnapshot
00730  *      Export the snapshot to a file so that other backends can import it.
00731  *      Returns the token (the file name) that can be used to import this
00732  *      snapshot.
00733  */
00734 static char *
00735 ExportSnapshot(Snapshot snapshot)
00736 {
00737     TransactionId topXid;
00738     TransactionId *children;
00739     int         nchildren;
00740     int         addTopXid;
00741     StringInfoData buf;
00742     FILE       *f;
00743     int         i;
00744     MemoryContext oldcxt;
00745     char        path[MAXPGPATH];
00746     char        pathtmp[MAXPGPATH];
00747 
00748     /*
00749      * It's tempting to call RequireTransactionChain here, since it's not very
00750      * useful to export a snapshot that will disappear immediately afterwards.
00751      * However, we haven't got enough information to do that, since we don't
00752      * know if we're at top level or not.  For example, we could be inside a
00753      * plpgsql function that is going to fire off other transactions via
00754      * dblink.  Rather than disallow perfectly legitimate usages, don't make a
00755      * check.
00756      *
00757      * Also note that we don't make any restriction on the transaction's
00758      * isolation level; however, importers must check the level if they are
00759      * serializable.
00760      */
00761 
00762     /*
00763      * This will assign a transaction ID if we do not yet have one.
00764      */
00765     topXid = GetTopTransactionId();
00766 
00767     /*
00768      * We cannot export a snapshot from a subtransaction because there's no
00769      * easy way for importers to verify that the same subtransaction is still
00770      * running.
00771      */
00772     if (IsSubTransaction())
00773         ereport(ERROR,
00774                 (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
00775                  errmsg("cannot export a snapshot from a subtransaction")));
00776 
00777     /*
00778      * We do however allow previous committed subtransactions to exist.
00779      * Importers of the snapshot must see them as still running, so get their
00780      * XIDs to add them to the snapshot.
00781      */
00782     nchildren = xactGetCommittedChildren(&children);
00783 
00784     /*
00785      * Copy the snapshot into TopTransactionContext, add it to the
00786      * exportedSnapshots list, and mark it pseudo-registered.  We do this to
00787      * ensure that the snapshot's xmin is honored for the rest of the
00788      * transaction.  (Right now, because SnapshotResetXmin is so stupid, this
00789      * is overkill; but later we might make that routine smarter.)
00790      */
00791     snapshot = CopySnapshot(snapshot);
00792 
00793     oldcxt = MemoryContextSwitchTo(TopTransactionContext);
00794     exportedSnapshots = lappend(exportedSnapshots, snapshot);
00795     MemoryContextSwitchTo(oldcxt);
00796 
00797     snapshot->regd_count++;
00798     RegisteredSnapshots++;
00799 
00800     /*
00801      * Fill buf with a text serialization of the snapshot, plus identification
00802      * data about this transaction.  The format expected by ImportSnapshot is
00803      * pretty rigid: each line must be fieldname:value.
00804      */
00805     initStringInfo(&buf);
00806 
00807     appendStringInfo(&buf, "xid:%u\n", topXid);
00808     appendStringInfo(&buf, "dbid:%u\n", MyDatabaseId);
00809     appendStringInfo(&buf, "iso:%d\n", XactIsoLevel);
00810     appendStringInfo(&buf, "ro:%d\n", XactReadOnly);
00811 
00812     appendStringInfo(&buf, "xmin:%u\n", snapshot->xmin);
00813     appendStringInfo(&buf, "xmax:%u\n", snapshot->xmax);
00814 
00815     /*
00816      * We must include our own top transaction ID in the top-xid data, since
00817      * by definition we will still be running when the importing transaction
00818      * adopts the snapshot, but GetSnapshotData never includes our own XID in
00819      * the snapshot.  (There must, therefore, be enough room to add it.)
00820      *
00821      * However, it could be that our topXid is after the xmax, in which case
00822      * we shouldn't include it because xip[] members are expected to be before
00823      * xmax.  (We need not make the same check for subxip[] members, see
00824      * snapshot.h.)
00825      */
00826     addTopXid = TransactionIdPrecedes(topXid, snapshot->xmax) ? 1 : 0;
00827     appendStringInfo(&buf, "xcnt:%d\n", snapshot->xcnt + addTopXid);
00828     for (i = 0; i < snapshot->xcnt; i++)
00829         appendStringInfo(&buf, "xip:%u\n", snapshot->xip[i]);
00830     if (addTopXid)
00831         appendStringInfo(&buf, "xip:%u\n", topXid);
00832 
00833     /*
00834      * Similarly, we add our subcommitted child XIDs to the subxid data. Here,
00835      * we have to cope with possible overflow.
00836      */
00837     if (snapshot->suboverflowed ||
00838         snapshot->subxcnt + nchildren > GetMaxSnapshotSubxidCount())
00839         appendStringInfoString(&buf, "sof:1\n");
00840     else
00841     {
00842         appendStringInfoString(&buf, "sof:0\n");
00843         appendStringInfo(&buf, "sxcnt:%d\n", snapshot->subxcnt + nchildren);
00844         for (i = 0; i < snapshot->subxcnt; i++)
00845             appendStringInfo(&buf, "sxp:%u\n", snapshot->subxip[i]);
00846         for (i = 0; i < nchildren; i++)
00847             appendStringInfo(&buf, "sxp:%u\n", children[i]);
00848     }
00849     appendStringInfo(&buf, "rec:%u\n", snapshot->takenDuringRecovery);
00850 
00851     /*
00852      * Now write the text representation into a file.  We first write to a
00853      * ".tmp" filename, and rename to final filename if no error.  This
00854      * ensures that no other backend can read an incomplete file
00855      * (ImportSnapshot won't allow it because of its valid-characters check).
00856      */
00857     XactExportFilePath(pathtmp, topXid, list_length(exportedSnapshots), ".tmp");
00858     if (!(f = AllocateFile(pathtmp, PG_BINARY_W)))
00859         ereport(ERROR,
00860                 (errcode_for_file_access(),
00861                  errmsg("could not create file \"%s\": %m", pathtmp)));
00862 
00863     if (fwrite(buf.data, buf.len, 1, f) != 1)
00864         ereport(ERROR,
00865                 (errcode_for_file_access(),
00866                  errmsg("could not write to file \"%s\": %m", pathtmp)));
00867 
00868     /* no fsync() since file need not survive a system crash */
00869 
00870     if (FreeFile(f))
00871         ereport(ERROR,
00872                 (errcode_for_file_access(),
00873                  errmsg("could not write to file \"%s\": %m", pathtmp)));
00874 
00875     /*
00876      * Now that we have written everything into a .tmp file, rename the file
00877      * to remove the .tmp suffix.
00878      */
00879     XactExportFilePath(path, topXid, list_length(exportedSnapshots), "");
00880 
00881     if (rename(pathtmp, path) < 0)
00882         ereport(ERROR,
00883                 (errcode_for_file_access(),
00884                  errmsg("could not rename file \"%s\" to \"%s\": %m",
00885                         pathtmp, path)));
00886 
00887     /*
00888      * The basename of the file is what we return from pg_export_snapshot().
00889      * It's already in path in a textual format and we know that the path
00890      * starts with SNAPSHOT_EXPORT_DIR.  Skip over the prefix and the slash
00891      * and pstrdup it so as not to return the address of a local variable.
00892      */
00893     return pstrdup(path + strlen(SNAPSHOT_EXPORT_DIR) + 1);
00894 }
00895 
00896 /*
00897  * pg_export_snapshot
00898  *      SQL-callable wrapper for ExportSnapshot.
00899  */
00900 Datum
00901 pg_export_snapshot(PG_FUNCTION_ARGS)
00902 {
00903     char       *snapshotName;
00904 
00905     snapshotName = ExportSnapshot(GetActiveSnapshot());
00906     PG_RETURN_TEXT_P(cstring_to_text(snapshotName));
00907 }
00908 
00909 
00910 /*
00911  * Parsing subroutines for ImportSnapshot: parse a line with the given
00912  * prefix followed by a value, and advance *s to the next line.  The
00913  * filename is provided for use in error messages.
00914  */
00915 static int
00916 parseIntFromText(const char *prefix, char **s, const char *filename)
00917 {
00918     char       *ptr = *s;
00919     int         prefixlen = strlen(prefix);
00920     int         val;
00921 
00922     if (strncmp(ptr, prefix, prefixlen) != 0)
00923         ereport(ERROR,
00924                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
00925                  errmsg("invalid snapshot data in file \"%s\"", filename)));
00926     ptr += prefixlen;
00927     if (sscanf(ptr, "%d", &val) != 1)
00928         ereport(ERROR,
00929                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
00930                  errmsg("invalid snapshot data in file \"%s\"", filename)));
00931     ptr = strchr(ptr, '\n');
00932     if (!ptr)
00933         ereport(ERROR,
00934                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
00935                  errmsg("invalid snapshot data in file \"%s\"", filename)));
00936     *s = ptr + 1;
00937     return val;
00938 }
00939 
00940 static TransactionId
00941 parseXidFromText(const char *prefix, char **s, const char *filename)
00942 {
00943     char       *ptr = *s;
00944     int         prefixlen = strlen(prefix);
00945     TransactionId val;
00946 
00947     if (strncmp(ptr, prefix, prefixlen) != 0)
00948         ereport(ERROR,
00949                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
00950                  errmsg("invalid snapshot data in file \"%s\"", filename)));
00951     ptr += prefixlen;
00952     if (sscanf(ptr, "%u", &val) != 1)
00953         ereport(ERROR,
00954                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
00955                  errmsg("invalid snapshot data in file \"%s\"", filename)));
00956     ptr = strchr(ptr, '\n');
00957     if (!ptr)
00958         ereport(ERROR,
00959                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
00960                  errmsg("invalid snapshot data in file \"%s\"", filename)));
00961     *s = ptr + 1;
00962     return val;
00963 }
00964 
00965 /*
00966  * ImportSnapshot
00967  *      Import a previously exported snapshot.  The argument should be a
00968  *      filename in SNAPSHOT_EXPORT_DIR.  Load the snapshot from that file.
00969  *      This is called by "SET TRANSACTION SNAPSHOT 'foo'".
00970  */
00971 void
00972 ImportSnapshot(const char *idstr)
00973 {
00974     char        path[MAXPGPATH];
00975     FILE       *f;
00976     struct stat stat_buf;
00977     char       *filebuf;
00978     int         xcnt;
00979     int         i;
00980     TransactionId src_xid;
00981     Oid         src_dbid;
00982     int         src_isolevel;
00983     bool        src_readonly;
00984     SnapshotData snapshot;
00985 
00986     /*
00987      * Must be at top level of a fresh transaction.  Note in particular that
00988      * we check we haven't acquired an XID --- if we have, it's conceivable
00989      * that the snapshot would show it as not running, making for very screwy
00990      * behavior.
00991      */
00992     if (FirstSnapshotSet ||
00993         GetTopTransactionIdIfAny() != InvalidTransactionId ||
00994         IsSubTransaction())
00995         ereport(ERROR,
00996                 (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
00997         errmsg("SET TRANSACTION SNAPSHOT must be called before any query")));
00998 
00999     /*
01000      * If we are in read committed mode then the next query would execute with
01001      * a new snapshot thus making this function call quite useless.
01002      */
01003     if (!IsolationUsesXactSnapshot())
01004         ereport(ERROR,
01005                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
01006                  errmsg("a snapshot-importing transaction must have isolation level SERIALIZABLE or REPEATABLE READ")));
01007 
01008     /*
01009      * Verify the identifier: only 0-9, A-F and hyphens are allowed.  We do
01010      * this mainly to prevent reading arbitrary files.
01011      */
01012     if (strspn(idstr, "0123456789ABCDEF-") != strlen(idstr))
01013         ereport(ERROR,
01014                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
01015                  errmsg("invalid snapshot identifier: \"%s\"", idstr)));
01016 
01017     /* OK, read the file */
01018     snprintf(path, MAXPGPATH, SNAPSHOT_EXPORT_DIR "/%s", idstr);
01019 
01020     f = AllocateFile(path, PG_BINARY_R);
01021     if (!f)
01022         ereport(ERROR,
01023                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
01024                  errmsg("invalid snapshot identifier: \"%s\"", idstr)));
01025 
01026     /* get the size of the file so that we know how much memory we need */
01027     if (fstat(fileno(f), &stat_buf))
01028         elog(ERROR, "could not stat file \"%s\": %m", path);
01029 
01030     /* and read the file into a palloc'd string */
01031     filebuf = (char *) palloc(stat_buf.st_size + 1);
01032     if (fread(filebuf, stat_buf.st_size, 1, f) != 1)
01033         elog(ERROR, "could not read file \"%s\": %m", path);
01034 
01035     filebuf[stat_buf.st_size] = '\0';
01036 
01037     FreeFile(f);
01038 
01039     /*
01040      * Construct a snapshot struct by parsing the file content.
01041      */
01042     memset(&snapshot, 0, sizeof(snapshot));
01043 
01044     src_xid = parseXidFromText("xid:", &filebuf, path);
01045     /* we abuse parseXidFromText a bit here ... */
01046     src_dbid = parseXidFromText("dbid:", &filebuf, path);
01047     src_isolevel = parseIntFromText("iso:", &filebuf, path);
01048     src_readonly = parseIntFromText("ro:", &filebuf, path);
01049 
01050     snapshot.xmin = parseXidFromText("xmin:", &filebuf, path);
01051     snapshot.xmax = parseXidFromText("xmax:", &filebuf, path);
01052 
01053     snapshot.xcnt = xcnt = parseIntFromText("xcnt:", &filebuf, path);
01054 
01055     /* sanity-check the xid count before palloc */
01056     if (xcnt < 0 || xcnt > GetMaxSnapshotXidCount())
01057         ereport(ERROR,
01058                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
01059                  errmsg("invalid snapshot data in file \"%s\"", path)));
01060 
01061     snapshot.xip = (TransactionId *) palloc(xcnt * sizeof(TransactionId));
01062     for (i = 0; i < xcnt; i++)
01063         snapshot.xip[i] = parseXidFromText("xip:", &filebuf, path);
01064 
01065     snapshot.suboverflowed = parseIntFromText("sof:", &filebuf, path);
01066 
01067     if (!snapshot.suboverflowed)
01068     {
01069         snapshot.subxcnt = xcnt = parseIntFromText("sxcnt:", &filebuf, path);
01070 
01071         /* sanity-check the xid count before palloc */
01072         if (xcnt < 0 || xcnt > GetMaxSnapshotSubxidCount())
01073             ereport(ERROR,
01074                     (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
01075                      errmsg("invalid snapshot data in file \"%s\"", path)));
01076 
01077         snapshot.subxip = (TransactionId *) palloc(xcnt * sizeof(TransactionId));
01078         for (i = 0; i < xcnt; i++)
01079             snapshot.subxip[i] = parseXidFromText("sxp:", &filebuf, path);
01080     }
01081     else
01082     {
01083         snapshot.subxcnt = 0;
01084         snapshot.subxip = NULL;
01085     }
01086 
01087     snapshot.takenDuringRecovery = parseIntFromText("rec:", &filebuf, path);
01088 
01089     /*
01090      * Do some additional sanity checking, just to protect ourselves.  We
01091      * don't trouble to check the array elements, just the most critical
01092      * fields.
01093      */
01094     if (!TransactionIdIsNormal(src_xid) ||
01095         !OidIsValid(src_dbid) ||
01096         !TransactionIdIsNormal(snapshot.xmin) ||
01097         !TransactionIdIsNormal(snapshot.xmax))
01098         ereport(ERROR,
01099                 (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
01100                  errmsg("invalid snapshot data in file \"%s\"", path)));
01101 
01102     /*
01103      * If we're serializable, the source transaction must be too, otherwise
01104      * predicate.c has problems (SxactGlobalXmin could go backwards).  Also, a
01105      * non-read-only transaction can't adopt a snapshot from a read-only
01106      * transaction, as predicate.c handles the cases very differently.
01107      */
01108     if (IsolationIsSerializable())
01109     {
01110         if (src_isolevel != XACT_SERIALIZABLE)
01111             ereport(ERROR,
01112                     (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
01113                      errmsg("a serializable transaction cannot import a snapshot from a non-serializable transaction")));
01114         if (src_readonly && !XactReadOnly)
01115             ereport(ERROR,
01116                     (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
01117                      errmsg("a non-read-only serializable transaction cannot import a snapshot from a read-only transaction")));
01118     }
01119 
01120     /*
01121      * We cannot import a snapshot that was taken in a different database,
01122      * because vacuum calculates OldestXmin on a per-database basis; so the
01123      * source transaction's xmin doesn't protect us from data loss.  This
01124      * restriction could be removed if the source transaction were to mark its
01125      * xmin as being globally applicable.  But that would require some
01126      * additional syntax, since that has to be known when the snapshot is
01127      * initially taken.  (See pgsql-hackers discussion of 2011-10-21.)
01128      */
01129     if (src_dbid != MyDatabaseId)
01130         ereport(ERROR,
01131                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
01132               errmsg("cannot import a snapshot from a different database")));
01133 
01134     /* OK, install the snapshot */
01135     SetTransactionSnapshot(&snapshot, src_xid);
01136 }
01137 
01138 /*
01139  * XactHasExportedSnapshots
01140  *      Test whether current transaction has exported any snapshots.
01141  */
01142 bool
01143 XactHasExportedSnapshots(void)
01144 {
01145     return (exportedSnapshots != NIL);
01146 }
01147 
01148 /*
01149  * DeleteAllExportedSnapshotFiles
01150  *      Clean up any files that have been left behind by a crashed backend
01151  *      that had exported snapshots before it died.
01152  *
01153  * This should be called during database startup or crash recovery.
01154  */
01155 void
01156 DeleteAllExportedSnapshotFiles(void)
01157 {
01158     char        buf[MAXPGPATH];
01159     DIR        *s_dir;
01160     struct dirent *s_de;
01161 
01162     if (!(s_dir = AllocateDir(SNAPSHOT_EXPORT_DIR)))
01163     {
01164         /*
01165          * We really should have that directory in a sane cluster setup. But
01166          * then again if we don't, it's not fatal enough to make it FATAL.
01167          * Since we're running in the postmaster, LOG is our best bet.
01168          */
01169         elog(LOG, "could not open directory \"%s\": %m", SNAPSHOT_EXPORT_DIR);
01170         return;
01171     }
01172 
01173     while ((s_de = ReadDir(s_dir, SNAPSHOT_EXPORT_DIR)) != NULL)
01174     {
01175         if (strcmp(s_de->d_name, ".") == 0 ||
01176             strcmp(s_de->d_name, "..") == 0)
01177             continue;
01178 
01179         snprintf(buf, MAXPGPATH, SNAPSHOT_EXPORT_DIR "/%s", s_de->d_name);
01180         /* Again, unlink failure is not worthy of FATAL */
01181         if (unlink(buf))
01182             elog(LOG, "could not unlink file \"%s\": %m", buf);
01183     }
01184 
01185     FreeDir(s_dir);
01186 }
01187 
01188 bool
01189 ThereAreNoPriorRegisteredSnapshots(void)
01190 {
01191     if (RegisteredSnapshots <= 1)
01192         return true;
01193 
01194     return false;
01195 }