Header And Logo

PostgreSQL
| The world's most advanced open source database.

predicate_internals.h

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * predicate_internals.h
00004  *    POSTGRES internal predicate locking definitions.
00005  *
00006  *
00007  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00008  * Portions Copyright (c) 1994, Regents of the University of California
00009  *
00010  * src/include/storage/predicate_internals.h
00011  *
00012  *-------------------------------------------------------------------------
00013  */
00014 #ifndef PREDICATE_INTERNALS_H
00015 #define PREDICATE_INTERNALS_H
00016 
00017 #include "storage/lock.h"
00018 
00019 /*
00020  * Commit number.
00021  */
00022 typedef uint64 SerCommitSeqNo;
00023 
00024 /*
00025  * Reserved commit sequence numbers:
00026  *  - 0 is reserved to indicate a non-existent SLRU entry; it cannot be
00027  *    used as a SerCommitSeqNo, even an invalid one
00028  *  - InvalidSerCommitSeqNo is used to indicate a transaction that
00029  *    hasn't committed yet, so use a number greater than all valid
00030  *    ones to make comparison do the expected thing
00031  *  - RecoverySerCommitSeqNo is used to refer to transactions that
00032  *    happened before a crash/recovery, since we restart the sequence
00033  *    at that point.  It's earlier than all normal sequence numbers,
00034  *    and is only used by recovered prepared transactions
00035  */
00036 #define InvalidSerCommitSeqNo       ((SerCommitSeqNo) UINT64CONST(0xFFFFFFFFFFFFFFFF))
00037 #define RecoverySerCommitSeqNo      ((SerCommitSeqNo) 1)
00038 #define FirstNormalSerCommitSeqNo   ((SerCommitSeqNo) 2)
00039 
00040 /*
00041  * The SERIALIZABLEXACT struct contains information needed for each
00042  * serializable database transaction to support SSI techniques.
00043  *
00044  * A home-grown list is maintained in shared memory to manage these.
00045  * An entry is used when the serializable transaction acquires a snapshot.
00046  * Unless the transaction is rolled back, this entry must generally remain
00047  * until all concurrent transactions have completed.  (There are special
00048  * optimizations for READ ONLY transactions which often allow them to be
00049  * cleaned up earlier.)  A transaction which is rolled back is cleaned up
00050  * as soon as possible.
00051  *
00052  * Eligibility for cleanup of committed transactions is generally determined
00053  * by comparing the transaction's finishedBefore field to
00054  * SerializableGlobalXmin.
00055  */
00056 typedef struct SERIALIZABLEXACT
00057 {
00058     VirtualTransactionId vxid;  /* The executing process always has one of
00059                                  * these. */
00060 
00061     /*
00062      * We use two numbers to track the order that transactions commit. Before
00063      * commit, a transaction is marked as prepared, and prepareSeqNo is set.
00064      * Shortly after commit, it's marked as committed, and commitSeqNo is set.
00065      * This doesn't give a strict commit order, but these two values together
00066      * are good enough for us, as we can always err on the safe side and
00067      * assume that there's a conflict, if we can't be sure of the exact
00068      * ordering of two commits.
00069      *
00070      * Note that a transaction is marked as prepared for a short period during
00071      * commit processing, even if two-phase commit is not used. But with
00072      * two-phase commit, a transaction can stay in prepared state for some
00073      * time.
00074      */
00075     SerCommitSeqNo prepareSeqNo;
00076     SerCommitSeqNo commitSeqNo;
00077 
00078     /* these values are not both interesting at the same time */
00079     union
00080     {
00081         SerCommitSeqNo earliestOutConflictCommit;       /* when committed with
00082                                                          * conflict out */
00083         SerCommitSeqNo lastCommitBeforeSnapshot;        /* when not committed or
00084                                                          * no conflict out */
00085     }           SeqNo;
00086     SHM_QUEUE   outConflicts;   /* list of write transactions whose data we
00087                                  * couldn't read. */
00088     SHM_QUEUE   inConflicts;    /* list of read transactions which couldn't
00089                                  * see our write. */
00090     SHM_QUEUE   predicateLocks; /* list of associated PREDICATELOCK objects */
00091     SHM_QUEUE   finishedLink;   /* list link in
00092                                  * FinishedSerializableTransactions */
00093 
00094     /*
00095      * for r/o transactions: list of concurrent r/w transactions that we could
00096      * potentially have conflicts with, and vice versa for r/w transactions
00097      */
00098     SHM_QUEUE   possibleUnsafeConflicts;
00099 
00100     TransactionId topXid;       /* top level xid for the transaction, if one
00101                                  * exists; else invalid */
00102     TransactionId finishedBefore;       /* invalid means still running; else
00103                                          * the struct expires when no
00104                                          * serializable xids are before this. */
00105     TransactionId xmin;         /* the transaction's snapshot xmin */
00106     uint32      flags;          /* OR'd combination of values defined below */
00107     int         pid;            /* pid of associated process */
00108 } SERIALIZABLEXACT;
00109 
00110 #define SXACT_FLAG_COMMITTED            0x00000001      /* already committed */
00111 #define SXACT_FLAG_PREPARED             0x00000002      /* about to commit */
00112 #define SXACT_FLAG_ROLLED_BACK          0x00000004      /* already rolled back */
00113 #define SXACT_FLAG_DOOMED               0x00000008      /* will roll back */
00114 /*
00115  * The following flag actually means that the flagged transaction has a
00116  * conflict out *to a transaction which committed ahead of it*.  It's hard
00117  * to get that into a name of a reasonable length.
00118  */
00119 #define SXACT_FLAG_CONFLICT_OUT         0x00000010
00120 #define SXACT_FLAG_READ_ONLY            0x00000020
00121 #define SXACT_FLAG_DEFERRABLE_WAITING   0x00000040
00122 #define SXACT_FLAG_RO_SAFE              0x00000080
00123 #define SXACT_FLAG_RO_UNSAFE            0x00000100
00124 #define SXACT_FLAG_SUMMARY_CONFLICT_IN  0x00000200
00125 #define SXACT_FLAG_SUMMARY_CONFLICT_OUT 0x00000400
00126 
00127 /*
00128  * The following types are used to provide an ad hoc list for holding
00129  * SERIALIZABLEXACT objects.  An HTAB is overkill, since there is no need to
00130  * access these by key -- there are direct pointers to these objects where
00131  * needed.  If a shared memory list is created, these types can probably be
00132  * eliminated in favor of using the general solution.
00133  */
00134 typedef struct PredXactListElementData
00135 {
00136     SHM_QUEUE   link;
00137     SERIALIZABLEXACT sxact;
00138 }   PredXactListElementData;
00139 
00140 typedef struct PredXactListElementData *PredXactListElement;
00141 
00142 #define PredXactListElementDataSize \
00143         ((Size)MAXALIGN(sizeof(PredXactListElementData)))
00144 
00145 typedef struct PredXactListData
00146 {
00147     SHM_QUEUE   availableList;
00148     SHM_QUEUE   activeList;
00149 
00150     /*
00151      * These global variables are maintained when registering and cleaning up
00152      * serializable transactions.  They must be global across all backends,
00153      * but are not needed outside the predicate.c source file. Protected by
00154      * SerializableXactHashLock.
00155      */
00156     TransactionId SxactGlobalXmin;      /* global xmin for active serializable
00157                                          * transactions */
00158     int         SxactGlobalXminCount;   /* how many active serializable
00159                                          * transactions have this xmin */
00160     int         WritableSxactCount;     /* how many non-read-only serializable
00161                                          * transactions are active */
00162     SerCommitSeqNo LastSxactCommitSeqNo;        /* a strictly monotonically
00163                                                  * increasing number for
00164                                                  * commits of serializable
00165                                                  * transactions */
00166     /* Protected by SerializableXactHashLock. */
00167     SerCommitSeqNo CanPartialClearThrough;      /* can clear predicate locks
00168                                                  * and inConflicts for
00169                                                  * committed transactions
00170                                                  * through this seq no */
00171     /* Protected by SerializableFinishedListLock. */
00172     SerCommitSeqNo HavePartialClearedThrough;   /* have cleared through this
00173                                                  * seq no */
00174     SERIALIZABLEXACT *OldCommittedSxact;        /* shared copy of dummy sxact */
00175 
00176     PredXactListElement element;
00177 }   PredXactListData;
00178 
00179 typedef struct PredXactListData *PredXactList;
00180 
00181 #define PredXactListDataSize \
00182         ((Size)MAXALIGN(sizeof(PredXactListData)))
00183 
00184 
00185 /*
00186  * The following types are used to provide lists of rw-conflicts between
00187  * pairs of transactions.  Since exactly the same information is needed,
00188  * they are also used to record possible unsafe transaction relationships
00189  * for purposes of identifying safe snapshots for read-only transactions.
00190  *
00191  * When a RWConflictData is not in use to record either type of relationship
00192  * between a pair of transactions, it is kept on an "available" list.  The
00193  * outLink field is used for maintaining that list.
00194  */
00195 typedef struct RWConflictData
00196 {
00197     SHM_QUEUE   outLink;        /* link for list of conflicts out from a sxact */
00198     SHM_QUEUE   inLink;         /* link for list of conflicts in to a sxact */
00199     SERIALIZABLEXACT *sxactOut;
00200     SERIALIZABLEXACT *sxactIn;
00201 }   RWConflictData;
00202 
00203 typedef struct RWConflictData *RWConflict;
00204 
00205 #define RWConflictDataSize \
00206         ((Size)MAXALIGN(sizeof(RWConflictData)))
00207 
00208 typedef struct RWConflictPoolHeaderData
00209 {
00210     SHM_QUEUE   availableList;
00211     RWConflict  element;
00212 }   RWConflictPoolHeaderData;
00213 
00214 typedef struct RWConflictPoolHeaderData *RWConflictPoolHeader;
00215 
00216 #define RWConflictPoolHeaderDataSize \
00217         ((Size)MAXALIGN(sizeof(RWConflictPoolHeaderData)))
00218 
00219 
00220 /*
00221  * The SERIALIZABLEXIDTAG struct identifies an xid assigned to a serializable
00222  * transaction or any of its subtransactions.
00223  */
00224 typedef struct SERIALIZABLEXIDTAG
00225 {
00226     TransactionId xid;
00227 } SERIALIZABLEXIDTAG;
00228 
00229 /*
00230  * The SERIALIZABLEXID struct provides a link from a TransactionId for a
00231  * serializable transaction to the related SERIALIZABLEXACT record, even if
00232  * the transaction has completed and its connection has been closed.
00233  *
00234  * These are created as new top level transaction IDs are first assigned to
00235  * transactions which are participating in predicate locking.  This may
00236  * never happen for a particular transaction if it doesn't write anything.
00237  * They are removed with their related serializable transaction objects.
00238  *
00239  * The SubTransGetTopmostTransaction method is used where necessary to get
00240  * from an XID which might be from a subtransaction to the top level XID.
00241  */
00242 typedef struct SERIALIZABLEXID
00243 {
00244     /* hash key */
00245     SERIALIZABLEXIDTAG tag;
00246 
00247     /* data */
00248     SERIALIZABLEXACT *myXact;   /* pointer to the top level transaction data */
00249 } SERIALIZABLEXID;
00250 
00251 
00252 /*
00253  * The PREDICATELOCKTARGETTAG struct identifies a database object which can
00254  * be the target of predicate locks.
00255  *
00256  * Note that the hash function being used doesn't properly respect tag
00257  * length -- it will go to a four byte boundary past the end of the tag.
00258  * If you change this struct, make sure any slack space is initialized,
00259  * so that any random bytes in the middle or at the end are not included
00260  * in the hash.
00261  *
00262  * TODO SSI: If we always use the same fields for the same type of value, we
00263  * should rename these.  Holding off until it's clear there are no exceptions.
00264  * Since indexes are relations with blocks and tuples, it's looking likely that
00265  * the rename will be possible.  If not, we may need to divide the last field
00266  * and use part of it for a target type, so that we know how to interpret the
00267  * data..
00268  */
00269 typedef struct PREDICATELOCKTARGETTAG
00270 {
00271     uint32      locktag_field1; /* a 32-bit ID field */
00272     uint32      locktag_field2; /* a 32-bit ID field */
00273     uint32      locktag_field3; /* a 32-bit ID field */
00274     uint32      locktag_field4; /* a 32-bit ID field */
00275     uint32      locktag_field5; /* a 32-bit ID field */
00276 } PREDICATELOCKTARGETTAG;
00277 
00278 /*
00279  * The PREDICATELOCKTARGET struct represents a database object on which there
00280  * are predicate locks.
00281  *
00282  * A hash list of these objects is maintained in shared memory.  An entry is
00283  * added when a predicate lock is requested on an object which doesn't
00284  * already have one.  An entry is removed when the last lock is removed from
00285  * its list.
00286  *
00287  * Because a particular target might become obsolete, due to update to a new
00288  * version, before the reading transaction is obsolete, we need some way to
00289  * prevent errors from reuse of a tuple ID.  Rather than attempting to clean
00290  * up the targets as the related tuples are pruned or vacuumed, we check the
00291  * xmin on access.  This should be far less costly.
00292  */
00293 typedef struct PREDICATELOCKTARGET
00294 {
00295     /* hash key */
00296     PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
00297 
00298     /* data */
00299     SHM_QUEUE   predicateLocks; /* list of PREDICATELOCK objects assoc. with
00300                                  * predicate lock target */
00301 } PREDICATELOCKTARGET;
00302 
00303 
00304 /*
00305  * The PREDICATELOCKTAG struct identifies an individual predicate lock.
00306  *
00307  * It is the combination of predicate lock target (which is a lockable
00308  * object) and a serializable transaction which has acquired a lock on that
00309  * target.
00310  */
00311 typedef struct PREDICATELOCKTAG
00312 {
00313     PREDICATELOCKTARGET *myTarget;
00314     SERIALIZABLEXACT *myXact;
00315 } PREDICATELOCKTAG;
00316 
00317 /*
00318  * The PREDICATELOCK struct represents an individual lock.
00319  *
00320  * An entry can be created here when the related database object is read, or
00321  * by promotion of multiple finer-grained targets.  All entries related to a
00322  * serializable transaction are removed when that serializable transaction is
00323  * cleaned up.  Entries can also be removed when they are combined into a
00324  * single coarser-grained lock entry.
00325  */
00326 typedef struct PREDICATELOCK
00327 {
00328     /* hash key */
00329     PREDICATELOCKTAG tag;       /* unique identifier of lock */
00330 
00331     /* data */
00332     SHM_QUEUE   targetLink;     /* list link in PREDICATELOCKTARGET's list of
00333                                  * predicate locks */
00334     SHM_QUEUE   xactLink;       /* list link in SERIALIZABLEXACT's list of
00335                                  * predicate locks */
00336     SerCommitSeqNo commitSeqNo; /* only used for summarized predicate locks */
00337 } PREDICATELOCK;
00338 
00339 
00340 /*
00341  * The LOCALPREDICATELOCK struct represents a local copy of data which is
00342  * also present in the PREDICATELOCK table, organized for fast access without
00343  * needing to acquire a LWLock.  It is strictly for optimization.
00344  *
00345  * Each serializable transaction creates its own local hash table to hold a
00346  * collection of these.  This information is used to determine when a number
00347  * of fine-grained locks should be promoted to a single coarser-grained lock.
00348  * The information is maintained more-or-less in parallel to the
00349  * PREDICATELOCK data, but because this data is not protected by locks and is
00350  * only used in an optimization heuristic, it is allowed to drift in a few
00351  * corner cases where maintaining exact data would be expensive.
00352  *
00353  * The hash table is created when the serializable transaction acquires its
00354  * snapshot, and its memory is released upon completion of the transaction.
00355  */
00356 typedef struct LOCALPREDICATELOCK
00357 {
00358     /* hash key */
00359     PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */
00360 
00361     /* data */
00362     bool        held;           /* is lock held, or just its children?  */
00363     int         childLocks;     /* number of child locks currently held */
00364 } LOCALPREDICATELOCK;
00365 
00366 
00367 /*
00368  * The types of predicate locks which can be acquired.
00369  */
00370 typedef enum PredicateLockTargetType
00371 {
00372     PREDLOCKTAG_RELATION,
00373     PREDLOCKTAG_PAGE,
00374     PREDLOCKTAG_TUPLE
00375     /* TODO SSI: Other types may be needed for index locking */
00376 } PredicateLockTargetType;
00377 
00378 
00379 /*
00380  * This structure is used to quickly capture a copy of all predicate
00381  * locks.  This is currently used only by the pg_lock_status function,
00382  * which in turn is used by the pg_locks view.
00383  */
00384 typedef struct PredicateLockData
00385 {
00386     int         nelements;
00387     PREDICATELOCKTARGETTAG *locktags;
00388     SERIALIZABLEXACT *xacts;
00389 } PredicateLockData;
00390 
00391 
00392 /*
00393  * These macros define how we map logical IDs of lockable objects into the
00394  * physical fields of PREDICATELOCKTARGETTAG.   Use these to set up values,
00395  * rather than accessing the fields directly.  Note multiple eval of target!
00396  */
00397 #define SET_PREDICATELOCKTARGETTAG_RELATION(locktag,dboid,reloid) \
00398     ((locktag).locktag_field1 = (dboid), \
00399      (locktag).locktag_field2 = (reloid), \
00400      (locktag).locktag_field3 = InvalidBlockNumber, \
00401      (locktag).locktag_field4 = InvalidOffsetNumber, \
00402      (locktag).locktag_field5 = InvalidTransactionId)
00403 
00404 #define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \
00405     ((locktag).locktag_field1 = (dboid), \
00406      (locktag).locktag_field2 = (reloid), \
00407      (locktag).locktag_field3 = (blocknum), \
00408      (locktag).locktag_field4 = InvalidOffsetNumber, \
00409      (locktag).locktag_field5 = InvalidTransactionId)
00410 
00411 #define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum,xmin) \
00412     ((locktag).locktag_field1 = (dboid), \
00413      (locktag).locktag_field2 = (reloid), \
00414      (locktag).locktag_field3 = (blocknum), \
00415      (locktag).locktag_field4 = (offnum), \
00416      (locktag).locktag_field5 = (xmin))
00417 
00418 #define GET_PREDICATELOCKTARGETTAG_DB(locktag) \
00419     ((Oid) (locktag).locktag_field1)
00420 #define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \
00421     ((Oid) (locktag).locktag_field2)
00422 #define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \
00423     ((BlockNumber) (locktag).locktag_field3)
00424 #define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \
00425     ((OffsetNumber) (locktag).locktag_field4)
00426 #define GET_PREDICATELOCKTARGETTAG_XMIN(locktag) \
00427     ((TransactionId) (locktag).locktag_field5)
00428 #define GET_PREDICATELOCKTARGETTAG_TYPE(locktag)                             \
00429     (((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \
00430      (((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE :   \
00431       PREDLOCKTAG_RELATION))
00432 
00433 /*
00434  * Two-phase commit statefile records. There are two types: for each
00435  * transaction, we generate one per-transaction record and a variable
00436  * number of per-predicate-lock records.
00437  */
00438 typedef enum TwoPhasePredicateRecordType
00439 {
00440     TWOPHASEPREDICATERECORD_XACT,
00441     TWOPHASEPREDICATERECORD_LOCK
00442 } TwoPhasePredicateRecordType;
00443 
00444 /*
00445  * Per-transaction information to reconstruct a SERIALIZABLEXACT. Not
00446  * much is needed because most of it not meaningful for a recovered
00447  * prepared transaction.
00448  *
00449  * In particular, we do not record the in and out conflict lists for a
00450  * prepared transaction because the associated SERIALIZABLEXACTs will
00451  * not be available after recovery. Instead, we simply record the
00452  * existence of each type of conflict by setting the transaction's
00453  * summary conflict in/out flag.
00454  */
00455 typedef struct TwoPhasePredicateXactRecord
00456 {
00457     TransactionId xmin;
00458     uint32      flags;
00459 } TwoPhasePredicateXactRecord;
00460 
00461 /* Per-lock state */
00462 typedef struct TwoPhasePredicateLockRecord
00463 {
00464     PREDICATELOCKTARGETTAG target;
00465 } TwoPhasePredicateLockRecord;
00466 
00467 typedef struct TwoPhasePredicateRecord
00468 {
00469     TwoPhasePredicateRecordType type;
00470     union
00471     {
00472         TwoPhasePredicateXactRecord xactRecord;
00473         TwoPhasePredicateLockRecord lockRecord;
00474     }           data;
00475 } TwoPhasePredicateRecord;
00476 
00477 /*
00478  * Define a macro to use for an "empty" SERIALIZABLEXACT reference.
00479  */
00480 #define InvalidSerializableXact ((SERIALIZABLEXACT *) NULL)
00481 
00482 
00483 /*
00484  * Function definitions for functions needing awareness of predicate
00485  * locking internals.
00486  */
00487 extern PredicateLockData *GetPredicateLockStatusData(void);
00488 
00489 
00490 #endif   /* PREDICATE_INTERNALS_H */