00001 /*------------------------------------------------------------------------- 00002 * 00003 * predicate_internals.h 00004 * POSTGRES internal predicate locking definitions. 00005 * 00006 * 00007 * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group 00008 * Portions Copyright (c) 1994, Regents of the University of California 00009 * 00010 * src/include/storage/predicate_internals.h 00011 * 00012 *------------------------------------------------------------------------- 00013 */ 00014 #ifndef PREDICATE_INTERNALS_H 00015 #define PREDICATE_INTERNALS_H 00016 00017 #include "storage/lock.h" 00018 00019 /* 00020 * Commit number. 00021 */ 00022 typedef uint64 SerCommitSeqNo; 00023 00024 /* 00025 * Reserved commit sequence numbers: 00026 * - 0 is reserved to indicate a non-existent SLRU entry; it cannot be 00027 * used as a SerCommitSeqNo, even an invalid one 00028 * - InvalidSerCommitSeqNo is used to indicate a transaction that 00029 * hasn't committed yet, so use a number greater than all valid 00030 * ones to make comparison do the expected thing 00031 * - RecoverySerCommitSeqNo is used to refer to transactions that 00032 * happened before a crash/recovery, since we restart the sequence 00033 * at that point. It's earlier than all normal sequence numbers, 00034 * and is only used by recovered prepared transactions 00035 */ 00036 #define InvalidSerCommitSeqNo ((SerCommitSeqNo) UINT64CONST(0xFFFFFFFFFFFFFFFF)) 00037 #define RecoverySerCommitSeqNo ((SerCommitSeqNo) 1) 00038 #define FirstNormalSerCommitSeqNo ((SerCommitSeqNo) 2) 00039 00040 /* 00041 * The SERIALIZABLEXACT struct contains information needed for each 00042 * serializable database transaction to support SSI techniques. 00043 * 00044 * A home-grown list is maintained in shared memory to manage these. 00045 * An entry is used when the serializable transaction acquires a snapshot. 00046 * Unless the transaction is rolled back, this entry must generally remain 00047 * until all concurrent transactions have completed. (There are special 00048 * optimizations for READ ONLY transactions which often allow them to be 00049 * cleaned up earlier.) A transaction which is rolled back is cleaned up 00050 * as soon as possible. 00051 * 00052 * Eligibility for cleanup of committed transactions is generally determined 00053 * by comparing the transaction's finishedBefore field to 00054 * SerializableGlobalXmin. 00055 */ 00056 typedef struct SERIALIZABLEXACT 00057 { 00058 VirtualTransactionId vxid; /* The executing process always has one of 00059 * these. */ 00060 00061 /* 00062 * We use two numbers to track the order that transactions commit. Before 00063 * commit, a transaction is marked as prepared, and prepareSeqNo is set. 00064 * Shortly after commit, it's marked as committed, and commitSeqNo is set. 00065 * This doesn't give a strict commit order, but these two values together 00066 * are good enough for us, as we can always err on the safe side and 00067 * assume that there's a conflict, if we can't be sure of the exact 00068 * ordering of two commits. 00069 * 00070 * Note that a transaction is marked as prepared for a short period during 00071 * commit processing, even if two-phase commit is not used. But with 00072 * two-phase commit, a transaction can stay in prepared state for some 00073 * time. 00074 */ 00075 SerCommitSeqNo prepareSeqNo; 00076 SerCommitSeqNo commitSeqNo; 00077 00078 /* these values are not both interesting at the same time */ 00079 union 00080 { 00081 SerCommitSeqNo earliestOutConflictCommit; /* when committed with 00082 * conflict out */ 00083 SerCommitSeqNo lastCommitBeforeSnapshot; /* when not committed or 00084 * no conflict out */ 00085 } SeqNo; 00086 SHM_QUEUE outConflicts; /* list of write transactions whose data we 00087 * couldn't read. */ 00088 SHM_QUEUE inConflicts; /* list of read transactions which couldn't 00089 * see our write. */ 00090 SHM_QUEUE predicateLocks; /* list of associated PREDICATELOCK objects */ 00091 SHM_QUEUE finishedLink; /* list link in 00092 * FinishedSerializableTransactions */ 00093 00094 /* 00095 * for r/o transactions: list of concurrent r/w transactions that we could 00096 * potentially have conflicts with, and vice versa for r/w transactions 00097 */ 00098 SHM_QUEUE possibleUnsafeConflicts; 00099 00100 TransactionId topXid; /* top level xid for the transaction, if one 00101 * exists; else invalid */ 00102 TransactionId finishedBefore; /* invalid means still running; else 00103 * the struct expires when no 00104 * serializable xids are before this. */ 00105 TransactionId xmin; /* the transaction's snapshot xmin */ 00106 uint32 flags; /* OR'd combination of values defined below */ 00107 int pid; /* pid of associated process */ 00108 } SERIALIZABLEXACT; 00109 00110 #define SXACT_FLAG_COMMITTED 0x00000001 /* already committed */ 00111 #define SXACT_FLAG_PREPARED 0x00000002 /* about to commit */ 00112 #define SXACT_FLAG_ROLLED_BACK 0x00000004 /* already rolled back */ 00113 #define SXACT_FLAG_DOOMED 0x00000008 /* will roll back */ 00114 /* 00115 * The following flag actually means that the flagged transaction has a 00116 * conflict out *to a transaction which committed ahead of it*. It's hard 00117 * to get that into a name of a reasonable length. 00118 */ 00119 #define SXACT_FLAG_CONFLICT_OUT 0x00000010 00120 #define SXACT_FLAG_READ_ONLY 0x00000020 00121 #define SXACT_FLAG_DEFERRABLE_WAITING 0x00000040 00122 #define SXACT_FLAG_RO_SAFE 0x00000080 00123 #define SXACT_FLAG_RO_UNSAFE 0x00000100 00124 #define SXACT_FLAG_SUMMARY_CONFLICT_IN 0x00000200 00125 #define SXACT_FLAG_SUMMARY_CONFLICT_OUT 0x00000400 00126 00127 /* 00128 * The following types are used to provide an ad hoc list for holding 00129 * SERIALIZABLEXACT objects. An HTAB is overkill, since there is no need to 00130 * access these by key -- there are direct pointers to these objects where 00131 * needed. If a shared memory list is created, these types can probably be 00132 * eliminated in favor of using the general solution. 00133 */ 00134 typedef struct PredXactListElementData 00135 { 00136 SHM_QUEUE link; 00137 SERIALIZABLEXACT sxact; 00138 } PredXactListElementData; 00139 00140 typedef struct PredXactListElementData *PredXactListElement; 00141 00142 #define PredXactListElementDataSize \ 00143 ((Size)MAXALIGN(sizeof(PredXactListElementData))) 00144 00145 typedef struct PredXactListData 00146 { 00147 SHM_QUEUE availableList; 00148 SHM_QUEUE activeList; 00149 00150 /* 00151 * These global variables are maintained when registering and cleaning up 00152 * serializable transactions. They must be global across all backends, 00153 * but are not needed outside the predicate.c source file. Protected by 00154 * SerializableXactHashLock. 00155 */ 00156 TransactionId SxactGlobalXmin; /* global xmin for active serializable 00157 * transactions */ 00158 int SxactGlobalXminCount; /* how many active serializable 00159 * transactions have this xmin */ 00160 int WritableSxactCount; /* how many non-read-only serializable 00161 * transactions are active */ 00162 SerCommitSeqNo LastSxactCommitSeqNo; /* a strictly monotonically 00163 * increasing number for 00164 * commits of serializable 00165 * transactions */ 00166 /* Protected by SerializableXactHashLock. */ 00167 SerCommitSeqNo CanPartialClearThrough; /* can clear predicate locks 00168 * and inConflicts for 00169 * committed transactions 00170 * through this seq no */ 00171 /* Protected by SerializableFinishedListLock. */ 00172 SerCommitSeqNo HavePartialClearedThrough; /* have cleared through this 00173 * seq no */ 00174 SERIALIZABLEXACT *OldCommittedSxact; /* shared copy of dummy sxact */ 00175 00176 PredXactListElement element; 00177 } PredXactListData; 00178 00179 typedef struct PredXactListData *PredXactList; 00180 00181 #define PredXactListDataSize \ 00182 ((Size)MAXALIGN(sizeof(PredXactListData))) 00183 00184 00185 /* 00186 * The following types are used to provide lists of rw-conflicts between 00187 * pairs of transactions. Since exactly the same information is needed, 00188 * they are also used to record possible unsafe transaction relationships 00189 * for purposes of identifying safe snapshots for read-only transactions. 00190 * 00191 * When a RWConflictData is not in use to record either type of relationship 00192 * between a pair of transactions, it is kept on an "available" list. The 00193 * outLink field is used for maintaining that list. 00194 */ 00195 typedef struct RWConflictData 00196 { 00197 SHM_QUEUE outLink; /* link for list of conflicts out from a sxact */ 00198 SHM_QUEUE inLink; /* link for list of conflicts in to a sxact */ 00199 SERIALIZABLEXACT *sxactOut; 00200 SERIALIZABLEXACT *sxactIn; 00201 } RWConflictData; 00202 00203 typedef struct RWConflictData *RWConflict; 00204 00205 #define RWConflictDataSize \ 00206 ((Size)MAXALIGN(sizeof(RWConflictData))) 00207 00208 typedef struct RWConflictPoolHeaderData 00209 { 00210 SHM_QUEUE availableList; 00211 RWConflict element; 00212 } RWConflictPoolHeaderData; 00213 00214 typedef struct RWConflictPoolHeaderData *RWConflictPoolHeader; 00215 00216 #define RWConflictPoolHeaderDataSize \ 00217 ((Size)MAXALIGN(sizeof(RWConflictPoolHeaderData))) 00218 00219 00220 /* 00221 * The SERIALIZABLEXIDTAG struct identifies an xid assigned to a serializable 00222 * transaction or any of its subtransactions. 00223 */ 00224 typedef struct SERIALIZABLEXIDTAG 00225 { 00226 TransactionId xid; 00227 } SERIALIZABLEXIDTAG; 00228 00229 /* 00230 * The SERIALIZABLEXID struct provides a link from a TransactionId for a 00231 * serializable transaction to the related SERIALIZABLEXACT record, even if 00232 * the transaction has completed and its connection has been closed. 00233 * 00234 * These are created as new top level transaction IDs are first assigned to 00235 * transactions which are participating in predicate locking. This may 00236 * never happen for a particular transaction if it doesn't write anything. 00237 * They are removed with their related serializable transaction objects. 00238 * 00239 * The SubTransGetTopmostTransaction method is used where necessary to get 00240 * from an XID which might be from a subtransaction to the top level XID. 00241 */ 00242 typedef struct SERIALIZABLEXID 00243 { 00244 /* hash key */ 00245 SERIALIZABLEXIDTAG tag; 00246 00247 /* data */ 00248 SERIALIZABLEXACT *myXact; /* pointer to the top level transaction data */ 00249 } SERIALIZABLEXID; 00250 00251 00252 /* 00253 * The PREDICATELOCKTARGETTAG struct identifies a database object which can 00254 * be the target of predicate locks. 00255 * 00256 * Note that the hash function being used doesn't properly respect tag 00257 * length -- it will go to a four byte boundary past the end of the tag. 00258 * If you change this struct, make sure any slack space is initialized, 00259 * so that any random bytes in the middle or at the end are not included 00260 * in the hash. 00261 * 00262 * TODO SSI: If we always use the same fields for the same type of value, we 00263 * should rename these. Holding off until it's clear there are no exceptions. 00264 * Since indexes are relations with blocks and tuples, it's looking likely that 00265 * the rename will be possible. If not, we may need to divide the last field 00266 * and use part of it for a target type, so that we know how to interpret the 00267 * data.. 00268 */ 00269 typedef struct PREDICATELOCKTARGETTAG 00270 { 00271 uint32 locktag_field1; /* a 32-bit ID field */ 00272 uint32 locktag_field2; /* a 32-bit ID field */ 00273 uint32 locktag_field3; /* a 32-bit ID field */ 00274 uint32 locktag_field4; /* a 32-bit ID field */ 00275 uint32 locktag_field5; /* a 32-bit ID field */ 00276 } PREDICATELOCKTARGETTAG; 00277 00278 /* 00279 * The PREDICATELOCKTARGET struct represents a database object on which there 00280 * are predicate locks. 00281 * 00282 * A hash list of these objects is maintained in shared memory. An entry is 00283 * added when a predicate lock is requested on an object which doesn't 00284 * already have one. An entry is removed when the last lock is removed from 00285 * its list. 00286 * 00287 * Because a particular target might become obsolete, due to update to a new 00288 * version, before the reading transaction is obsolete, we need some way to 00289 * prevent errors from reuse of a tuple ID. Rather than attempting to clean 00290 * up the targets as the related tuples are pruned or vacuumed, we check the 00291 * xmin on access. This should be far less costly. 00292 */ 00293 typedef struct PREDICATELOCKTARGET 00294 { 00295 /* hash key */ 00296 PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */ 00297 00298 /* data */ 00299 SHM_QUEUE predicateLocks; /* list of PREDICATELOCK objects assoc. with 00300 * predicate lock target */ 00301 } PREDICATELOCKTARGET; 00302 00303 00304 /* 00305 * The PREDICATELOCKTAG struct identifies an individual predicate lock. 00306 * 00307 * It is the combination of predicate lock target (which is a lockable 00308 * object) and a serializable transaction which has acquired a lock on that 00309 * target. 00310 */ 00311 typedef struct PREDICATELOCKTAG 00312 { 00313 PREDICATELOCKTARGET *myTarget; 00314 SERIALIZABLEXACT *myXact; 00315 } PREDICATELOCKTAG; 00316 00317 /* 00318 * The PREDICATELOCK struct represents an individual lock. 00319 * 00320 * An entry can be created here when the related database object is read, or 00321 * by promotion of multiple finer-grained targets. All entries related to a 00322 * serializable transaction are removed when that serializable transaction is 00323 * cleaned up. Entries can also be removed when they are combined into a 00324 * single coarser-grained lock entry. 00325 */ 00326 typedef struct PREDICATELOCK 00327 { 00328 /* hash key */ 00329 PREDICATELOCKTAG tag; /* unique identifier of lock */ 00330 00331 /* data */ 00332 SHM_QUEUE targetLink; /* list link in PREDICATELOCKTARGET's list of 00333 * predicate locks */ 00334 SHM_QUEUE xactLink; /* list link in SERIALIZABLEXACT's list of 00335 * predicate locks */ 00336 SerCommitSeqNo commitSeqNo; /* only used for summarized predicate locks */ 00337 } PREDICATELOCK; 00338 00339 00340 /* 00341 * The LOCALPREDICATELOCK struct represents a local copy of data which is 00342 * also present in the PREDICATELOCK table, organized for fast access without 00343 * needing to acquire a LWLock. It is strictly for optimization. 00344 * 00345 * Each serializable transaction creates its own local hash table to hold a 00346 * collection of these. This information is used to determine when a number 00347 * of fine-grained locks should be promoted to a single coarser-grained lock. 00348 * The information is maintained more-or-less in parallel to the 00349 * PREDICATELOCK data, but because this data is not protected by locks and is 00350 * only used in an optimization heuristic, it is allowed to drift in a few 00351 * corner cases where maintaining exact data would be expensive. 00352 * 00353 * The hash table is created when the serializable transaction acquires its 00354 * snapshot, and its memory is released upon completion of the transaction. 00355 */ 00356 typedef struct LOCALPREDICATELOCK 00357 { 00358 /* hash key */ 00359 PREDICATELOCKTARGETTAG tag; /* unique identifier of lockable object */ 00360 00361 /* data */ 00362 bool held; /* is lock held, or just its children? */ 00363 int childLocks; /* number of child locks currently held */ 00364 } LOCALPREDICATELOCK; 00365 00366 00367 /* 00368 * The types of predicate locks which can be acquired. 00369 */ 00370 typedef enum PredicateLockTargetType 00371 { 00372 PREDLOCKTAG_RELATION, 00373 PREDLOCKTAG_PAGE, 00374 PREDLOCKTAG_TUPLE 00375 /* TODO SSI: Other types may be needed for index locking */ 00376 } PredicateLockTargetType; 00377 00378 00379 /* 00380 * This structure is used to quickly capture a copy of all predicate 00381 * locks. This is currently used only by the pg_lock_status function, 00382 * which in turn is used by the pg_locks view. 00383 */ 00384 typedef struct PredicateLockData 00385 { 00386 int nelements; 00387 PREDICATELOCKTARGETTAG *locktags; 00388 SERIALIZABLEXACT *xacts; 00389 } PredicateLockData; 00390 00391 00392 /* 00393 * These macros define how we map logical IDs of lockable objects into the 00394 * physical fields of PREDICATELOCKTARGETTAG. Use these to set up values, 00395 * rather than accessing the fields directly. Note multiple eval of target! 00396 */ 00397 #define SET_PREDICATELOCKTARGETTAG_RELATION(locktag,dboid,reloid) \ 00398 ((locktag).locktag_field1 = (dboid), \ 00399 (locktag).locktag_field2 = (reloid), \ 00400 (locktag).locktag_field3 = InvalidBlockNumber, \ 00401 (locktag).locktag_field4 = InvalidOffsetNumber, \ 00402 (locktag).locktag_field5 = InvalidTransactionId) 00403 00404 #define SET_PREDICATELOCKTARGETTAG_PAGE(locktag,dboid,reloid,blocknum) \ 00405 ((locktag).locktag_field1 = (dboid), \ 00406 (locktag).locktag_field2 = (reloid), \ 00407 (locktag).locktag_field3 = (blocknum), \ 00408 (locktag).locktag_field4 = InvalidOffsetNumber, \ 00409 (locktag).locktag_field5 = InvalidTransactionId) 00410 00411 #define SET_PREDICATELOCKTARGETTAG_TUPLE(locktag,dboid,reloid,blocknum,offnum,xmin) \ 00412 ((locktag).locktag_field1 = (dboid), \ 00413 (locktag).locktag_field2 = (reloid), \ 00414 (locktag).locktag_field3 = (blocknum), \ 00415 (locktag).locktag_field4 = (offnum), \ 00416 (locktag).locktag_field5 = (xmin)) 00417 00418 #define GET_PREDICATELOCKTARGETTAG_DB(locktag) \ 00419 ((Oid) (locktag).locktag_field1) 00420 #define GET_PREDICATELOCKTARGETTAG_RELATION(locktag) \ 00421 ((Oid) (locktag).locktag_field2) 00422 #define GET_PREDICATELOCKTARGETTAG_PAGE(locktag) \ 00423 ((BlockNumber) (locktag).locktag_field3) 00424 #define GET_PREDICATELOCKTARGETTAG_OFFSET(locktag) \ 00425 ((OffsetNumber) (locktag).locktag_field4) 00426 #define GET_PREDICATELOCKTARGETTAG_XMIN(locktag) \ 00427 ((TransactionId) (locktag).locktag_field5) 00428 #define GET_PREDICATELOCKTARGETTAG_TYPE(locktag) \ 00429 (((locktag).locktag_field4 != InvalidOffsetNumber) ? PREDLOCKTAG_TUPLE : \ 00430 (((locktag).locktag_field3 != InvalidBlockNumber) ? PREDLOCKTAG_PAGE : \ 00431 PREDLOCKTAG_RELATION)) 00432 00433 /* 00434 * Two-phase commit statefile records. There are two types: for each 00435 * transaction, we generate one per-transaction record and a variable 00436 * number of per-predicate-lock records. 00437 */ 00438 typedef enum TwoPhasePredicateRecordType 00439 { 00440 TWOPHASEPREDICATERECORD_XACT, 00441 TWOPHASEPREDICATERECORD_LOCK 00442 } TwoPhasePredicateRecordType; 00443 00444 /* 00445 * Per-transaction information to reconstruct a SERIALIZABLEXACT. Not 00446 * much is needed because most of it not meaningful for a recovered 00447 * prepared transaction. 00448 * 00449 * In particular, we do not record the in and out conflict lists for a 00450 * prepared transaction because the associated SERIALIZABLEXACTs will 00451 * not be available after recovery. Instead, we simply record the 00452 * existence of each type of conflict by setting the transaction's 00453 * summary conflict in/out flag. 00454 */ 00455 typedef struct TwoPhasePredicateXactRecord 00456 { 00457 TransactionId xmin; 00458 uint32 flags; 00459 } TwoPhasePredicateXactRecord; 00460 00461 /* Per-lock state */ 00462 typedef struct TwoPhasePredicateLockRecord 00463 { 00464 PREDICATELOCKTARGETTAG target; 00465 } TwoPhasePredicateLockRecord; 00466 00467 typedef struct TwoPhasePredicateRecord 00468 { 00469 TwoPhasePredicateRecordType type; 00470 union 00471 { 00472 TwoPhasePredicateXactRecord xactRecord; 00473 TwoPhasePredicateLockRecord lockRecord; 00474 } data; 00475 } TwoPhasePredicateRecord; 00476 00477 /* 00478 * Define a macro to use for an "empty" SERIALIZABLEXACT reference. 00479 */ 00480 #define InvalidSerializableXact ((SERIALIZABLEXACT *) NULL) 00481 00482 00483 /* 00484 * Function definitions for functions needing awareness of predicate 00485 * locking internals. 00486 */ 00487 extern PredicateLockData *GetPredicateLockStatusData(void); 00488 00489 00490 #endif /* PREDICATE_INTERNALS_H */