Header And Logo

PostgreSQL
| The world's most advanced open source database.

storage.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * storage.c
00004  *    code to create and destroy physical storage for relations
00005  *
00006  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00007  * Portions Copyright (c) 1994, Regents of the University of California
00008  *
00009  *
00010  * IDENTIFICATION
00011  *    src/backend/catalog/storage.c
00012  *
00013  * NOTES
00014  *    Some of this code used to be in storage/smgr/smgr.c, and the
00015  *    function names still reflect that.
00016  *
00017  *-------------------------------------------------------------------------
00018  */
00019 
00020 #include "postgres.h"
00021 
00022 #include "access/visibilitymap.h"
00023 #include "access/xact.h"
00024 #include "access/xlogutils.h"
00025 #include "catalog/catalog.h"
00026 #include "catalog/storage.h"
00027 #include "catalog/storage_xlog.h"
00028 #include "storage/freespace.h"
00029 #include "storage/smgr.h"
00030 #include "utils/memutils.h"
00031 #include "utils/rel.h"
00032 
00033 /*
00034  * We keep a list of all relations (represented as RelFileNode values)
00035  * that have been created or deleted in the current transaction.  When
00036  * a relation is created, we create the physical file immediately, but
00037  * remember it so that we can delete the file again if the current
00038  * transaction is aborted.  Conversely, a deletion request is NOT
00039  * executed immediately, but is just entered in the list.  When and if
00040  * the transaction commits, we can delete the physical file.
00041  *
00042  * To handle subtransactions, every entry is marked with its transaction
00043  * nesting level.  At subtransaction commit, we reassign the subtransaction's
00044  * entries to the parent nesting level.  At subtransaction abort, we can
00045  * immediately execute the abort-time actions for all entries of the current
00046  * nesting level.
00047  *
00048  * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
00049  * unbetimes.  It'd probably be OK to keep it in TopTransactionContext,
00050  * but I'm being paranoid.
00051  */
00052 
00053 typedef struct PendingRelDelete
00054 {
00055     RelFileNode relnode;        /* relation that may need to be deleted */
00056     BackendId   backend;        /* InvalidBackendId if not a temp rel */
00057     bool        atCommit;       /* T=delete at commit; F=delete at abort */
00058     int         nestLevel;      /* xact nesting level of request */
00059     struct PendingRelDelete *next;      /* linked-list link */
00060 } PendingRelDelete;
00061 
00062 static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
00063 
00064 /*
00065  * RelationCreateStorage
00066  *      Create physical storage for a relation.
00067  *
00068  * Create the underlying disk file storage for the relation. This only
00069  * creates the main fork; additional forks are created lazily by the
00070  * modules that need them.
00071  *
00072  * This function is transactional. The creation is WAL-logged, and if the
00073  * transaction aborts later on, the storage will be destroyed.
00074  */
00075 void
00076 RelationCreateStorage(RelFileNode rnode, char relpersistence)
00077 {
00078     PendingRelDelete *pending;
00079     SMgrRelation srel;
00080     BackendId   backend;
00081     bool        needs_wal;
00082 
00083     switch (relpersistence)
00084     {
00085         case RELPERSISTENCE_TEMP:
00086             backend = MyBackendId;
00087             needs_wal = false;
00088             break;
00089         case RELPERSISTENCE_UNLOGGED:
00090             backend = InvalidBackendId;
00091             needs_wal = false;
00092             break;
00093         case RELPERSISTENCE_PERMANENT:
00094             backend = InvalidBackendId;
00095             needs_wal = true;
00096             break;
00097         default:
00098             elog(ERROR, "invalid relpersistence: %c", relpersistence);
00099             return;             /* placate compiler */
00100     }
00101 
00102     srel = smgropen(rnode, backend);
00103     smgrcreate(srel, MAIN_FORKNUM, false);
00104 
00105     if (needs_wal)
00106         log_smgrcreate(&srel->smgr_rnode.node, MAIN_FORKNUM);
00107 
00108     /* Add the relation to the list of stuff to delete at abort */
00109     pending = (PendingRelDelete *)
00110         MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
00111     pending->relnode = rnode;
00112     pending->backend = backend;
00113     pending->atCommit = false;  /* delete if abort */
00114     pending->nestLevel = GetCurrentTransactionNestLevel();
00115     pending->next = pendingDeletes;
00116     pendingDeletes = pending;
00117 }
00118 
00119 /*
00120  * Perform XLogInsert of a XLOG_SMGR_CREATE record to WAL.
00121  */
00122 void
00123 log_smgrcreate(RelFileNode *rnode, ForkNumber forkNum)
00124 {
00125     xl_smgr_create xlrec;
00126     XLogRecData rdata;
00127 
00128     /*
00129      * Make an XLOG entry reporting the file creation.
00130      */
00131     xlrec.rnode = *rnode;
00132     xlrec.forkNum = forkNum;
00133 
00134     rdata.data = (char *) &xlrec;
00135     rdata.len = sizeof(xlrec);
00136     rdata.buffer = InvalidBuffer;
00137     rdata.next = NULL;
00138 
00139     XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE, &rdata);
00140 }
00141 
00142 /*
00143  * RelationDropStorage
00144  *      Schedule unlinking of physical storage at transaction commit.
00145  */
00146 void
00147 RelationDropStorage(Relation rel)
00148 {
00149     PendingRelDelete *pending;
00150 
00151     /* Add the relation to the list of stuff to delete at commit */
00152     pending = (PendingRelDelete *)
00153         MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
00154     pending->relnode = rel->rd_node;
00155     pending->backend = rel->rd_backend;
00156     pending->atCommit = true;   /* delete if commit */
00157     pending->nestLevel = GetCurrentTransactionNestLevel();
00158     pending->next = pendingDeletes;
00159     pendingDeletes = pending;
00160 
00161     /*
00162      * NOTE: if the relation was created in this transaction, it will now be
00163      * present in the pending-delete list twice, once with atCommit true and
00164      * once with atCommit false.  Hence, it will be physically deleted at end
00165      * of xact in either case (and the other entry will be ignored by
00166      * smgrDoPendingDeletes, so no error will occur).  We could instead remove
00167      * the existing list entry and delete the physical file immediately, but
00168      * for now I'll keep the logic simple.
00169      */
00170 
00171     RelationCloseSmgr(rel);
00172 }
00173 
00174 /*
00175  * RelationPreserveStorage
00176  *      Mark a relation as not to be deleted after all.
00177  *
00178  * We need this function because relation mapping changes are committed
00179  * separately from commit of the whole transaction, so it's still possible
00180  * for the transaction to abort after the mapping update is done.
00181  * When a new physical relation is installed in the map, it would be
00182  * scheduled for delete-on-abort, so we'd delete it, and be in trouble.
00183  * The relation mapper fixes this by telling us to not delete such relations
00184  * after all as part of its commit.
00185  *
00186  * We also use this to reuse an old build of an index during ALTER TABLE, this
00187  * time removing the delete-at-commit entry.
00188  *
00189  * No-op if the relation is not among those scheduled for deletion.
00190  */
00191 void
00192 RelationPreserveStorage(RelFileNode rnode, bool atCommit)
00193 {
00194     PendingRelDelete *pending;
00195     PendingRelDelete *prev;
00196     PendingRelDelete *next;
00197 
00198     prev = NULL;
00199     for (pending = pendingDeletes; pending != NULL; pending = next)
00200     {
00201         next = pending->next;
00202         if (RelFileNodeEquals(rnode, pending->relnode)
00203             && pending->atCommit == atCommit)
00204         {
00205             /* unlink and delete list entry */
00206             if (prev)
00207                 prev->next = next;
00208             else
00209                 pendingDeletes = next;
00210             pfree(pending);
00211             /* prev does not change */
00212         }
00213         else
00214         {
00215             /* unrelated entry, don't touch it */
00216             prev = pending;
00217         }
00218     }
00219 }
00220 
00221 /*
00222  * RelationTruncate
00223  *      Physically truncate a relation to the specified number of blocks.
00224  *
00225  * This includes getting rid of any buffers for the blocks that are to be
00226  * dropped.
00227  */
00228 void
00229 RelationTruncate(Relation rel, BlockNumber nblocks)
00230 {
00231     bool        fsm;
00232     bool        vm;
00233 
00234     /* Open it at the smgr level if not already done */
00235     RelationOpenSmgr(rel);
00236 
00237     /*
00238      * Make sure smgr_targblock etc aren't pointing somewhere past new end
00239      */
00240     rel->rd_smgr->smgr_targblock = InvalidBlockNumber;
00241     rel->rd_smgr->smgr_fsm_nblocks = InvalidBlockNumber;
00242     rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber;
00243 
00244     /* Truncate the FSM first if it exists */
00245     fsm = smgrexists(rel->rd_smgr, FSM_FORKNUM);
00246     if (fsm)
00247         FreeSpaceMapTruncateRel(rel, nblocks);
00248 
00249     /* Truncate the visibility map too if it exists. */
00250     vm = smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
00251     if (vm)
00252         visibilitymap_truncate(rel, nblocks);
00253 
00254     /*
00255      * We WAL-log the truncation before actually truncating, which means
00256      * trouble if the truncation fails. If we then crash, the WAL replay
00257      * likely isn't going to succeed in the truncation either, and cause a
00258      * PANIC. It's tempting to put a critical section here, but that cure
00259      * would be worse than the disease. It would turn a usually harmless
00260      * failure to truncate, that might spell trouble at WAL replay, into a
00261      * certain PANIC.
00262      */
00263     if (RelationNeedsWAL(rel))
00264     {
00265         /*
00266          * Make an XLOG entry reporting the file truncation.
00267          */
00268         XLogRecPtr  lsn;
00269         XLogRecData rdata;
00270         xl_smgr_truncate xlrec;
00271 
00272         xlrec.blkno = nblocks;
00273         xlrec.rnode = rel->rd_node;
00274 
00275         rdata.data = (char *) &xlrec;
00276         rdata.len = sizeof(xlrec);
00277         rdata.buffer = InvalidBuffer;
00278         rdata.next = NULL;
00279 
00280         lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE, &rdata);
00281 
00282         /*
00283          * Flush, because otherwise the truncation of the main relation might
00284          * hit the disk before the WAL record, and the truncation of the FSM
00285          * or visibility map. If we crashed during that window, we'd be left
00286          * with a truncated heap, but the FSM or visibility map would still
00287          * contain entries for the non-existent heap pages.
00288          */
00289         if (fsm || vm)
00290             XLogFlush(lsn);
00291     }
00292 
00293     /* Do the real work */
00294     smgrtruncate(rel->rd_smgr, MAIN_FORKNUM, nblocks);
00295 }
00296 
00297 /*
00298  *  smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
00299  *
00300  * This also runs when aborting a subxact; we want to clean up a failed
00301  * subxact immediately.
00302  *
00303  * Note: It's possible that we're being asked to remove a relation that has
00304  * no physical storage in any fork. In particular, it's possible that we're
00305  * cleaning up an old temporary relation for which RemovePgTempFiles has
00306  * already recovered the physical storage.
00307  */
00308 void
00309 smgrDoPendingDeletes(bool isCommit)
00310 {
00311     int         nestLevel = GetCurrentTransactionNestLevel();
00312     PendingRelDelete *pending;
00313     PendingRelDelete *prev;
00314     PendingRelDelete *next;
00315     int         nrels = 0,
00316                 i = 0,
00317                 maxrels = 8;
00318     SMgrRelation *srels = palloc(maxrels * sizeof(SMgrRelation));
00319 
00320     prev = NULL;
00321     for (pending = pendingDeletes; pending != NULL; pending = next)
00322     {
00323         next = pending->next;
00324         if (pending->nestLevel < nestLevel)
00325         {
00326             /* outer-level entries should not be processed yet */
00327             prev = pending;
00328         }
00329         else
00330         {
00331             /* unlink list entry first, so we don't retry on failure */
00332             if (prev)
00333                 prev->next = next;
00334             else
00335                 pendingDeletes = next;
00336             /* do deletion if called for */
00337             if (pending->atCommit == isCommit)
00338             {
00339                 SMgrRelation srel;
00340 
00341                 srel = smgropen(pending->relnode, pending->backend);
00342 
00343                 /* extend the array if needed (double the size) */
00344                 if (maxrels <= nrels)
00345                 {
00346                     maxrels *= 2;
00347                     srels = repalloc(srels, sizeof(SMgrRelation) * maxrels);
00348                 }
00349 
00350                 srels[nrels++] = srel;
00351             }
00352             /* must explicitly free the list entry */
00353             pfree(pending);
00354             /* prev does not change */
00355         }
00356     }
00357 
00358     if (nrels > 0)
00359     {
00360         smgrdounlinkall(srels, nrels, false);
00361 
00362         for (i = 0; i < nrels; i++)
00363             smgrclose(srels[i]);
00364     }
00365 
00366     pfree(srels);
00367 
00368 }
00369 
00370 /*
00371  * smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
00372  *
00373  * The return value is the number of relations scheduled for termination.
00374  * *ptr is set to point to a freshly-palloc'd array of RelFileNodes.
00375  * If there are no relations to be deleted, *ptr is set to NULL.
00376  *
00377  * Only non-temporary relations are included in the returned list.  This is OK
00378  * because the list is used only in contexts where temporary relations don't
00379  * matter: we're either writing to the two-phase state file (and transactions
00380  * that have touched temp tables can't be prepared) or we're writing to xlog
00381  * (and all temporary files will be zapped if we restart anyway, so no need
00382  * for redo to do it also).
00383  *
00384  * Note that the list does not include anything scheduled for termination
00385  * by upper-level transactions.
00386  */
00387 int
00388 smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr)
00389 {
00390     int         nestLevel = GetCurrentTransactionNestLevel();
00391     int         nrels;
00392     RelFileNode *rptr;
00393     PendingRelDelete *pending;
00394 
00395     nrels = 0;
00396     for (pending = pendingDeletes; pending != NULL; pending = pending->next)
00397     {
00398         if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
00399             && pending->backend == InvalidBackendId)
00400             nrels++;
00401     }
00402     if (nrels == 0)
00403     {
00404         *ptr = NULL;
00405         return 0;
00406     }
00407     rptr = (RelFileNode *) palloc(nrels * sizeof(RelFileNode));
00408     *ptr = rptr;
00409     for (pending = pendingDeletes; pending != NULL; pending = pending->next)
00410     {
00411         if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit
00412             && pending->backend == InvalidBackendId)
00413         {
00414             *rptr = pending->relnode;
00415             rptr++;
00416         }
00417     }
00418     return nrels;
00419 }
00420 
00421 /*
00422  *  PostPrepare_smgr -- Clean up after a successful PREPARE
00423  *
00424  * What we have to do here is throw away the in-memory state about pending
00425  * relation deletes.  It's all been recorded in the 2PC state file and
00426  * it's no longer smgr's job to worry about it.
00427  */
00428 void
00429 PostPrepare_smgr(void)
00430 {
00431     PendingRelDelete *pending;
00432     PendingRelDelete *next;
00433 
00434     for (pending = pendingDeletes; pending != NULL; pending = next)
00435     {
00436         next = pending->next;
00437         pendingDeletes = next;
00438         /* must explicitly free the list entry */
00439         pfree(pending);
00440     }
00441 }
00442 
00443 
00444 /*
00445  * AtSubCommit_smgr() --- Take care of subtransaction commit.
00446  *
00447  * Reassign all items in the pending-deletes list to the parent transaction.
00448  */
00449 void
00450 AtSubCommit_smgr(void)
00451 {
00452     int         nestLevel = GetCurrentTransactionNestLevel();
00453     PendingRelDelete *pending;
00454 
00455     for (pending = pendingDeletes; pending != NULL; pending = pending->next)
00456     {
00457         if (pending->nestLevel >= nestLevel)
00458             pending->nestLevel = nestLevel - 1;
00459     }
00460 }
00461 
00462 /*
00463  * AtSubAbort_smgr() --- Take care of subtransaction abort.
00464  *
00465  * Delete created relations and forget about deleted relations.
00466  * We can execute these operations immediately because we know this
00467  * subtransaction will not commit.
00468  */
00469 void
00470 AtSubAbort_smgr(void)
00471 {
00472     smgrDoPendingDeletes(false);
00473 }
00474 
00475 void
00476 smgr_redo(XLogRecPtr lsn, XLogRecord *record)
00477 {
00478     uint8       info = record->xl_info & ~XLR_INFO_MASK;
00479 
00480     /* Backup blocks are not used in smgr records */
00481     Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
00482 
00483     if (info == XLOG_SMGR_CREATE)
00484     {
00485         xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
00486         SMgrRelation reln;
00487 
00488         reln = smgropen(xlrec->rnode, InvalidBackendId);
00489         smgrcreate(reln, xlrec->forkNum, true);
00490     }
00491     else if (info == XLOG_SMGR_TRUNCATE)
00492     {
00493         xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
00494         SMgrRelation reln;
00495         Relation    rel;
00496 
00497         reln = smgropen(xlrec->rnode, InvalidBackendId);
00498 
00499         /*
00500          * Forcibly create relation if it doesn't exist (which suggests that
00501          * it was dropped somewhere later in the WAL sequence).  As in
00502          * XLogReadBuffer, we prefer to recreate the rel and replay the log as
00503          * best we can until the drop is seen.
00504          */
00505         smgrcreate(reln, MAIN_FORKNUM, true);
00506 
00507         /*
00508          * Before we perform the truncation, update minimum recovery point
00509          * to cover this WAL record. Once the relation is truncated, there's
00510          * no going back. The buffer manager enforces the WAL-first rule
00511          * for normal updates to relation files, so that the minimum recovery
00512          * point is always updated before the corresponding change in the
00513          * data file is flushed to disk. We have to do the same manually
00514          * here.
00515          *
00516          * Doing this before the truncation means that if the truncation fails
00517          * for some reason, you cannot start up the system even after restart,
00518          * until you fix the underlying situation so that the truncation will
00519          * succeed. Alternatively, we could update the minimum recovery point
00520          * after truncation, but that would leave a small window where the
00521          * WAL-first rule could be violated.
00522          */
00523         XLogFlush(lsn);
00524 
00525         smgrtruncate(reln, MAIN_FORKNUM, xlrec->blkno);
00526 
00527         /* Also tell xlogutils.c about it */
00528         XLogTruncateRelation(xlrec->rnode, MAIN_FORKNUM, xlrec->blkno);
00529 
00530         /* Truncate FSM and VM too */
00531         rel = CreateFakeRelcacheEntry(xlrec->rnode);
00532 
00533         if (smgrexists(reln, FSM_FORKNUM))
00534             FreeSpaceMapTruncateRel(rel, xlrec->blkno);
00535         if (smgrexists(reln, VISIBILITYMAP_FORKNUM))
00536             visibilitymap_truncate(rel, xlrec->blkno);
00537 
00538         FreeFakeRelcacheEntry(rel);
00539     }
00540     else
00541         elog(PANIC, "smgr_redo: unknown op code %u", info);
00542 }