Header And Logo

PostgreSQL
| The world's most advanced open source database.

clog.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * clog.c
00004  *      PostgreSQL transaction-commit-log manager
00005  *
00006  * This module replaces the old "pg_log" access code, which treated pg_log
00007  * essentially like a relation, in that it went through the regular buffer
00008  * manager.  The problem with that was that there wasn't any good way to
00009  * recycle storage space for transactions so old that they'll never be
00010  * looked up again.  Now we use specialized access code so that the commit
00011  * log can be broken into relatively small, independent segments.
00012  *
00013  * XLOG interactions: this module generates an XLOG record whenever a new
00014  * CLOG page is initialized to zeroes.  Other writes of CLOG come from
00015  * recording of transaction commit or abort in xact.c, which generates its
00016  * own XLOG records for these events and will re-perform the status update
00017  * on redo; so we need make no additional XLOG entry here.  For synchronous
00018  * transaction commits, the XLOG is guaranteed flushed through the XLOG commit
00019  * record before we are called to log a commit, so the WAL rule "write xlog
00020  * before data" is satisfied automatically.  However, for async commits we
00021  * must track the latest LSN affecting each CLOG page, so that we can flush
00022  * XLOG that far and satisfy the WAL rule.  We don't have to worry about this
00023  * for aborts (whether sync or async), since the post-crash assumption would
00024  * be that such transactions failed anyway.
00025  *
00026  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00027  * Portions Copyright (c) 1994, Regents of the University of California
00028  *
00029  * src/backend/access/transam/clog.c
00030  *
00031  *-------------------------------------------------------------------------
00032  */
00033 #include "postgres.h"
00034 
00035 #include "access/clog.h"
00036 #include "access/slru.h"
00037 #include "access/transam.h"
00038 #include "miscadmin.h"
00039 #include "pg_trace.h"
00040 
00041 /*
00042  * Defines for CLOG page sizes.  A page is the same BLCKSZ as is used
00043  * everywhere else in Postgres.
00044  *
00045  * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
00046  * CLOG page numbering also wraps around at 0xFFFFFFFF/CLOG_XACTS_PER_PAGE,
00047  * and CLOG segment numbering at
00048  * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT.  We need take no
00049  * explicit notice of that fact in this module, except when comparing segment
00050  * and page numbers in TruncateCLOG (see CLOGPagePrecedes).
00051  */
00052 
00053 /* We need two bits per xact, so four xacts fit in a byte */
00054 #define CLOG_BITS_PER_XACT  2
00055 #define CLOG_XACTS_PER_BYTE 4
00056 #define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE)
00057 #define CLOG_XACT_BITMASK   ((1 << CLOG_BITS_PER_XACT) - 1)
00058 
00059 #define TransactionIdToPage(xid)    ((xid) / (TransactionId) CLOG_XACTS_PER_PAGE)
00060 #define TransactionIdToPgIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE)
00061 #define TransactionIdToByte(xid)    (TransactionIdToPgIndex(xid) / CLOG_XACTS_PER_BYTE)
00062 #define TransactionIdToBIndex(xid)  ((xid) % (TransactionId) CLOG_XACTS_PER_BYTE)
00063 
00064 /* We store the latest async LSN for each group of transactions */
00065 #define CLOG_XACTS_PER_LSN_GROUP    32  /* keep this a power of 2 */
00066 #define CLOG_LSNS_PER_PAGE  (CLOG_XACTS_PER_PAGE / CLOG_XACTS_PER_LSN_GROUP)
00067 
00068 #define GetLSNIndex(slotno, xid)    ((slotno) * CLOG_LSNS_PER_PAGE + \
00069     ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP)
00070 
00071 
00072 /*
00073  * Link to shared-memory data structures for CLOG control
00074  */
00075 static SlruCtlData ClogCtlData;
00076 
00077 #define ClogCtl (&ClogCtlData)
00078 
00079 
00080 static int  ZeroCLOGPage(int pageno, bool writeXlog);
00081 static bool CLOGPagePrecedes(int page1, int page2);
00082 static void WriteZeroPageXlogRec(int pageno);
00083 static void WriteTruncateXlogRec(int pageno);
00084 static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
00085                            TransactionId *subxids, XidStatus status,
00086                            XLogRecPtr lsn, int pageno);
00087 static void TransactionIdSetStatusBit(TransactionId xid, XidStatus status,
00088                           XLogRecPtr lsn, int slotno);
00089 static void set_status_by_pages(int nsubxids, TransactionId *subxids,
00090                     XidStatus status, XLogRecPtr lsn);
00091 
00092 
00093 /*
00094  * TransactionIdSetTreeStatus
00095  *
00096  * Record the final state of transaction entries in the commit log for
00097  * a transaction and its subtransaction tree. Take care to ensure this is
00098  * efficient, and as atomic as possible.
00099  *
00100  * xid is a single xid to set status for. This will typically be
00101  * the top level transactionid for a top level commit or abort. It can
00102  * also be a subtransaction when we record transaction aborts.
00103  *
00104  * subxids is an array of xids of length nsubxids, representing subtransactions
00105  * in the tree of xid. In various cases nsubxids may be zero.
00106  *
00107  * lsn must be the WAL location of the commit record when recording an async
00108  * commit.  For a synchronous commit it can be InvalidXLogRecPtr, since the
00109  * caller guarantees the commit record is already flushed in that case.  It
00110  * should be InvalidXLogRecPtr for abort cases, too.
00111  *
00112  * In the commit case, atomicity is limited by whether all the subxids are in
00113  * the same CLOG page as xid.  If they all are, then the lock will be grabbed
00114  * only once, and the status will be set to committed directly.  Otherwise
00115  * we must
00116  *   1. set sub-committed all subxids that are not on the same page as the
00117  *      main xid
00118  *   2. atomically set committed the main xid and the subxids on the same page
00119  *   3. go over the first bunch again and set them committed
00120  * Note that as far as concurrent checkers are concerned, main transaction
00121  * commit as a whole is still atomic.
00122  *
00123  * Example:
00124  *      TransactionId t commits and has subxids t1, t2, t3, t4
00125  *      t is on page p1, t1 is also on p1, t2 and t3 are on p2, t4 is on p3
00126  *      1. update pages2-3:
00127  *                  page2: set t2,t3 as sub-committed
00128  *                  page3: set t4 as sub-committed
00129  *      2. update page1:
00130  *                  set t1 as sub-committed,
00131  *                  then set t as committed,
00132                     then set t1 as committed
00133  *      3. update pages2-3:
00134  *                  page2: set t2,t3 as committed
00135  *                  page3: set t4 as committed
00136  *
00137  * NB: this is a low-level routine and is NOT the preferred entry point
00138  * for most uses; functions in transam.c are the intended callers.
00139  *
00140  * XXX Think about issuing FADVISE_WILLNEED on pages that we will need,
00141  * but aren't yet in cache, as well as hinting pages not to fall out of
00142  * cache yet.
00143  */
00144 void
00145 TransactionIdSetTreeStatus(TransactionId xid, int nsubxids,
00146                     TransactionId *subxids, XidStatus status, XLogRecPtr lsn)
00147 {
00148     int         pageno = TransactionIdToPage(xid);      /* get page of parent */
00149     int         i;
00150 
00151     Assert(status == TRANSACTION_STATUS_COMMITTED ||
00152            status == TRANSACTION_STATUS_ABORTED);
00153 
00154     /*
00155      * See how many subxids, if any, are on the same page as the parent, if
00156      * any.
00157      */
00158     for (i = 0; i < nsubxids; i++)
00159     {
00160         if (TransactionIdToPage(subxids[i]) != pageno)
00161             break;
00162     }
00163 
00164     /*
00165      * Do all items fit on a single page?
00166      */
00167     if (i == nsubxids)
00168     {
00169         /*
00170          * Set the parent and all subtransactions in a single call
00171          */
00172         TransactionIdSetPageStatus(xid, nsubxids, subxids, status, lsn,
00173                                    pageno);
00174     }
00175     else
00176     {
00177         int         nsubxids_on_first_page = i;
00178 
00179         /*
00180          * If this is a commit then we care about doing this correctly (i.e.
00181          * using the subcommitted intermediate status).  By here, we know
00182          * we're updating more than one page of clog, so we must mark entries
00183          * that are *not* on the first page so that they show as subcommitted
00184          * before we then return to update the status to fully committed.
00185          *
00186          * To avoid touching the first page twice, skip marking subcommitted
00187          * for the subxids on that first page.
00188          */
00189         if (status == TRANSACTION_STATUS_COMMITTED)
00190             set_status_by_pages(nsubxids - nsubxids_on_first_page,
00191                                 subxids + nsubxids_on_first_page,
00192                                 TRANSACTION_STATUS_SUB_COMMITTED, lsn);
00193 
00194         /*
00195          * Now set the parent and subtransactions on same page as the parent,
00196          * if any
00197          */
00198         pageno = TransactionIdToPage(xid);
00199         TransactionIdSetPageStatus(xid, nsubxids_on_first_page, subxids, status,
00200                                    lsn, pageno);
00201 
00202         /*
00203          * Now work through the rest of the subxids one clog page at a time,
00204          * starting from the second page onwards, like we did above.
00205          */
00206         set_status_by_pages(nsubxids - nsubxids_on_first_page,
00207                             subxids + nsubxids_on_first_page,
00208                             status, lsn);
00209     }
00210 }
00211 
00212 /*
00213  * Helper for TransactionIdSetTreeStatus: set the status for a bunch of
00214  * transactions, chunking in the separate CLOG pages involved. We never
00215  * pass the whole transaction tree to this function, only subtransactions
00216  * that are on different pages to the top level transaction id.
00217  */
00218 static void
00219 set_status_by_pages(int nsubxids, TransactionId *subxids,
00220                     XidStatus status, XLogRecPtr lsn)
00221 {
00222     int         pageno = TransactionIdToPage(subxids[0]);
00223     int         offset = 0;
00224     int         i = 0;
00225 
00226     while (i < nsubxids)
00227     {
00228         int         num_on_page = 0;
00229 
00230         while (TransactionIdToPage(subxids[i]) == pageno && i < nsubxids)
00231         {
00232             num_on_page++;
00233             i++;
00234         }
00235 
00236         TransactionIdSetPageStatus(InvalidTransactionId,
00237                                    num_on_page, subxids + offset,
00238                                    status, lsn, pageno);
00239         offset = i;
00240         pageno = TransactionIdToPage(subxids[offset]);
00241     }
00242 }
00243 
00244 /*
00245  * Record the final state of transaction entries in the commit log for
00246  * all entries on a single page.  Atomic only on this page.
00247  *
00248  * Otherwise API is same as TransactionIdSetTreeStatus()
00249  */
00250 static void
00251 TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
00252                            TransactionId *subxids, XidStatus status,
00253                            XLogRecPtr lsn, int pageno)
00254 {
00255     int         slotno;
00256     int         i;
00257 
00258     Assert(status == TRANSACTION_STATUS_COMMITTED ||
00259            status == TRANSACTION_STATUS_ABORTED ||
00260            (status == TRANSACTION_STATUS_SUB_COMMITTED && !TransactionIdIsValid(xid)));
00261 
00262     LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
00263 
00264     /*
00265      * If we're doing an async commit (ie, lsn is valid), then we must wait
00266      * for any active write on the page slot to complete.  Otherwise our
00267      * update could reach disk in that write, which will not do since we
00268      * mustn't let it reach disk until we've done the appropriate WAL flush.
00269      * But when lsn is invalid, it's OK to scribble on a page while it is
00270      * write-busy, since we don't care if the update reaches disk sooner than
00271      * we think.
00272      */
00273     slotno = SimpleLruReadPage(ClogCtl, pageno, XLogRecPtrIsInvalid(lsn), xid);
00274 
00275     /*
00276      * Set the main transaction id, if any.
00277      *
00278      * If we update more than one xid on this page while it is being written
00279      * out, we might find that some of the bits go to disk and others don't.
00280      * If we are updating commits on the page with the top-level xid that
00281      * could break atomicity, so we subcommit the subxids first before we mark
00282      * the top-level commit.
00283      */
00284     if (TransactionIdIsValid(xid))
00285     {
00286         /* Subtransactions first, if needed ... */
00287         if (status == TRANSACTION_STATUS_COMMITTED)
00288         {
00289             for (i = 0; i < nsubxids; i++)
00290             {
00291                 Assert(ClogCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
00292                 TransactionIdSetStatusBit(subxids[i],
00293                                           TRANSACTION_STATUS_SUB_COMMITTED,
00294                                           lsn, slotno);
00295             }
00296         }
00297 
00298         /* ... then the main transaction */
00299         TransactionIdSetStatusBit(xid, status, lsn, slotno);
00300     }
00301 
00302     /* Set the subtransactions */
00303     for (i = 0; i < nsubxids; i++)
00304     {
00305         Assert(ClogCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
00306         TransactionIdSetStatusBit(subxids[i], status, lsn, slotno);
00307     }
00308 
00309     ClogCtl->shared->page_dirty[slotno] = true;
00310 
00311     LWLockRelease(CLogControlLock);
00312 }
00313 
00314 /*
00315  * Sets the commit status of a single transaction.
00316  *
00317  * Must be called with CLogControlLock held
00318  */
00319 static void
00320 TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, int slotno)
00321 {
00322     int         byteno = TransactionIdToByte(xid);
00323     int         bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
00324     char       *byteptr;
00325     char        byteval;
00326     char        curval;
00327 
00328     byteptr = ClogCtl->shared->page_buffer[slotno] + byteno;
00329     curval = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
00330 
00331     /*
00332      * When replaying transactions during recovery we still need to perform
00333      * the two phases of subcommit and then commit. However, some transactions
00334      * are already correctly marked, so we just treat those as a no-op which
00335      * allows us to keep the following Assert as restrictive as possible.
00336      */
00337     if (InRecovery && status == TRANSACTION_STATUS_SUB_COMMITTED &&
00338         curval == TRANSACTION_STATUS_COMMITTED)
00339         return;
00340 
00341     /*
00342      * Current state change should be from 0 or subcommitted to target state
00343      * or we should already be there when replaying changes during recovery.
00344      */
00345     Assert(curval == 0 ||
00346            (curval == TRANSACTION_STATUS_SUB_COMMITTED &&
00347             status != TRANSACTION_STATUS_IN_PROGRESS) ||
00348            curval == status);
00349 
00350     /* note this assumes exclusive access to the clog page */
00351     byteval = *byteptr;
00352     byteval &= ~(((1 << CLOG_BITS_PER_XACT) - 1) << bshift);
00353     byteval |= (status << bshift);
00354     *byteptr = byteval;
00355 
00356     /*
00357      * Update the group LSN if the transaction completion LSN is higher.
00358      *
00359      * Note: lsn will be invalid when supplied during InRecovery processing,
00360      * so we don't need to do anything special to avoid LSN updates during
00361      * recovery. After recovery completes the next clog change will set the
00362      * LSN correctly.
00363      */
00364     if (!XLogRecPtrIsInvalid(lsn))
00365     {
00366         int         lsnindex = GetLSNIndex(slotno, xid);
00367 
00368         if (ClogCtl->shared->group_lsn[lsnindex] < lsn)
00369             ClogCtl->shared->group_lsn[lsnindex] = lsn;
00370     }
00371 }
00372 
00373 /*
00374  * Interrogate the state of a transaction in the commit log.
00375  *
00376  * Aside from the actual commit status, this function returns (into *lsn)
00377  * an LSN that is late enough to be able to guarantee that if we flush up to
00378  * that LSN then we will have flushed the transaction's commit record to disk.
00379  * The result is not necessarily the exact LSN of the transaction's commit
00380  * record!  For example, for long-past transactions (those whose clog pages
00381  * already migrated to disk), we'll return InvalidXLogRecPtr.  Also, because
00382  * we group transactions on the same clog page to conserve storage, we might
00383  * return the LSN of a later transaction that falls into the same group.
00384  *
00385  * NB: this is a low-level routine and is NOT the preferred entry point
00386  * for most uses; TransactionLogFetch() in transam.c is the intended caller.
00387  */
00388 XidStatus
00389 TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
00390 {
00391     int         pageno = TransactionIdToPage(xid);
00392     int         byteno = TransactionIdToByte(xid);
00393     int         bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
00394     int         slotno;
00395     int         lsnindex;
00396     char       *byteptr;
00397     XidStatus   status;
00398 
00399     /* lock is acquired by SimpleLruReadPage_ReadOnly */
00400 
00401     slotno = SimpleLruReadPage_ReadOnly(ClogCtl, pageno, xid);
00402     byteptr = ClogCtl->shared->page_buffer[slotno] + byteno;
00403 
00404     status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
00405 
00406     lsnindex = GetLSNIndex(slotno, xid);
00407     *lsn = ClogCtl->shared->group_lsn[lsnindex];
00408 
00409     LWLockRelease(CLogControlLock);
00410 
00411     return status;
00412 }
00413 
00414 /*
00415  * Number of shared CLOG buffers.
00416  *
00417  * Testing during the PostgreSQL 9.2 development cycle revealed that on a
00418  * large multi-processor system, it was possible to have more CLOG page
00419  * requests in flight at one time than the numebr of CLOG buffers which existed
00420  * at that time, which was hardcoded to 8.  Further testing revealed that
00421  * performance dropped off with more than 32 CLOG buffers, possibly because
00422  * the linear buffer search algorithm doesn't scale well.
00423  *
00424  * Unconditionally increasing the number of CLOG buffers to 32 did not seem
00425  * like a good idea, because it would increase the minimum amount of shared
00426  * memory required to start, which could be a problem for people running very
00427  * small configurations.  The following formula seems to represent a reasonable
00428  * compromise: people with very low values for shared_buffers will get fewer
00429  * CLOG buffers as well, and everyone else will get 32.
00430  *
00431  * It is likely that some further work will be needed here in future releases;
00432  * for example, on a 64-core server, the maximum number of CLOG requests that
00433  * can be simultaneously in flight will be even larger.  But that will
00434  * apparently require more than just changing the formula, so for now we take
00435  * the easy way out.
00436  */
00437 Size
00438 CLOGShmemBuffers(void)
00439 {
00440     return Min(32, Max(4, NBuffers / 512));
00441 }
00442 
00443 /*
00444  * Initialization of shared memory for CLOG
00445  */
00446 Size
00447 CLOGShmemSize(void)
00448 {
00449     return SimpleLruShmemSize(CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE);
00450 }
00451 
00452 void
00453 CLOGShmemInit(void)
00454 {
00455     ClogCtl->PagePrecedes = CLOGPagePrecedes;
00456     SimpleLruInit(ClogCtl, "CLOG Ctl", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE,
00457                   CLogControlLock, "pg_clog");
00458 }
00459 
00460 /*
00461  * This func must be called ONCE on system install.  It creates
00462  * the initial CLOG segment.  (The CLOG directory is assumed to
00463  * have been created by initdb, and CLOGShmemInit must have been
00464  * called already.)
00465  */
00466 void
00467 BootStrapCLOG(void)
00468 {
00469     int         slotno;
00470 
00471     LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
00472 
00473     /* Create and zero the first page of the commit log */
00474     slotno = ZeroCLOGPage(0, false);
00475 
00476     /* Make sure it's written out */
00477     SimpleLruWritePage(ClogCtl, slotno);
00478     Assert(!ClogCtl->shared->page_dirty[slotno]);
00479 
00480     LWLockRelease(CLogControlLock);
00481 }
00482 
00483 /*
00484  * Initialize (or reinitialize) a page of CLOG to zeroes.
00485  * If writeXlog is TRUE, also emit an XLOG record saying we did this.
00486  *
00487  * The page is not actually written, just set up in shared memory.
00488  * The slot number of the new page is returned.
00489  *
00490  * Control lock must be held at entry, and will be held at exit.
00491  */
00492 static int
00493 ZeroCLOGPage(int pageno, bool writeXlog)
00494 {
00495     int         slotno;
00496 
00497     slotno = SimpleLruZeroPage(ClogCtl, pageno);
00498 
00499     if (writeXlog)
00500         WriteZeroPageXlogRec(pageno);
00501 
00502     return slotno;
00503 }
00504 
00505 /*
00506  * This must be called ONCE during postmaster or standalone-backend startup,
00507  * after StartupXLOG has initialized ShmemVariableCache->nextXid.
00508  */
00509 void
00510 StartupCLOG(void)
00511 {
00512     TransactionId xid = ShmemVariableCache->nextXid;
00513     int         pageno = TransactionIdToPage(xid);
00514 
00515     LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
00516 
00517     /*
00518      * Initialize our idea of the latest page number.
00519      */
00520     ClogCtl->shared->latest_page_number = pageno;
00521 
00522     LWLockRelease(CLogControlLock);
00523 }
00524 
00525 /*
00526  * This must be called ONCE at the end of startup/recovery.
00527  */
00528 void
00529 TrimCLOG(void)
00530 {
00531     TransactionId xid = ShmemVariableCache->nextXid;
00532     int         pageno = TransactionIdToPage(xid);
00533 
00534     LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
00535 
00536     /*
00537      * Re-Initialize our idea of the latest page number.
00538      */
00539     ClogCtl->shared->latest_page_number = pageno;
00540 
00541     /*
00542      * Zero out the remainder of the current clog page.  Under normal
00543      * circumstances it should be zeroes already, but it seems at least
00544      * theoretically possible that XLOG replay will have settled on a nextXID
00545      * value that is less than the last XID actually used and marked by the
00546      * previous database lifecycle (since subtransaction commit writes clog
00547      * but makes no WAL entry).  Let's just be safe. (We need not worry about
00548      * pages beyond the current one, since those will be zeroed when first
00549      * used.  For the same reason, there is no need to do anything when
00550      * nextXid is exactly at a page boundary; and it's likely that the
00551      * "current" page doesn't exist yet in that case.)
00552      */
00553     if (TransactionIdToPgIndex(xid) != 0)
00554     {
00555         int         byteno = TransactionIdToByte(xid);
00556         int         bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
00557         int         slotno;
00558         char       *byteptr;
00559 
00560         slotno = SimpleLruReadPage(ClogCtl, pageno, false, xid);
00561         byteptr = ClogCtl->shared->page_buffer[slotno] + byteno;
00562 
00563         /* Zero so-far-unused positions in the current byte */
00564         *byteptr &= (1 << bshift) - 1;
00565         /* Zero the rest of the page */
00566         MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1);
00567 
00568         ClogCtl->shared->page_dirty[slotno] = true;
00569     }
00570 
00571     LWLockRelease(CLogControlLock);
00572 }
00573 
00574 /*
00575  * This must be called ONCE during postmaster or standalone-backend shutdown
00576  */
00577 void
00578 ShutdownCLOG(void)
00579 {
00580     /* Flush dirty CLOG pages to disk */
00581     TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(false);
00582     SimpleLruFlush(ClogCtl, false);
00583     TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(false);
00584 }
00585 
00586 /*
00587  * Perform a checkpoint --- either during shutdown, or on-the-fly
00588  */
00589 void
00590 CheckPointCLOG(void)
00591 {
00592     /* Flush dirty CLOG pages to disk */
00593     TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true);
00594     SimpleLruFlush(ClogCtl, true);
00595     TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
00596 }
00597 
00598 
00599 /*
00600  * Make sure that CLOG has room for a newly-allocated XID.
00601  *
00602  * NB: this is called while holding XidGenLock.  We want it to be very fast
00603  * most of the time; even when it's not so fast, no actual I/O need happen
00604  * unless we're forced to write out a dirty clog or xlog page to make room
00605  * in shared memory.
00606  */
00607 void
00608 ExtendCLOG(TransactionId newestXact)
00609 {
00610     int         pageno;
00611 
00612     /*
00613      * No work except at first XID of a page.  But beware: just after
00614      * wraparound, the first XID of page zero is FirstNormalTransactionId.
00615      */
00616     if (TransactionIdToPgIndex(newestXact) != 0 &&
00617         !TransactionIdEquals(newestXact, FirstNormalTransactionId))
00618         return;
00619 
00620     pageno = TransactionIdToPage(newestXact);
00621 
00622     LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
00623 
00624     /* Zero the page and make an XLOG entry about it */
00625     ZeroCLOGPage(pageno, !InRecovery);
00626 
00627     LWLockRelease(CLogControlLock);
00628 }
00629 
00630 
00631 /*
00632  * Remove all CLOG segments before the one holding the passed transaction ID
00633  *
00634  * Before removing any CLOG data, we must flush XLOG to disk, to ensure
00635  * that any recently-emitted HEAP_FREEZE records have reached disk; otherwise
00636  * a crash and restart might leave us with some unfrozen tuples referencing
00637  * removed CLOG data.  We choose to emit a special TRUNCATE XLOG record too.
00638  * Replaying the deletion from XLOG is not critical, since the files could
00639  * just as well be removed later, but doing so prevents a long-running hot
00640  * standby server from acquiring an unreasonably bloated CLOG directory.
00641  *
00642  * Since CLOG segments hold a large number of transactions, the opportunity to
00643  * actually remove a segment is fairly rare, and so it seems best not to do
00644  * the XLOG flush unless we have confirmed that there is a removable segment.
00645  */
00646 void
00647 TruncateCLOG(TransactionId oldestXact)
00648 {
00649     int         cutoffPage;
00650 
00651     /*
00652      * The cutoff point is the start of the segment containing oldestXact. We
00653      * pass the *page* containing oldestXact to SimpleLruTruncate.
00654      */
00655     cutoffPage = TransactionIdToPage(oldestXact);
00656 
00657     /* Check to see if there's any files that could be removed */
00658     if (!SlruScanDirectory(ClogCtl, SlruScanDirCbReportPresence, &cutoffPage))
00659         return;                 /* nothing to remove */
00660 
00661     /* Write XLOG record and flush XLOG to disk */
00662     WriteTruncateXlogRec(cutoffPage);
00663 
00664     /* Now we can remove the old CLOG segment(s) */
00665     SimpleLruTruncate(ClogCtl, cutoffPage);
00666 }
00667 
00668 
00669 /*
00670  * Decide which of two CLOG page numbers is "older" for truncation purposes.
00671  *
00672  * We need to use comparison of TransactionIds here in order to do the right
00673  * thing with wraparound XID arithmetic.  However, if we are asked about
00674  * page number zero, we don't want to hand InvalidTransactionId to
00675  * TransactionIdPrecedes: it'll get weird about permanent xact IDs.  So,
00676  * offset both xids by FirstNormalTransactionId to avoid that.
00677  */
00678 static bool
00679 CLOGPagePrecedes(int page1, int page2)
00680 {
00681     TransactionId xid1;
00682     TransactionId xid2;
00683 
00684     xid1 = ((TransactionId) page1) * CLOG_XACTS_PER_PAGE;
00685     xid1 += FirstNormalTransactionId;
00686     xid2 = ((TransactionId) page2) * CLOG_XACTS_PER_PAGE;
00687     xid2 += FirstNormalTransactionId;
00688 
00689     return TransactionIdPrecedes(xid1, xid2);
00690 }
00691 
00692 
00693 /*
00694  * Write a ZEROPAGE xlog record
00695  */
00696 static void
00697 WriteZeroPageXlogRec(int pageno)
00698 {
00699     XLogRecData rdata;
00700 
00701     rdata.data = (char *) (&pageno);
00702     rdata.len = sizeof(int);
00703     rdata.buffer = InvalidBuffer;
00704     rdata.next = NULL;
00705     (void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE, &rdata);
00706 }
00707 
00708 /*
00709  * Write a TRUNCATE xlog record
00710  *
00711  * We must flush the xlog record to disk before returning --- see notes
00712  * in TruncateCLOG().
00713  */
00714 static void
00715 WriteTruncateXlogRec(int pageno)
00716 {
00717     XLogRecData rdata;
00718     XLogRecPtr  recptr;
00719 
00720     rdata.data = (char *) (&pageno);
00721     rdata.len = sizeof(int);
00722     rdata.buffer = InvalidBuffer;
00723     rdata.next = NULL;
00724     recptr = XLogInsert(RM_CLOG_ID, CLOG_TRUNCATE, &rdata);
00725     XLogFlush(recptr);
00726 }
00727 
00728 /*
00729  * CLOG resource manager's routines
00730  */
00731 void
00732 clog_redo(XLogRecPtr lsn, XLogRecord *record)
00733 {
00734     uint8       info = record->xl_info & ~XLR_INFO_MASK;
00735 
00736     /* Backup blocks are not used in clog records */
00737     Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
00738 
00739     if (info == CLOG_ZEROPAGE)
00740     {
00741         int         pageno;
00742         int         slotno;
00743 
00744         memcpy(&pageno, XLogRecGetData(record), sizeof(int));
00745 
00746         LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
00747 
00748         slotno = ZeroCLOGPage(pageno, false);
00749         SimpleLruWritePage(ClogCtl, slotno);
00750         Assert(!ClogCtl->shared->page_dirty[slotno]);
00751 
00752         LWLockRelease(CLogControlLock);
00753     }
00754     else if (info == CLOG_TRUNCATE)
00755     {
00756         int         pageno;
00757 
00758         memcpy(&pageno, XLogRecGetData(record), sizeof(int));
00759 
00760         /*
00761          * During XLOG replay, latest_page_number isn't set up yet; insert a
00762          * suitable value to bypass the sanity test in SimpleLruTruncate.
00763          */
00764         ClogCtl->shared->latest_page_number = pageno;
00765 
00766         SimpleLruTruncate(ClogCtl, pageno);
00767     }
00768     else
00769         elog(PANIC, "clog_redo: unknown op code %u", info);
00770 }