00001 /*------------------------------------------------------------------------- 00002 * 00003 * clog.c 00004 * PostgreSQL transaction-commit-log manager 00005 * 00006 * This module replaces the old "pg_log" access code, which treated pg_log 00007 * essentially like a relation, in that it went through the regular buffer 00008 * manager. The problem with that was that there wasn't any good way to 00009 * recycle storage space for transactions so old that they'll never be 00010 * looked up again. Now we use specialized access code so that the commit 00011 * log can be broken into relatively small, independent segments. 00012 * 00013 * XLOG interactions: this module generates an XLOG record whenever a new 00014 * CLOG page is initialized to zeroes. Other writes of CLOG come from 00015 * recording of transaction commit or abort in xact.c, which generates its 00016 * own XLOG records for these events and will re-perform the status update 00017 * on redo; so we need make no additional XLOG entry here. For synchronous 00018 * transaction commits, the XLOG is guaranteed flushed through the XLOG commit 00019 * record before we are called to log a commit, so the WAL rule "write xlog 00020 * before data" is satisfied automatically. However, for async commits we 00021 * must track the latest LSN affecting each CLOG page, so that we can flush 00022 * XLOG that far and satisfy the WAL rule. We don't have to worry about this 00023 * for aborts (whether sync or async), since the post-crash assumption would 00024 * be that such transactions failed anyway. 00025 * 00026 * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group 00027 * Portions Copyright (c) 1994, Regents of the University of California 00028 * 00029 * src/backend/access/transam/clog.c 00030 * 00031 *------------------------------------------------------------------------- 00032 */ 00033 #include "postgres.h" 00034 00035 #include "access/clog.h" 00036 #include "access/slru.h" 00037 #include "access/transam.h" 00038 #include "miscadmin.h" 00039 #include "pg_trace.h" 00040 00041 /* 00042 * Defines for CLOG page sizes. A page is the same BLCKSZ as is used 00043 * everywhere else in Postgres. 00044 * 00045 * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF, 00046 * CLOG page numbering also wraps around at 0xFFFFFFFF/CLOG_XACTS_PER_PAGE, 00047 * and CLOG segment numbering at 00048 * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need take no 00049 * explicit notice of that fact in this module, except when comparing segment 00050 * and page numbers in TruncateCLOG (see CLOGPagePrecedes). 00051 */ 00052 00053 /* We need two bits per xact, so four xacts fit in a byte */ 00054 #define CLOG_BITS_PER_XACT 2 00055 #define CLOG_XACTS_PER_BYTE 4 00056 #define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE) 00057 #define CLOG_XACT_BITMASK ((1 << CLOG_BITS_PER_XACT) - 1) 00058 00059 #define TransactionIdToPage(xid) ((xid) / (TransactionId) CLOG_XACTS_PER_PAGE) 00060 #define TransactionIdToPgIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) 00061 #define TransactionIdToByte(xid) (TransactionIdToPgIndex(xid) / CLOG_XACTS_PER_BYTE) 00062 #define TransactionIdToBIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_BYTE) 00063 00064 /* We store the latest async LSN for each group of transactions */ 00065 #define CLOG_XACTS_PER_LSN_GROUP 32 /* keep this a power of 2 */ 00066 #define CLOG_LSNS_PER_PAGE (CLOG_XACTS_PER_PAGE / CLOG_XACTS_PER_LSN_GROUP) 00067 00068 #define GetLSNIndex(slotno, xid) ((slotno) * CLOG_LSNS_PER_PAGE + \ 00069 ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP) 00070 00071 00072 /* 00073 * Link to shared-memory data structures for CLOG control 00074 */ 00075 static SlruCtlData ClogCtlData; 00076 00077 #define ClogCtl (&ClogCtlData) 00078 00079 00080 static int ZeroCLOGPage(int pageno, bool writeXlog); 00081 static bool CLOGPagePrecedes(int page1, int page2); 00082 static void WriteZeroPageXlogRec(int pageno); 00083 static void WriteTruncateXlogRec(int pageno); 00084 static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids, 00085 TransactionId *subxids, XidStatus status, 00086 XLogRecPtr lsn, int pageno); 00087 static void TransactionIdSetStatusBit(TransactionId xid, XidStatus status, 00088 XLogRecPtr lsn, int slotno); 00089 static void set_status_by_pages(int nsubxids, TransactionId *subxids, 00090 XidStatus status, XLogRecPtr lsn); 00091 00092 00093 /* 00094 * TransactionIdSetTreeStatus 00095 * 00096 * Record the final state of transaction entries in the commit log for 00097 * a transaction and its subtransaction tree. Take care to ensure this is 00098 * efficient, and as atomic as possible. 00099 * 00100 * xid is a single xid to set status for. This will typically be 00101 * the top level transactionid for a top level commit or abort. It can 00102 * also be a subtransaction when we record transaction aborts. 00103 * 00104 * subxids is an array of xids of length nsubxids, representing subtransactions 00105 * in the tree of xid. In various cases nsubxids may be zero. 00106 * 00107 * lsn must be the WAL location of the commit record when recording an async 00108 * commit. For a synchronous commit it can be InvalidXLogRecPtr, since the 00109 * caller guarantees the commit record is already flushed in that case. It 00110 * should be InvalidXLogRecPtr for abort cases, too. 00111 * 00112 * In the commit case, atomicity is limited by whether all the subxids are in 00113 * the same CLOG page as xid. If they all are, then the lock will be grabbed 00114 * only once, and the status will be set to committed directly. Otherwise 00115 * we must 00116 * 1. set sub-committed all subxids that are not on the same page as the 00117 * main xid 00118 * 2. atomically set committed the main xid and the subxids on the same page 00119 * 3. go over the first bunch again and set them committed 00120 * Note that as far as concurrent checkers are concerned, main transaction 00121 * commit as a whole is still atomic. 00122 * 00123 * Example: 00124 * TransactionId t commits and has subxids t1, t2, t3, t4 00125 * t is on page p1, t1 is also on p1, t2 and t3 are on p2, t4 is on p3 00126 * 1. update pages2-3: 00127 * page2: set t2,t3 as sub-committed 00128 * page3: set t4 as sub-committed 00129 * 2. update page1: 00130 * set t1 as sub-committed, 00131 * then set t as committed, 00132 then set t1 as committed 00133 * 3. update pages2-3: 00134 * page2: set t2,t3 as committed 00135 * page3: set t4 as committed 00136 * 00137 * NB: this is a low-level routine and is NOT the preferred entry point 00138 * for most uses; functions in transam.c are the intended callers. 00139 * 00140 * XXX Think about issuing FADVISE_WILLNEED on pages that we will need, 00141 * but aren't yet in cache, as well as hinting pages not to fall out of 00142 * cache yet. 00143 */ 00144 void 00145 TransactionIdSetTreeStatus(TransactionId xid, int nsubxids, 00146 TransactionId *subxids, XidStatus status, XLogRecPtr lsn) 00147 { 00148 int pageno = TransactionIdToPage(xid); /* get page of parent */ 00149 int i; 00150 00151 Assert(status == TRANSACTION_STATUS_COMMITTED || 00152 status == TRANSACTION_STATUS_ABORTED); 00153 00154 /* 00155 * See how many subxids, if any, are on the same page as the parent, if 00156 * any. 00157 */ 00158 for (i = 0; i < nsubxids; i++) 00159 { 00160 if (TransactionIdToPage(subxids[i]) != pageno) 00161 break; 00162 } 00163 00164 /* 00165 * Do all items fit on a single page? 00166 */ 00167 if (i == nsubxids) 00168 { 00169 /* 00170 * Set the parent and all subtransactions in a single call 00171 */ 00172 TransactionIdSetPageStatus(xid, nsubxids, subxids, status, lsn, 00173 pageno); 00174 } 00175 else 00176 { 00177 int nsubxids_on_first_page = i; 00178 00179 /* 00180 * If this is a commit then we care about doing this correctly (i.e. 00181 * using the subcommitted intermediate status). By here, we know 00182 * we're updating more than one page of clog, so we must mark entries 00183 * that are *not* on the first page so that they show as subcommitted 00184 * before we then return to update the status to fully committed. 00185 * 00186 * To avoid touching the first page twice, skip marking subcommitted 00187 * for the subxids on that first page. 00188 */ 00189 if (status == TRANSACTION_STATUS_COMMITTED) 00190 set_status_by_pages(nsubxids - nsubxids_on_first_page, 00191 subxids + nsubxids_on_first_page, 00192 TRANSACTION_STATUS_SUB_COMMITTED, lsn); 00193 00194 /* 00195 * Now set the parent and subtransactions on same page as the parent, 00196 * if any 00197 */ 00198 pageno = TransactionIdToPage(xid); 00199 TransactionIdSetPageStatus(xid, nsubxids_on_first_page, subxids, status, 00200 lsn, pageno); 00201 00202 /* 00203 * Now work through the rest of the subxids one clog page at a time, 00204 * starting from the second page onwards, like we did above. 00205 */ 00206 set_status_by_pages(nsubxids - nsubxids_on_first_page, 00207 subxids + nsubxids_on_first_page, 00208 status, lsn); 00209 } 00210 } 00211 00212 /* 00213 * Helper for TransactionIdSetTreeStatus: set the status for a bunch of 00214 * transactions, chunking in the separate CLOG pages involved. We never 00215 * pass the whole transaction tree to this function, only subtransactions 00216 * that are on different pages to the top level transaction id. 00217 */ 00218 static void 00219 set_status_by_pages(int nsubxids, TransactionId *subxids, 00220 XidStatus status, XLogRecPtr lsn) 00221 { 00222 int pageno = TransactionIdToPage(subxids[0]); 00223 int offset = 0; 00224 int i = 0; 00225 00226 while (i < nsubxids) 00227 { 00228 int num_on_page = 0; 00229 00230 while (TransactionIdToPage(subxids[i]) == pageno && i < nsubxids) 00231 { 00232 num_on_page++; 00233 i++; 00234 } 00235 00236 TransactionIdSetPageStatus(InvalidTransactionId, 00237 num_on_page, subxids + offset, 00238 status, lsn, pageno); 00239 offset = i; 00240 pageno = TransactionIdToPage(subxids[offset]); 00241 } 00242 } 00243 00244 /* 00245 * Record the final state of transaction entries in the commit log for 00246 * all entries on a single page. Atomic only on this page. 00247 * 00248 * Otherwise API is same as TransactionIdSetTreeStatus() 00249 */ 00250 static void 00251 TransactionIdSetPageStatus(TransactionId xid, int nsubxids, 00252 TransactionId *subxids, XidStatus status, 00253 XLogRecPtr lsn, int pageno) 00254 { 00255 int slotno; 00256 int i; 00257 00258 Assert(status == TRANSACTION_STATUS_COMMITTED || 00259 status == TRANSACTION_STATUS_ABORTED || 00260 (status == TRANSACTION_STATUS_SUB_COMMITTED && !TransactionIdIsValid(xid))); 00261 00262 LWLockAcquire(CLogControlLock, LW_EXCLUSIVE); 00263 00264 /* 00265 * If we're doing an async commit (ie, lsn is valid), then we must wait 00266 * for any active write on the page slot to complete. Otherwise our 00267 * update could reach disk in that write, which will not do since we 00268 * mustn't let it reach disk until we've done the appropriate WAL flush. 00269 * But when lsn is invalid, it's OK to scribble on a page while it is 00270 * write-busy, since we don't care if the update reaches disk sooner than 00271 * we think. 00272 */ 00273 slotno = SimpleLruReadPage(ClogCtl, pageno, XLogRecPtrIsInvalid(lsn), xid); 00274 00275 /* 00276 * Set the main transaction id, if any. 00277 * 00278 * If we update more than one xid on this page while it is being written 00279 * out, we might find that some of the bits go to disk and others don't. 00280 * If we are updating commits on the page with the top-level xid that 00281 * could break atomicity, so we subcommit the subxids first before we mark 00282 * the top-level commit. 00283 */ 00284 if (TransactionIdIsValid(xid)) 00285 { 00286 /* Subtransactions first, if needed ... */ 00287 if (status == TRANSACTION_STATUS_COMMITTED) 00288 { 00289 for (i = 0; i < nsubxids; i++) 00290 { 00291 Assert(ClogCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i])); 00292 TransactionIdSetStatusBit(subxids[i], 00293 TRANSACTION_STATUS_SUB_COMMITTED, 00294 lsn, slotno); 00295 } 00296 } 00297 00298 /* ... then the main transaction */ 00299 TransactionIdSetStatusBit(xid, status, lsn, slotno); 00300 } 00301 00302 /* Set the subtransactions */ 00303 for (i = 0; i < nsubxids; i++) 00304 { 00305 Assert(ClogCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i])); 00306 TransactionIdSetStatusBit(subxids[i], status, lsn, slotno); 00307 } 00308 00309 ClogCtl->shared->page_dirty[slotno] = true; 00310 00311 LWLockRelease(CLogControlLock); 00312 } 00313 00314 /* 00315 * Sets the commit status of a single transaction. 00316 * 00317 * Must be called with CLogControlLock held 00318 */ 00319 static void 00320 TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, int slotno) 00321 { 00322 int byteno = TransactionIdToByte(xid); 00323 int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT; 00324 char *byteptr; 00325 char byteval; 00326 char curval; 00327 00328 byteptr = ClogCtl->shared->page_buffer[slotno] + byteno; 00329 curval = (*byteptr >> bshift) & CLOG_XACT_BITMASK; 00330 00331 /* 00332 * When replaying transactions during recovery we still need to perform 00333 * the two phases of subcommit and then commit. However, some transactions 00334 * are already correctly marked, so we just treat those as a no-op which 00335 * allows us to keep the following Assert as restrictive as possible. 00336 */ 00337 if (InRecovery && status == TRANSACTION_STATUS_SUB_COMMITTED && 00338 curval == TRANSACTION_STATUS_COMMITTED) 00339 return; 00340 00341 /* 00342 * Current state change should be from 0 or subcommitted to target state 00343 * or we should already be there when replaying changes during recovery. 00344 */ 00345 Assert(curval == 0 || 00346 (curval == TRANSACTION_STATUS_SUB_COMMITTED && 00347 status != TRANSACTION_STATUS_IN_PROGRESS) || 00348 curval == status); 00349 00350 /* note this assumes exclusive access to the clog page */ 00351 byteval = *byteptr; 00352 byteval &= ~(((1 << CLOG_BITS_PER_XACT) - 1) << bshift); 00353 byteval |= (status << bshift); 00354 *byteptr = byteval; 00355 00356 /* 00357 * Update the group LSN if the transaction completion LSN is higher. 00358 * 00359 * Note: lsn will be invalid when supplied during InRecovery processing, 00360 * so we don't need to do anything special to avoid LSN updates during 00361 * recovery. After recovery completes the next clog change will set the 00362 * LSN correctly. 00363 */ 00364 if (!XLogRecPtrIsInvalid(lsn)) 00365 { 00366 int lsnindex = GetLSNIndex(slotno, xid); 00367 00368 if (ClogCtl->shared->group_lsn[lsnindex] < lsn) 00369 ClogCtl->shared->group_lsn[lsnindex] = lsn; 00370 } 00371 } 00372 00373 /* 00374 * Interrogate the state of a transaction in the commit log. 00375 * 00376 * Aside from the actual commit status, this function returns (into *lsn) 00377 * an LSN that is late enough to be able to guarantee that if we flush up to 00378 * that LSN then we will have flushed the transaction's commit record to disk. 00379 * The result is not necessarily the exact LSN of the transaction's commit 00380 * record! For example, for long-past transactions (those whose clog pages 00381 * already migrated to disk), we'll return InvalidXLogRecPtr. Also, because 00382 * we group transactions on the same clog page to conserve storage, we might 00383 * return the LSN of a later transaction that falls into the same group. 00384 * 00385 * NB: this is a low-level routine and is NOT the preferred entry point 00386 * for most uses; TransactionLogFetch() in transam.c is the intended caller. 00387 */ 00388 XidStatus 00389 TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn) 00390 { 00391 int pageno = TransactionIdToPage(xid); 00392 int byteno = TransactionIdToByte(xid); 00393 int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT; 00394 int slotno; 00395 int lsnindex; 00396 char *byteptr; 00397 XidStatus status; 00398 00399 /* lock is acquired by SimpleLruReadPage_ReadOnly */ 00400 00401 slotno = SimpleLruReadPage_ReadOnly(ClogCtl, pageno, xid); 00402 byteptr = ClogCtl->shared->page_buffer[slotno] + byteno; 00403 00404 status = (*byteptr >> bshift) & CLOG_XACT_BITMASK; 00405 00406 lsnindex = GetLSNIndex(slotno, xid); 00407 *lsn = ClogCtl->shared->group_lsn[lsnindex]; 00408 00409 LWLockRelease(CLogControlLock); 00410 00411 return status; 00412 } 00413 00414 /* 00415 * Number of shared CLOG buffers. 00416 * 00417 * Testing during the PostgreSQL 9.2 development cycle revealed that on a 00418 * large multi-processor system, it was possible to have more CLOG page 00419 * requests in flight at one time than the numebr of CLOG buffers which existed 00420 * at that time, which was hardcoded to 8. Further testing revealed that 00421 * performance dropped off with more than 32 CLOG buffers, possibly because 00422 * the linear buffer search algorithm doesn't scale well. 00423 * 00424 * Unconditionally increasing the number of CLOG buffers to 32 did not seem 00425 * like a good idea, because it would increase the minimum amount of shared 00426 * memory required to start, which could be a problem for people running very 00427 * small configurations. The following formula seems to represent a reasonable 00428 * compromise: people with very low values for shared_buffers will get fewer 00429 * CLOG buffers as well, and everyone else will get 32. 00430 * 00431 * It is likely that some further work will be needed here in future releases; 00432 * for example, on a 64-core server, the maximum number of CLOG requests that 00433 * can be simultaneously in flight will be even larger. But that will 00434 * apparently require more than just changing the formula, so for now we take 00435 * the easy way out. 00436 */ 00437 Size 00438 CLOGShmemBuffers(void) 00439 { 00440 return Min(32, Max(4, NBuffers / 512)); 00441 } 00442 00443 /* 00444 * Initialization of shared memory for CLOG 00445 */ 00446 Size 00447 CLOGShmemSize(void) 00448 { 00449 return SimpleLruShmemSize(CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE); 00450 } 00451 00452 void 00453 CLOGShmemInit(void) 00454 { 00455 ClogCtl->PagePrecedes = CLOGPagePrecedes; 00456 SimpleLruInit(ClogCtl, "CLOG Ctl", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE, 00457 CLogControlLock, "pg_clog"); 00458 } 00459 00460 /* 00461 * This func must be called ONCE on system install. It creates 00462 * the initial CLOG segment. (The CLOG directory is assumed to 00463 * have been created by initdb, and CLOGShmemInit must have been 00464 * called already.) 00465 */ 00466 void 00467 BootStrapCLOG(void) 00468 { 00469 int slotno; 00470 00471 LWLockAcquire(CLogControlLock, LW_EXCLUSIVE); 00472 00473 /* Create and zero the first page of the commit log */ 00474 slotno = ZeroCLOGPage(0, false); 00475 00476 /* Make sure it's written out */ 00477 SimpleLruWritePage(ClogCtl, slotno); 00478 Assert(!ClogCtl->shared->page_dirty[slotno]); 00479 00480 LWLockRelease(CLogControlLock); 00481 } 00482 00483 /* 00484 * Initialize (or reinitialize) a page of CLOG to zeroes. 00485 * If writeXlog is TRUE, also emit an XLOG record saying we did this. 00486 * 00487 * The page is not actually written, just set up in shared memory. 00488 * The slot number of the new page is returned. 00489 * 00490 * Control lock must be held at entry, and will be held at exit. 00491 */ 00492 static int 00493 ZeroCLOGPage(int pageno, bool writeXlog) 00494 { 00495 int slotno; 00496 00497 slotno = SimpleLruZeroPage(ClogCtl, pageno); 00498 00499 if (writeXlog) 00500 WriteZeroPageXlogRec(pageno); 00501 00502 return slotno; 00503 } 00504 00505 /* 00506 * This must be called ONCE during postmaster or standalone-backend startup, 00507 * after StartupXLOG has initialized ShmemVariableCache->nextXid. 00508 */ 00509 void 00510 StartupCLOG(void) 00511 { 00512 TransactionId xid = ShmemVariableCache->nextXid; 00513 int pageno = TransactionIdToPage(xid); 00514 00515 LWLockAcquire(CLogControlLock, LW_EXCLUSIVE); 00516 00517 /* 00518 * Initialize our idea of the latest page number. 00519 */ 00520 ClogCtl->shared->latest_page_number = pageno; 00521 00522 LWLockRelease(CLogControlLock); 00523 } 00524 00525 /* 00526 * This must be called ONCE at the end of startup/recovery. 00527 */ 00528 void 00529 TrimCLOG(void) 00530 { 00531 TransactionId xid = ShmemVariableCache->nextXid; 00532 int pageno = TransactionIdToPage(xid); 00533 00534 LWLockAcquire(CLogControlLock, LW_EXCLUSIVE); 00535 00536 /* 00537 * Re-Initialize our idea of the latest page number. 00538 */ 00539 ClogCtl->shared->latest_page_number = pageno; 00540 00541 /* 00542 * Zero out the remainder of the current clog page. Under normal 00543 * circumstances it should be zeroes already, but it seems at least 00544 * theoretically possible that XLOG replay will have settled on a nextXID 00545 * value that is less than the last XID actually used and marked by the 00546 * previous database lifecycle (since subtransaction commit writes clog 00547 * but makes no WAL entry). Let's just be safe. (We need not worry about 00548 * pages beyond the current one, since those will be zeroed when first 00549 * used. For the same reason, there is no need to do anything when 00550 * nextXid is exactly at a page boundary; and it's likely that the 00551 * "current" page doesn't exist yet in that case.) 00552 */ 00553 if (TransactionIdToPgIndex(xid) != 0) 00554 { 00555 int byteno = TransactionIdToByte(xid); 00556 int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT; 00557 int slotno; 00558 char *byteptr; 00559 00560 slotno = SimpleLruReadPage(ClogCtl, pageno, false, xid); 00561 byteptr = ClogCtl->shared->page_buffer[slotno] + byteno; 00562 00563 /* Zero so-far-unused positions in the current byte */ 00564 *byteptr &= (1 << bshift) - 1; 00565 /* Zero the rest of the page */ 00566 MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1); 00567 00568 ClogCtl->shared->page_dirty[slotno] = true; 00569 } 00570 00571 LWLockRelease(CLogControlLock); 00572 } 00573 00574 /* 00575 * This must be called ONCE during postmaster or standalone-backend shutdown 00576 */ 00577 void 00578 ShutdownCLOG(void) 00579 { 00580 /* Flush dirty CLOG pages to disk */ 00581 TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(false); 00582 SimpleLruFlush(ClogCtl, false); 00583 TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(false); 00584 } 00585 00586 /* 00587 * Perform a checkpoint --- either during shutdown, or on-the-fly 00588 */ 00589 void 00590 CheckPointCLOG(void) 00591 { 00592 /* Flush dirty CLOG pages to disk */ 00593 TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true); 00594 SimpleLruFlush(ClogCtl, true); 00595 TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true); 00596 } 00597 00598 00599 /* 00600 * Make sure that CLOG has room for a newly-allocated XID. 00601 * 00602 * NB: this is called while holding XidGenLock. We want it to be very fast 00603 * most of the time; even when it's not so fast, no actual I/O need happen 00604 * unless we're forced to write out a dirty clog or xlog page to make room 00605 * in shared memory. 00606 */ 00607 void 00608 ExtendCLOG(TransactionId newestXact) 00609 { 00610 int pageno; 00611 00612 /* 00613 * No work except at first XID of a page. But beware: just after 00614 * wraparound, the first XID of page zero is FirstNormalTransactionId. 00615 */ 00616 if (TransactionIdToPgIndex(newestXact) != 0 && 00617 !TransactionIdEquals(newestXact, FirstNormalTransactionId)) 00618 return; 00619 00620 pageno = TransactionIdToPage(newestXact); 00621 00622 LWLockAcquire(CLogControlLock, LW_EXCLUSIVE); 00623 00624 /* Zero the page and make an XLOG entry about it */ 00625 ZeroCLOGPage(pageno, !InRecovery); 00626 00627 LWLockRelease(CLogControlLock); 00628 } 00629 00630 00631 /* 00632 * Remove all CLOG segments before the one holding the passed transaction ID 00633 * 00634 * Before removing any CLOG data, we must flush XLOG to disk, to ensure 00635 * that any recently-emitted HEAP_FREEZE records have reached disk; otherwise 00636 * a crash and restart might leave us with some unfrozen tuples referencing 00637 * removed CLOG data. We choose to emit a special TRUNCATE XLOG record too. 00638 * Replaying the deletion from XLOG is not critical, since the files could 00639 * just as well be removed later, but doing so prevents a long-running hot 00640 * standby server from acquiring an unreasonably bloated CLOG directory. 00641 * 00642 * Since CLOG segments hold a large number of transactions, the opportunity to 00643 * actually remove a segment is fairly rare, and so it seems best not to do 00644 * the XLOG flush unless we have confirmed that there is a removable segment. 00645 */ 00646 void 00647 TruncateCLOG(TransactionId oldestXact) 00648 { 00649 int cutoffPage; 00650 00651 /* 00652 * The cutoff point is the start of the segment containing oldestXact. We 00653 * pass the *page* containing oldestXact to SimpleLruTruncate. 00654 */ 00655 cutoffPage = TransactionIdToPage(oldestXact); 00656 00657 /* Check to see if there's any files that could be removed */ 00658 if (!SlruScanDirectory(ClogCtl, SlruScanDirCbReportPresence, &cutoffPage)) 00659 return; /* nothing to remove */ 00660 00661 /* Write XLOG record and flush XLOG to disk */ 00662 WriteTruncateXlogRec(cutoffPage); 00663 00664 /* Now we can remove the old CLOG segment(s) */ 00665 SimpleLruTruncate(ClogCtl, cutoffPage); 00666 } 00667 00668 00669 /* 00670 * Decide which of two CLOG page numbers is "older" for truncation purposes. 00671 * 00672 * We need to use comparison of TransactionIds here in order to do the right 00673 * thing with wraparound XID arithmetic. However, if we are asked about 00674 * page number zero, we don't want to hand InvalidTransactionId to 00675 * TransactionIdPrecedes: it'll get weird about permanent xact IDs. So, 00676 * offset both xids by FirstNormalTransactionId to avoid that. 00677 */ 00678 static bool 00679 CLOGPagePrecedes(int page1, int page2) 00680 { 00681 TransactionId xid1; 00682 TransactionId xid2; 00683 00684 xid1 = ((TransactionId) page1) * CLOG_XACTS_PER_PAGE; 00685 xid1 += FirstNormalTransactionId; 00686 xid2 = ((TransactionId) page2) * CLOG_XACTS_PER_PAGE; 00687 xid2 += FirstNormalTransactionId; 00688 00689 return TransactionIdPrecedes(xid1, xid2); 00690 } 00691 00692 00693 /* 00694 * Write a ZEROPAGE xlog record 00695 */ 00696 static void 00697 WriteZeroPageXlogRec(int pageno) 00698 { 00699 XLogRecData rdata; 00700 00701 rdata.data = (char *) (&pageno); 00702 rdata.len = sizeof(int); 00703 rdata.buffer = InvalidBuffer; 00704 rdata.next = NULL; 00705 (void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE, &rdata); 00706 } 00707 00708 /* 00709 * Write a TRUNCATE xlog record 00710 * 00711 * We must flush the xlog record to disk before returning --- see notes 00712 * in TruncateCLOG(). 00713 */ 00714 static void 00715 WriteTruncateXlogRec(int pageno) 00716 { 00717 XLogRecData rdata; 00718 XLogRecPtr recptr; 00719 00720 rdata.data = (char *) (&pageno); 00721 rdata.len = sizeof(int); 00722 rdata.buffer = InvalidBuffer; 00723 rdata.next = NULL; 00724 recptr = XLogInsert(RM_CLOG_ID, CLOG_TRUNCATE, &rdata); 00725 XLogFlush(recptr); 00726 } 00727 00728 /* 00729 * CLOG resource manager's routines 00730 */ 00731 void 00732 clog_redo(XLogRecPtr lsn, XLogRecord *record) 00733 { 00734 uint8 info = record->xl_info & ~XLR_INFO_MASK; 00735 00736 /* Backup blocks are not used in clog records */ 00737 Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); 00738 00739 if (info == CLOG_ZEROPAGE) 00740 { 00741 int pageno; 00742 int slotno; 00743 00744 memcpy(&pageno, XLogRecGetData(record), sizeof(int)); 00745 00746 LWLockAcquire(CLogControlLock, LW_EXCLUSIVE); 00747 00748 slotno = ZeroCLOGPage(pageno, false); 00749 SimpleLruWritePage(ClogCtl, slotno); 00750 Assert(!ClogCtl->shared->page_dirty[slotno]); 00751 00752 LWLockRelease(CLogControlLock); 00753 } 00754 else if (info == CLOG_TRUNCATE) 00755 { 00756 int pageno; 00757 00758 memcpy(&pageno, XLogRecGetData(record), sizeof(int)); 00759 00760 /* 00761 * During XLOG replay, latest_page_number isn't set up yet; insert a 00762 * suitable value to bypass the sanity test in SimpleLruTruncate. 00763 */ 00764 ClogCtl->shared->latest_page_number = pageno; 00765 00766 SimpleLruTruncate(ClogCtl, pageno); 00767 } 00768 else 00769 elog(PANIC, "clog_redo: unknown op code %u", info); 00770 }