00001 /*------------------------------------------------------------------------- 00002 * 00003 * freelist.c 00004 * routines for managing the buffer pool's replacement strategy. 00005 * 00006 * 00007 * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group 00008 * Portions Copyright (c) 1994, Regents of the University of California 00009 * 00010 * 00011 * IDENTIFICATION 00012 * src/backend/storage/buffer/freelist.c 00013 * 00014 *------------------------------------------------------------------------- 00015 */ 00016 #include "postgres.h" 00017 00018 #include "storage/buf_internals.h" 00019 #include "storage/bufmgr.h" 00020 00021 00022 /* 00023 * The shared freelist control information. 00024 */ 00025 typedef struct 00026 { 00027 /* Clock sweep hand: index of next buffer to consider grabbing */ 00028 int nextVictimBuffer; 00029 00030 int firstFreeBuffer; /* Head of list of unused buffers */ 00031 int lastFreeBuffer; /* Tail of list of unused buffers */ 00032 00033 /* 00034 * NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1 (that is, 00035 * when the list is empty) 00036 */ 00037 00038 /* 00039 * Statistics. These counters should be wide enough that they can't 00040 * overflow during a single bgwriter cycle. 00041 */ 00042 uint32 completePasses; /* Complete cycles of the clock sweep */ 00043 uint32 numBufferAllocs; /* Buffers allocated since last reset */ 00044 00045 /* 00046 * Notification latch, or NULL if none. See StrategyNotifyBgWriter. 00047 */ 00048 Latch *bgwriterLatch; 00049 } BufferStrategyControl; 00050 00051 /* Pointers to shared state */ 00052 static BufferStrategyControl *StrategyControl = NULL; 00053 00054 /* 00055 * Private (non-shared) state for managing a ring of shared buffers to re-use. 00056 * This is currently the only kind of BufferAccessStrategy object, but someday 00057 * we might have more kinds. 00058 */ 00059 typedef struct BufferAccessStrategyData 00060 { 00061 /* Overall strategy type */ 00062 BufferAccessStrategyType btype; 00063 /* Number of elements in buffers[] array */ 00064 int ring_size; 00065 00066 /* 00067 * Index of the "current" slot in the ring, ie, the one most recently 00068 * returned by GetBufferFromRing. 00069 */ 00070 int current; 00071 00072 /* 00073 * True if the buffer just returned by StrategyGetBuffer had been in the 00074 * ring already. 00075 */ 00076 bool current_was_in_ring; 00077 00078 /* 00079 * Array of buffer numbers. InvalidBuffer (that is, zero) indicates we 00080 * have not yet selected a buffer for this ring slot. For allocation 00081 * simplicity this is palloc'd together with the fixed fields of the 00082 * struct. 00083 */ 00084 Buffer buffers[1]; /* VARIABLE SIZE ARRAY */ 00085 } BufferAccessStrategyData; 00086 00087 00088 /* Prototypes for internal functions */ 00089 static volatile BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy); 00090 static void AddBufferToRing(BufferAccessStrategy strategy, 00091 volatile BufferDesc *buf); 00092 00093 00094 /* 00095 * StrategyGetBuffer 00096 * 00097 * Called by the bufmgr to get the next candidate buffer to use in 00098 * BufferAlloc(). The only hard requirement BufferAlloc() has is that 00099 * the selected buffer must not currently be pinned by anyone. 00100 * 00101 * strategy is a BufferAccessStrategy object, or NULL for default strategy. 00102 * 00103 * To ensure that no one else can pin the buffer before we do, we must 00104 * return the buffer with the buffer header spinlock still held. If 00105 * *lock_held is set on exit, we have returned with the BufFreelistLock 00106 * still held, as well; the caller must release that lock once the spinlock 00107 * is dropped. We do it that way because releasing the BufFreelistLock 00108 * might awaken other processes, and it would be bad to do the associated 00109 * kernel calls while holding the buffer header spinlock. 00110 */ 00111 volatile BufferDesc * 00112 StrategyGetBuffer(BufferAccessStrategy strategy, bool *lock_held) 00113 { 00114 volatile BufferDesc *buf; 00115 Latch *bgwriterLatch; 00116 int trycounter; 00117 00118 /* 00119 * If given a strategy object, see whether it can select a buffer. We 00120 * assume strategy objects don't need the BufFreelistLock. 00121 */ 00122 if (strategy != NULL) 00123 { 00124 buf = GetBufferFromRing(strategy); 00125 if (buf != NULL) 00126 { 00127 *lock_held = false; 00128 return buf; 00129 } 00130 } 00131 00132 /* Nope, so lock the freelist */ 00133 *lock_held = true; 00134 LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE); 00135 00136 /* 00137 * We count buffer allocation requests so that the bgwriter can estimate 00138 * the rate of buffer consumption. Note that buffers recycled by a 00139 * strategy object are intentionally not counted here. 00140 */ 00141 StrategyControl->numBufferAllocs++; 00142 00143 /* 00144 * If bgwriterLatch is set, we need to waken the bgwriter, but we should 00145 * not do so while holding BufFreelistLock; so release and re-grab. This 00146 * is annoyingly tedious, but it happens at most once per bgwriter cycle, 00147 * so the performance hit is minimal. 00148 */ 00149 bgwriterLatch = StrategyControl->bgwriterLatch; 00150 if (bgwriterLatch) 00151 { 00152 StrategyControl->bgwriterLatch = NULL; 00153 LWLockRelease(BufFreelistLock); 00154 SetLatch(bgwriterLatch); 00155 LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE); 00156 } 00157 00158 /* 00159 * Try to get a buffer from the freelist. Note that the freeNext fields 00160 * are considered to be protected by the BufFreelistLock not the 00161 * individual buffer spinlocks, so it's OK to manipulate them without 00162 * holding the spinlock. 00163 */ 00164 while (StrategyControl->firstFreeBuffer >= 0) 00165 { 00166 buf = &BufferDescriptors[StrategyControl->firstFreeBuffer]; 00167 Assert(buf->freeNext != FREENEXT_NOT_IN_LIST); 00168 00169 /* Unconditionally remove buffer from freelist */ 00170 StrategyControl->firstFreeBuffer = buf->freeNext; 00171 buf->freeNext = FREENEXT_NOT_IN_LIST; 00172 00173 /* 00174 * If the buffer is pinned or has a nonzero usage_count, we cannot use 00175 * it; discard it and retry. (This can only happen if VACUUM put a 00176 * valid buffer in the freelist and then someone else used it before 00177 * we got to it. It's probably impossible altogether as of 8.3, but 00178 * we'd better check anyway.) 00179 */ 00180 LockBufHdr(buf); 00181 if (buf->refcount == 0 && buf->usage_count == 0) 00182 { 00183 if (strategy != NULL) 00184 AddBufferToRing(strategy, buf); 00185 return buf; 00186 } 00187 UnlockBufHdr(buf); 00188 } 00189 00190 /* Nothing on the freelist, so run the "clock sweep" algorithm */ 00191 trycounter = NBuffers; 00192 for (;;) 00193 { 00194 buf = &BufferDescriptors[StrategyControl->nextVictimBuffer]; 00195 00196 if (++StrategyControl->nextVictimBuffer >= NBuffers) 00197 { 00198 StrategyControl->nextVictimBuffer = 0; 00199 StrategyControl->completePasses++; 00200 } 00201 00202 /* 00203 * If the buffer is pinned or has a nonzero usage_count, we cannot use 00204 * it; decrement the usage_count (unless pinned) and keep scanning. 00205 */ 00206 LockBufHdr(buf); 00207 if (buf->refcount == 0) 00208 { 00209 if (buf->usage_count > 0) 00210 { 00211 buf->usage_count--; 00212 trycounter = NBuffers; 00213 } 00214 else 00215 { 00216 /* Found a usable buffer */ 00217 if (strategy != NULL) 00218 AddBufferToRing(strategy, buf); 00219 return buf; 00220 } 00221 } 00222 else if (--trycounter == 0) 00223 { 00224 /* 00225 * We've scanned all the buffers without making any state changes, 00226 * so all the buffers are pinned (or were when we looked at them). 00227 * We could hope that someone will free one eventually, but it's 00228 * probably better to fail than to risk getting stuck in an 00229 * infinite loop. 00230 */ 00231 UnlockBufHdr(buf); 00232 elog(ERROR, "no unpinned buffers available"); 00233 } 00234 UnlockBufHdr(buf); 00235 } 00236 } 00237 00238 /* 00239 * StrategyFreeBuffer: put a buffer on the freelist 00240 */ 00241 void 00242 StrategyFreeBuffer(volatile BufferDesc *buf) 00243 { 00244 LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE); 00245 00246 /* 00247 * It is possible that we are told to put something in the freelist that 00248 * is already in it; don't screw up the list if so. 00249 */ 00250 if (buf->freeNext == FREENEXT_NOT_IN_LIST) 00251 { 00252 buf->freeNext = StrategyControl->firstFreeBuffer; 00253 if (buf->freeNext < 0) 00254 StrategyControl->lastFreeBuffer = buf->buf_id; 00255 StrategyControl->firstFreeBuffer = buf->buf_id; 00256 } 00257 00258 LWLockRelease(BufFreelistLock); 00259 } 00260 00261 /* 00262 * StrategySyncStart -- tell BufferSync where to start syncing 00263 * 00264 * The result is the buffer index of the best buffer to sync first. 00265 * BufferSync() will proceed circularly around the buffer array from there. 00266 * 00267 * In addition, we return the completed-pass count (which is effectively 00268 * the higher-order bits of nextVictimBuffer) and the count of recent buffer 00269 * allocs if non-NULL pointers are passed. The alloc count is reset after 00270 * being read. 00271 */ 00272 int 00273 StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc) 00274 { 00275 int result; 00276 00277 LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE); 00278 result = StrategyControl->nextVictimBuffer; 00279 if (complete_passes) 00280 *complete_passes = StrategyControl->completePasses; 00281 if (num_buf_alloc) 00282 { 00283 *num_buf_alloc = StrategyControl->numBufferAllocs; 00284 StrategyControl->numBufferAllocs = 0; 00285 } 00286 LWLockRelease(BufFreelistLock); 00287 return result; 00288 } 00289 00290 /* 00291 * StrategyNotifyBgWriter -- set or clear allocation notification latch 00292 * 00293 * If bgwriterLatch isn't NULL, the next invocation of StrategyGetBuffer will 00294 * set that latch. Pass NULL to clear the pending notification before it 00295 * happens. This feature is used by the bgwriter process to wake itself up 00296 * from hibernation, and is not meant for anybody else to use. 00297 */ 00298 void 00299 StrategyNotifyBgWriter(Latch *bgwriterLatch) 00300 { 00301 /* 00302 * We acquire the BufFreelistLock just to ensure that the store appears 00303 * atomic to StrategyGetBuffer. The bgwriter should call this rather 00304 * infrequently, so there's no performance penalty from being safe. 00305 */ 00306 LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE); 00307 StrategyControl->bgwriterLatch = bgwriterLatch; 00308 LWLockRelease(BufFreelistLock); 00309 } 00310 00311 00312 /* 00313 * StrategyShmemSize 00314 * 00315 * estimate the size of shared memory used by the freelist-related structures. 00316 * 00317 * Note: for somewhat historical reasons, the buffer lookup hashtable size 00318 * is also determined here. 00319 */ 00320 Size 00321 StrategyShmemSize(void) 00322 { 00323 Size size = 0; 00324 00325 /* size of lookup hash table ... see comment in StrategyInitialize */ 00326 size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS)); 00327 00328 /* size of the shared replacement strategy control block */ 00329 size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl))); 00330 00331 return size; 00332 } 00333 00334 /* 00335 * StrategyInitialize -- initialize the buffer cache replacement 00336 * strategy. 00337 * 00338 * Assumes: All of the buffers are already built into a linked list. 00339 * Only called by postmaster and only during initialization. 00340 */ 00341 void 00342 StrategyInitialize(bool init) 00343 { 00344 bool found; 00345 00346 /* 00347 * Initialize the shared buffer lookup hashtable. 00348 * 00349 * Since we can't tolerate running out of lookup table entries, we must be 00350 * sure to specify an adequate table size here. The maximum steady-state 00351 * usage is of course NBuffers entries, but BufferAlloc() tries to insert 00352 * a new entry before deleting the old. In principle this could be 00353 * happening in each partition concurrently, so we could need as many as 00354 * NBuffers + NUM_BUFFER_PARTITIONS entries. 00355 */ 00356 InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS); 00357 00358 /* 00359 * Get or create the shared strategy control block 00360 */ 00361 StrategyControl = (BufferStrategyControl *) 00362 ShmemInitStruct("Buffer Strategy Status", 00363 sizeof(BufferStrategyControl), 00364 &found); 00365 00366 if (!found) 00367 { 00368 /* 00369 * Only done once, usually in postmaster 00370 */ 00371 Assert(init); 00372 00373 /* 00374 * Grab the whole linked list of free buffers for our strategy. We 00375 * assume it was previously set up by InitBufferPool(). 00376 */ 00377 StrategyControl->firstFreeBuffer = 0; 00378 StrategyControl->lastFreeBuffer = NBuffers - 1; 00379 00380 /* Initialize the clock sweep pointer */ 00381 StrategyControl->nextVictimBuffer = 0; 00382 00383 /* Clear statistics */ 00384 StrategyControl->completePasses = 0; 00385 StrategyControl->numBufferAllocs = 0; 00386 00387 /* No pending notification */ 00388 StrategyControl->bgwriterLatch = NULL; 00389 } 00390 else 00391 Assert(!init); 00392 } 00393 00394 00395 /* ---------------------------------------------------------------- 00396 * Backend-private buffer ring management 00397 * ---------------------------------------------------------------- 00398 */ 00399 00400 00401 /* 00402 * GetAccessStrategy -- create a BufferAccessStrategy object 00403 * 00404 * The object is allocated in the current memory context. 00405 */ 00406 BufferAccessStrategy 00407 GetAccessStrategy(BufferAccessStrategyType btype) 00408 { 00409 BufferAccessStrategy strategy; 00410 int ring_size; 00411 00412 /* 00413 * Select ring size to use. See buffer/README for rationales. 00414 * 00415 * Note: if you change the ring size for BAS_BULKREAD, see also 00416 * SYNC_SCAN_REPORT_INTERVAL in access/heap/syncscan.c. 00417 */ 00418 switch (btype) 00419 { 00420 case BAS_NORMAL: 00421 /* if someone asks for NORMAL, just give 'em a "default" object */ 00422 return NULL; 00423 00424 case BAS_BULKREAD: 00425 ring_size = 256 * 1024 / BLCKSZ; 00426 break; 00427 case BAS_BULKWRITE: 00428 ring_size = 16 * 1024 * 1024 / BLCKSZ; 00429 break; 00430 case BAS_VACUUM: 00431 ring_size = 256 * 1024 / BLCKSZ; 00432 break; 00433 00434 default: 00435 elog(ERROR, "unrecognized buffer access strategy: %d", 00436 (int) btype); 00437 return NULL; /* keep compiler quiet */ 00438 } 00439 00440 /* Make sure ring isn't an undue fraction of shared buffers */ 00441 ring_size = Min(NBuffers / 8, ring_size); 00442 00443 /* Allocate the object and initialize all elements to zeroes */ 00444 strategy = (BufferAccessStrategy) 00445 palloc0(offsetof(BufferAccessStrategyData, buffers) + 00446 ring_size * sizeof(Buffer)); 00447 00448 /* Set fields that don't start out zero */ 00449 strategy->btype = btype; 00450 strategy->ring_size = ring_size; 00451 00452 return strategy; 00453 } 00454 00455 /* 00456 * FreeAccessStrategy -- release a BufferAccessStrategy object 00457 * 00458 * A simple pfree would do at the moment, but we would prefer that callers 00459 * don't assume that much about the representation of BufferAccessStrategy. 00460 */ 00461 void 00462 FreeAccessStrategy(BufferAccessStrategy strategy) 00463 { 00464 /* don't crash if called on a "default" strategy */ 00465 if (strategy != NULL) 00466 pfree(strategy); 00467 } 00468 00469 /* 00470 * GetBufferFromRing -- returns a buffer from the ring, or NULL if the 00471 * ring is empty. 00472 * 00473 * The bufhdr spin lock is held on the returned buffer. 00474 */ 00475 static volatile BufferDesc * 00476 GetBufferFromRing(BufferAccessStrategy strategy) 00477 { 00478 volatile BufferDesc *buf; 00479 Buffer bufnum; 00480 00481 /* Advance to next ring slot */ 00482 if (++strategy->current >= strategy->ring_size) 00483 strategy->current = 0; 00484 00485 /* 00486 * If the slot hasn't been filled yet, tell the caller to allocate a new 00487 * buffer with the normal allocation strategy. He will then fill this 00488 * slot by calling AddBufferToRing with the new buffer. 00489 */ 00490 bufnum = strategy->buffers[strategy->current]; 00491 if (bufnum == InvalidBuffer) 00492 { 00493 strategy->current_was_in_ring = false; 00494 return NULL; 00495 } 00496 00497 /* 00498 * If the buffer is pinned we cannot use it under any circumstances. 00499 * 00500 * If usage_count is 0 or 1 then the buffer is fair game (we expect 1, 00501 * since our own previous usage of the ring element would have left it 00502 * there, but it might've been decremented by clock sweep since then). A 00503 * higher usage_count indicates someone else has touched the buffer, so we 00504 * shouldn't re-use it. 00505 */ 00506 buf = &BufferDescriptors[bufnum - 1]; 00507 LockBufHdr(buf); 00508 if (buf->refcount == 0 && buf->usage_count <= 1) 00509 { 00510 strategy->current_was_in_ring = true; 00511 return buf; 00512 } 00513 UnlockBufHdr(buf); 00514 00515 /* 00516 * Tell caller to allocate a new buffer with the normal allocation 00517 * strategy. He'll then replace this ring element via AddBufferToRing. 00518 */ 00519 strategy->current_was_in_ring = false; 00520 return NULL; 00521 } 00522 00523 /* 00524 * AddBufferToRing -- add a buffer to the buffer ring 00525 * 00526 * Caller must hold the buffer header spinlock on the buffer. Since this 00527 * is called with the spinlock held, it had better be quite cheap. 00528 */ 00529 static void 00530 AddBufferToRing(BufferAccessStrategy strategy, volatile BufferDesc *buf) 00531 { 00532 strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf); 00533 } 00534 00535 /* 00536 * StrategyRejectBuffer -- consider rejecting a dirty buffer 00537 * 00538 * When a nondefault strategy is used, the buffer manager calls this function 00539 * when it turns out that the buffer selected by StrategyGetBuffer needs to 00540 * be written out and doing so would require flushing WAL too. This gives us 00541 * a chance to choose a different victim. 00542 * 00543 * Returns true if buffer manager should ask for a new victim, and false 00544 * if this buffer should be written and re-used. 00545 */ 00546 bool 00547 StrategyRejectBuffer(BufferAccessStrategy strategy, volatile BufferDesc *buf) 00548 { 00549 /* We only do this in bulkread mode */ 00550 if (strategy->btype != BAS_BULKREAD) 00551 return false; 00552 00553 /* Don't muck with behavior of normal buffer-replacement strategy */ 00554 if (!strategy->current_was_in_ring || 00555 strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf)) 00556 return false; 00557 00558 /* 00559 * Remove the dirty buffer from the ring; necessary to prevent infinite 00560 * loop if all ring members are dirty. 00561 */ 00562 strategy->buffers[strategy->current] = InvalidBuffer; 00563 00564 return true; 00565 }