Header And Logo

PostgreSQL
| The world's most advanced open source database.

freelist.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * freelist.c
00004  *    routines for managing the buffer pool's replacement strategy.
00005  *
00006  *
00007  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00008  * Portions Copyright (c) 1994, Regents of the University of California
00009  *
00010  *
00011  * IDENTIFICATION
00012  *    src/backend/storage/buffer/freelist.c
00013  *
00014  *-------------------------------------------------------------------------
00015  */
00016 #include "postgres.h"
00017 
00018 #include "storage/buf_internals.h"
00019 #include "storage/bufmgr.h"
00020 
00021 
00022 /*
00023  * The shared freelist control information.
00024  */
00025 typedef struct
00026 {
00027     /* Clock sweep hand: index of next buffer to consider grabbing */
00028     int         nextVictimBuffer;
00029 
00030     int         firstFreeBuffer;    /* Head of list of unused buffers */
00031     int         lastFreeBuffer; /* Tail of list of unused buffers */
00032 
00033     /*
00034      * NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1 (that is,
00035      * when the list is empty)
00036      */
00037 
00038     /*
00039      * Statistics.  These counters should be wide enough that they can't
00040      * overflow during a single bgwriter cycle.
00041      */
00042     uint32      completePasses; /* Complete cycles of the clock sweep */
00043     uint32      numBufferAllocs;    /* Buffers allocated since last reset */
00044 
00045     /*
00046      * Notification latch, or NULL if none.  See StrategyNotifyBgWriter.
00047      */
00048     Latch      *bgwriterLatch;
00049 } BufferStrategyControl;
00050 
00051 /* Pointers to shared state */
00052 static BufferStrategyControl *StrategyControl = NULL;
00053 
00054 /*
00055  * Private (non-shared) state for managing a ring of shared buffers to re-use.
00056  * This is currently the only kind of BufferAccessStrategy object, but someday
00057  * we might have more kinds.
00058  */
00059 typedef struct BufferAccessStrategyData
00060 {
00061     /* Overall strategy type */
00062     BufferAccessStrategyType btype;
00063     /* Number of elements in buffers[] array */
00064     int         ring_size;
00065 
00066     /*
00067      * Index of the "current" slot in the ring, ie, the one most recently
00068      * returned by GetBufferFromRing.
00069      */
00070     int         current;
00071 
00072     /*
00073      * True if the buffer just returned by StrategyGetBuffer had been in the
00074      * ring already.
00075      */
00076     bool        current_was_in_ring;
00077 
00078     /*
00079      * Array of buffer numbers.  InvalidBuffer (that is, zero) indicates we
00080      * have not yet selected a buffer for this ring slot.  For allocation
00081      * simplicity this is palloc'd together with the fixed fields of the
00082      * struct.
00083      */
00084     Buffer      buffers[1];     /* VARIABLE SIZE ARRAY */
00085 }   BufferAccessStrategyData;
00086 
00087 
00088 /* Prototypes for internal functions */
00089 static volatile BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy);
00090 static void AddBufferToRing(BufferAccessStrategy strategy,
00091                 volatile BufferDesc *buf);
00092 
00093 
00094 /*
00095  * StrategyGetBuffer
00096  *
00097  *  Called by the bufmgr to get the next candidate buffer to use in
00098  *  BufferAlloc(). The only hard requirement BufferAlloc() has is that
00099  *  the selected buffer must not currently be pinned by anyone.
00100  *
00101  *  strategy is a BufferAccessStrategy object, or NULL for default strategy.
00102  *
00103  *  To ensure that no one else can pin the buffer before we do, we must
00104  *  return the buffer with the buffer header spinlock still held.  If
00105  *  *lock_held is set on exit, we have returned with the BufFreelistLock
00106  *  still held, as well; the caller must release that lock once the spinlock
00107  *  is dropped.  We do it that way because releasing the BufFreelistLock
00108  *  might awaken other processes, and it would be bad to do the associated
00109  *  kernel calls while holding the buffer header spinlock.
00110  */
00111 volatile BufferDesc *
00112 StrategyGetBuffer(BufferAccessStrategy strategy, bool *lock_held)
00113 {
00114     volatile BufferDesc *buf;
00115     Latch      *bgwriterLatch;
00116     int         trycounter;
00117 
00118     /*
00119      * If given a strategy object, see whether it can select a buffer. We
00120      * assume strategy objects don't need the BufFreelistLock.
00121      */
00122     if (strategy != NULL)
00123     {
00124         buf = GetBufferFromRing(strategy);
00125         if (buf != NULL)
00126         {
00127             *lock_held = false;
00128             return buf;
00129         }
00130     }
00131 
00132     /* Nope, so lock the freelist */
00133     *lock_held = true;
00134     LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
00135 
00136     /*
00137      * We count buffer allocation requests so that the bgwriter can estimate
00138      * the rate of buffer consumption.  Note that buffers recycled by a
00139      * strategy object are intentionally not counted here.
00140      */
00141     StrategyControl->numBufferAllocs++;
00142 
00143     /*
00144      * If bgwriterLatch is set, we need to waken the bgwriter, but we should
00145      * not do so while holding BufFreelistLock; so release and re-grab.  This
00146      * is annoyingly tedious, but it happens at most once per bgwriter cycle,
00147      * so the performance hit is minimal.
00148      */
00149     bgwriterLatch = StrategyControl->bgwriterLatch;
00150     if (bgwriterLatch)
00151     {
00152         StrategyControl->bgwriterLatch = NULL;
00153         LWLockRelease(BufFreelistLock);
00154         SetLatch(bgwriterLatch);
00155         LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
00156     }
00157 
00158     /*
00159      * Try to get a buffer from the freelist.  Note that the freeNext fields
00160      * are considered to be protected by the BufFreelistLock not the
00161      * individual buffer spinlocks, so it's OK to manipulate them without
00162      * holding the spinlock.
00163      */
00164     while (StrategyControl->firstFreeBuffer >= 0)
00165     {
00166         buf = &BufferDescriptors[StrategyControl->firstFreeBuffer];
00167         Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
00168 
00169         /* Unconditionally remove buffer from freelist */
00170         StrategyControl->firstFreeBuffer = buf->freeNext;
00171         buf->freeNext = FREENEXT_NOT_IN_LIST;
00172 
00173         /*
00174          * If the buffer is pinned or has a nonzero usage_count, we cannot use
00175          * it; discard it and retry.  (This can only happen if VACUUM put a
00176          * valid buffer in the freelist and then someone else used it before
00177          * we got to it.  It's probably impossible altogether as of 8.3, but
00178          * we'd better check anyway.)
00179          */
00180         LockBufHdr(buf);
00181         if (buf->refcount == 0 && buf->usage_count == 0)
00182         {
00183             if (strategy != NULL)
00184                 AddBufferToRing(strategy, buf);
00185             return buf;
00186         }
00187         UnlockBufHdr(buf);
00188     }
00189 
00190     /* Nothing on the freelist, so run the "clock sweep" algorithm */
00191     trycounter = NBuffers;
00192     for (;;)
00193     {
00194         buf = &BufferDescriptors[StrategyControl->nextVictimBuffer];
00195 
00196         if (++StrategyControl->nextVictimBuffer >= NBuffers)
00197         {
00198             StrategyControl->nextVictimBuffer = 0;
00199             StrategyControl->completePasses++;
00200         }
00201 
00202         /*
00203          * If the buffer is pinned or has a nonzero usage_count, we cannot use
00204          * it; decrement the usage_count (unless pinned) and keep scanning.
00205          */
00206         LockBufHdr(buf);
00207         if (buf->refcount == 0)
00208         {
00209             if (buf->usage_count > 0)
00210             {
00211                 buf->usage_count--;
00212                 trycounter = NBuffers;
00213             }
00214             else
00215             {
00216                 /* Found a usable buffer */
00217                 if (strategy != NULL)
00218                     AddBufferToRing(strategy, buf);
00219                 return buf;
00220             }
00221         }
00222         else if (--trycounter == 0)
00223         {
00224             /*
00225              * We've scanned all the buffers without making any state changes,
00226              * so all the buffers are pinned (or were when we looked at them).
00227              * We could hope that someone will free one eventually, but it's
00228              * probably better to fail than to risk getting stuck in an
00229              * infinite loop.
00230              */
00231             UnlockBufHdr(buf);
00232             elog(ERROR, "no unpinned buffers available");
00233         }
00234         UnlockBufHdr(buf);
00235     }
00236 }
00237 
00238 /*
00239  * StrategyFreeBuffer: put a buffer on the freelist
00240  */
00241 void
00242 StrategyFreeBuffer(volatile BufferDesc *buf)
00243 {
00244     LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
00245 
00246     /*
00247      * It is possible that we are told to put something in the freelist that
00248      * is already in it; don't screw up the list if so.
00249      */
00250     if (buf->freeNext == FREENEXT_NOT_IN_LIST)
00251     {
00252         buf->freeNext = StrategyControl->firstFreeBuffer;
00253         if (buf->freeNext < 0)
00254             StrategyControl->lastFreeBuffer = buf->buf_id;
00255         StrategyControl->firstFreeBuffer = buf->buf_id;
00256     }
00257 
00258     LWLockRelease(BufFreelistLock);
00259 }
00260 
00261 /*
00262  * StrategySyncStart -- tell BufferSync where to start syncing
00263  *
00264  * The result is the buffer index of the best buffer to sync first.
00265  * BufferSync() will proceed circularly around the buffer array from there.
00266  *
00267  * In addition, we return the completed-pass count (which is effectively
00268  * the higher-order bits of nextVictimBuffer) and the count of recent buffer
00269  * allocs if non-NULL pointers are passed.  The alloc count is reset after
00270  * being read.
00271  */
00272 int
00273 StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
00274 {
00275     int         result;
00276 
00277     LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
00278     result = StrategyControl->nextVictimBuffer;
00279     if (complete_passes)
00280         *complete_passes = StrategyControl->completePasses;
00281     if (num_buf_alloc)
00282     {
00283         *num_buf_alloc = StrategyControl->numBufferAllocs;
00284         StrategyControl->numBufferAllocs = 0;
00285     }
00286     LWLockRelease(BufFreelistLock);
00287     return result;
00288 }
00289 
00290 /*
00291  * StrategyNotifyBgWriter -- set or clear allocation notification latch
00292  *
00293  * If bgwriterLatch isn't NULL, the next invocation of StrategyGetBuffer will
00294  * set that latch.  Pass NULL to clear the pending notification before it
00295  * happens.  This feature is used by the bgwriter process to wake itself up
00296  * from hibernation, and is not meant for anybody else to use.
00297  */
00298 void
00299 StrategyNotifyBgWriter(Latch *bgwriterLatch)
00300 {
00301     /*
00302      * We acquire the BufFreelistLock just to ensure that the store appears
00303      * atomic to StrategyGetBuffer.  The bgwriter should call this rather
00304      * infrequently, so there's no performance penalty from being safe.
00305      */
00306     LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
00307     StrategyControl->bgwriterLatch = bgwriterLatch;
00308     LWLockRelease(BufFreelistLock);
00309 }
00310 
00311 
00312 /*
00313  * StrategyShmemSize
00314  *
00315  * estimate the size of shared memory used by the freelist-related structures.
00316  *
00317  * Note: for somewhat historical reasons, the buffer lookup hashtable size
00318  * is also determined here.
00319  */
00320 Size
00321 StrategyShmemSize(void)
00322 {
00323     Size        size = 0;
00324 
00325     /* size of lookup hash table ... see comment in StrategyInitialize */
00326     size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
00327 
00328     /* size of the shared replacement strategy control block */
00329     size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl)));
00330 
00331     return size;
00332 }
00333 
00334 /*
00335  * StrategyInitialize -- initialize the buffer cache replacement
00336  *      strategy.
00337  *
00338  * Assumes: All of the buffers are already built into a linked list.
00339  *      Only called by postmaster and only during initialization.
00340  */
00341 void
00342 StrategyInitialize(bool init)
00343 {
00344     bool        found;
00345 
00346     /*
00347      * Initialize the shared buffer lookup hashtable.
00348      *
00349      * Since we can't tolerate running out of lookup table entries, we must be
00350      * sure to specify an adequate table size here.  The maximum steady-state
00351      * usage is of course NBuffers entries, but BufferAlloc() tries to insert
00352      * a new entry before deleting the old.  In principle this could be
00353      * happening in each partition concurrently, so we could need as many as
00354      * NBuffers + NUM_BUFFER_PARTITIONS entries.
00355      */
00356     InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS);
00357 
00358     /*
00359      * Get or create the shared strategy control block
00360      */
00361     StrategyControl = (BufferStrategyControl *)
00362         ShmemInitStruct("Buffer Strategy Status",
00363                         sizeof(BufferStrategyControl),
00364                         &found);
00365 
00366     if (!found)
00367     {
00368         /*
00369          * Only done once, usually in postmaster
00370          */
00371         Assert(init);
00372 
00373         /*
00374          * Grab the whole linked list of free buffers for our strategy. We
00375          * assume it was previously set up by InitBufferPool().
00376          */
00377         StrategyControl->firstFreeBuffer = 0;
00378         StrategyControl->lastFreeBuffer = NBuffers - 1;
00379 
00380         /* Initialize the clock sweep pointer */
00381         StrategyControl->nextVictimBuffer = 0;
00382 
00383         /* Clear statistics */
00384         StrategyControl->completePasses = 0;
00385         StrategyControl->numBufferAllocs = 0;
00386 
00387         /* No pending notification */
00388         StrategyControl->bgwriterLatch = NULL;
00389     }
00390     else
00391         Assert(!init);
00392 }
00393 
00394 
00395 /* ----------------------------------------------------------------
00396  *              Backend-private buffer ring management
00397  * ----------------------------------------------------------------
00398  */
00399 
00400 
00401 /*
00402  * GetAccessStrategy -- create a BufferAccessStrategy object
00403  *
00404  * The object is allocated in the current memory context.
00405  */
00406 BufferAccessStrategy
00407 GetAccessStrategy(BufferAccessStrategyType btype)
00408 {
00409     BufferAccessStrategy strategy;
00410     int         ring_size;
00411 
00412     /*
00413      * Select ring size to use.  See buffer/README for rationales.
00414      *
00415      * Note: if you change the ring size for BAS_BULKREAD, see also
00416      * SYNC_SCAN_REPORT_INTERVAL in access/heap/syncscan.c.
00417      */
00418     switch (btype)
00419     {
00420         case BAS_NORMAL:
00421             /* if someone asks for NORMAL, just give 'em a "default" object */
00422             return NULL;
00423 
00424         case BAS_BULKREAD:
00425             ring_size = 256 * 1024 / BLCKSZ;
00426             break;
00427         case BAS_BULKWRITE:
00428             ring_size = 16 * 1024 * 1024 / BLCKSZ;
00429             break;
00430         case BAS_VACUUM:
00431             ring_size = 256 * 1024 / BLCKSZ;
00432             break;
00433 
00434         default:
00435             elog(ERROR, "unrecognized buffer access strategy: %d",
00436                  (int) btype);
00437             return NULL;        /* keep compiler quiet */
00438     }
00439 
00440     /* Make sure ring isn't an undue fraction of shared buffers */
00441     ring_size = Min(NBuffers / 8, ring_size);
00442 
00443     /* Allocate the object and initialize all elements to zeroes */
00444     strategy = (BufferAccessStrategy)
00445         palloc0(offsetof(BufferAccessStrategyData, buffers) +
00446                 ring_size * sizeof(Buffer));
00447 
00448     /* Set fields that don't start out zero */
00449     strategy->btype = btype;
00450     strategy->ring_size = ring_size;
00451 
00452     return strategy;
00453 }
00454 
00455 /*
00456  * FreeAccessStrategy -- release a BufferAccessStrategy object
00457  *
00458  * A simple pfree would do at the moment, but we would prefer that callers
00459  * don't assume that much about the representation of BufferAccessStrategy.
00460  */
00461 void
00462 FreeAccessStrategy(BufferAccessStrategy strategy)
00463 {
00464     /* don't crash if called on a "default" strategy */
00465     if (strategy != NULL)
00466         pfree(strategy);
00467 }
00468 
00469 /*
00470  * GetBufferFromRing -- returns a buffer from the ring, or NULL if the
00471  *      ring is empty.
00472  *
00473  * The bufhdr spin lock is held on the returned buffer.
00474  */
00475 static volatile BufferDesc *
00476 GetBufferFromRing(BufferAccessStrategy strategy)
00477 {
00478     volatile BufferDesc *buf;
00479     Buffer      bufnum;
00480 
00481     /* Advance to next ring slot */
00482     if (++strategy->current >= strategy->ring_size)
00483         strategy->current = 0;
00484 
00485     /*
00486      * If the slot hasn't been filled yet, tell the caller to allocate a new
00487      * buffer with the normal allocation strategy.  He will then fill this
00488      * slot by calling AddBufferToRing with the new buffer.
00489      */
00490     bufnum = strategy->buffers[strategy->current];
00491     if (bufnum == InvalidBuffer)
00492     {
00493         strategy->current_was_in_ring = false;
00494         return NULL;
00495     }
00496 
00497     /*
00498      * If the buffer is pinned we cannot use it under any circumstances.
00499      *
00500      * If usage_count is 0 or 1 then the buffer is fair game (we expect 1,
00501      * since our own previous usage of the ring element would have left it
00502      * there, but it might've been decremented by clock sweep since then). A
00503      * higher usage_count indicates someone else has touched the buffer, so we
00504      * shouldn't re-use it.
00505      */
00506     buf = &BufferDescriptors[bufnum - 1];
00507     LockBufHdr(buf);
00508     if (buf->refcount == 0 && buf->usage_count <= 1)
00509     {
00510         strategy->current_was_in_ring = true;
00511         return buf;
00512     }
00513     UnlockBufHdr(buf);
00514 
00515     /*
00516      * Tell caller to allocate a new buffer with the normal allocation
00517      * strategy.  He'll then replace this ring element via AddBufferToRing.
00518      */
00519     strategy->current_was_in_ring = false;
00520     return NULL;
00521 }
00522 
00523 /*
00524  * AddBufferToRing -- add a buffer to the buffer ring
00525  *
00526  * Caller must hold the buffer header spinlock on the buffer.  Since this
00527  * is called with the spinlock held, it had better be quite cheap.
00528  */
00529 static void
00530 AddBufferToRing(BufferAccessStrategy strategy, volatile BufferDesc *buf)
00531 {
00532     strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf);
00533 }
00534 
00535 /*
00536  * StrategyRejectBuffer -- consider rejecting a dirty buffer
00537  *
00538  * When a nondefault strategy is used, the buffer manager calls this function
00539  * when it turns out that the buffer selected by StrategyGetBuffer needs to
00540  * be written out and doing so would require flushing WAL too.  This gives us
00541  * a chance to choose a different victim.
00542  *
00543  * Returns true if buffer manager should ask for a new victim, and false
00544  * if this buffer should be written and re-used.
00545  */
00546 bool
00547 StrategyRejectBuffer(BufferAccessStrategy strategy, volatile BufferDesc *buf)
00548 {
00549     /* We only do this in bulkread mode */
00550     if (strategy->btype != BAS_BULKREAD)
00551         return false;
00552 
00553     /* Don't muck with behavior of normal buffer-replacement strategy */
00554     if (!strategy->current_was_in_ring ||
00555       strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
00556         return false;
00557 
00558     /*
00559      * Remove the dirty buffer from the ring; necessary to prevent infinite
00560      * loop if all ring members are dirty.
00561      */
00562     strategy->buffers[strategy->current] = InvalidBuffer;
00563 
00564     return true;
00565 }