Main Page | Class Hierarchy | Data Structures | Directories | File List | Data Fields | Related Pages

mp_alloc.c

00001 /*-
00002  * See the file LICENSE for redistribution information.
00003  *
00004  * Copyright (c) 1996-2005
00005  *      Sleepycat Software.  All rights reserved.
00006  *
00007  * $Id: mp_alloc.c,v 12.6 2005/08/12 13:17:22 bostic Exp $
00008  */
00009 
00010 #include "db_config.h"
00011 
00012 #ifndef NO_SYSTEM_INCLUDES
00013 #include <sys/types.h>
00014 #include <string.h>
00015 #endif
00016 
00017 #include "db_int.h"
00018 #include "dbinc/db_shash.h"
00019 #include "dbinc/mp.h"
00020 
00021 static void __memp_bad_buffer __P((DB_MPOOL_HASH *));
00022 
00023 /*
00024  * __memp_alloc --
00025  *      Allocate some space from a cache region.
00026  *
00027  * PUBLIC: int __memp_alloc __P((DB_MPOOL *,
00028  * PUBLIC:     REGINFO *, MPOOLFILE *, size_t, roff_t *, void *));
00029  */
00030 int
00031 __memp_alloc(dbmp, infop, mfp, len, offsetp, retp)
00032         DB_MPOOL *dbmp;
00033         REGINFO *infop;
00034         MPOOLFILE *mfp;
00035         size_t len;
00036         roff_t *offsetp;
00037         void *retp;
00038 {
00039         BH *bhp;
00040         DB_ENV *dbenv;
00041         DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_tmp;
00042         MPOOL *c_mp;
00043         MPOOLFILE *bh_mfp;
00044         size_t freed_space;
00045         db_mutex_t mutex;
00046         u_int32_t buckets, buffers, high_priority, priority;
00047         u_int32_t put_counter, total_buckets;
00048         int aggressive, giveup, ret;
00049         void *p;
00050 
00051         dbenv = dbmp->dbenv;
00052         c_mp = infop->primary;
00053         dbht = R_ADDR(infop, c_mp->htab);
00054         hp_end = &dbht[c_mp->htab_buckets];
00055 
00056         buckets = buffers = put_counter = total_buckets = 0;
00057         aggressive = giveup = 0;
00058         hp_tmp = NULL;
00059 
00060         c_mp->stat.st_alloc++;
00061 
00062         /*
00063          * If we're allocating a buffer, and the one we're discarding is the
00064          * same size, we don't want to waste the time to re-integrate it into
00065          * the shared memory free list.  If the DB_MPOOLFILE argument isn't
00066          * NULL, we'll compare the underlying page sizes of the two buffers
00067          * before free-ing and re-allocating buffers.
00068          */
00069         if (mfp != NULL)
00070                 len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize;
00071 
00072         MPOOL_REGION_LOCK(dbenv, infop);
00073 
00074         /*
00075          * Anything newer than 1/10th of the buffer pool is ignored during
00076          * allocation (unless allocation starts failing).
00077          */
00078         high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10;
00079 
00080         /*
00081          * First we try to allocate from free memory.  If that fails, scan the
00082          * buffer pool to find buffers with low priorities.  We consider small
00083          * sets of hash buckets each time to limit the amount of work needing
00084          * to be done.  This approximates LRU, but not very well.  We either
00085          * find a buffer of the same size to use, or we will free 3 times what
00086          * we need in the hopes it will coalesce into a contiguous chunk of the
00087          * right size.  In the latter case we branch back here and try again.
00088          */
00089 alloc:  if ((ret = __db_shalloc(infop, len, 0, &p)) == 0) {
00090                 if (mfp != NULL)
00091                         c_mp->stat.st_pages++;
00092                 MPOOL_REGION_UNLOCK(dbenv, infop);
00093 
00094 found:          if (offsetp != NULL)
00095                         *offsetp = R_OFFSET(infop, p);
00096                 *(void **)retp = p;
00097 
00098                 /*
00099                  * Update the search statistics.
00100                  *
00101                  * We're not holding the region locked here, these statistics
00102                  * can't be trusted.
00103                  */
00104                 total_buckets += buckets;
00105                 if (total_buckets != 0) {
00106                         if (total_buckets > c_mp->stat.st_alloc_max_buckets)
00107                                 c_mp->stat.st_alloc_max_buckets = total_buckets;
00108                         c_mp->stat.st_alloc_buckets += total_buckets;
00109                 }
00110                 if (buffers != 0) {
00111                         if (buffers > c_mp->stat.st_alloc_max_pages)
00112                                 c_mp->stat.st_alloc_max_pages = buffers;
00113                         c_mp->stat.st_alloc_pages += buffers;
00114                 }
00115                 return (0);
00116         } else if (giveup || c_mp->stat.st_pages == 0) {
00117                 MPOOL_REGION_UNLOCK(dbenv, infop);
00118 
00119                 __db_err(dbenv,
00120                     "unable to allocate space from the buffer cache");
00121                 return (ret);
00122         }
00123         ret = 0;
00124 
00125         /*
00126          * We re-attempt the allocation every time we've freed 3 times what
00127          * we need.  Reset our free-space counter.
00128          */
00129         freed_space = 0;
00130         total_buckets += buckets;
00131         buckets = 0;
00132 
00133         /*
00134          * Walk the hash buckets and find the next two with potentially useful
00135          * buffers.  Free the buffer with the lowest priority from the buckets'
00136          * chains.
00137          */
00138         for (;;) {
00139                 /* All pages have been freed, make one last try */
00140                 if (c_mp->stat.st_pages == 0)
00141                         goto alloc;
00142 
00143                 /* Check for wrap around. */
00144                 hp = &dbht[c_mp->last_checked++];
00145                 if (hp >= hp_end) {
00146                         c_mp->last_checked = 0;
00147                         hp = &dbht[c_mp->last_checked++];
00148                 }
00149 
00150                 /*
00151                  * Skip empty buckets.
00152                  *
00153                  * We can check for empty buckets before locking as we
00154                  * only care if the pointer is zero or non-zero.
00155                  */
00156                 if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
00157                         continue;
00158 
00159                 /*
00160                  * The failure mode is when there are too many buffers we can't
00161                  * write or there's not enough memory in the system.  We don't
00162                  * have a way to know that allocation has no way to succeed.
00163                  * We fail if there were no pages returned to the cache after
00164                  * we've been trying for a relatively long time.
00165                  *
00166                  * Get aggressive if we've tried to flush the number of hash
00167                  * buckets as are in the system and have not found any more
00168                  * space.  Aggressive means:
00169                  *
00170                  * a: set a flag to attempt to flush high priority buffers as
00171                  *    well as other buffers.
00172                  * b: sync the mpool to force out queue extent pages.  While we
00173                  *    might not have enough space for what we want and flushing
00174                  *    is expensive, why not?
00175                  * c: look at a buffer in every hash bucket rather than choose
00176                  *    the more preferable of two.
00177                  * d: start to think about giving up.
00178                  *
00179                  * If we get here twice, sleep for a second, hopefully someone
00180                  * else will run and free up some memory.
00181                  *
00182                  * Always try to allocate memory too, in case some other thread
00183                  * returns its memory to the region.
00184                  *
00185                  * !!!
00186                  * This test ignores pathological cases like no buffers in the
00187                  * system -- that shouldn't be possible.
00188                  */
00189                 if ((++buckets % c_mp->htab_buckets) == 0) {
00190                         if (freed_space > 0)
00191                                 goto alloc;
00192                         MPOOL_REGION_UNLOCK(dbenv, infop);
00193 
00194                         switch (++aggressive) {
00195                         case 1:
00196                                 break;
00197                         case 2:
00198                                 put_counter = c_mp->put_counter;
00199                                 /* FALLTHROUGH */
00200                         case 3:
00201                         case 4:
00202                         case 5:
00203                         case 6:
00204                                 (void)__memp_sync_int(
00205                                     dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
00206 
00207                                 __os_sleep(dbenv, 1, 0);
00208                                 break;
00209                         default:
00210                                 aggressive = 1;
00211                                 if (put_counter == c_mp->put_counter)
00212                                         giveup = 1;
00213                                 break;
00214                         }
00215 
00216                         MPOOL_REGION_LOCK(dbenv, infop);
00217                         goto alloc;
00218                 }
00219 
00220                 if (!aggressive) {
00221                         /* Skip high priority buckets. */
00222                         if (hp->hash_priority > high_priority)
00223                                 continue;
00224 
00225                         /*
00226                          * Find two buckets and select the one with the lowest
00227                          * priority.  Performance testing shows that looking
00228                          * at two improves the LRUness and looking at more only
00229                          * does a little better.
00230                          */
00231                         if (hp_tmp == NULL) {
00232                                 hp_tmp = hp;
00233                                 continue;
00234                         }
00235                         if (hp->hash_priority > hp_tmp->hash_priority)
00236                                 hp = hp_tmp;
00237                         hp_tmp = NULL;
00238                 }
00239 
00240                 /* Remember the priority of the buffer we're looking for. */
00241                 priority = hp->hash_priority;
00242 
00243                 /* Unlock the region and lock the hash bucket. */
00244                 MPOOL_REGION_UNLOCK(dbenv, infop);
00245                 mutex = hp->mtx_hash;
00246                 MUTEX_LOCK(dbenv, mutex);
00247 
00248 #ifdef DIAGNOSTIC
00249                 __memp_check_order(hp);
00250 #endif
00251                 /*
00252                  * The lowest priority page is first in the bucket, as they are
00253                  * maintained in sorted order.
00254                  *
00255                  * The buffer may have been freed or its priority changed while
00256                  * we switched from the region lock to the hash lock.  If so,
00257                  * we have to restart.  We will still take the first buffer on
00258                  * the bucket's list, though, if it has a low enough priority.
00259                  */
00260                 if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL ||
00261                     bhp->ref != 0 || bhp->priority > priority)
00262                         goto next_hb;
00263 
00264                 buffers++;
00265 
00266                 /* Find the associated MPOOLFILE. */
00267                 bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
00268 
00269                 /* If the page is dirty, pin it and write it. */
00270                 ret = 0;
00271                 if (F_ISSET(bhp, BH_DIRTY)) {
00272                         ++bhp->ref;
00273                         ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0);
00274                         --bhp->ref;
00275                         if (ret == 0)
00276                                 ++c_mp->stat.st_rw_evict;
00277                 } else
00278                         ++c_mp->stat.st_ro_evict;
00279 
00280                 /*
00281                  * If a write fails for any reason, we can't proceed.
00282                  *
00283                  * We released the hash bucket lock while doing I/O, so another
00284                  * thread may have acquired this buffer and incremented the ref
00285                  * count after we wrote it, in which case we can't have it.
00286                  *
00287                  * If there's a write error and we're having problems finding
00288                  * something to allocate, avoid selecting this buffer again
00289                  * by making it the bucket's least-desirable buffer.
00290                  */
00291                 if (ret != 0 || bhp->ref != 0) {
00292                         if (ret != 0 && aggressive)
00293                                 __memp_bad_buffer(hp);
00294                         goto next_hb;
00295                 }
00296 
00297                 /*
00298                  * Check to see if the buffer is the size we're looking for.
00299                  * If so, we can simply reuse it.  Else, free the buffer and
00300                  * its space and keep looking.
00301                  */
00302                 if (mfp != NULL &&
00303                     mfp->stat.st_pagesize == bh_mfp->stat.st_pagesize) {
00304                         if ((ret = __memp_bhfree(dbmp, hp, bhp, 0)) != 0) {
00305                                 MUTEX_UNLOCK(dbenv, mutex);
00306                                 return (ret);
00307                         }
00308                         p = bhp;
00309                         goto found;
00310                 }
00311 
00312                 freed_space += __db_shalloc_sizeof(bhp);
00313                 if ((ret =
00314                     __memp_bhfree(dbmp, hp, bhp, BH_FREE_FREEMEM)) != 0) {
00315                         MUTEX_UNLOCK(dbenv, mutex);
00316                         return (ret);
00317                 }
00318                 if (aggressive > 1)
00319                         aggressive = 1;
00320 
00321                 /*
00322                  * Unlock this hash bucket and re-acquire the region lock. If
00323                  * we're reaching here as a result of calling memp_bhfree, the
00324                  * hash bucket lock has already been discarded.
00325                  */
00326                 if (0) {
00327 next_hb:                MUTEX_UNLOCK(dbenv, mutex);
00328                 }
00329                 MPOOL_REGION_LOCK(dbenv, infop);
00330 
00331                 /*
00332                  * Retry the allocation as soon as we've freed up sufficient
00333                  * space.  We're likely to have to coalesce of memory to
00334                  * satisfy the request, don't try until it's likely (possible?)
00335                  * we'll succeed.
00336                  */
00337                 if (freed_space >= 3 * len)
00338                         goto alloc;
00339         }
00340         /* NOTREACHED */
00341 }
00342 
00343 /*
00344  * __memp_bad_buffer --
00345  *      Make the first buffer in a hash bucket the least desirable buffer.
00346  */
00347 static void
00348 __memp_bad_buffer(hp)
00349         DB_MPOOL_HASH *hp;
00350 {
00351         BH *bhp;
00352         u_int32_t priority;
00353 
00354         /* Remove the first buffer from the bucket. */
00355         bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
00356         SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
00357 
00358         /*
00359          * Find the highest priority buffer in the bucket.  Buffers are
00360          * sorted by priority, so it's the last one in the bucket.
00361          */
00362         priority = SH_TAILQ_EMPTY(&hp->hash_bucket) ? bhp->priority :
00363             SH_TAILQ_LASTP(&hp->hash_bucket, hq, __bh)->priority;
00364 
00365         /*
00366          * Set our buffer's priority to be just as bad, and append it to
00367          * the bucket.
00368          */
00369         bhp->priority = priority;
00370         SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
00371 
00372         /* Reset the hash bucket's priority. */
00373         hp->hash_priority = SH_TAILQ_FIRSTP(&hp->hash_bucket, __bh)->priority;
00374 #ifdef DIAGNOSTIC
00375         __memp_check_order(hp);
00376 #endif
00377 }
00378 
00379 #ifdef DIAGNOSTIC
00380 /*
00381  * __memp_check_order --
00382  *      Verify the priority ordering of a hash bucket chain.
00383  *
00384  * PUBLIC: #ifdef DIAGNOSTIC
00385  * PUBLIC: void __memp_check_order __P((DB_MPOOL_HASH *));
00386  * PUBLIC: #endif
00387  */
00388 void
00389 __memp_check_order(hp)
00390         DB_MPOOL_HASH *hp;
00391 {
00392         BH *bhp;
00393         u_int32_t priority;
00394 
00395         /*
00396          * Assumes the hash bucket is locked.
00397          */
00398         if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL)
00399                 return;
00400 
00401         DB_ASSERT(bhp->priority == hp->hash_priority);
00402 
00403         for (priority = bhp->priority;
00404             (bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) != NULL;
00405             priority = bhp->priority)
00406                 DB_ASSERT(priority <= bhp->priority);
00407 }
00408 #endif

Generated on Sun Dec 25 12:14:41 2005 for Berkeley DB 4.4.16 by  doxygen 1.4.2