Berkeley DB 4.4.16: /home/huihoo/src/db/db-4.4.16/mp/mp

00001 /*-
00002  * See the file LICENSE for redistribution information.
00003  *
00004  * Copyright (c) 1996-2005
00005  *      Sleepycat Software.  All rights reserved.
00006  *
00007  * $Id: mp_fget.c,v 12.8 2005/10/12 17:53:36 bostic Exp $
00008  */
00009 
00010 #include "db_config.h"
00011 
00012 #ifndef NO_SYSTEM_INCLUDES
00013 #include <sys/types.h>
00014 
00015 #include <string.h>
00016 #endif
00017 
00018 #include "db_int.h"
00019 #include "dbinc/db_shash.h"
00020 #include "dbinc/log.h"
00021 #include "dbinc/mp.h"
00022 
00023 /*
00024  * __memp_fget_pp --
00025  *      DB_MPOOLFILE->get pre/post processing.
00026  *
00027  * PUBLIC: int __memp_fget_pp
00028  * PUBLIC:     __P((DB_MPOOLFILE *, db_pgno_t *, u_int32_t, void *));
00029  */
00030 int
00031 __memp_fget_pp(dbmfp, pgnoaddr, flags, addrp)
00032         DB_MPOOLFILE *dbmfp;
00033         db_pgno_t *pgnoaddr;
00034         u_int32_t flags;
00035         void *addrp;
00036 {
00037         DB_ENV *dbenv;
00038         DB_THREAD_INFO *ip;
00039         int rep_check, ret;
00040 
00041         dbenv = dbmfp->dbenv;
00042 
00043         PANIC_CHECK(dbenv);
00044         MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->get");
00045 
00046         /*
00047          * Validate arguments.
00048          *
00049          * !!!
00050          * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly
00051          * files here, and create non-existent pages in readonly files if the
00052          * flags are set, later.  The reason is that the hash access method
00053          * wants to get empty pages that don't really exist in readonly files.
00054          * The only alternative is for hash to write the last "bucket" all the
00055          * time, which we don't want to do because one of our big goals in life
00056          * is to keep database files small.  It's sleazy as hell, but we catch
00057          * any attempt to actually write the file in memp_fput().
00058          */
00059 #define OKFLAGS         (DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW)
00060         if (flags != 0) {
00061                 if ((ret = __db_fchk(dbenv, "memp_fget", flags, OKFLAGS)) != 0)
00062                         return (ret);
00063 
00064                 switch (flags) {
00065                 case DB_MPOOL_CREATE:
00066                 case DB_MPOOL_LAST:
00067                 case DB_MPOOL_NEW:
00068                         break;
00069                 default:
00070                         return (__db_ferr(dbenv, "memp_fget", 1));
00071                 }
00072         }
00073 
00074         ENV_ENTER(dbenv, ip);
00075 
00076         rep_check = IS_ENV_REPLICATED(dbenv) ? 1 : 0;
00077         if (rep_check && (ret = __op_rep_enter(dbenv)) != 0)
00078                 goto err;
00079         ret = __memp_fget(dbmfp, pgnoaddr, flags, addrp);
00080         /*
00081          * We only decrement the count in op_rep_exit if the operation fails.
00082          * Otherwise the count will be decremented when the page is no longer
00083          * pinned in memp_fput.
00084          */
00085         if (ret != 0 && rep_check)
00086                 (void)__op_rep_exit(dbenv);
00087 
00088         /* Similarly if an app has a page pinned it is ACTIVE. */
00089 err:    if (ret != 0)
00090                 ENV_LEAVE(dbenv, ip);
00091 
00092         return (ret);
00093 }
00094 
00095 /*
00096  * __memp_fget --
00097  *      Get a page from the file.
00098  *
00099  * PUBLIC: int __memp_fget
00100  * PUBLIC:     __P((DB_MPOOLFILE *, db_pgno_t *, u_int32_t, void *));
00101  */
00102 int
00103 __memp_fget(dbmfp, pgnoaddr, flags, addrp)
00104         DB_MPOOLFILE *dbmfp;
00105         db_pgno_t *pgnoaddr;
00106         u_int32_t flags;
00107         void *addrp;
00108 {
00109         enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state;
00110         BH *alloc_bhp, *bhp;
00111         DB_ENV *dbenv;
00112         DB_MPOOL *dbmp;
00113         DB_MPOOL_HASH *hp;
00114         MPOOL *c_mp, *mp;
00115         MPOOLFILE *mfp;
00116         roff_t mf_offset;
00117         u_int32_t n_cache, st_hsearch;
00118         int b_incr, extending, first, ret;
00119 
00120         *(void **)addrp = NULL;
00121 
00122         dbenv = dbmfp->dbenv;
00123         dbmp = dbenv->mp_handle;
00124 
00125         c_mp = NULL;
00126         mp = dbmp->reginfo[0].primary;
00127         mfp = dbmfp->mfp;
00128         mf_offset = R_OFFSET(dbmp->reginfo, mfp);
00129         alloc_bhp = bhp = NULL;
00130         hp = NULL;
00131         b_incr = extending = ret = 0;
00132 
00133         switch (flags) {
00134         case DB_MPOOL_LAST:
00135                 /* Get the last page number in the file. */
00136                 MPOOL_SYSTEM_LOCK(dbenv);
00137                 *pgnoaddr = mfp->last_pgno;
00138                 MPOOL_SYSTEM_UNLOCK(dbenv);
00139                 break;
00140         case DB_MPOOL_NEW:
00141                 /*
00142                  * If always creating a page, skip the first search
00143                  * of the hash bucket.
00144                  */
00145                 goto alloc;
00146         case DB_MPOOL_CREATE:
00147         default:
00148                 break;
00149         }
00150 
00151         /*
00152          * If mmap'ing the file and the page is not past the end of the file,
00153          * just return a pointer.  We can't use R_ADDR here: this is an offset
00154          * into an mmap'd file, not a shared region, and doesn't change for
00155          * private environments.
00156          *
00157          * The page may be past the end of the file, so check the page number
00158          * argument against the original length of the file.  If we previously
00159          * returned pages past the original end of the file, last_pgno will
00160          * have been updated to match the "new" end of the file, and checking
00161          * against it would return pointers past the end of the mmap'd region.
00162          *
00163          * If another process has opened the file for writing since we mmap'd
00164          * it, we will start playing the game by their rules, i.e. everything
00165          * goes through the cache.  All pages previously returned will be safe,
00166          * as long as the correct locking protocol was observed.
00167          *
00168          * We don't discard the map because we don't know when all of the
00169          * pages will have been discarded from the process' address space.
00170          * It would be possible to do so by reference counting the open
00171          * pages from the mmap, but it's unclear to me that it's worth it.
00172          */
00173         if (dbmfp->addr != NULL &&
00174             F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) {
00175                 *(void **)addrp = (u_int8_t *)dbmfp->addr +
00176                     (*pgnoaddr * mfp->stat.st_pagesize);
00177                 ++mfp->stat.st_map;
00178                 return (0);
00179         }
00180 
00181 hb_search:
00182         /*
00183          * Determine the cache and hash bucket where this page lives and get
00184          * local pointers to them.  Reset on each pass through this code, the
00185          * page number can change.
00186          */
00187         n_cache = NCACHE(mp, mf_offset, *pgnoaddr);
00188         c_mp = dbmp->reginfo[n_cache].primary;
00189         hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
00190         hp = &hp[NBUCKET(c_mp, mf_offset, *pgnoaddr)];
00191 
00192         /* Search the hash chain for the page. */
00193 retry:  st_hsearch = 0;
00194         MUTEX_LOCK(dbenv, hp->mtx_hash);
00195         for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
00196             bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
00197                 ++st_hsearch;
00198                 if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset)
00199                         continue;
00200 
00201                 /*
00202                  * Increment the reference count.  We may discard the hash
00203                  * bucket lock as we evaluate and/or read the buffer, so we
00204                  * need to ensure it doesn't move and its contents remain
00205                  * unchanged.
00206                  */
00207                 if (bhp->ref == UINT16_MAX) {
00208                         MUTEX_UNLOCK(dbenv, hp->mtx_hash);
00209 
00210                         __db_err(dbenv,
00211                             "%s: page %lu: reference count overflow",
00212                             __memp_fn(dbmfp), (u_long)bhp->pgno);
00213                         ret = __db_panic(dbenv, EINVAL);
00214                         goto err;
00215                 }
00216                 ++bhp->ref;
00217                 b_incr = 1;
00218 
00219                 /*
00220                  * BH_LOCKED --
00221                  * I/O is in progress or sync is waiting on the buffer to write
00222                  * it.  Because we've incremented the buffer reference count,
00223                  * we know the buffer can't move.  Unlock the bucket lock, wait
00224                  * for the buffer to become available, reacquire the bucket.
00225                  */
00226                 for (first = 1; F_ISSET(bhp, BH_LOCKED) &&
00227                     !F_ISSET(dbenv, DB_ENV_NOLOCKING); first = 0) {
00228                         /*
00229                          * If someone is trying to sync this buffer and the
00230                          * buffer is hot, they may never get in.  Give up
00231                          * and try again.
00232                          */
00233                         if (!first && bhp->ref_sync != 0) {
00234                                 --bhp->ref;
00235                                 b_incr = 0;
00236                                 MUTEX_UNLOCK(dbenv, hp->mtx_hash);
00237                                 __os_yield(dbenv, 1);
00238                                 goto retry;
00239                         }
00240 
00241                         MUTEX_UNLOCK(dbenv, hp->mtx_hash);
00242                         /*
00243                          * Explicitly yield the processor if not the first pass
00244                          * through this loop -- if we don't, we might run to the
00245                          * end of our CPU quantum as we will simply be swapping
00246                          * between the two locks.
00247                          */
00248                         if (!first)
00249                                 __os_yield(dbenv, 1);
00250 
00251                         MUTEX_LOCK(dbenv, bhp->mtx_bh);
00252                         /* Wait for I/O to finish... */
00253                         MUTEX_UNLOCK(dbenv, bhp->mtx_bh);
00254                         MUTEX_LOCK(dbenv, hp->mtx_hash);
00255                 }
00256 
00257                 ++mfp->stat.st_cache_hit;
00258                 break;
00259         }
00260 
00261         /*
00262          * Update the hash bucket search statistics -- do now because our next
00263          * search may be for a different bucket.
00264          */
00265         ++c_mp->stat.st_hash_searches;
00266         if (st_hsearch > c_mp->stat.st_hash_longest)
00267                 c_mp->stat.st_hash_longest = st_hsearch;
00268         c_mp->stat.st_hash_examined += st_hsearch;
00269 
00270         /*
00271          * There are 4 possible paths to this location:
00272          *
00273          * FIRST_MISS:
00274          *      Didn't find the page in the hash bucket on our first pass:
00275          *      bhp == NULL, alloc_bhp == NULL
00276          *
00277          * FIRST_FOUND:
00278          *      Found the page in the hash bucket on our first pass:
00279          *      bhp != NULL, alloc_bhp == NULL
00280          *
00281          * SECOND_FOUND:
00282          *      Didn't find the page in the hash bucket on the first pass,
00283          *      allocated space, and found the page in the hash bucket on
00284          *      our second pass:
00285          *      bhp != NULL, alloc_bhp != NULL
00286          *
00287          * SECOND_MISS:
00288          *      Didn't find the page in the hash bucket on the first pass,
00289          *      allocated space, and didn't find the page in the hash bucket
00290          *      on our second pass:
00291          *      bhp == NULL, alloc_bhp != NULL
00292          */
00293         state = bhp == NULL ?
00294             (alloc_bhp == NULL ? FIRST_MISS : SECOND_MISS) :
00295             (alloc_bhp == NULL ? FIRST_FOUND : SECOND_FOUND);
00296         switch (state) {
00297         case FIRST_FOUND:
00298                 /*
00299                  * If we are to free the buffer, then this had better
00300                  * be the only reference. If so, just free the buffer.
00301                  * If not, complain and get out.
00302                  */
00303                 if (flags == DB_MPOOL_FREE) {
00304                         if (bhp->ref == 1)
00305                                 return (__memp_bhfree(
00306                                     dbmp, hp, bhp, BH_FREE_FREEMEM));
00307                         __db_err(dbenv,
00308                             "File %s: freeing pinned buffer for page %lu",
00309                                 __memp_fns(dbmp, mfp), (u_long)*pgnoaddr);
00310                         ret = __db_panic(dbenv, EINVAL);
00311                         goto err;
00312                 }
00313 
00314                 /* We found the buffer in our first check -- we're done. */
00315                 break;
00316         case FIRST_MISS:
00317                 /*
00318                  * We didn't find the buffer in our first check.  Figure out
00319                  * if the page exists, and allocate structures so we can add
00320                  * the page to the buffer pool.
00321                  */
00322                 MUTEX_UNLOCK(dbenv, hp->mtx_hash);
00323 
00324                 /*
00325                  * The buffer is not in the pool, so we don't need to free it.
00326                  */
00327                 if (flags == DB_MPOOL_FREE)
00328                         return (0);
00329 
00330 alloc:          /*
00331                  * If DB_MPOOL_NEW is set, we have to allocate a page number.
00332                  * If neither DB_MPOOL_CREATE or DB_MPOOL_CREATE is set, then
00333                  * it's an error to try and get a page past the end of file.
00334                  */
00335                 COMPQUIET(n_cache, 0);
00336 
00337                 extending = ret = 0;
00338                 MPOOL_SYSTEM_LOCK(dbenv);
00339                 switch (flags) {
00340                 case DB_MPOOL_NEW:
00341                         extending = 1;
00342                         if (mfp->maxpgno != 0 &&
00343                             mfp->last_pgno >= mfp->maxpgno) {
00344                                 __db_err(dbenv, "%s: file limited to %lu pages",
00345                                     __memp_fn(dbmfp), (u_long)mfp->maxpgno);
00346                                 ret = ENOSPC;
00347                         } else
00348                                 *pgnoaddr = mfp->last_pgno + 1;
00349                         break;
00350                 case DB_MPOOL_CREATE:
00351                         if (mfp->maxpgno != 0 && *pgnoaddr > mfp->maxpgno) {
00352                                 __db_err(dbenv, "%s: file limited to %lu pages",
00353                                     __memp_fn(dbmfp), (u_long)mfp->maxpgno);
00354                                 ret = ENOSPC;
00355                         } else
00356                                 extending = *pgnoaddr > mfp->last_pgno;
00357                         break;
00358                 default:
00359                         ret = *pgnoaddr > mfp->last_pgno ? DB_PAGE_NOTFOUND : 0;
00360                         break;
00361                 }
00362                 MPOOL_SYSTEM_UNLOCK(dbenv);
00363                 if (ret != 0)
00364                         goto err;
00365 
00366                 /*
00367                  * !!!
00368                  * In the DB_MPOOL_NEW code path, mf_offset and n_cache have
00369                  * not yet been initialized.
00370                  */
00371                 mf_offset = R_OFFSET(dbmp->reginfo, mfp);
00372                 n_cache = NCACHE(mp, mf_offset, *pgnoaddr);
00373                 c_mp = dbmp->reginfo[n_cache].primary;
00374 
00375                 /* Allocate a new buffer header and data space. */
00376                 if ((ret = __memp_alloc(dbmp,
00377                     &dbmp->reginfo[n_cache], mfp, 0, NULL, &alloc_bhp)) != 0)
00378                         goto err;
00379 #ifdef DIAGNOSTIC
00380                 if ((uintptr_t)alloc_bhp->buf & (sizeof(size_t) - 1)) {
00381                         __db_err(dbenv,
00382                     "DB_MPOOLFILE->get: buffer data is NOT size_t aligned");
00383                         ret = __db_panic(dbenv, EINVAL);
00384                         goto err;
00385                 }
00386 #endif
00387                 /*
00388                  * If we are extending the file, we'll need the region lock
00389                  * again.
00390                  */
00391                 if (extending)
00392                         MPOOL_SYSTEM_LOCK(dbenv);
00393 
00394                 /*
00395                  * DB_MPOOL_NEW does not guarantee you a page unreferenced by
00396                  * any other thread of control.  (That guarantee is interesting
00397                  * for DB_MPOOL_NEW, unlike DB_MPOOL_CREATE, because the caller
00398                  * did not specify the page number, and so, may reasonably not
00399                  * have any way to lock the page outside of mpool.) Regardless,
00400                  * if we allocate the page, and some other thread of control
00401                  * requests the page by number, we will not detect that and the
00402                  * thread of control that allocated using DB_MPOOL_NEW may not
00403                  * have a chance to initialize the page.  (Note: we *could*
00404                  * detect this case if we set a flag in the buffer header which
00405                  * guaranteed that no gets of the page would succeed until the
00406                  * reference count went to 0, that is, until the creating page
00407                  * put the page.)  What we do guarantee is that if two threads
00408                  * of control are both doing DB_MPOOL_NEW calls, they won't
00409                  * collide, that is, they won't both get the same page.
00410                  *
00411                  * There's a possibility that another thread allocated the page
00412                  * we were planning to allocate while we were off doing buffer
00413                  * allocation.  We can do that by making sure the page number
00414                  * we were going to use is still available.  If it's not, then
00415                  * we check to see if the next available page number hashes to
00416                  * the same mpool region as the old one -- if it does, we can
00417                  * continue, otherwise, we have to start over.
00418                  */
00419                 if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) {
00420                         *pgnoaddr = mfp->last_pgno + 1;
00421                         if (n_cache != NCACHE(mp, mf_offset, *pgnoaddr)) {
00422                                 /*
00423                                  * flags == DB_MPOOL_NEW, so extending is set
00424                                  * and we're holding the region locked.
00425                                  */
00426                                 MPOOL_SYSTEM_UNLOCK(dbenv);
00427 
00428                                 MPOOL_REGION_LOCK(
00429                                     dbenv, &dbmp->reginfo[n_cache]);
00430                                 __db_shalloc_free(
00431                                     &dbmp->reginfo[n_cache], alloc_bhp);
00432                                 c_mp->stat.st_pages--;
00433                                 MPOOL_REGION_UNLOCK(
00434                                     dbenv, &dbmp->reginfo[n_cache]);
00435 
00436                                 alloc_bhp = NULL;
00437                                 goto alloc;
00438                         }
00439                 }
00440 
00441                 /*
00442                  * We released the region lock, so another thread might have
00443                  * extended the file.  Update the last_pgno and initialize
00444                  * the file, as necessary, if we extended the file.
00445                  */
00446                 if (extending) {
00447                         if (*pgnoaddr > mfp->last_pgno)
00448                                 mfp->last_pgno = *pgnoaddr;
00449 
00450                         MPOOL_SYSTEM_UNLOCK(dbenv);
00451                         if (ret != 0)
00452                                 goto err;
00453                 }
00454                 goto hb_search;
00455         case SECOND_FOUND:
00456                 /*
00457                  * We allocated buffer space for the requested page, but then
00458                  * found the page in the buffer cache on our second check.
00459                  * That's OK -- we can use the page we found in the pool,
00460                  * unless DB_MPOOL_NEW is set.
00461                  *
00462                  * Free the allocated memory, we no longer need it.  Since we
00463                  * can't acquire the region lock while holding the hash bucket
00464                  * lock, we have to release the hash bucket and re-acquire it.
00465                  * That's OK, because we have the buffer pinned down.
00466                  */
00467                 MUTEX_UNLOCK(dbenv, hp->mtx_hash);
00468                 MPOOL_REGION_LOCK(dbenv, &dbmp->reginfo[n_cache]);
00469                 __db_shalloc_free(&dbmp->reginfo[n_cache], alloc_bhp);
00470                 c_mp->stat.st_pages--;
00471                 alloc_bhp = NULL;
00472                 MPOOL_REGION_UNLOCK(dbenv, &dbmp->reginfo[n_cache]);
00473 
00474                 /*
00475                  * We can't use the page we found in the pool if DB_MPOOL_NEW
00476                  * was set.  (For details, see the above comment beginning
00477                  * "DB_MPOOL_NEW does not guarantee you a page unreferenced by
00478                  * any other thread of control".)  If DB_MPOOL_NEW is set, we
00479                  * release our pin on this particular buffer, and try to get
00480                  * another one.
00481                  */
00482                 if (flags == DB_MPOOL_NEW) {
00483                         --bhp->ref;
00484                         b_incr = 0;
00485                         goto alloc;
00486                 }
00487 
00488                 /* We can use the page -- get the bucket lock. */
00489                 MUTEX_LOCK(dbenv, hp->mtx_hash);
00490                 break;
00491         case SECOND_MISS:
00492                 /*
00493                  * We allocated buffer space for the requested page, and found
00494                  * the page still missing on our second pass through the buffer
00495                  * cache.  Instantiate the page.
00496                  */
00497                 bhp = alloc_bhp;
00498                 alloc_bhp = NULL;
00499 
00500                 /*
00501                  * Initialize all the BH and hash bucket fields so we can call
00502                  * __memp_bhfree if an error occurs.
00503                  *
00504                  * Append the buffer to the tail of the bucket list and update
00505                  * the hash bucket's priority.
00506                  */
00507                 b_incr = 1;
00508 
00509                 /*lint --e{668} (flexelint: bhp cannot be NULL). */
00510                 memset(bhp, 0, sizeof(BH));
00511                 bhp->ref = 1;
00512                 bhp->priority = UINT32_MAX;
00513                 bhp->pgno = *pgnoaddr;
00514                 bhp->mf_offset = mf_offset;
00515                 SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
00516 
00517                 /*
00518                  * Allocate the mutex.  This is the last BH initialization step,
00519                  * because it's the only one that can fail, and everything else
00520                  * must be set up or we can't jump to the err label because it
00521                  * will call __memp_bhfree.
00522                  */
00523                 if ((ret = __mutex_alloc(
00524                     dbenv, MTX_MPOOL_BUFFER, 0, &bhp->mtx_bh)) != 0)
00525                         goto err;
00526 
00527                 hp->hash_priority =
00528                     SH_TAILQ_FIRSTP(&hp->hash_bucket, __bh)->priority;
00529 
00530                 /* If we extended the file, make sure the page is never lost. */
00531                 if (extending) {
00532                         ++hp->hash_page_dirty;
00533                         F_SET(bhp, BH_DIRTY | BH_DIRTY_CREATE);
00534                 }
00535 
00536                 /*
00537                  * If we created the page, zero it out.  If we didn't create
00538                  * the page, read from the backing file.
00539                  *
00540                  * !!!
00541                  * DB_MPOOL_NEW doesn't call the pgin function.
00542                  *
00543                  * If DB_MPOOL_CREATE is used, then the application's pgin
00544                  * function has to be able to handle pages of 0's -- if it
00545                  * uses DB_MPOOL_NEW, it can detect all of its page creates,
00546                  * and not bother.
00547                  *
00548                  * If we're running in diagnostic mode, smash any bytes on the
00549                  * page that are unknown quantities for the caller.
00550                  *
00551                  * Otherwise, read the page into memory, optionally creating it
00552                  * if DB_MPOOL_CREATE is set.
00553                  */
00554                 if (extending) {
00555                         if (mfp->clear_len == DB_CLEARLEN_NOTSET)
00556                                 memset(bhp->buf, 0, mfp->stat.st_pagesize);
00557                         else {
00558                                 memset(bhp->buf, 0, mfp->clear_len);
00559 #if defined(DIAGNOSTIC) || defined(UMRW)
00560                                 memset(bhp->buf + mfp->clear_len, CLEAR_BYTE,
00561                                     mfp->stat.st_pagesize - mfp->clear_len);
00562 #endif
00563                         }
00564 
00565                         if (flags == DB_MPOOL_CREATE && mfp->ftype != 0)
00566                                 F_SET(bhp, BH_CALLPGIN);
00567 
00568                         ++mfp->stat.st_page_create;
00569                 } else {
00570                         F_SET(bhp, BH_TRASH);
00571                         ++mfp->stat.st_cache_miss;
00572                 }
00573 
00574                 /* Increment buffer count referenced by MPOOLFILE. */
00575                 MUTEX_LOCK(dbenv, mfp->mutex);
00576                 ++mfp->block_cnt;
00577                 MUTEX_UNLOCK(dbenv, mfp->mutex);
00578         }
00579 
00580         DB_ASSERT(bhp->ref != 0);
00581 
00582         /*
00583          * If we're the only reference, update buffer and bucket priorities.
00584          * We may be about to release the hash bucket lock, and everything
00585          * should be correct, first.  (We've already done this if we created
00586          * the buffer, so there is no need to do it again.)
00587          */
00588         if (state != SECOND_MISS && bhp->ref == 1) {
00589                 bhp->priority = UINT32_MAX;
00590                 SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
00591                 SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
00592                 hp->hash_priority =
00593                     SH_TAILQ_FIRSTP(&hp->hash_bucket, __bh)->priority;
00594         }
00595 
00596         /*
00597          * BH_TRASH --
00598          * The buffer we found may need to be filled from the disk.
00599          *
00600          * It's possible for the read function to fail, which means we fail as
00601          * well.  Note, the __memp_pgread() function discards and reacquires
00602          * the hash lock, so the buffer must be pinned down so that it cannot
00603          * move and its contents are unchanged.  Discard the buffer on failure
00604          * unless another thread is waiting on our I/O to complete.  It's OK to
00605          * leave the buffer around, as the waiting thread will see the BH_TRASH
00606          * flag set, and will also attempt to discard it.  If there's a waiter,
00607          * we need to decrement our reference count.
00608          */
00609         if (F_ISSET(bhp, BH_TRASH) &&
00610             (ret = __memp_pgread(dbmfp,
00611             hp->mtx_hash, bhp, LF_ISSET(DB_MPOOL_CREATE) ? 1 : 0)) != 0)
00612                 goto err;
00613 
00614         /*
00615          * BH_CALLPGIN --
00616          * The buffer was processed for being written to disk, and now has
00617          * to be re-converted for use.
00618          */
00619         if (F_ISSET(bhp, BH_CALLPGIN)) {
00620                 if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
00621                         goto err;
00622                 F_CLR(bhp, BH_CALLPGIN);
00623         }
00624 #ifdef DIAGNOSTIC
00625         __memp_check_order(hp);
00626 #endif
00627 
00628         MUTEX_UNLOCK(dbenv, hp->mtx_hash);
00629 
00630 #ifdef DIAGNOSTIC
00631         /* Update the file's pinned reference count. */
00632         MPOOL_SYSTEM_LOCK(dbenv);
00633         ++dbmfp->pinref;
00634         MPOOL_SYSTEM_UNLOCK(dbenv);
00635 
00636         /*
00637          * We want to switch threads as often as possible, and at awkward
00638          * times.  Yield every time we get a new page to ensure contention.
00639          */
00640         if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
00641                 __os_yield(dbenv, 1);
00642 #endif
00643 
00644         *(void **)addrp = bhp->buf;
00645         return (0);
00646 
00647 err:    /*
00648          * Discard our reference.  If we're the only reference, discard the
00649          * the buffer entirely.  If we held a reference to a buffer, we are
00650          * also still holding the hash bucket mutex.
00651          */
00652         if (b_incr) {
00653                 if (bhp->ref == 1)
00654                         (void)__memp_bhfree(dbmp, hp, bhp, BH_FREE_FREEMEM);
00655                 else {
00656                         --bhp->ref;
00657                         MUTEX_UNLOCK(dbenv, hp->mtx_hash);
00658                 }
00659         }
00660 
00661         /* If alloc_bhp is set, free the memory. */
00662         if (alloc_bhp != NULL) {
00663                 MPOOL_REGION_LOCK(dbenv, &dbmp->reginfo[n_cache]);
00664                 __db_shalloc_free(&dbmp->reginfo[n_cache], alloc_bhp);
00665                 c_mp->stat.st_pages--;
00666                 MPOOL_REGION_UNLOCK(dbenv, &dbmp->reginfo[n_cache]);
00667         }
00668 
00669         return (ret);
00670 }