Main Page | Class Hierarchy | Data Structures | Directories | File List | Data Fields | Related Pages

mp_bh.c

00001 /*-
00002  * See the file LICENSE for redistribution information.
00003  *
00004  * Copyright (c) 1996-2005
00005  *      Sleepycat Software.  All rights reserved.
00006  *
00007  * $Id: mp_bh.c,v 12.11 2005/10/20 18:57:07 bostic Exp $
00008  */
00009 
00010 #include "db_config.h"
00011 
00012 #ifndef NO_SYSTEM_INCLUDES
00013 #include <sys/types.h>
00014 
00015 #include <string.h>
00016 #endif
00017 
00018 #include "db_int.h"
00019 #include "dbinc/db_shash.h"
00020 #include "dbinc/mp.h"
00021 #include "dbinc/log.h"
00022 #include "dbinc/db_page.h"
00023 
00024 static int __memp_pgwrite
00025                __P((DB_ENV *, DB_MPOOLFILE *, DB_MPOOL_HASH *, BH *));
00026 
00027 /*
00028  * __memp_bhwrite --
00029  *      Write the page associated with a given buffer header.
00030  *
00031  * PUBLIC: int __memp_bhwrite __P((DB_MPOOL *,
00032  * PUBLIC:      DB_MPOOL_HASH *, MPOOLFILE *, BH *, int));
00033  */
00034 int
00035 __memp_bhwrite(dbmp, hp, mfp, bhp, open_extents)
00036         DB_MPOOL *dbmp;
00037         DB_MPOOL_HASH *hp;
00038         MPOOLFILE *mfp;
00039         BH *bhp;
00040         int open_extents;
00041 {
00042         DB_ENV *dbenv;
00043         DB_MPOOLFILE *dbmfp;
00044         DB_MPREG *mpreg;
00045         int ret;
00046 
00047         dbenv = dbmp->dbenv;
00048 
00049         /*
00050          * If the file has been removed or is a closed temporary file, we're
00051          * done -- the page-write function knows how to handle the fact that
00052          * we don't have (or need!) any real file descriptor information.
00053          */
00054         if (mfp->deadfile)
00055                 return (__memp_pgwrite(dbenv, NULL, hp, bhp));
00056 
00057         /*
00058          * Walk the process' DB_MPOOLFILE list and find a file descriptor for
00059          * the file.  We also check that the descriptor is open for writing.
00060          */
00061         MUTEX_LOCK(dbenv, dbmp->mutex);
00062         for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
00063             dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
00064                 if (dbmfp->mfp == mfp && !F_ISSET(dbmfp, MP_READONLY)) {
00065                         ++dbmfp->ref;
00066                         break;
00067                 }
00068         MUTEX_UNLOCK(dbenv, dbmp->mutex);
00069 
00070         if (dbmfp != NULL) {
00071                 /*
00072                  * Temporary files may not have been created.  We only handle
00073                  * temporary files in this path, because only the process that
00074                  * created a temporary file will ever flush buffers to it.
00075                  */
00076                 if (dbmfp->fhp == NULL) {
00077                         /* We may not be allowed to create backing files. */
00078                         if (mfp->no_backing_file)
00079                                 return (EPERM);
00080 
00081                         MUTEX_LOCK(dbenv, dbmp->mutex);
00082                         if (dbmfp->fhp == NULL)
00083                                 ret = __db_appname(dbenv, DB_APP_TMP, NULL,
00084                                     F_ISSET(dbenv, DB_ENV_DIRECT_DB) ?
00085                                     DB_OSO_DIRECT : 0, &dbmfp->fhp, NULL);
00086                         else
00087                                 ret = 0;
00088                         MUTEX_UNLOCK(dbenv, dbmp->mutex);
00089                         if (ret != 0) {
00090                                 __db_err(dbenv,
00091                                     "unable to create temporary backing file");
00092                                 return (ret);
00093                         }
00094                 }
00095 
00096                 goto pgwrite;
00097         }
00098 
00099         /*
00100          * There's no file handle for this file in our process.
00101          *
00102          * !!!
00103          * It's the caller's choice if we're going to open extent files.
00104          */
00105         if (!open_extents && F_ISSET(mfp, MP_EXTENT))
00106                 return (EPERM);
00107 
00108         /*
00109          * !!!
00110          * Don't try to attach to temporary files.  There are two problems in
00111          * trying to do that.  First, if we have different privileges than the
00112          * process that "owns" the temporary file, we might create the backing
00113          * disk file such that the owning process couldn't read/write its own
00114          * buffers, e.g., memp_trickle running as root creating a file owned
00115          * as root, mode 600.  Second, if the temporary file has already been
00116          * created, we don't have any way of finding out what its real name is,
00117          * and, even if we did, it was already unlinked (so that it won't be
00118          * left if the process dies horribly).  This decision causes a problem,
00119          * however: if the temporary file consumes the entire buffer cache,
00120          * and the owner doesn't flush the buffers to disk, we could end up
00121          * with resource starvation, and the memp_trickle thread couldn't do
00122          * anything about it.  That's a pretty unlikely scenario, though.
00123          *
00124          * Note we should never get here when the temporary file in question
00125          * has already been closed in another process, in which case it should
00126          * be marked dead.
00127          */
00128         if (F_ISSET(mfp, MP_TEMP) || mfp->no_backing_file)
00129                 return (EPERM);
00130 
00131         /*
00132          * It's not a page from a file we've opened.  If the file requires
00133          * application-specific input/output processing, see if this process
00134          * has ever registered information as to how to write this type of
00135          * file.  If not, there's nothing we can do.
00136          */
00137         if (mfp->ftype != 0 && mfp->ftype != DB_FTYPE_SET) {
00138                 MUTEX_LOCK(dbenv, dbmp->mutex);
00139                 for (mpreg = LIST_FIRST(&dbmp->dbregq);
00140                     mpreg != NULL; mpreg = LIST_NEXT(mpreg, q))
00141                         if (mpreg->ftype == mfp->ftype)
00142                                 break;
00143                 MUTEX_UNLOCK(dbenv, dbmp->mutex);
00144                 if (mpreg == NULL)
00145                         return (EPERM);
00146         }
00147 
00148         /*
00149          * Try and open the file, specifying the known underlying shared area.
00150          *
00151          * !!!
00152          * There's no negative cache, so we may repeatedly try and open files
00153          * that we have previously tried (and failed) to open.
00154          */
00155         if ((ret = __memp_fcreate(dbenv, &dbmfp)) != 0)
00156                 return (ret);
00157         if ((ret = __memp_fopen(dbmfp,
00158             mfp, NULL, DB_DURABLE_UNKNOWN, 0, mfp->stat.st_pagesize)) != 0) {
00159                 (void)__memp_fclose(dbmfp, 0);
00160 
00161                 /*
00162                  * Ignore any error if the file is marked dead, assume the file
00163                  * was removed from under us.
00164                  */
00165                 if (!mfp->deadfile)
00166                         return (ret);
00167 
00168                 dbmfp = NULL;
00169         }
00170 
00171 pgwrite:
00172         ret = __memp_pgwrite(dbenv, dbmfp, hp, bhp);
00173         if (dbmfp == NULL)
00174                 return (ret);
00175 
00176         /*
00177          * Discard our reference, and, if we're the last reference, make sure
00178          * the file eventually gets closed.
00179          */
00180         MUTEX_LOCK(dbenv, dbmp->mutex);
00181         if (dbmfp->ref == 1)
00182                 F_SET(dbmfp, MP_FLUSH);
00183         else
00184                 --dbmfp->ref;
00185         MUTEX_UNLOCK(dbenv, dbmp->mutex);
00186 
00187         return (ret);
00188 }
00189 
00190 /*
00191  * __memp_pgread --
00192  *      Read a page from a file.
00193  *
00194  * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, db_mutex_t, BH *, int));
00195  */
00196 int
00197 __memp_pgread(dbmfp, mutex, bhp, can_create)
00198         DB_MPOOLFILE *dbmfp;
00199         db_mutex_t mutex;
00200         BH *bhp;
00201         int can_create;
00202 {
00203         DB_ENV *dbenv;
00204         MPOOLFILE *mfp;
00205         size_t len, nr;
00206         u_int32_t pagesize;
00207         int ret;
00208 
00209         dbenv = dbmfp->dbenv;
00210         mfp = dbmfp->mfp;
00211         pagesize = mfp->stat.st_pagesize;
00212 
00213         /* We should never be called with a dirty or a locked buffer. */
00214         DB_ASSERT(!F_ISSET(bhp, BH_DIRTY | BH_DIRTY_CREATE | BH_LOCKED));
00215 
00216         /* Lock the buffer and swap the hash bucket lock for the buffer lock. */
00217         F_SET(bhp, BH_LOCKED | BH_TRASH);
00218         MUTEX_LOCK(dbenv, bhp->mtx_bh);
00219         MUTEX_UNLOCK(dbenv, mutex);
00220 
00221         /*
00222          * Temporary files may not yet have been created.  We don't create
00223          * them now, we create them when the pages have to be flushed.
00224          */
00225         nr = 0;
00226         if (dbmfp->fhp != NULL)
00227                 if ((ret = __os_io(dbenv, DB_IO_READ,
00228                     dbmfp->fhp, bhp->pgno, pagesize, bhp->buf, &nr)) != 0)
00229                         goto err;
00230 
00231         /*
00232          * The page may not exist; if it doesn't, nr may well be 0, but we
00233          * expect the underlying OS calls not to return an error code in
00234          * this case.
00235          */
00236         if (nr < pagesize) {
00237                 /*
00238                  * Don't output error messages for short reads.  In particular,
00239                  * DB recovery processing may request pages never written to
00240                  * disk or for which only some part have been written to disk,
00241                  * in which case we won't find the page.  The caller must know
00242                  * how to handle the error.
00243                  */
00244                 if (can_create == 0) {
00245                         ret = DB_PAGE_NOTFOUND;
00246                         goto err;
00247                 }
00248 
00249                 /* Clear any bytes that need to be cleared. */
00250                 len = mfp->clear_len == DB_CLEARLEN_NOTSET ?
00251                     pagesize : mfp->clear_len;
00252                 memset(bhp->buf, 0, len);
00253 
00254 #if defined(DIAGNOSTIC) || defined(UMRW)
00255                 /*
00256                  * If we're running in diagnostic mode, corrupt any bytes on
00257                  * the page that are unknown quantities for the caller.
00258                  */
00259                 if (len < pagesize)
00260                         memset(bhp->buf + len, CLEAR_BYTE, pagesize - len);
00261 #endif
00262                 ++mfp->stat.st_page_create;
00263         } else
00264                 ++mfp->stat.st_page_in;
00265 
00266         /* Call any pgin function. */
00267         ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1);
00268 
00269         /* Unlock the buffer and reacquire the hash bucket lock. */
00270 err:    MUTEX_UNLOCK(dbenv, bhp->mtx_bh);
00271         MUTEX_LOCK(dbenv, mutex);
00272 
00273         /*
00274          * If no errors occurred, the data is now valid, clear the BH_TRASH
00275          * flag; regardless, clear the lock bit and let other threads proceed.
00276          */
00277         F_CLR(bhp, BH_LOCKED);
00278         if (ret == 0)
00279                 F_CLR(bhp, BH_TRASH);
00280 
00281         return (ret);
00282 }
00283 
00284 /*
00285  * __memp_pgwrite --
00286  *      Write a page to a file.
00287  */
00288 static int
00289 __memp_pgwrite(dbenv, dbmfp, hp, bhp)
00290         DB_ENV *dbenv;
00291         DB_MPOOLFILE *dbmfp;
00292         DB_MPOOL_HASH *hp;
00293         BH *bhp;
00294 {
00295         DB_LSN lsn;
00296         MPOOLFILE *mfp;
00297         size_t nw;
00298         int callpgin, ret;
00299 
00300         mfp = dbmfp == NULL ? NULL : dbmfp->mfp;
00301         callpgin = ret = 0;
00302 
00303         /*
00304          * We should never be called with a clean or trash buffer.
00305          * The sync code does call us with already locked buffers.
00306          */
00307         DB_ASSERT(F_ISSET(bhp, BH_DIRTY));
00308         DB_ASSERT(!F_ISSET(bhp, BH_TRASH));
00309 
00310         /*
00311          * If we have not already traded the hash bucket lock for the buffer
00312          * lock, do so now.
00313          */
00314         if (!F_ISSET(bhp, BH_LOCKED)) {
00315                 F_SET(bhp, BH_LOCKED);
00316                 MUTEX_LOCK(dbenv, bhp->mtx_bh);
00317                 MUTEX_UNLOCK(dbenv, hp->mtx_hash);
00318         }
00319 
00320         /*
00321          * It's possible that the underlying file doesn't exist, either
00322          * because of an outright removal or because it was a temporary
00323          * file that's been closed.
00324          *
00325          * !!!
00326          * Once we pass this point, we know that dbmfp and mfp aren't NULL,
00327          * and that we have a valid file reference.
00328          */
00329         if (mfp == NULL || mfp->deadfile)
00330                 goto file_dead;
00331 
00332         /*
00333          * If the page is in a file for which we have LSN information, we have
00334          * to ensure the appropriate log records are on disk.
00335          */
00336         if (LOGGING_ON(dbenv) && mfp->lsn_off != -1 &&
00337             !IS_CLIENT_PGRECOVER(dbenv)) {
00338                 memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN));
00339                 if (!IS_NOT_LOGGED_LSN(lsn) &&
00340                     (ret = __log_flush(dbenv, &lsn)) != 0)
00341                         goto err;
00342         }
00343 
00344 #ifdef DIAGNOSTIC
00345         /*
00346          * Verify write-ahead logging semantics.
00347          *
00348          * !!!
00349          * Two special cases.  There is a single field on the meta-data page,
00350          * the last-page-number-in-the-file field, for which we do not log
00351          * changes.  If the page was originally created in a database that
00352          * didn't have logging turned on, we can see a page marked dirty but
00353          * for which no corresponding log record has been written.  However,
00354          * the only way that a page can be created for which there isn't a
00355          * previous log record and valid LSN is when the page was created
00356          * without logging turned on, and so we check for that special-case
00357          * LSN value.
00358          *
00359          * Second, when a client is reading database pages from a master
00360          * during an internal backup, we may get pages modified after
00361          * the current end-of-log.
00362          */
00363         if (LOGGING_ON(dbenv) && !IS_NOT_LOGGED_LSN(LSN(bhp->buf)) &&
00364             !IS_CLIENT_PGRECOVER(dbenv)) {
00365                 /*
00366                  * There is a potential race here.  If we are in the midst of
00367                  * switching log files, it's possible we could test against the
00368                  * old file and the new offset in the log region's LSN.  If we
00369                  * fail the first test, acquire the log mutex and check again.
00370                  */
00371                 DB_LOG *dblp;
00372                 LOG *lp;
00373 
00374                 dblp = dbenv->lg_handle;
00375                 lp = dblp->reginfo.primary;
00376                 if (!lp->db_log_inmemory &&
00377                     log_compare(&lp->s_lsn, &LSN(bhp->buf)) <= 0) {
00378                         MUTEX_LOCK(dbenv, lp->mtx_flush);
00379                         DB_ASSERT(log_compare(&lp->s_lsn, &LSN(bhp->buf)) > 0);
00380                         MUTEX_UNLOCK(dbenv, lp->mtx_flush);
00381                 }
00382         }
00383 #endif
00384 
00385         /*
00386          * Call any pgout function.  We set the callpgin flag so that we flag
00387          * that the contents of the buffer will need to be passed through pgin
00388          * before they are reused.
00389          */
00390         if (mfp->ftype != 0 && !F_ISSET(bhp, BH_CALLPGIN)) {
00391                 callpgin = 1;
00392                 if ((ret = __memp_pg(dbmfp, bhp, 0)) != 0)
00393                         goto err;
00394         }
00395 
00396         /* Write the page. */
00397         if ((ret = __os_io(dbenv, DB_IO_WRITE, dbmfp->fhp,
00398             bhp->pgno, mfp->stat.st_pagesize, bhp->buf, &nw)) != 0) {
00399                 __db_err(dbenv, "%s: write failed for page %lu",
00400                     __memp_fn(dbmfp), (u_long)bhp->pgno);
00401                 goto err;
00402         }
00403         ++mfp->stat.st_page_out;
00404 
00405 err:
00406 file_dead:
00407         /*
00408          * !!!
00409          * Once we pass this point, dbmfp and mfp may be NULL, we may not have
00410          * a valid file reference.
00411          *
00412          * Unlock the buffer and reacquire the hash lock.
00413          */
00414         MUTEX_UNLOCK(dbenv, bhp->mtx_bh);
00415         MUTEX_LOCK(dbenv, hp->mtx_hash);
00416 
00417         /*
00418          * If we rewrote the page, it will need processing by the pgin
00419          * routine before reuse.
00420          */
00421         if (callpgin)
00422                 F_SET(bhp, BH_CALLPGIN);
00423 
00424         /*
00425          * Update the hash bucket statistics, reset the flags.
00426          * If we were successful, the page is no longer dirty.
00427          */
00428         if (ret == 0) {
00429                 DB_ASSERT(hp->hash_page_dirty != 0);
00430                 --hp->hash_page_dirty;
00431 
00432                 F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
00433         }
00434 
00435         /* Regardless, clear any sync wait-for count and remove our lock. */
00436         bhp->ref_sync = 0;
00437         F_CLR(bhp, BH_LOCKED);
00438 
00439         return (ret);
00440 }
00441 
00442 /*
00443  * __memp_pg --
00444  *      Call the pgin/pgout routine.
00445  *
00446  * PUBLIC: int __memp_pg __P((DB_MPOOLFILE *, BH *, int));
00447  */
00448 int
00449 __memp_pg(dbmfp, bhp, is_pgin)
00450         DB_MPOOLFILE *dbmfp;
00451         BH *bhp;
00452         int is_pgin;
00453 {
00454         DBT dbt, *dbtp;
00455         DB_ENV *dbenv;
00456         DB_MPOOL *dbmp;
00457         DB_MPREG *mpreg;
00458         MPOOLFILE *mfp;
00459         int ftype, ret;
00460 
00461         dbenv = dbmfp->dbenv;
00462         dbmp = dbenv->mp_handle;
00463         mfp = dbmfp->mfp;
00464 
00465         if ((ftype = mfp->ftype) == DB_FTYPE_SET)
00466                 mpreg = dbmp->pg_inout;
00467         else {
00468                 MUTEX_LOCK(dbenv, dbmp->mutex);
00469                 for (mpreg = LIST_FIRST(&dbmp->dbregq);
00470                     mpreg != NULL; mpreg = LIST_NEXT(mpreg, q))
00471                         if (ftype == mpreg->ftype)
00472                                 break;
00473                 MUTEX_UNLOCK(dbenv, dbmp->mutex);
00474         }
00475         if (mpreg == NULL)
00476                 return (0);
00477 
00478         if (mfp->pgcookie_len == 0)
00479                 dbtp = NULL;
00480         else {
00481                 dbt.size = (u_int32_t)mfp->pgcookie_len;
00482                 dbt.data = R_ADDR(dbmp->reginfo, mfp->pgcookie_off);
00483                 dbtp = &dbt;
00484         }
00485 
00486         if (is_pgin) {
00487                 if (mpreg->pgin != NULL &&
00488                     (ret = mpreg->pgin(dbenv, bhp->pgno, bhp->buf, dbtp)) != 0)
00489                         goto err;
00490         } else
00491                 if (mpreg->pgout != NULL &&
00492                     (ret = mpreg->pgout(dbenv, bhp->pgno, bhp->buf, dbtp)) != 0)
00493                         goto err;
00494 
00495         return (0);
00496 
00497 err:    __db_err(dbenv, "%s: %s failed for page %lu",
00498             __memp_fn(dbmfp), is_pgin ? "pgin" : "pgout", (u_long)bhp->pgno);
00499         return (ret);
00500 }
00501 
00502 /*
00503  * __memp_bhfree --
00504  *      Free a bucket header and its referenced data.
00505  *
00506  * PUBLIC: int __memp_bhfree
00507  * PUBLIC:     __P((DB_MPOOL *, DB_MPOOL_HASH *, BH *, u_int32_t));
00508  */
00509 int
00510 __memp_bhfree(dbmp, hp, bhp, flags)
00511         DB_MPOOL *dbmp;
00512         DB_MPOOL_HASH *hp;
00513         BH *bhp;
00514         u_int32_t flags;
00515 {
00516         DB_ENV *dbenv;
00517         MPOOL *c_mp, *mp;
00518         MPOOLFILE *mfp;
00519         u_int32_t n_cache;
00520         int ret, t_ret;
00521 
00522         /*
00523          * Assumes the hash bucket is locked and the MPOOL is not.
00524          */
00525         dbenv = dbmp->dbenv;
00526         mp = dbmp->reginfo[0].primary;
00527         n_cache = NCACHE(mp, bhp->mf_offset, bhp->pgno);
00528 
00529         /*
00530          * Delete the buffer header from the hash bucket queue and reset
00531          * the hash bucket's priority, if necessary.
00532          */
00533         SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
00534         if (bhp->priority == hp->hash_priority)
00535                 hp->hash_priority =
00536                     SH_TAILQ_EMPTY(&hp->hash_bucket) ?
00537                     0 : SH_TAILQ_FIRSTP(&hp->hash_bucket, __bh)->priority;
00538 #ifdef DIAGNOSTIC
00539         __memp_check_order(hp);
00540 #endif
00541 
00542         /*
00543          * Discard the hash bucket's mutex, it's no longer needed, and
00544          * we don't want to be holding it when acquiring other locks.
00545          */
00546         if (!LF_ISSET(BH_FREE_UNLOCKED))
00547                 MUTEX_UNLOCK(dbenv, hp->mtx_hash);
00548 
00549         /*
00550          * Find the underlying MPOOLFILE and decrement its reference count.
00551          * If this is its last reference, remove it.
00552          */
00553         mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
00554         MUTEX_LOCK(dbenv, mfp->mutex);
00555         if (--mfp->block_cnt == 0 && mfp->mpf_cnt == 0)
00556                 ret = __memp_mf_discard(dbmp, mfp);
00557         else {
00558                 ret = 0;
00559                 MUTEX_UNLOCK(dbenv, mfp->mutex);
00560         }
00561 
00562         /*
00563          * Free the associated mutex.
00564          *
00565          * XXX
00566          * This is wrong.  We fast-path the allocation of replacement buffers
00567          * by checking the required size, we shouldn't reallocate the mutex in
00568          * that case, either.  (Note that we should probably reset the mutex
00569          * statistics in case we re-use the mutex, though.)
00570          */
00571         if ((t_ret = __mutex_free(dbenv, &bhp->mtx_bh)) != 0 && ret == 0)
00572                 ret = t_ret;
00573 
00574         /*
00575          * If we're not reusing the buffer immediately, free the buffer for
00576          * real.
00577          */
00578         if (LF_ISSET(BH_FREE_FREEMEM)) {
00579                 MPOOL_REGION_LOCK(dbenv, &dbmp->reginfo[n_cache]);
00580 
00581                 __db_shalloc_free(&dbmp->reginfo[n_cache], bhp);
00582                 c_mp = dbmp->reginfo[n_cache].primary;
00583                 c_mp->stat.st_pages--;
00584 
00585                 MPOOL_REGION_UNLOCK(dbenv, &dbmp->reginfo[n_cache]);
00586         }
00587 
00588         return (ret);
00589 }

Generated on Sun Dec 25 12:14:41 2005 for Berkeley DB 4.4.16 by  doxygen 1.4.2