Main Page | Class Hierarchy | Data Structures | Directories | File List | Data Fields | Related Pages

mp_sync.c

00001 /*-
00002  * See the file LICENSE for redistribution information.
00003  *
00004  * Copyright (c) 1996-2005
00005  *      Sleepycat Software.  All rights reserved.
00006  *
00007  * $Id: mp_sync.c,v 12.11 2005/10/07 20:21:33 ubell Exp $
00008  */
00009 
00010 #include "db_config.h"
00011 
00012 #ifndef NO_SYSTEM_INCLUDES
00013 #include <sys/types.h>
00014 
00015 #include <stdlib.h>
00016 #endif
00017 
00018 #include "db_int.h"
00019 #include "dbinc/db_shash.h"
00020 #include "dbinc/log.h"
00021 #include "dbinc/mp.h"
00022 
00023 typedef struct {
00024         DB_MPOOL_HASH *track_hp;        /* Hash bucket. */
00025 
00026         roff_t    track_off;            /* Page file offset. */
00027         db_pgno_t track_pgno;           /* Page number. */
00028 } BH_TRACK;
00029 
00030 static int __bhcmp __P((const void *, const void *));
00031 static int __memp_close_flush_files __P((DB_ENV *, DB_MPOOL *, int));
00032 static int __memp_sync_files __P((DB_ENV *, DB_MPOOL *));
00033 
00034 /*
00035  * __memp_sync_pp --
00036  *      DB_ENV->memp_sync pre/post processing.
00037  *
00038  * PUBLIC: int __memp_sync_pp __P((DB_ENV *, DB_LSN *));
00039  */
00040 int
00041 __memp_sync_pp(dbenv, lsnp)
00042         DB_ENV *dbenv;
00043         DB_LSN *lsnp;
00044 {
00045         DB_THREAD_INFO *ip;
00046         int ret;
00047 
00048         PANIC_CHECK(dbenv);
00049         ENV_REQUIRES_CONFIG(dbenv,
00050             dbenv->mp_handle, "memp_sync", DB_INIT_MPOOL);
00051 
00052         /*
00053          * If no LSN is provided, flush the entire cache (reasonable usage
00054          * even if there's no log subsystem configured).
00055          */
00056         if (lsnp != NULL)
00057                 ENV_REQUIRES_CONFIG(dbenv,
00058                     dbenv->lg_handle, "memp_sync", DB_INIT_LOG);
00059 
00060         ENV_ENTER(dbenv, ip);
00061         REPLICATION_WRAP(dbenv, (__memp_sync(dbenv, lsnp)), ret);
00062         ENV_LEAVE(dbenv, ip);
00063         return (ret);
00064 }
00065 
00066 /*
00067  * __memp_sync --
00068  *      DB_ENV->memp_sync.
00069  *
00070  * PUBLIC: int __memp_sync __P((DB_ENV *, DB_LSN *));
00071  */
00072 int
00073 __memp_sync(dbenv, lsnp)
00074         DB_ENV *dbenv;
00075         DB_LSN *lsnp;
00076 {
00077         DB_MPOOL *dbmp;
00078         MPOOL *mp;
00079         int ret;
00080 
00081         dbmp = dbenv->mp_handle;
00082         mp = dbmp->reginfo[0].primary;
00083 
00084         /* If we've flushed to the requested LSN, return that information. */
00085         if (lsnp != NULL) {
00086                 MPOOL_SYSTEM_LOCK(dbenv);
00087                 if (log_compare(lsnp, &mp->lsn) <= 0) {
00088                         *lsnp = mp->lsn;
00089 
00090                         MPOOL_SYSTEM_UNLOCK(dbenv);
00091                         return (0);
00092                 }
00093                 MPOOL_SYSTEM_UNLOCK(dbenv);
00094         }
00095 
00096         if ((ret = __memp_sync_int(dbenv, NULL, 0, DB_SYNC_CACHE, NULL)) != 0)
00097                 return (ret);
00098 
00099         if (lsnp != NULL) {
00100                 MPOOL_SYSTEM_LOCK(dbenv);
00101                 if (log_compare(lsnp, &mp->lsn) > 0)
00102                         mp->lsn = *lsnp;
00103                 MPOOL_SYSTEM_UNLOCK(dbenv);
00104         }
00105 
00106         return (0);
00107 }
00108 
00109 /*
00110  * __memp_fsync_pp --
00111  *      DB_MPOOLFILE->sync pre/post processing.
00112  *
00113  * PUBLIC: int __memp_fsync_pp __P((DB_MPOOLFILE *));
00114  */
00115 int
00116 __memp_fsync_pp(dbmfp)
00117         DB_MPOOLFILE *dbmfp;
00118 {
00119         DB_ENV *dbenv;
00120         DB_THREAD_INFO *ip;
00121         int ret;
00122 
00123         dbenv = dbmfp->dbenv;
00124 
00125         PANIC_CHECK(dbenv);
00126         MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->sync");
00127 
00128         ENV_ENTER(dbenv, ip);
00129         REPLICATION_WRAP(dbenv, (__memp_fsync(dbmfp)), ret);
00130         ENV_LEAVE(dbenv, ip);
00131         return (ret);
00132 }
00133 
00134 /*
00135  * __memp_fsync --
00136  *      DB_MPOOLFILE->sync.
00137  *
00138  * PUBLIC: int __memp_fsync __P((DB_MPOOLFILE *));
00139  */
00140 int
00141 __memp_fsync(dbmfp)
00142         DB_MPOOLFILE *dbmfp;
00143 {
00144         MPOOLFILE *mfp;
00145 
00146         mfp = dbmfp->mfp;
00147 
00148         /*
00149          * If this handle doesn't have a file descriptor that's open for
00150          * writing, or if the file is a temporary, or if the file hasn't
00151          * been written since it was flushed, there's no reason to proceed
00152          * further.
00153          */
00154         if (F_ISSET(dbmfp, MP_READONLY))
00155                 return (0);
00156 
00157         if (F_ISSET(dbmfp->mfp, MP_TEMP) || dbmfp->mfp->no_backing_file)
00158                 return (0);
00159 
00160         if (mfp->file_written == 0)
00161                 return (0);
00162 
00163         return (__memp_sync_int(dbmfp->dbenv, dbmfp, 0, DB_SYNC_FILE, NULL));
00164 }
00165 
00166 /*
00167  * __mp_xxx_fh --
00168  *      Return a file descriptor for DB 1.85 compatibility locking.
00169  *
00170  * PUBLIC: int __mp_xxx_fh __P((DB_MPOOLFILE *, DB_FH **));
00171  */
00172 int
00173 __mp_xxx_fh(dbmfp, fhp)
00174         DB_MPOOLFILE *dbmfp;
00175         DB_FH **fhp;
00176 {
00177         /*
00178          * This is a truly spectacular layering violation, intended ONLY to
00179          * support compatibility for the DB 1.85 DB->fd call.
00180          *
00181          * Sync the database file to disk, creating the file as necessary.
00182          *
00183          * We skip the MP_READONLY and MP_TEMP tests done by memp_fsync(3).
00184          * The MP_READONLY test isn't interesting because we will either
00185          * already have a file descriptor (we opened the database file for
00186          * reading) or we aren't readonly (we created the database which
00187          * requires write privileges).  The MP_TEMP test isn't interesting
00188          * because we want to write to the backing file regardless so that
00189          * we get a file descriptor to return.
00190          */
00191         if ((*fhp = dbmfp->fhp) != NULL)
00192                 return (0);
00193 
00194         return (__memp_sync_int(dbmfp->dbenv, dbmfp, 0, DB_SYNC_FILE, NULL));
00195 }
00196 
00197 /*
00198  * __memp_sync_int --
00199  *      Mpool sync internal function.
00200  *
00201  * PUBLIC: int __memp_sync_int __P((DB_ENV *,
00202  * PUBLIC:     DB_MPOOLFILE *, u_int32_t, db_sync_op, u_int32_t *));
00203  */
00204 int
00205 __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
00206         DB_ENV *dbenv;
00207         DB_MPOOLFILE *dbmfp;
00208         u_int32_t trickle_max, *wrotep;
00209         db_sync_op op;
00210 {
00211         BH *bhp;
00212         BH_TRACK *bharray;
00213         DB_MPOOL *dbmp;
00214         DB_MPOOL_HASH *hp;
00215         MPOOL *c_mp, *mp;
00216         MPOOLFILE *mfp;
00217         db_mutex_t mutex;
00218         roff_t last_mf_offset;
00219         u_int32_t ar_cnt, ar_max, i, n_cache, remaining, wrote;
00220         int filecnt, hb_lock, maxopenfd, maxwrite, maxwrite_sleep;
00221         int pass, ret, t_ret, wait_cnt, write_cnt;
00222 
00223         dbmp = dbenv->mp_handle;
00224         mp = dbmp->reginfo[0].primary;
00225         last_mf_offset = INVALID_ROFF;
00226         filecnt = pass = wrote = 0;
00227 
00228         /* Get shared configuration information. */
00229         MPOOL_SYSTEM_LOCK(dbenv);
00230         maxopenfd = mp->mp_maxopenfd;
00231         maxwrite = mp->mp_maxwrite;
00232         maxwrite_sleep = mp->mp_maxwrite_sleep;
00233         MPOOL_SYSTEM_UNLOCK(dbenv);
00234 
00235         /* Assume one dirty page per bucket. */
00236         ar_max = mp->nreg * mp->htab_buckets;
00237         if ((ret =
00238             __os_malloc(dbenv, ar_max * sizeof(BH_TRACK), &bharray)) != 0)
00239                 return (ret);
00240 
00241         /*
00242          * Walk each cache's list of buffers and mark all dirty buffers to be
00243          * written and all pinned buffers to be potentially written, depending
00244          * on our flags.
00245          */
00246         for (ar_cnt = 0, n_cache = 0; n_cache < mp->nreg; ++n_cache) {
00247                 c_mp = dbmp->reginfo[n_cache].primary;
00248 
00249                 hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
00250                 for (i = 0; i < c_mp->htab_buckets; i++, hp++) {
00251                         /*
00252                          * We can check for empty buckets before locking as we
00253                          * only care if the pointer is zero or non-zero.  We
00254                          * can ignore empty buckets because we only need write
00255                          * buffers that were dirty before we started.
00256                          */
00257                         if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
00258                                 continue;
00259 
00260                         MUTEX_LOCK(dbenv, hp->mtx_hash);
00261                         for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
00262                             bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
00263                                 /* Always ignore unreferenced, clean pages. */
00264                                 if (bhp->ref == 0 && !F_ISSET(bhp, BH_DIRTY))
00265                                         continue;
00266 
00267                                 /*
00268                                  * Checkpoints have to wait on all pinned pages,
00269                                  * as pages may be marked dirty when returned to
00270                                  * the cache.
00271                                  *
00272                                  * File syncs only wait on pages both pinned and
00273                                  * dirty.  (We don't care if pages are marked
00274                                  * dirty when returned to the cache, that means
00275                                  * there's another writing thread and flushing
00276                                  * the cache for this handle is meaningless.)
00277                                  */
00278                                 if (op == DB_SYNC_FILE &&
00279                                     !F_ISSET(bhp, BH_DIRTY))
00280                                         continue;
00281 
00282                                 mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
00283 
00284                                 /*
00285                                  * Ignore in-memory files, even if they are
00286                                  * temp files to whom a backing file has been
00287                                  * allocated.
00288                                  */
00289                                 if (mfp->no_backing_file ||
00290                                     F_ISSET(mfp, MP_TEMP))
00291                                         continue;
00292 
00293                                 /*
00294                                  * If we're flushing a specific file, see if
00295                                  * this page is from that file.
00296                                  */
00297                                 if (dbmfp != NULL && mfp != dbmfp->mfp)
00298                                         continue;
00299 
00300                                 /*
00301                                  * Ignore files that aren't involved in DB's
00302                                  * transactional operations during checkpoints.
00303                                  */
00304                                 if (dbmfp == NULL && mfp->lsn_off == -1)
00305                                         continue;
00306 
00307                                 /* Track the buffer, we want it. */
00308                                 bharray[ar_cnt].track_hp = hp;
00309                                 bharray[ar_cnt].track_pgno = bhp->pgno;
00310                                 bharray[ar_cnt].track_off = bhp->mf_offset;
00311                                 ar_cnt++;
00312 
00313                                 /*
00314                                  * If we run out of space, double and continue.
00315                                  * Don't stop at trickle_max, we want to sort
00316                                  * as large a sample set as possible in order
00317                                  * to minimize disk seeks.
00318                                  */
00319                                 if (ar_cnt >= ar_max) {
00320                                         if ((ret = __os_realloc(dbenv,
00321                                             (ar_max * 2) * sizeof(BH_TRACK),
00322                                             &bharray)) != 0)
00323                                                 break;
00324                                         ar_max *= 2;
00325                                 }
00326                         }
00327                         MUTEX_UNLOCK(dbenv, hp->mtx_hash);
00328 
00329                         if (ret != 0)
00330                                 goto err;
00331                 }
00332         }
00333 
00334         /* If there no buffers to write, we're done. */
00335         if (ar_cnt == 0)
00336                 goto done;
00337 
00338         /*
00339          * Write the buffers in file/page order, trying to reduce seeks by the
00340          * filesystem and, when pages are smaller than filesystem block sizes,
00341          * reduce the actual number of writes.
00342          */
00343         if (ar_cnt > 1)
00344                 qsort(bharray, ar_cnt, sizeof(BH_TRACK), __bhcmp);
00345 
00346         /*
00347          * If we're trickling buffers, only write enough to reach the correct
00348          * percentage.
00349          */
00350         if (op == DB_SYNC_TRICKLE && ar_cnt > trickle_max)
00351                 ar_cnt = trickle_max;
00352 
00353         /*
00354          * Flush the log.  We have to ensure the log records reflecting the
00355          * changes on the database pages we're writing have already made it
00356          * to disk.  We still have to check the log each time we write a page
00357          * (because pages we are about to write may be modified after we have
00358          * flushed the log), but in general this will at least avoid any I/O
00359          * on the log's part.
00360          */
00361         if (LOGGING_ON(dbenv) && (ret = __log_flush(dbenv, NULL)) != 0)
00362                 goto err;
00363 
00364         /*
00365          * Walk the array, writing buffers.  When we write a buffer, we NULL
00366          * out its hash bucket pointer so we don't process a slot more than
00367          * once.
00368          */
00369         for (i = pass = write_cnt = 0, remaining = ar_cnt; remaining > 0; ++i) {
00370                 if (i >= ar_cnt) {
00371                         i = 0;
00372                         ++pass;
00373                         __os_sleep(dbenv, 1, 0);
00374                 }
00375                 if ((hp = bharray[i].track_hp) == NULL)
00376                         continue;
00377 
00378                 /* Lock the hash bucket and find the buffer. */
00379                 mutex = hp->mtx_hash;
00380                 MUTEX_LOCK(dbenv, mutex);
00381                 for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
00382                     bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
00383                         if (bhp->pgno == bharray[i].track_pgno &&
00384                             bhp->mf_offset == bharray[i].track_off)
00385                                 break;
00386 
00387                 /*
00388                  * If we can't find the buffer we're done, somebody else had
00389                  * to have written it.
00390                  *
00391                  * If the buffer isn't pinned or dirty, we're done, there's
00392                  * no work needed.
00393                  */
00394                 if (bhp == NULL || (bhp->ref == 0 && !F_ISSET(bhp, BH_DIRTY))) {
00395                         MUTEX_UNLOCK(dbenv, mutex);
00396                         --remaining;
00397                         bharray[i].track_hp = NULL;
00398                         continue;
00399                 }
00400 
00401                 /*
00402                  * If the buffer is locked by another thread, ignore it, we'll
00403                  * come back to it.
00404                  *
00405                  * If the buffer is pinned and it's only the first or second
00406                  * time we have looked at it, ignore it, we'll come back to
00407                  * it.
00408                  *
00409                  * In either case, skip the buffer if we're not required to
00410                  * write it.
00411                  */
00412                 if (F_ISSET(bhp, BH_LOCKED) || (bhp->ref != 0 && pass < 2)) {
00413                         MUTEX_UNLOCK(dbenv, mutex);
00414                         if (op != DB_SYNC_CACHE && op != DB_SYNC_FILE) {
00415                                 --remaining;
00416                                 bharray[i].track_hp = NULL;
00417                         }
00418                         continue;
00419                 }
00420 
00421                 /*
00422                  * The buffer is either pinned or dirty.
00423                  *
00424                  * Set the sync wait-for count, used to count down outstanding
00425                  * references to this buffer as they are returned to the cache.
00426                  */
00427                 bhp->ref_sync = bhp->ref;
00428 
00429                 /* Pin the buffer into memory and lock it. */
00430                 ++bhp->ref;
00431                 F_SET(bhp, BH_LOCKED);
00432                 MUTEX_LOCK(dbenv, bhp->mtx_bh);
00433 
00434                 /*
00435                  * Unlock the hash bucket and wait for the wait-for count to
00436                  * go to 0.   No new thread can acquire the buffer because we
00437                  * have it locked.
00438                  *
00439                  * If a thread attempts to re-pin a page, the wait-for count
00440                  * will never go to 0 (the thread spins on our buffer lock,
00441                  * while we spin on the thread's ref count).  Give up if we
00442                  * don't get the buffer in 3 seconds, we can try again later.
00443                  *
00444                  * If, when the wait-for count goes to 0, the buffer is found
00445                  * to be dirty, write it.
00446                  */
00447                 MUTEX_UNLOCK(dbenv, mutex);
00448                 for (wait_cnt = 1;
00449                     bhp->ref_sync != 0 && wait_cnt < 4; ++wait_cnt)
00450                         __os_sleep(dbenv, 1, 0);
00451                 MUTEX_LOCK(dbenv, mutex);
00452                 hb_lock = 1;
00453 
00454                 /*
00455                  * If we've switched files, check to see if we're configured
00456                  * to close file descriptors.
00457                  */
00458                 if (maxopenfd != 0 && bhp->mf_offset != last_mf_offset) {
00459                         if (++filecnt >= maxopenfd) {
00460                                 filecnt = 0;
00461                                 if ((ret = __memp_close_flush_files(
00462                                     dbenv, dbmp, 1)) != 0)
00463                                         break;
00464                         }
00465                         last_mf_offset = bhp->mf_offset;
00466                 }
00467 
00468                 /*
00469                  * If the ref_sync count has gone to 0, we're going to be done
00470                  * with this buffer no matter what happens.
00471                  */
00472                 if (bhp->ref_sync == 0) {
00473                         --remaining;
00474                         bharray[i].track_hp = NULL;
00475                 }
00476 
00477                 /*
00478                  * If the ref_sync count has gone to 0 and the buffer is still
00479                  * dirty, we write it.  We only try to write the buffer once.
00480                  */
00481                 if (bhp->ref_sync == 0 && F_ISSET(bhp, BH_DIRTY)) {
00482                         hb_lock = 0;
00483                         MUTEX_UNLOCK(dbenv, mutex);
00484 
00485                         mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
00486                         if ((ret = __memp_bhwrite(dbmp, hp, mfp, bhp, 1)) == 0)
00487                                 ++wrote;
00488                         else
00489                                 __db_err(dbenv, "%s: unable to flush page: %lu",
00490                                     __memp_fns(dbmp, mfp), (u_long)bhp->pgno);
00491 
00492                         /*
00493                          * Avoid saturating the disk, sleep once we've done
00494                          * some number of writes.
00495                          */
00496                         if (maxwrite != 0 && ++write_cnt >= maxwrite) {
00497                                 write_cnt = 0;
00498                                 __os_sleep(dbenv, 0, (u_long)maxwrite_sleep);
00499                         }
00500                 }
00501 
00502                 /*
00503                  * If ref_sync count never went to 0, the buffer was written
00504                  * by another thread, or the write failed, we still have the
00505                  * buffer locked.
00506                  *
00507                  * We may or may not currently hold the hash bucket mutex.  If
00508                  * the __memp_bhwrite -> __memp_pgwrite call was successful,
00509                  * then __memp_pgwrite will have swapped the buffer lock for
00510                  * the hash lock.  All other call paths will leave us without
00511                  * the hash bucket lock.
00512                  *
00513                  * The order of mutexes above was to acquire the buffer lock
00514                  * while holding the hash bucket lock.  Don't deadlock here,
00515                  * release the buffer lock and then acquire the hash bucket
00516                  * lock.
00517                  */
00518                 if (F_ISSET(bhp, BH_LOCKED)) {
00519                         F_CLR(bhp, BH_LOCKED);
00520                         MUTEX_UNLOCK(dbenv, bhp->mtx_bh);
00521 
00522                         if (!hb_lock)
00523                                 MUTEX_LOCK(dbenv, mutex);
00524                 }
00525 
00526                 /*
00527                  * Reset the ref_sync count regardless of our success, we're
00528                  * done with this buffer for now.
00529                  */
00530                 bhp->ref_sync = 0;
00531 
00532                 /* Discard our reference and unlock the bucket. */
00533                 --bhp->ref;
00534                 MUTEX_UNLOCK(dbenv, mutex);
00535 
00536                 if (ret != 0)
00537                         break;
00538         }
00539 
00540 done:   /*
00541          * If doing a checkpoint or flushing a file for the application, we
00542          * have to force the pages to disk.  We don't do this as we go along
00543          * because we want to give the OS as much time as possible to lazily
00544          * flush, and because we have to flush files that might not even have
00545          * had dirty buffers in the cache, so we have to walk the files list.
00546          */
00547         if (ret == 0 && (op == DB_SYNC_CACHE || op == DB_SYNC_FILE)) {
00548                 if (dbmfp == NULL)
00549                         ret = __memp_sync_files(dbenv, dbmp);
00550                 else
00551                         ret = __os_fsync(dbenv, dbmfp->fhp);
00552         }
00553 
00554         /* If we've opened files to flush pages, close them. */
00555         if ((t_ret = __memp_close_flush_files(dbenv, dbmp, 0)) != 0 && ret == 0)
00556                 ret = t_ret;
00557 
00558 err:    __os_free(dbenv, bharray);
00559         if (wrotep != NULL)
00560                 *wrotep = wrote;
00561 
00562         return (ret);
00563 }
00564 
00565 /*
00566  * __memp_sync_files --
00567  *      Sync all the files in the environment, open or not.
00568  */
00569 static
00570 int __memp_sync_files(dbenv, dbmp)
00571         DB_ENV *dbenv;
00572         DB_MPOOL *dbmp;
00573 {
00574         DB_MPOOLFILE *dbmfp;
00575         MPOOL *mp;
00576         MPOOLFILE *mfp, *next_mfp;
00577         int need_discard_pass, ret, t_ret;
00578 
00579         need_discard_pass = ret = 0;
00580         mp = dbmp->reginfo[0].primary;
00581 
00582         MPOOL_SYSTEM_LOCK(dbenv);
00583         for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
00584             mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
00585                 if (!mfp->file_written || mfp->no_backing_file ||
00586                     mfp->deadfile || F_ISSET(mfp, MP_TEMP))
00587                         continue;
00588                 /*
00589                  * Pin the MPOOLFILE structure into memory, and release the
00590                  * region mutex allowing us to walk the linked list.  We'll
00591                  * re-acquire that mutex to move to the next entry in the list.
00592                  *
00593                  * This works because we only need to flush current entries,
00594                  * we don't care about new entries being added, and the linked
00595                  * list is never re-ordered, a single pass is sufficient.  It
00596                  * requires MPOOLFILE structures removed before we get to them
00597                  * be flushed to disk, but that's nothing new, they could have
00598                  * been removed while checkpoint was running, too.
00599                  *
00600                  * Once we have the MPOOLFILE lock, re-check the MPOOLFILE is
00601                  * not being discarded.  (A thread removing the MPOOLFILE
00602                  * will: hold the MPOOLFILE mutex, set deadfile, drop the
00603                  * MPOOLFILE mutex and then acquire the region MUTEX to walk
00604                  * the linked list and remove the MPOOLFILE structure.  Make
00605                  * sure the MPOOLFILE wasn't marked dead while we waited for
00606                  * the mutex.
00607                  */
00608                 MUTEX_LOCK(dbenv, mfp->mutex);
00609                 if (!mfp->file_written || mfp->deadfile) {
00610                         MUTEX_UNLOCK(dbenv, mfp->mutex);
00611                         continue;
00612                 }
00613                 MPOOL_SYSTEM_UNLOCK(dbenv);
00614                 ++mfp->mpf_cnt;
00615                 MUTEX_UNLOCK(dbenv, mfp->mutex);
00616 
00617                 /*
00618                  * Look for an already open, writeable handle (fsync doesn't
00619                  * work on read-only Windows handles).
00620                  */
00621                 MUTEX_LOCK(dbenv, dbmp->mutex);
00622                 for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
00623                     dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q)) {
00624                         if (dbmfp->mfp != mfp || F_ISSET(dbmfp, MP_READONLY))
00625                                 continue;
00626                         /*
00627                          * We don't want to hold the mutex while calling sync.
00628                          * Increment the DB_MPOOLFILE handle ref count to pin
00629                          * it into memory.
00630                          */
00631                         ++dbmfp->ref;
00632                         break;
00633                 }
00634                 MUTEX_UNLOCK(dbenv, dbmp->mutex);
00635 
00636                 /* If we don't find a handle we can use, open one. */
00637                 if (dbmfp == NULL) {
00638                         if ((t_ret = __memp_mf_sync(dbmp, mfp, 0)) != 0) {
00639                                 __db_err(dbenv,
00640                                     "%s: unable to flush: %s", (char *)
00641                                     R_ADDR(dbmp->reginfo, mfp->path_off),
00642                                     db_strerror(t_ret));
00643                                 if (ret == 0)
00644                                         ret = t_ret;
00645                         }
00646                 } else {
00647                         if ((t_ret =
00648                             __os_fsync(dbenv, dbmfp->fhp)) != 0 && ret == 0)
00649                                 ret = t_ret;
00650 
00651                         if ((t_ret = __memp_fclose(dbmfp, 0)) != 0 && ret == 0)
00652                                 ret = t_ret;
00653                 }
00654 
00655                 /*
00656                  * Re-acquire the region lock, we need it to move to the next
00657                  * MPOOLFILE.
00658                  *
00659                  * Re-acquire the MPOOLFILE mutex, we need it to modify the
00660                  * reference count.
00661                  */
00662                 MPOOL_SYSTEM_LOCK(dbenv);
00663                 MUTEX_LOCK(dbenv, mfp->mutex);
00664                 --mfp->mpf_cnt;
00665 
00666                 /*
00667                  * If we wrote the file and there are no open handles (or there
00668                  * is a single open handle, and it's the one we opened to write
00669                  * buffers during checkpoint), clear the file_written flag.  We
00670                  * do this so that applications opening thousands of files don't
00671                  * loop here opening and flushing those files during checkpoint.
00672                  *
00673                  * The danger here is if a buffer were to be written as part of
00674                  * a checkpoint, and then not be flushed to disk.  This cannot
00675                  * happen because we only clear file_written when there are no
00676                  * other users of the MPOOLFILE in the system, and, as we hold
00677                  * the region lock, no possibility of another thread of control
00678                  * racing with us to open a MPOOLFILE.
00679                  */
00680                 if (mfp->mpf_cnt == 0 || (mfp->mpf_cnt == 1 &&
00681                     dbmfp != NULL && F_ISSET(dbmfp, MP_FLUSH))) {
00682                         mfp->file_written = 0;
00683 
00684                         /*
00685                          * We may be the last reference for a MPOOLFILE, as we
00686                          * weren't holding the MPOOLFILE mutex when flushing
00687                          * it's buffers to disk.  If we can discard it, set
00688                          * a flag to schedule a clean-out pass.   (Not likely,
00689                          * I mean, what are the chances that there aren't any
00690                          * buffers in the pool?  Regardless, it might happen.)
00691                          */
00692                         if (mfp->mpf_cnt == 0 && mfp->block_cnt == 0)
00693                                 need_discard_pass = 1;
00694                 }
00695 
00696                 /* Unlock the MPOOLFILE, and move to the next entry. */
00697                 MUTEX_UNLOCK(dbenv, mfp->mutex);
00698         }
00699 
00700         /*
00701          * We exit the loop holding the region lock.
00702          *
00703          * We may need to do a last pass through the MPOOLFILE list -- if we
00704          * were the last reference to an MPOOLFILE, we need to clean it out.
00705          */
00706         if (need_discard_pass)
00707                 for (mfp = SH_TAILQ_FIRST(
00708                     &mp->mpfq, __mpoolfile); mfp != NULL; mfp = next_mfp) {
00709                         next_mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile);
00710 
00711                         /*
00712                          * Do a fast check -- we can check for zero/non-zero
00713                          * without a mutex on the MPOOLFILE.  If likely to
00714                          * succeed, lock the MPOOLFILE down and look for real.
00715                          */
00716                         if (mfp->block_cnt != 0 || mfp->mpf_cnt != 0)
00717                                 continue;
00718 
00719                         MUTEX_LOCK(dbenv, mfp->mutex);
00720                         if (mfp->block_cnt == 0 && mfp->mpf_cnt == 0)
00721                                 (void)__memp_mf_discard(dbmp, mfp);
00722                         else
00723                                 MUTEX_UNLOCK(dbenv, mfp->mutex);
00724                 }
00725         MPOOL_SYSTEM_UNLOCK(dbenv);
00726 
00727         return (ret);
00728 }
00729 
00730 /*
00731  * __memp_mf_sync --
00732  *      Flush an MPOOLFILE, when no currently open handle is available.
00733  *
00734  * PUBLIC: int __memp_mf_sync __P((DB_MPOOL *, MPOOLFILE *, int));
00735  */
00736 int
00737 __memp_mf_sync(dbmp, mfp, region_locked)
00738         DB_MPOOL *dbmp;
00739         MPOOLFILE *mfp;
00740         int region_locked;
00741 {
00742         DB_ENV *dbenv;
00743         DB_FH *fhp;
00744         int ret, t_ret;
00745         char *rpath;
00746 
00747         dbenv = dbmp->dbenv;
00748 
00749         /*
00750          * We need to be holding the region lock: we're using the path name
00751          * and __memp_nameop might try and rename the file.
00752          */
00753         if (!region_locked)
00754                 MPOOL_SYSTEM_LOCK(dbenv);
00755 
00756         if ((ret = __db_appname(dbenv, DB_APP_DATA,
00757             R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) == 0) {
00758                 if ((ret = __os_open(dbenv, rpath, 0, 0, &fhp)) == 0) {
00759                         ret = __os_fsync(dbenv, fhp);
00760                         if ((t_ret =
00761                             __os_closehandle(dbenv, fhp)) != 0 && ret == 0)
00762                                 ret = t_ret;
00763                 }
00764                 __os_free(dbenv, rpath);
00765         }
00766 
00767         if (!region_locked)
00768                 MPOOL_SYSTEM_UNLOCK(dbenv);
00769 
00770         return (ret);
00771 }
00772 
00773 /*
00774  * __memp_close_flush_files --
00775  *      Close files opened only to flush buffers.
00776  */
00777 static int
00778 __memp_close_flush_files(dbenv, dbmp, dosync)
00779         DB_ENV *dbenv;
00780         DB_MPOOL *dbmp;
00781         int dosync;
00782 {
00783         DB_MPOOLFILE *dbmfp;
00784         MPOOLFILE *mfp;
00785         int ret;
00786 
00787         /*
00788          * The routine exists because we must close files opened by sync to
00789          * flush buffers.  There are two cases: first, extent files have to
00790          * be closed so they may be removed when empty.  Second, regular
00791          * files have to be closed so we don't run out of descriptors (for
00792          * example, an application partitioning its data into databases
00793          * based on timestamps, so there's a continually increasing set of
00794          * files).
00795          *
00796          * We mark files opened in the __memp_bhwrite() function with the
00797          * MP_FLUSH flag.  Here we walk through our file descriptor list,
00798          * and, if a file was opened by __memp_bhwrite(), we close it.
00799          */
00800 retry:  MUTEX_LOCK(dbenv, dbmp->mutex);
00801         for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
00802             dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
00803                 if (F_ISSET(dbmfp, MP_FLUSH)) {
00804                         F_CLR(dbmfp, MP_FLUSH);
00805                         MUTEX_UNLOCK(dbenv, dbmp->mutex);
00806                         if (dosync) {
00807                                 /*
00808                                  * If we have the only open handle on the file,
00809                                  * clear the dirty flag so we don't re-open and
00810                                  * sync it again when discarding the MPOOLFILE
00811                                  * structure.  Clear the flag before the sync
00812                                  * so can't race with a thread writing the file.
00813                                  */
00814                                 mfp = dbmfp->mfp;
00815                                 if (mfp->mpf_cnt == 1) {
00816                                         MUTEX_LOCK(dbenv, mfp->mutex);
00817                                         if (mfp->mpf_cnt == 1)
00818                                                 mfp->file_written = 0;
00819                                         MUTEX_UNLOCK(dbenv, mfp->mutex);
00820                                 }
00821                                 if ((ret = __os_fsync(dbenv, dbmfp->fhp)) != 0)
00822                                         return (ret);
00823                         }
00824                         if ((ret = __memp_fclose(dbmfp, 0)) != 0)
00825                                 return (ret);
00826                         goto retry;
00827                 }
00828         MUTEX_UNLOCK(dbenv, dbmp->mutex);
00829 
00830         return (0);
00831 }
00832 
00833 static int
00834 __bhcmp(p1, p2)
00835         const void *p1, *p2;
00836 {
00837         BH_TRACK *bhp1, *bhp2;
00838 
00839         bhp1 = (BH_TRACK *)p1;
00840         bhp2 = (BH_TRACK *)p2;
00841 
00842         /* Sort by file (shared memory pool offset). */
00843         if (bhp1->track_off < bhp2->track_off)
00844                 return (-1);
00845         if (bhp1->track_off > bhp2->track_off)
00846                 return (1);
00847 
00848         /*
00849          * !!!
00850          * Defend against badly written quicksort code calling the comparison
00851          * function with two identical pointers (e.g., WATCOM C++ (Power++)).
00852          */
00853         if (bhp1->track_pgno < bhp2->track_pgno)
00854                 return (-1);
00855         if (bhp1->track_pgno > bhp2->track_pgno)
00856                 return (1);
00857         return (0);
00858 }

Generated on Sun Dec 25 12:14:41 2005 for Berkeley DB 4.4.16 by  doxygen 1.4.2