Main Page | Class Hierarchy | Data Structures | Directories | File List | Data Fields | Related Pages

env_region.c

00001 /*-
00002  * See the file LICENSE for redistribution information.
00003  *
00004  * Copyright (c) 1996-2005
00005  *      Sleepycat Software.  All rights reserved.
00006  *
00007  * $Id: env_region.c,v 12.13 2005/10/21 19:13:01 bostic Exp $
00008  */
00009 
00010 #include "db_config.h"
00011 
00012 #ifndef NO_SYSTEM_INCLUDES
00013 #include <sys/types.h>
00014 
00015 #if TIME_WITH_SYS_TIME
00016 #include <sys/time.h>
00017 #include <time.h>
00018 #else
00019 #if HAVE_SYS_TIME_H
00020 #include <sys/time.h>
00021 #else
00022 #include <time.h>
00023 #endif
00024 #endif
00025 
00026 #include <string.h>
00027 #endif
00028 
00029 #include "db_int.h"
00030 #include "dbinc/db_shash.h"
00031 #include "dbinc/crypto.h"
00032 #include "dbinc/mp.h"
00033 
00034 static void __db_des_destroy __P((DB_ENV *, REGION *));
00035 static int  __db_des_get __P((DB_ENV *, REGINFO *, REGINFO *, REGION **));
00036 static int  __db_e_remfile __P((DB_ENV *));
00037 static int  __db_faultmem __P((DB_ENV *, void *, size_t, int));
00038 
00039 /*
00040  * __db_e_attach
00041  *      Join/create the environment
00042  *
00043  * PUBLIC: int __db_e_attach __P((DB_ENV *, u_int32_t *));
00044  */
00045 int
00046 __db_e_attach(dbenv, init_flagsp)
00047         DB_ENV *dbenv;
00048         u_int32_t *init_flagsp;
00049 {
00050         REGENV *renv;
00051         REGENV_REF ref;
00052         REGINFO *infop;
00053         REGION *rp, tregion;
00054         size_t size;
00055         size_t nrw;
00056         u_int32_t bytes, i, mbytes, nregions;
00057         u_int retry_cnt;
00058         int majver, minver, patchver, ret, segid;
00059         char buf[sizeof(DB_REGION_FMT) + 20];
00060 
00061         /* Initialization */
00062         retry_cnt = 0;
00063 
00064         /* Repeated initialization. */
00065 loop:   renv = NULL;
00066 
00067         /* Set up the DB_ENV's REG_INFO structure. */
00068         if ((ret = __os_calloc(dbenv, 1, sizeof(REGINFO), &infop)) != 0)
00069                 return (ret);
00070         infop->dbenv = dbenv;
00071         infop->type = REGION_TYPE_ENV;
00072         infop->id = REGION_ID_ENV;
00073         infop->flags = REGION_JOIN_OK;
00074         if (F_ISSET(dbenv, DB_ENV_CREATE))
00075                 F_SET(infop, REGION_CREATE_OK);
00076 
00077         /*
00078          * We have to single-thread the creation of the REGENV region.  Once
00079          * it exists, we can serialize using region mutexes, but until then
00080          * we have to be the only player in the game.
00081          *
00082          * If this is a private environment, we are only called once and there
00083          * are no possible race conditions.
00084          *
00085          * If this is a public environment, we use the filesystem to ensure
00086          * the creation of the environment file is single-threaded.
00087          */
00088         if (F_ISSET(dbenv, DB_ENV_PRIVATE)) {
00089                 if ((ret = __os_strdup(dbenv,
00090                     "process-private", &infop->name)) != 0)
00091                         goto err;
00092                 goto creation;
00093         }
00094 
00095         /* Build the region name. */
00096         (void)snprintf(buf, sizeof(buf), "%s", DB_REGION_ENV);
00097         if ((ret = __db_appname(dbenv,
00098             DB_APP_NONE, buf, 0, NULL, &infop->name)) != 0)
00099                 goto err;
00100 
00101         /*
00102          * Try to create the file, if we have the authority.  We have to ensure
00103          * that multiple threads/processes attempting to simultaneously create
00104          * the file are properly ordered.  Open using the O_CREAT and O_EXCL
00105          * flags so that multiple attempts to create the region will return
00106          * failure in all but one.  POSIX 1003.1 requires that EEXIST be the
00107          * errno return value -- I sure hope they're right.
00108          */
00109         if (F_ISSET(dbenv, DB_ENV_CREATE)) {
00110                 if ((ret = __os_open(dbenv, infop->name,
00111                     DB_OSO_CREATE | DB_OSO_EXCL | DB_OSO_REGION,
00112                     dbenv->db_mode, &dbenv->lockfhp)) == 0)
00113                         goto creation;
00114                 if (ret != EEXIST) {
00115                         __db_err(dbenv,
00116                             "%s: %s", infop->name, db_strerror(ret));
00117                         goto err;
00118                 }
00119         }
00120 
00121         /*
00122          * If we couldn't create the file, try and open it.  (If that fails,
00123          * we're done.)
00124          */
00125         if ((ret = __os_open(
00126             dbenv, infop->name, DB_OSO_REGION, 0, &dbenv->lockfhp)) != 0)
00127                 goto err;
00128 
00129         /* The region exists, it's not okay to recreate it. */
00130         F_CLR(infop, REGION_CREATE_OK);
00131 
00132         /*
00133          * !!!
00134          * The region may be in system memory not backed by the filesystem
00135          * (more specifically, not backed by this file), and we're joining
00136          * it.  In that case, the process that created it will have written
00137          * out a REGENV_REF structure as its only contents.  We read that
00138          * structure before we do anything further, e.g., we can't just map
00139          * that file in and then figure out what's going on.
00140          *
00141          * All of this noise is because some systems don't have a coherent VM
00142          * and buffer cache, and what's worse, when you mix operations on the
00143          * VM and buffer cache, half the time you hang the system.
00144          *
00145          * If the file is the size of an REGENV_REF structure, then we know
00146          * the real region is in some other memory.  (The only way you get a
00147          * file that size is to deliberately write it, as it's smaller than
00148          * any possible disk sector created by writing a file or mapping the
00149          * file into memory.)  In which case, retrieve the structure from the
00150          * file and use it to acquire the referenced memory.
00151          *
00152          * If the structure is larger than a REGENV_REF structure, then this
00153          * file is backing the shared memory region, and we just map it into
00154          * memory.
00155          *
00156          * And yes, this makes me want to take somebody and kill them.  (I
00157          * digress -- but you have no freakin' idea.  This is unbelievably
00158          * stupid and gross, and I've probably spent six months of my life,
00159          * now, trying to make different versions of it work.)
00160          */
00161         if ((ret = __os_ioinfo(dbenv, infop->name,
00162             dbenv->lockfhp, &mbytes, &bytes, NULL)) != 0) {
00163                 __db_err(dbenv, "%s: %s", infop->name, db_strerror(ret));
00164                 goto err;
00165         }
00166 
00167         /*
00168          * !!!
00169          * A size_t is OK -- regions get mapped into memory, and so can't
00170          * be larger than a size_t.
00171          */
00172         size = mbytes * MEGABYTE + bytes;
00173 
00174         /*
00175          * If the size is less than the size of a REGENV_REF structure, the
00176          * region (or, possibly, the REGENV_REF structure) has not yet been
00177          * completely written.  Shouldn't be possible, but there's no reason
00178          * not to wait awhile and try again.
00179          *
00180          * Otherwise, if the size is the size of a REGENV_REF structure,
00181          * read it into memory and use it as a reference to the real region.
00182          */
00183         if (size <= sizeof(ref)) {
00184                 if (size != sizeof(ref))
00185                         goto retry;
00186 
00187                 if ((ret = __os_read(dbenv, dbenv->lockfhp, &ref,
00188                     sizeof(ref), &nrw)) != 0 || nrw < (size_t)sizeof(ref)) {
00189                         if (ret == 0)
00190                                 ret = EIO;
00191                         __db_err(dbenv,
00192                     "%s: unable to read system-memory information from: %s",
00193                             infop->name, db_strerror(ret));
00194                         goto err;
00195                 }
00196                 size = ref.size;
00197                 segid = ref.segid;
00198 
00199                 F_SET(dbenv, DB_ENV_SYSTEM_MEM);
00200         } else if (F_ISSET(dbenv, DB_ENV_SYSTEM_MEM)) {
00201                 ret = EINVAL;
00202                 __db_err(dbenv,
00203                     "%s: existing environment not created in system memory: %s",
00204                     infop->name, db_strerror(ret));
00205                 goto err;
00206         } else
00207                 segid = INVALID_REGION_SEGID;
00208 
00209 #ifndef HAVE_MUTEX_FCNTL
00210         /*
00211          * If we're not doing fcntl locking, we can close the file handle.  We
00212          * no longer need it and the less contact between the buffer cache and
00213          * the VM, the better.
00214          */
00215          (void)__os_closehandle(dbenv, dbenv->lockfhp);
00216          dbenv->lockfhp = NULL;
00217 #endif
00218 
00219         /* Call the region join routine to acquire the region. */
00220         memset(&tregion, 0, sizeof(tregion));
00221         tregion.size = (roff_t)size;
00222         tregion.segid = segid;
00223         if ((ret = __os_r_attach(dbenv, infop, &tregion)) != 0)
00224                 goto err;
00225 
00226         /*
00227          * The environment's REGENV structure has to live at offset 0 instead
00228          * of the usual shalloc information.  Set the primary reference and
00229          * correct the "addr" value to reference the shalloc region.  Note,
00230          * this means that all of our offsets (R_ADDR/R_OFFSET) get shifted
00231          * as well, but that should be fine.
00232          */
00233         infop->primary = infop->addr;
00234         infop->addr = (u_int8_t *)infop->addr + sizeof(REGENV);
00235         renv = infop->primary;
00236 
00237         /*
00238          * Make sure the region matches our build.  Special case a region
00239          * that's all nul bytes, just treat it like any other corruption.
00240          *
00241          * !!!
00242          * We don't display the major/minor version from the environment,
00243          * because it may be in a different place in the two regions.
00244          */
00245         if (renv->majver != DB_VERSION_MAJOR ||
00246             renv->minver != DB_VERSION_MINOR) {
00247                 if (renv->majver != 0 || renv->minver != 0) {
00248                         __db_err(dbenv,
00249         "Program version %d.%d doesn't match environment version %d.%d",
00250                             DB_VERSION_MAJOR, DB_VERSION_MINOR,
00251                             renv->majver, renv->minver);
00252                         ret = DB_VERSION_MISMATCH;
00253                 } else
00254                         ret = EINVAL;
00255                 goto err;
00256         }
00257 
00258         /*
00259          * Check if the environment has had a catastrophic failure.
00260          *
00261          * Check the magic number to ensure the region is initialized.  If the
00262          * magic number isn't set, the lock may not have been initialized, and
00263          * an attempt to use it could lead to random behavior.
00264          *
00265          * The panic and magic values aren't protected by any lock, so we never
00266          * use them in any check that's more complex than set/not-set.
00267          *
00268          * !!!
00269          * I'd rather play permissions games using the underlying file, but I
00270          * can't because Windows/NT filesystems won't open files mode 0.
00271          */
00272         if (renv->panic && !F_ISSET(dbenv, DB_ENV_NOPANIC)) {
00273                 ret = __db_panic_msg(dbenv);
00274                 goto err;
00275         }
00276         if (renv->magic != DB_REGION_MAGIC)
00277                 goto retry;
00278 
00279         /*
00280          * Get a reference to the underlying REGION information for this
00281          * environment.
00282          */
00283         if ((ret = __db_des_get(dbenv, infop, infop, &rp)) != 0 || rp == NULL)
00284                 goto find_err;
00285         infop->rp = rp;
00286 
00287         /*
00288          * There's still a possibility for inconsistent data.  When we acquired
00289          * the size of the region and attached to it, it might have still been
00290          * growing as part of its creation.  We can detect this by checking the
00291          * size we originally found against the region's current size.  (The
00292          * region's current size has to be final, the creator finished growing
00293          * it before setting the magic number in the region.)
00294          */
00295         if (rp->size != size)
00296                 goto retry;
00297 
00298         /* Increment the reference count. */
00299         MUTEX_LOCK(dbenv, renv->mtx_regenv);
00300         ++renv->refcnt;
00301         MUTEX_UNLOCK(dbenv, renv->mtx_regenv);
00302 
00303         /*
00304          * Check our callers configuration flags, it's an error to configure
00305          * incompatible or additional subsystems in an existing environment.
00306          * Return the total set of flags to the caller so they initialize the
00307          * correct set of subsystems.
00308          */
00309         if (init_flagsp != NULL) {
00310                 FLD_CLR(*init_flagsp, renv->init_flags);
00311                 if (*init_flagsp != 0) {
00312                         __db_err(dbenv,
00313     "configured environment flags incompatible with existing environment");
00314                         ret = EINVAL;
00315                         goto err;
00316                 }
00317                 *init_flagsp = renv->init_flags;
00318         }
00319 
00320         /*
00321          * Fault the pages into memory.  Note, do this AFTER releasing the
00322          * lock, because we're only reading the pages, not writing them.
00323          */
00324         (void)__db_faultmem(dbenv, infop->primary, rp->size, 0);
00325 
00326         /* Everything looks good, we're done. */
00327         dbenv->reginfo = infop;
00328         return (0);
00329 
00330 creation:
00331         /* Create the environment region. */
00332         F_SET(infop, REGION_CREATE);
00333 
00334         /*
00335          * Allocate room for REGION structures plus overhead.
00336          *
00337          * XXX
00338          * Overhead is so high because encryption passwds are stored in the
00339          * base environment region, as are replication vote arrays.  This is
00340          * a bug, not a feature, replication needs its own region.
00341          */
00342         memset(&tregion, 0, sizeof(tregion));
00343         nregions = dbenv->mp_ncache + 10;
00344         tregion.size =
00345            (roff_t)(nregions * sizeof(REGION) + dbenv->passwd_len + 16 * 1024);
00346         tregion.segid = INVALID_REGION_SEGID;
00347         if ((ret = __os_r_attach(dbenv, infop, &tregion)) != 0)
00348                 goto err;
00349 
00350         /*
00351          * Fault the pages into memory.  Note, do this BEFORE we initialize
00352          * anything, because we're writing the pages, not just reading them.
00353          */
00354         (void)__db_faultmem(dbenv, infop->addr, tregion.size, 1);
00355 
00356         /*
00357          * The first object in the region is the REGENV structure.  This is
00358          * different from the other regions, and, from everything else in
00359          * this region, where all objects are allocated from the pool, i.e.,
00360          * there aren't any fixed locations.  The remaining space is made
00361          * available for later allocation.
00362          *
00363          * The allocation space must be size_t aligned, because that's what
00364          * the initialization routine is going to store there.  To make sure
00365          * that happens, the REGENV structure was padded with a final size_t.
00366          * No other region needs to worry about it because all of them treat
00367          * the entire region as allocation space.
00368          *
00369          * Set the primary reference and correct the "addr" value to reference
00370          * the shalloc region.  Note, this requires that we "uncorrect" it at
00371          * region detach, and that all of our offsets (R_ADDR/R_OFFSET) will be
00372          * shifted as well, but that should be fine.
00373          */
00374         infop->primary = infop->addr;
00375         infop->addr = (u_int8_t *)infop->addr + sizeof(REGENV);
00376         __db_shalloc_init(infop, tregion.size - sizeof(REGENV));
00377 
00378         /*
00379          * Initialize the rest of the REGENV structure.  (Don't set the magic
00380          * number to the correct value, that would validate the environment).
00381          */
00382         renv = infop->primary;
00383         renv->magic = 0;
00384         renv->panic = 0;
00385 
00386         (void)db_version(&majver, &minver, &patchver);
00387         renv->majver = (u_int32_t)majver;
00388         renv->minver = (u_int32_t)minver;
00389         renv->patchver = (u_int32_t)patchver;
00390 
00391         (void)time(&renv->timestamp);
00392         __os_unique_id(dbenv, &renv->envid);
00393 
00394         if ((ret = __mutex_alloc(
00395             dbenv, MTX_ENV_REGION, 0, &renv->mtx_regenv)) != 0)
00396                 goto err;
00397         renv->refcnt = 1;
00398 
00399         /*
00400          * Initialize init_flags to store the flags that any other environment
00401          * handle that uses DB_JOINENV to join this environment will need.
00402          */
00403         renv->init_flags = (init_flagsp == NULL) ? 0 : *init_flagsp;
00404 
00405         /*
00406          * Set up the region array.  We use an array rather than a linked list
00407          * as we have to traverse this list after failure in some cases, and
00408          * we don't want to infinitely loop should the application fail while
00409          * we're manipulating the list.
00410          */
00411         renv->region_cnt = nregions;
00412         if ((ret =
00413             __db_shalloc(infop, nregions * sizeof(REGION), 0, &rp)) != 0) {
00414                 __db_err(dbenv, "unable to create new master region array: %s",
00415                     db_strerror(ret));
00416                 goto err;
00417         }
00418         renv->region_off = R_OFFSET(infop, rp);
00419         for (i = 0; i < nregions; ++i, ++rp)
00420                 rp->id = INVALID_REGION_ID;
00421 
00422         renv->cipher_off = INVALID_ROFF;
00423 
00424         renv->rep_off = INVALID_ROFF;
00425         renv->flags = 0;
00426         renv->op_timestamp = renv->rep_timestamp = 0;
00427 
00428         /*
00429          * Get the underlying REGION structure for this environment.  Note,
00430          * we created the underlying OS region before we acquired the REGION
00431          * structure, which is backwards from the normal procedure.  Update
00432          * the REGION structure.
00433          */
00434         if ((ret = __db_des_get(dbenv, infop, infop, &rp)) != 0) {
00435 find_err:       __db_err(dbenv, "%s: unable to find environment", infop->name);
00436                 if (ret == 0)
00437                         ret = EINVAL;
00438                 goto err;
00439         }
00440         infop->rp = rp;
00441         rp->size = tregion.size;
00442         rp->segid = tregion.segid;
00443 
00444         /*
00445          * !!!
00446          * If we create an environment where regions are public and in system
00447          * memory, we have to inform processes joining the environment how to
00448          * attach to the shared memory segment.  So, we write the shared memory
00449          * identifier into the file, to be read by those other processes.
00450          *
00451          * XXX
00452          * This is really OS-layer information, but I can't see any easy way
00453          * to move it down there without passing down information that it has
00454          * no right to know, e.g., that this is the one-and-only REGENV region
00455          * and not some other random region.
00456          */
00457         if (tregion.segid != INVALID_REGION_SEGID) {
00458                 ref.size = tregion.size;
00459                 ref.segid = tregion.segid;
00460                 if ((ret = __os_write(
00461                     dbenv, dbenv->lockfhp, &ref, sizeof(ref), &nrw)) != 0) {
00462                         __db_err(dbenv,
00463                             "%s: unable to write out public environment ID: %s",
00464                             infop->name, db_strerror(ret));
00465                         goto err;
00466                 }
00467         }
00468 
00469 #ifndef HAVE_MUTEX_FCNTL
00470         /*
00471          * If we're not doing fcntl locking, we can close the file handle.  We
00472          * no longer need it and the less contact between the buffer cache and
00473          * the VM, the better.
00474          */
00475         if (dbenv->lockfhp != NULL) {
00476                  (void)__os_closehandle(dbenv, dbenv->lockfhp);
00477                  dbenv->lockfhp = NULL;
00478         }
00479 #endif
00480 
00481         /* Everything looks good, we're done. */
00482         dbenv->reginfo = infop;
00483         return (0);
00484 
00485 err:
00486 retry:  /* Close any open file handle. */
00487         if (dbenv->lockfhp != NULL) {
00488                 (void)__os_closehandle(dbenv, dbenv->lockfhp);
00489                 dbenv->lockfhp = NULL;
00490         }
00491 
00492         /*
00493          * If we joined or created the region, detach from it.  If we created
00494          * it, destroy it.  Note, there's a path in the above code where we're
00495          * using a temporary REGION structure because we haven't yet allocated
00496          * the real one.  In that case the region address (addr) will be filled
00497          * in, but the REGION pointer (rp) won't.  Fix it.
00498          */
00499         if (infop->addr != NULL) {
00500                 if (infop->rp == NULL)
00501                         infop->rp = &tregion;
00502 
00503                 /* Reset the addr value that we "corrected" above. */
00504                 infop->addr = infop->primary;
00505                 (void)__os_r_detach(dbenv,
00506                     infop, F_ISSET(infop, REGION_CREATE));
00507         }
00508 
00509         /* Free the allocated name and/or REGINFO structure. */
00510         if (infop->name != NULL)
00511                 __os_free(dbenv, infop->name);
00512         __os_free(dbenv, infop);
00513 
00514         /* If we had a temporary error, wait awhile and try again. */
00515         if (ret == 0) {
00516                 if (++retry_cnt > 3) {
00517                         __db_err(dbenv, "unable to join the environment");
00518                         ret = EAGAIN;
00519                 } else {
00520                         __os_sleep(dbenv, retry_cnt * 3, 0);
00521                         goto loop;
00522                 }
00523         }
00524 
00525         return (ret);
00526 }
00527 
00528 /*
00529  * __db_e_golive --
00530  *      Turn on the created environment.
00531  *
00532  * PUBLIC: int __db_e_golive __P((DB_ENV *));
00533  */
00534 int
00535 __db_e_golive(dbenv)
00536         DB_ENV *dbenv;
00537 {
00538         REGENV *renv;
00539         REGINFO *infop;
00540 
00541         infop = dbenv->reginfo;
00542         renv = infop->primary;
00543 
00544         /* If we didn't create the region, there's no need for further work. */
00545         if (!F_ISSET(infop, REGION_CREATE))
00546                 return (0);
00547 
00548         /*
00549          * Validate the file.  All other threads of control are waiting
00550          * on this value to be written -- "Let slip the hounds of war!"
00551          */
00552         renv->magic = DB_REGION_MAGIC;
00553 
00554         return (0);
00555 }
00556 
00557 /*
00558  * __db_e_detach --
00559  *      Detach from the environment.
00560  *
00561  * PUBLIC: int __db_e_detach __P((DB_ENV *, int));
00562  */
00563 int
00564 __db_e_detach(dbenv, destroy)
00565         DB_ENV *dbenv;
00566         int destroy;
00567 {
00568         REGENV *renv;
00569         REGINFO *infop;
00570         REGION rp;
00571         int ret, t_ret;
00572 
00573         infop = dbenv->reginfo;
00574         renv = infop->primary;
00575         ret = 0;
00576 
00577         if (F_ISSET(dbenv, DB_ENV_PRIVATE))
00578                 destroy = 1;
00579 
00580         /* Decrement the reference count. */
00581         MUTEX_LOCK(dbenv, renv->mtx_regenv);
00582         if (renv->refcnt == 0)
00583                 __db_err(dbenv, "environment reference count went negative");
00584         else
00585                 --renv->refcnt;
00586         MUTEX_UNLOCK(dbenv, renv->mtx_regenv);
00587 
00588         /* Close the locking file handle. */
00589         if (dbenv->lockfhp != NULL) {
00590                 if ((t_ret =
00591                     __os_closehandle(dbenv, dbenv->lockfhp)) != 0 && ret == 0)
00592                         ret = t_ret;
00593                 dbenv->lockfhp = NULL;
00594         }
00595 
00596         /*
00597          * Release the region, and kill our reference.
00598          */
00599         if (destroy) {
00600 #ifdef HAVE_CRYPTO
00601                 /*
00602                  * Destroy any system resources the crypto subsystem may have
00603                  * acquired.
00604                  */
00605                 if ((t_ret = __crypto_region_destroy(dbenv)) != 0 && ret == 0)
00606                         ret = t_ret;
00607 #endif
00608                 /*
00609                  * Destroy any system resources the replication subsystem may
00610                  * have acquired.
00611                  */
00612                 if ((t_ret = __rep_region_destroy(dbenv)) != 0 && ret == 0)
00613                         ret = t_ret;
00614 
00615                 /*
00616                  * Free the REGION array.
00617                  *
00618                  * The actual underlying region structure is allocated from the
00619                  * primary shared region, and we're about to free it.  Save a
00620                  * copy on our stack for the REGINFO to reference when it calls
00621                  * down into the OS layer to release the shared memory segment.
00622                  */
00623                 rp = *infop->rp;
00624                 infop->rp = &rp;
00625 
00626                 if (renv->region_off != INVALID_ROFF)
00627                         __db_shalloc_free(
00628                            infop, R_ADDR(infop, renv->region_off));
00629 
00630                 /* Discard any mutex resources we may have acquired. */
00631                 if ((t_ret =
00632                     __mutex_free(dbenv, &renv->mtx_regenv)) != 0 && ret == 0)
00633                         ret = t_ret;
00634         }
00635 
00636         /*
00637          * Set the DB_ENV->reginfo field to NULL.  First, DB_ENV->remove calls
00638          * __env_remove to do the region remove, and __envremove attached and
00639          * then detaches from the region.  We don't want to return to
00640          * DB_ENV->remove with a non-NULL DB_ENV->reginfo field because it will
00641          * attempt to detach again as part of its cleanup.
00642          *
00643          * Second, DB code uses DB_ENV->reginfo to decide if it's OK to read
00644          * the underlying region.  We're about to destroy what it references,
00645          * so it needs to be cleared.
00646          */
00647         dbenv->reginfo = NULL;
00648 
00649         /* Reset the addr value that we "corrected" above. */
00650         infop->addr = infop->primary;
00651 
00652         if ((t_ret = __os_r_detach(dbenv, infop, destroy)) != 0 && ret == 0)
00653                 ret = t_ret;
00654         if (infop->name != NULL)
00655                 __os_free(dbenv, infop->name);
00656 
00657         /* Discard the DB_ENV->reginfo field's memory. */
00658         __os_free(dbenv, infop);
00659 
00660         return (ret);
00661 }
00662 
00663 /*
00664  * __db_e_remove --
00665  *      Discard an environment if it's not in use.
00666  *
00667  * PUBLIC: int __db_e_remove __P((DB_ENV *, u_int32_t));
00668  */
00669 int
00670 __db_e_remove(dbenv, flags)
00671         DB_ENV *dbenv;
00672         u_int32_t flags;
00673 {
00674         REGENV *renv;
00675         REGINFO *infop, reginfo;
00676         REGION *rp;
00677         u_int32_t db_env_reset, i;
00678         int ret;
00679 
00680         db_env_reset = F_ISSET(dbenv, DB_ENV_NOLOCKING | DB_ENV_NOPANIC);
00681 
00682         /*
00683          * This routine has to walk a nasty line between not looking into
00684          * the environment (which may be corrupted after an app or system
00685          * crash), and removing everything that needs removing.  What we
00686          * do is:
00687          *      1. Connect to the environment.
00688          *      2. If the environment is in use (reference count is non-zero),
00689          *         return EBUSY.
00690          *      3. Panic it and overwrite the magic number so any threads of
00691          *         control attempting to connect (or racing with us) backoff
00692          *         and retry or just die.
00693          *      4. Walk the array of regions.  Connect to each region and then
00694          *         disconnect with the destroy flag set.  This shouldn't cause
00695          *         any problems, even if the region is corrupted, because we
00696          *         never look inside the region (with the single exception of
00697          *         mutex regions on systems where we have to return resources
00698          *         to the underlying system).
00699          *      5. Walk the list of files in the directory, unlinking any
00700          *         files that match a region name.  Unlink the environment
00701          *         file last.
00702          *
00703          * If the force flag is set, we do not acquire any locks during this
00704          * process.
00705          *
00706          * We're going to panic the environment, so we'll want to ignore that
00707          * flag.
00708          */
00709         if (LF_ISSET(DB_FORCE))
00710                 F_SET(dbenv, DB_ENV_NOLOCKING);
00711         F_SET(dbenv, DB_ENV_NOPANIC);
00712 
00713         /* Join the environment. */
00714         if ((ret = __db_e_attach(dbenv, NULL)) != 0) {
00715                 /*
00716                  * If we can't join it, we assume that's because it doesn't
00717                  * exist.  It would be better to know why we failed, but it
00718                  * probably isn't important.
00719                  */
00720                 ret = 0;
00721                 if (LF_ISSET(DB_FORCE))
00722                         goto remfiles;
00723                 goto done;
00724         }
00725 
00726         infop = dbenv->reginfo;
00727         renv = infop->primary;
00728 
00729         /* Lock the environment. */
00730         MUTEX_LOCK(dbenv, renv->mtx_regenv);
00731 
00732         /*
00733          * If it's in use, we're done unless we're forcing the issue or the
00734          * environment has panic'd.  (Presumably, if the environment panic'd,
00735          * the thread holding the reference count may not have cleaned up.)
00736          */
00737         if (renv->refcnt == 1 || renv->panic == 1 || LF_ISSET(DB_FORCE)) {
00738                 /*
00739                  * Set the panic flag and overwrite the magic number.
00740                  *
00741                  * !!!
00742                  * From this point on, there's no going back, we pretty
00743                  * much ignore errors, and just whack on whatever we can.
00744                  */
00745                 renv->magic = 0;
00746                 renv->panic = 1;
00747 
00748                 /*
00749                  * Unlock the environment -- nobody should need this lock
00750                  * because we've poisoned the pool.
00751                  */
00752                 MUTEX_UNLOCK(dbenv, renv->mtx_regenv);
00753 
00754                 /* Attach to each sub-region and destroy it. */
00755                 for (rp = R_ADDR(infop, renv->region_off),
00756                     i = 0; i < renv->region_cnt; ++i, ++rp) {
00757                         if (rp->id == INVALID_REGION_ID ||
00758                             rp->type == REGION_TYPE_ENV)
00759                                 continue;
00760                         /*
00761                          * !!!
00762                          * The REGION_CREATE_OK flag is set for Windows/95 --
00763                          * regions are zero'd out when the last reference to
00764                          * the region goes away, in which case the underlying
00765                          * OS region code requires callers be prepared to
00766                          * create the region in order to join it.
00767                          */
00768                         memset(&reginfo, 0, sizeof(reginfo));
00769                         reginfo.id = rp->id;
00770                         reginfo.flags = REGION_CREATE_OK;
00771 
00772                         /*
00773                          * If we get here and can't attach and/or detach to the
00774                          * region, it's a mess.  Ignore errors, there's nothing
00775                          * we can do about them.
00776                          */
00777                         if (__db_r_attach(dbenv, &reginfo, 0) != 0)
00778                                 continue;
00779 
00780 #ifdef  HAVE_MUTEX_SYSTEM_RESOURCES
00781                         /*
00782                          * If destroying the mutex region, return any system
00783                          * resources to the system.
00784                          */
00785                         if (reginfo.type == REGION_TYPE_MUTEX)
00786                                 __mutex_resource_return(dbenv, &reginfo);
00787 #endif
00788                         (void)__db_r_detach(dbenv, &reginfo, 1);
00789                 }
00790 
00791                 /* Destroy the environment's region. */
00792                 (void)__db_e_detach(dbenv, 1);
00793 
00794                 /* Discard any remaining physical files. */
00795 remfiles:       (void)__db_e_remfile(dbenv);
00796         } else {
00797                 /* Unlock the environment. */
00798                 MUTEX_UNLOCK(dbenv, renv->mtx_regenv);
00799 
00800                 /* Discard the environment. */
00801                 (void)__db_e_detach(dbenv, 0);
00802 
00803                 ret = EBUSY;
00804         }
00805 
00806 done:   F_CLR(dbenv, DB_ENV_NOLOCKING | DB_ENV_NOPANIC);
00807         F_SET(dbenv, db_env_reset);
00808 
00809         return (ret);
00810 }
00811 
00812 /*
00813  * __db_e_remfile --
00814  *      Discard any region files in the filesystem.
00815  */
00816 static int
00817 __db_e_remfile(dbenv)
00818         DB_ENV *dbenv;
00819 {
00820         int cnt, fcnt, lastrm, ret;
00821         const char *dir;
00822         char saved_char, *p, **names, *path, buf[sizeof(DB_REGION_FMT) + 20];
00823 
00824         /* Get the full path of a file in the environment. */
00825         (void)snprintf(buf, sizeof(buf), "%s", DB_REGION_ENV);
00826         if ((ret = __db_appname(dbenv, DB_APP_NONE, buf, 0, NULL, &path)) != 0)
00827                 return (ret);
00828 
00829         /* Get the parent directory for the environment. */
00830         if ((p = __db_rpath(path)) == NULL) {
00831                 p = path;
00832                 saved_char = *p;
00833 
00834                 dir = PATH_DOT;
00835         } else {
00836                 saved_char = *p;
00837                 *p = '\0';
00838 
00839                 dir = path;
00840         }
00841 
00842         /* Get the list of file names. */
00843         if ((ret = __os_dirlist(dbenv, dir, &names, &fcnt)) != 0)
00844                 __db_err(dbenv, "%s: %s", dir, db_strerror(ret));
00845 
00846         /* Restore the path, and free it. */
00847         *p = saved_char;
00848         __os_free(dbenv, path);
00849 
00850         if (ret != 0)
00851                 return (ret);
00852 
00853         /*
00854          * Remove files from the region directory.
00855          */
00856         for (lastrm = -1, cnt = fcnt; --cnt >= 0;) {
00857                 /* Skip anything outside our name space. */
00858                 if (strncmp(names[cnt],
00859                     DB_REGION_PREFIX, sizeof(DB_REGION_PREFIX) - 1))
00860                         continue;
00861 
00862                 /* Skip queue extent files. */
00863                 if (strncmp(names[cnt], "__dbq.", 6) == 0)
00864                         continue;
00865 
00866                 /* Skip registry files. */
00867                 if (strncmp(names[cnt], "__db.register", 13) == 0)
00868                         continue;
00869 
00870                 /* Skip replication files. */
00871                 if (strncmp(names[cnt], "__db.rep.", 9) == 0)
00872                         continue;
00873 
00874                 /*
00875                  * Remove the primary environment region last, because it's
00876                  * the key to this whole mess.
00877                  */
00878                 if (strcmp(names[cnt], DB_REGION_ENV) == 0) {
00879                         lastrm = cnt;
00880                         continue;
00881                 }
00882 
00883                 /* Remove the file. */
00884                 if (__db_appname(dbenv,
00885                     DB_APP_NONE, names[cnt], 0, NULL, &path) == 0) {
00886                         /*
00887                          * Overwrite region files.  Temporary files would have
00888                          * been maintained in encrypted format, so there's no
00889                          * reason to overwrite them.  This is not an exact
00890                          * check on the file being a region file, but it's
00891                          * not likely to be wrong, and the worst thing that can
00892                          * happen is we overwrite a file that didn't need to be
00893                          * overwritten.
00894                          */
00895                         if (F_ISSET(dbenv, DB_ENV_OVERWRITE) &&
00896                             strlen(names[cnt]) == DB_REGION_NAME_LENGTH)
00897                                 (void)__db_file_multi_write(dbenv, path);
00898                         (void)__os_unlink(dbenv, path);
00899                         __os_free(dbenv, path);
00900                 }
00901         }
00902 
00903         if (lastrm != -1)
00904                 if (__db_appname(dbenv,
00905                     DB_APP_NONE, names[lastrm], 0, NULL, &path) == 0) {
00906                         if (F_ISSET(dbenv, DB_ENV_OVERWRITE))
00907                                 (void)__db_file_multi_write(dbenv, path);
00908                         (void)__os_unlink(dbenv, path);
00909                         __os_free(dbenv, path);
00910                 }
00911         __os_dirfree(dbenv, names, fcnt);
00912 
00913         return (0);
00914 }
00915 
00916 /*
00917  * __db_r_attach
00918  *      Join/create a region.
00919  *
00920  * PUBLIC: int __db_r_attach __P((DB_ENV *, REGINFO *, size_t));
00921  */
00922 int
00923 __db_r_attach(dbenv, infop, size)
00924         DB_ENV *dbenv;
00925         REGINFO *infop;
00926         size_t size;
00927 {
00928         REGION *rp;
00929         int ret;
00930         char buf[sizeof(DB_REGION_FMT) + 20];
00931 
00932         /*
00933          * Find or create a REGION structure for this region.  If we create
00934          * it, the REGION_CREATE flag will be set in the infop structure.
00935          */
00936         F_CLR(infop, REGION_CREATE);
00937         if ((ret = __db_des_get(dbenv, dbenv->reginfo, infop, &rp)) != 0)
00938                 return (ret);
00939         infop->dbenv = dbenv;
00940         infop->rp = rp;
00941         infop->type = rp->type;
00942         infop->id = rp->id;
00943 
00944         /*
00945          * __db_des_get may have created the region and reset the create
00946          * flag.  If we're creating the region, set the desired size.
00947          */
00948         if (F_ISSET(infop, REGION_CREATE))
00949                 rp->size = (roff_t)size;
00950 
00951         /* Join/create the underlying region. */
00952         (void)snprintf(buf, sizeof(buf), DB_REGION_FMT, infop->id);
00953         if ((ret = __db_appname(dbenv,
00954             DB_APP_NONE, buf, 0, NULL, &infop->name)) != 0)
00955                 goto err;
00956         if ((ret = __os_r_attach(dbenv, infop, rp)) != 0)
00957                 goto err;
00958 
00959         /*
00960          * Fault the pages into memory.  Note, do this BEFORE we initialize
00961          * anything because we're writing pages in created regions, not just
00962          * reading them.
00963          */
00964         (void)__db_faultmem(dbenv,
00965             infop->addr, rp->size, F_ISSET(infop, REGION_CREATE));
00966 
00967         /*
00968          * !!!
00969          * The underlying layer may have just decided that we are going
00970          * to create the region.  There are various system issues that
00971          * can result in a useless region that requires re-initialization.
00972          *
00973          * If we created the region, initialize it for allocation.
00974          */
00975         if (F_ISSET(infop, REGION_CREATE))
00976                 __db_shalloc_init(infop, rp->size);
00977 
00978         return (0);
00979 
00980 err:    /* Discard the underlying region. */
00981         if (infop->addr != NULL)
00982                 (void)__os_r_detach(dbenv,
00983                     infop, F_ISSET(infop, REGION_CREATE));
00984         infop->rp = NULL;
00985         infop->id = INVALID_REGION_ID;
00986 
00987         /* Discard the REGION structure if we created it. */
00988         if (F_ISSET(infop, REGION_CREATE)) {
00989                 __db_des_destroy(dbenv, rp);
00990                 F_CLR(infop, REGION_CREATE);
00991         }
00992 
00993         return (ret);
00994 }
00995 
00996 /*
00997  * __db_r_detach --
00998  *      Detach from a region.
00999  *
01000  * PUBLIC: int __db_r_detach __P((DB_ENV *, REGINFO *, int));
01001  */
01002 int
01003 __db_r_detach(dbenv, infop, destroy)
01004         DB_ENV *dbenv;
01005         REGINFO *infop;
01006         int destroy;
01007 {
01008         REGION *rp;
01009         int ret;
01010 
01011         rp = infop->rp;
01012         if (F_ISSET(dbenv, DB_ENV_PRIVATE))
01013                 destroy = 1;
01014 
01015         /*
01016          * When discarding the regions as we shut down a database environment,
01017          * discard any allocated shared memory segments.  This is the last time
01018          * we use them, and db_region_destroy is the last region-specific call
01019          * we make.
01020          */
01021         if (F_ISSET(dbenv, DB_ENV_PRIVATE) && infop->primary != NULL)
01022                 __db_shalloc_free(infop, infop->primary);
01023 
01024         /* Detach from the underlying OS region. */
01025         ret = __os_r_detach(dbenv, infop, destroy);
01026 
01027         /* If we destroyed the region, discard the REGION structure. */
01028         if (destroy)
01029                 __db_des_destroy(dbenv, rp);
01030 
01031         /* Destroy the structure. */
01032         if (infop->name != NULL)
01033                 __os_free(dbenv, infop->name);
01034 
01035         return (ret);
01036 }
01037 
01038 /*
01039  * __db_des_get --
01040  *      Return a reference to the shared information for a REGION,
01041  *      optionally creating a new entry.
01042  */
01043 static int
01044 __db_des_get(dbenv, env_infop, infop, rpp)
01045         DB_ENV *dbenv;
01046         REGINFO *env_infop, *infop;
01047         REGION **rpp;
01048 {
01049         REGENV *renv;
01050         REGION *rp, *empty_slot, *first_type;
01051         u_int32_t i, maxid;
01052 
01053         *rpp = NULL;
01054         renv = env_infop->primary;
01055 
01056         /*
01057          * If the caller wants to join a region, walk through the existing
01058          * regions looking for a matching ID (if ID specified) or matching
01059          * type (if type specified).  If we return based on a matching type
01060          * return the "primary" region, that is, the first region that was
01061          * created of this type.
01062          *
01063          * Track the first empty slot and maximum region ID for new region
01064          * allocation.
01065          *
01066          * MaxID starts at REGION_ID_ENV, the ID of the primary environment.
01067          */
01068         maxid = REGION_ID_ENV;
01069         empty_slot = first_type = NULL;
01070         for (rp = R_ADDR(env_infop, renv->region_off),
01071             i = 0; i < renv->region_cnt; ++i, ++rp) {
01072                 if (rp->id == INVALID_REGION_ID) {
01073                         if (empty_slot == NULL)
01074                                 empty_slot = rp;
01075                         continue;
01076                 }
01077                 if (infop->id != INVALID_REGION_ID) {
01078                         if (infop->id == rp->id)
01079                                 break;
01080                         continue;
01081                 }
01082                 if (infop->type == rp->type &&
01083                     F_ISSET(infop, REGION_JOIN_OK) &&
01084                     (first_type == NULL || first_type->id > rp->id))
01085                         first_type = rp;
01086 
01087                 if (rp->id > maxid)
01088                         maxid = rp->id;
01089         }
01090 
01091         /* If we found a matching ID (or a matching type), return it. */
01092         if (i >= renv->region_cnt)
01093                 rp = first_type;
01094         if (rp != NULL) {
01095                 *rpp = rp;
01096                 return (0);
01097         }
01098 
01099         /*
01100          * If we didn't find a region and we don't have permission to create
01101          * the region, fail.  The caller generates any error message.
01102          */
01103         if (!F_ISSET(infop, REGION_CREATE_OK))
01104                 return (ENOENT);
01105 
01106         /*
01107          * If we didn't find a region and don't have room to create the region
01108          * fail with an error message, there's a sizing problem.
01109          */
01110         if (empty_slot == NULL) {
01111                 __db_err(dbenv, "no room remaining for additional REGIONs");
01112                 return (ENOENT);
01113         }
01114 
01115         /*
01116          * Initialize a REGION structure for the caller.  If id was set, use
01117          * that value, otherwise we use the next available ID.
01118          */
01119         memset(empty_slot, 0, sizeof(REGION));
01120         empty_slot->segid = INVALID_REGION_SEGID;
01121 
01122         /*
01123          * Set the type and ID; if no region ID was specified,
01124          * allocate one.
01125          */
01126         empty_slot->type = infop->type;
01127         empty_slot->id = infop->id == INVALID_REGION_ID ? maxid + 1 : infop->id;
01128 
01129         F_SET(infop, REGION_CREATE);
01130 
01131         *rpp = empty_slot;
01132         return (0);
01133 }
01134 
01135 /*
01136  * __db_des_destroy --
01137  *      Destroy a reference to a REGION.
01138  */
01139 static void
01140 __db_des_destroy(dbenv, rp)
01141         DB_ENV *dbenv;
01142         REGION *rp;
01143 {
01144         COMPQUIET(dbenv, NULL);
01145 
01146         rp->id = INVALID_REGION_ID;
01147 }
01148 
01149 /*
01150  * __db_faultmem --
01151  *      Fault the region into memory.
01152  */
01153 static int
01154 __db_faultmem(dbenv, addr, size, created)
01155         DB_ENV *dbenv;
01156         void *addr;
01157         size_t size;
01158         int created;
01159 {
01160         int ret;
01161         u_int8_t *p, *t;
01162 
01163         /* Ignore heap regions. */
01164         if (F_ISSET(dbenv, DB_ENV_PRIVATE))
01165                 return (0);
01166 
01167         /*
01168          * It's sometimes significantly faster to page-fault in all of the
01169          * region's pages before we run the application, as we see nasty
01170          * side-effects when we page-fault while holding various locks, i.e.,
01171          * the lock takes a long time to acquire because of the underlying
01172          * page fault, and the other threads convoy behind the lock holder.
01173          *
01174          * If we created the region, we write a non-zero value so that the
01175          * system can't cheat.  If we're just joining the region, we can
01176          * only read the value and try to confuse the compiler sufficiently
01177          * that it doesn't figure out that we're never really using it.
01178          *
01179          * Touch every page (assuming pages are 512B, the smallest VM page
01180          * size used in any general purpose processor).
01181          */
01182         ret = 0;
01183         if (F_ISSET(dbenv, DB_ENV_REGION_INIT)) {
01184                 if (created)
01185                         for (p = addr,
01186                             t = (u_int8_t *)addr + size; p < t; p += 512)
01187                                 p[0] = 0xdb;
01188                 else
01189                         for (p = addr,
01190                             t = (u_int8_t *)addr + size; p < t; p += 512)
01191                                 ret |= p[0];
01192         }
01193 
01194         return (ret);
01195 }

Generated on Sun Dec 25 12:14:24 2005 for Berkeley DB 4.4.16 by  doxygen 1.4.2