Main Page | Class Hierarchy | Data Structures | Directories | File List | Data Fields | Related Pages

env_register.c

00001 /*-
00002  * See the file LICENSE for redistribution information.
00003  *
00004  * Copyright (c) 2004-2005
00005  *      Sleepycat Software.  All rights reserved.
00006  *
00007  * $Id: env_register.c,v 1.15 2005/10/07 20:21:27 ubell Exp $
00008  */
00009 #include "db_config.h"
00010 
00011 #ifndef NO_SYSTEM_INCLUDES
00012 #include <sys/types.h>
00013 
00014 #include <string.h>
00015 #endif
00016 
00017 #include "db_int.h"
00018 
00019 #define REGISTER_FILE   "__db.register"
00020 
00021 #define PID_EMPTY       "X%23lu\n"      /* An unused PID entry. */
00022 #define PID_FMT         "%24lu\n"       /* File PID format. */
00023 #define PID_ISEMPTY(p)  (p[0] == 'X')
00024 #define PID_LEN         25              /* Length of PID line. */
00025 
00026 #define REGISTRY_LOCK(dbenv, pos, nowait)                               \
00027         __os_fdlock(dbenv, (dbenv)->registry, (off_t)(pos), 1, nowait)
00028 #define REGISTRY_UNLOCK(dbenv, pos)                                     \
00029         __os_fdlock(dbenv, (dbenv)->registry, (off_t)(pos), 0, 0)
00030 #define REGISTRY_EXCL_LOCK(dbenv, nowait)                               \
00031         REGISTRY_LOCK(dbenv, 1, nowait)
00032 #define REGISTRY_EXCL_UNLOCK(dbenv)                                     \
00033         REGISTRY_UNLOCK(dbenv, 1)
00034 
00035 static  int __envreg_add __P((DB_ENV *, int *));
00036 
00037 /*
00038  * Support for portable, multi-process database environment locking, based on
00039  * the Subversion SR (#11511).
00040  *
00041  * The registry feature is configured by specifying the DB_REGISTER flag to the
00042  * DbEnv.open method.  If DB_REGISTER is specified, DB opens the registry file
00043  * in the database environment home directory.  The registry file is formatted
00044  * as follows:
00045  *
00046  *                          12345               # process ID slot 1
00047  *      X               # empty slot
00048  *                          12346               # process ID slot 2
00049  *      X               # empty slot
00050  *                          12347               # process ID slot 3
00051  *                          12348               # process ID slot 4
00052  *      X                   12349               # empty slot
00053  *      X               # empty slot
00054  *
00055  * All lines are fixed-length.  All lines are process ID slots.  Empty slots
00056  * are marked with leading non-digit characters.
00057  *
00058  * To modify the file, you get an exclusive lock on the first byte of the file.
00059  *
00060  * While holding any DbEnv handle, each process has an exclusive lock on the
00061  * first byte of a process ID slot.  There is a restriction on having more
00062  * than one DbEnv handle open at a time, because Berkeley DB uses per-process
00063  * locking to implement this feature, that is, a process may never have more
00064  * than a single slot locked.
00065  *
00066  * This work requires that if a process dies or the system crashes, locks held
00067  * by the dying processes will be dropped.  (We can't use system shared
00068  * memory-backed or filesystem-backed locks because they're persistent when a
00069  * process dies.)  On POSIX systems, we use fcntl(2) locks; on Win32 we have
00070  * LockFileEx/UnlockFile, except for Win/9X and Win/ME which have to loop on
00071  * Lockfile/UnlockFile.
00072  *
00073  * We could implement the same solution with flock locking instead of fcntl,
00074  * but flock would require a separate file for each process of control (and
00075  * probably each DbEnv handle) in the database environment, which is fairly
00076  * ugly.
00077  *
00078  * Whenever a process opens a new DbEnv handle, it walks the registry file and
00079  * verifies it CANNOT acquire the lock for any non-empty slot.  If a lock for
00080  * a non-empty slot is available, we know a process died holding an open handle,
00081  * and recovery needs to be run.
00082  *
00083  * There can still be processes running in the environment when we recover it,
00084  * and, in fact, there can still be processes running in the old environment
00085  * after we're up and running in a new one.  This is safe because performing
00086  * recovery panics (and removes) the existing environment, so the window of
00087  * vulnerability is small.  Further, we check the panic flag in the DB API
00088  * methods, when waking from spinning on a mutex, and whenever we're about to
00089  * write to disk).  The only window of corruption is if the write check of the
00090  * panic were to complete, the region subsequently be recovered, and then the
00091  * write continues.  That's very, very unlikely to happen.  This vulnerability
00092  * already exists in Berkeley DB, too, the registry code doesn't make it any
00093  * worse than it already is.
00094  */
00095 /*
00096  * __envreg_register --
00097  *      Register a DB_ENV handle.
00098  *
00099  * PUBLIC: int __envreg_register __P((DB_ENV *, const char *, int *));
00100  */
00101 int
00102 __envreg_register(dbenv, db_home, need_recoveryp)
00103         DB_ENV *dbenv;
00104         const char *db_home;
00105         int *need_recoveryp;
00106 {
00107         pid_t pid;
00108         db_threadid_t tid;
00109         u_int32_t bytes, mbytes;
00110         int ret;
00111         char path[MAXPATHLEN];
00112 
00113         *need_recoveryp = 0;
00114         dbenv->thread_id(dbenv, &pid, &tid);
00115 
00116         if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
00117                 __db_msg(dbenv, "%lu: register environment", (u_long)pid);
00118 
00119         /* Build the path name and open the registry file. */
00120         (void)snprintf(path, sizeof(path), "%s/%s", db_home, REGISTER_FILE);
00121         if ((ret = __os_open(dbenv, path,
00122             DB_OSO_CREATE, __db_omode("rw-rw----"), &dbenv->registry)) != 0)
00123                 goto err;
00124 
00125         /*
00126          * Wait for an exclusive lock on the file.
00127          *
00128          * !!!
00129          * We're locking bytes that don't yet exist, but that's OK as far as
00130          * I know.
00131          */
00132         if ((ret = REGISTRY_EXCL_LOCK(dbenv, 0)) != 0)
00133                 goto err;
00134 
00135         /*
00136          * If the file size is 0, initialize the file.
00137          *
00138          * Run recovery if we create the file, that means we can clean up the
00139          * system by removing the registry file and restarting the application.
00140          */
00141         if ((ret = __os_ioinfo(
00142             dbenv, path, dbenv->registry, &mbytes, &bytes, NULL)) != 0)
00143                 goto err;
00144         if (mbytes == 0 && bytes == 0) {
00145                 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
00146                         __db_msg(dbenv,
00147                             "%lu: creating %s", (u_long)pid, path);
00148                 *need_recoveryp = 1;
00149         }
00150 
00151         /* Register this process. */
00152         if ((ret = __envreg_add(dbenv, need_recoveryp)) != 0)
00153                 goto err;
00154 
00155         /*
00156          * Release our exclusive lock if we don't need to run recovery.  If
00157          * we need to run recovery, DB_ENV->open will call back into register
00158          * code once recovery has completed.
00159          */
00160         if (*need_recoveryp == 0 && (ret = REGISTRY_EXCL_UNLOCK(dbenv)) != 0)
00161                 goto err;
00162 
00163         if (0) {
00164 err:            *need_recoveryp = 0;
00165 
00166                 /*
00167                  * !!!
00168                  * Closing the file handle must release all of our locks.
00169                  */
00170                 (void)__os_closehandle(dbenv, dbenv->registry);
00171                 dbenv->registry = NULL;
00172         }
00173 
00174         return (ret);
00175 }
00176 
00177 /*
00178  * __envreg_add --
00179  *      Add the process' pid to the register.
00180  */
00181 static int
00182 __envreg_add(dbenv, need_recoveryp)
00183         DB_ENV *dbenv;
00184         int *need_recoveryp;
00185 {
00186         pid_t pid;
00187         db_threadid_t tid;
00188         off_t end, pos;
00189         size_t nr, nw;
00190         u_int lcnt;
00191         u_int32_t bytes, mbytes;
00192         int need_recovery, ret;
00193         char *p, buf[256], pid_buf[256];
00194 
00195         need_recovery = 0;
00196         COMPQUIET(p, NULL);
00197 
00198         /* Get a copy of our process ID. */
00199         dbenv->thread_id(dbenv, &pid, &tid);
00200         snprintf(pid_buf, sizeof(pid_buf), PID_FMT, (u_long)pid);
00201 
00202         if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
00203                 __db_msg(dbenv, "===== %lu: before add", (u_long)pid);
00204 
00205         /*
00206          * Read the file.  Skip empty slots, and check that a lock is held
00207          * for any allocated slots.  An allocated slot which we can lock
00208          * indicates a process died holding a handle and recovery needs to
00209          * be run.
00210          */
00211         for (lcnt = 0;; ++lcnt) {
00212                 if ((ret = __os_read(
00213                     dbenv, dbenv->registry, buf, PID_LEN, &nr)) != 0)
00214                         return (ret);
00215                 if (nr == 0)
00216                         break;
00217                 if (nr != PID_LEN)
00218                         goto corrupt;
00219 
00220                 if (FLD_ISSET(
00221                     dbenv->verbose, DB_VERB_REGISTER) && PID_ISEMPTY(buf)) {
00222                         __db_msg(dbenv, "%02u: EMPTY", lcnt);
00223                         continue;
00224                 }
00225 
00226                 /*
00227                  * !!!
00228                  * DB_REGISTER is implemented using per-process locking, only
00229                  * a single DB_ENV handle may be open per process.  Enforce
00230                  * that restriction.
00231                  */
00232                 if (memcmp(buf, pid_buf, PID_LEN) == 0) {
00233                         __db_err(dbenv,
00234         "DB_REGISTER limits each process to a single open DB_ENV handle");
00235                         return (EINVAL);
00236                 }
00237 
00238                 if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) {
00239                         for (p = buf; *p == ' ';)
00240                                 ++p;
00241                         buf[nr - 1] = '\0';
00242                 }
00243 
00244                 pos = (off_t)lcnt * PID_LEN;
00245                 if (REGISTRY_LOCK(dbenv, pos, 1) == 0) {
00246                         if ((ret = REGISTRY_UNLOCK(dbenv, pos)) != 0)
00247                                 return (ret);
00248 
00249                         if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
00250                                 __db_msg(dbenv, "%02u: %s: FAILED", lcnt, p);
00251 
00252                         need_recovery = 1;
00253                         break;
00254                 } else
00255                         if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
00256                                 __db_msg(dbenv, "%02u: %s: LOCKED", lcnt, p);
00257         }
00258 
00259         /*
00260          * If we have to perform recovery...
00261          *
00262          * Mark all slots empty.  Registry ignores empty slots we can't lock,
00263          * so it doesn't matter if any of the processes are in the middle of
00264          * exiting Berkeley DB -- they'll discard their lock when they exit.
00265          */
00266         if (need_recovery) {
00267                 /* Figure out how big the file is. */
00268                 if ((ret = __os_ioinfo(
00269                     dbenv, NULL, dbenv->registry, &mbytes, &bytes, NULL)) != 0)
00270                         return (ret);
00271                 end = (off_t)mbytes * MEGABYTE + bytes;
00272 
00273                 /* Confirm the file is of a reasonable size. */
00274                 DB_ASSERT(end % PID_LEN == 0);
00275 
00276                 /*
00277                  * Seek to the beginning of the file and overwrite slots to
00278                  * the end of the file.
00279                  */
00280                 if ((ret = __os_seek(
00281                     dbenv, dbenv->registry, 0, 0, 0, 0, DB_OS_SEEK_SET)) != 0)
00282                         return (ret);
00283                 snprintf(buf, sizeof(buf), PID_EMPTY, (u_long)0);
00284                 for (lcnt = (u_int)end / PID_LEN; lcnt > 0; --lcnt)
00285                         if ((ret = __os_write(
00286                             dbenv, dbenv->registry, buf, PID_LEN, &nw)) != 0 ||
00287                             nw != PID_LEN)
00288                                 goto corrupt;
00289         }
00290 
00291         /*
00292          * Seek to the first process slot and add ourselves to the first empty
00293          * slot we can lock.
00294          */
00295         if ((ret = __os_seek(
00296             dbenv, dbenv->registry, 0, 0, 0, 0, DB_OS_SEEK_SET)) != 0)
00297                 return (ret);
00298         for (lcnt = 0;; ++lcnt) {
00299                 if ((ret = __os_read(
00300                     dbenv, dbenv->registry, buf, PID_LEN, &nr)) != 0)
00301                         return (ret);
00302                 if (nr == PID_LEN && !PID_ISEMPTY(buf))
00303                         continue;
00304                 pos = (off_t)lcnt * PID_LEN;
00305                 if (REGISTRY_LOCK(dbenv, pos, 1) == 0) {
00306                         if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
00307                                 __db_msg(dbenv,
00308                                     "%lu: locking slot %02u at offset %lu",
00309                                     (u_long)pid, lcnt, (u_long)pos);
00310 
00311                         if ((ret = __os_seek(dbenv, dbenv->registry,
00312                             0, 0, (u_int32_t)pos, 0, DB_OS_SEEK_SET)) != 0 ||
00313                             (ret = __os_write(dbenv,
00314                             dbenv->registry, pid_buf, PID_LEN, &nw)) != 0 ||
00315                             nw != PID_LEN)
00316                                 return (ret);
00317                         dbenv->registry_off = (u_int32_t)pos;
00318                         break;
00319                 }
00320         }
00321 
00322         if (need_recovery)
00323                 *need_recoveryp = 1;
00324 
00325         if (0) {
00326 corrupt:        __db_err(dbenv, "%s: file contents corrupted", REGISTER_FILE);
00327                 return (ret == 0 ? EACCES : ret);
00328         }
00329 
00330         return (ret);
00331 }
00332 
00333 /*
00334  * __envreg_unregister --
00335  *      Unregister a DB_ENV handle.
00336  *
00337  * PUBLIC: int __envreg_unregister __P((DB_ENV *, int));
00338  */
00339 int
00340 __envreg_unregister(dbenv, recovery_failed)
00341         DB_ENV *dbenv;
00342         int recovery_failed;
00343 {
00344         size_t nw;
00345         int ret, t_ret;
00346         char buf[256];
00347 
00348         ret = 0;
00349 
00350         /*
00351          * If recovery failed, we want to drop our locks and return, but still
00352          * make sure any subsequent process doesn't decide everything is just
00353          * fine and try to get into the database environment.  In the case of
00354          * an error, discard our locks, but leave our slot filled-in.
00355          */
00356         if (recovery_failed)
00357                 goto err;
00358 
00359         /*
00360          * Why isn't an exclusive lock necessary to discard a DB_ENV handle?
00361          *
00362          * We mark our process ID slot empty before we discard the process slot
00363          * lock, and threads of control reviewing the register file ignore any
00364          * slots which they can't lock.
00365          */
00366         snprintf(buf, sizeof(buf), PID_EMPTY, (u_long)0);
00367         if ((ret = __os_seek(dbenv, dbenv->registry,
00368             0, 0, dbenv->registry_off, 0, DB_OS_SEEK_SET)) != 0 ||
00369             (ret = __os_write(
00370             dbenv, dbenv->registry, buf, PID_LEN, &nw)) != 0 ||
00371             nw != PID_LEN)
00372                 goto err;
00373 
00374         /*
00375          * !!!
00376          * This code assumes that closing the file descriptor discards all
00377          * held locks.
00378          *
00379          * !!!
00380          * There is an ordering problem here -- in the case of a process that
00381          * failed in recovery, we're unlocking both the exclusive lock and our
00382          * slot lock.  If the OS unlocked the exclusive lock and then allowed
00383          * another thread of control to acquire the exclusive lock before also
00384          * also releasing our slot lock, we could race.  That can't happen, I
00385          * don't think.
00386          */
00387 err:    if ((t_ret =
00388             __os_closehandle(dbenv, dbenv->registry)) != 0 && ret == 0)
00389                 ret = t_ret;
00390 
00391         dbenv->registry = NULL;
00392         return (ret);
00393 }
00394 
00395 /*
00396  * __envreg_xunlock --
00397  *      Discard the exclusive lock held by the DB_ENV handle.
00398  *
00399  * PUBLIC: int __envreg_xunlock __P((DB_ENV *));
00400  */
00401 int
00402 __envreg_xunlock(dbenv)
00403         DB_ENV *dbenv;
00404 {
00405         pid_t pid;
00406         db_threadid_t tid;
00407         int ret;
00408 
00409         dbenv->thread_id(dbenv, &pid, &tid);
00410 
00411         if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER))
00412                 __db_msg(dbenv,
00413                     "%lu: recovery completed, unlocking", (u_long)pid);
00414 
00415         if ((ret = REGISTRY_EXCL_UNLOCK(dbenv)) == 0)
00416                 return (ret);
00417 
00418         __db_err(dbenv,
00419             "%s: exclusive file unlock: %s", REGISTER_FILE, db_strerror(ret));
00420         return (__db_panic(dbenv, ret));
00421 }

Generated on Sun Dec 25 12:14:25 2005 for Berkeley DB 4.4.16 by  doxygen 1.4.2