Main Page | Class Hierarchy | Data Structures | Directories | File List | Data Fields | Related Pages

region.h

00001 /*-
00002  * See the file LICENSE for redistribution information.
00003  *
00004  * Copyright (c) 1998-2005
00005  *      Sleepycat Software.  All rights reserved.
00006  *
00007  * $Id: region.h,v 12.7 2005/10/13 00:53:00 bostic Exp $
00008  */
00009 
00010 #ifndef _DB_REGION_H_
00011 #define _DB_REGION_H_
00012 
00013 /*
00014  * The DB environment consists of some number of "regions", which are described
00015  * by the following four structures:
00016  *
00017  *      REGENV     -- shared information about the environment
00018  *      REGENV_REF -- file describing system memory version of REGENV
00019  *      REGION     -- shared information about a single region
00020  *      REGINFO    -- per-process information about a REGION
00021  *
00022  * There are three types of memory that hold regions:
00023  *      per-process heap (malloc)
00024  *      file mapped into memory (mmap, MapViewOfFile)
00025  *      system memory (shmget, CreateFileMapping)
00026  *
00027  * By default, regions are created in filesystem-backed shared memory.  They
00028  * can also be created in system shared memory (DB_SYSTEM_MEM), or, if private
00029  * to a process, in heap memory (DB_PRIVATE).
00030  *
00031  * Regions in the filesystem are named "__db.001", "__db.002" and so on.  If
00032  * we're not using a private environment allocated in heap, "__db.001" will
00033  * always exist, as we use it to synchronize on the regions, whether they are
00034  * in filesystem-backed memory or system memory.
00035  *
00036  * The file "__db.001" contains a REGENV structure and an array of REGION
00037  * structures.  Each REGION structures describes an underlying chunk of
00038  * shared memory.
00039  *
00040  *      __db.001
00041  *      +---------+
00042  *      |REGENV  |
00043  *      +---------+   +----------+
00044  *      |REGION   |-> | __db.002 |
00045  *      |         |   +----------+
00046  *      +---------+   +----------+
00047  *      |REGION   |-> | __db.003 |
00048  *      |         |   +----------+
00049  *      +---------+   +----------+
00050  *      |REGION   |-> | __db.004 |
00051  *      |         |   +----------+
00052  *      +---------+
00053  *
00054  * The tricky part about manipulating the regions is creating or joining the
00055  * database environment.  We have to be sure only a single thread of control
00056  * creates and/or recovers a database environment.  All other threads should
00057  * then join without seeing inconsistent data.
00058  *
00059  * We do this in two parts: first, we use the underlying O_EXCL flag to the
00060  * open system call to serialize creation of the __db.001 file.  The thread
00061  * of control creating that file then proceeds to create the remaining
00062  * regions in the environment, including the mutex region.  Once the mutex
00063  * region has been created, the creating thread of control fills in the
00064  * __db.001 file's magic number.  Other threads of control (the ones that
00065  * didn't create the __db.001 file), wait on the initialization of the
00066  * __db.001 file's magic number.  After it has been initialized, all threads
00067  * of control can proceed, using normal shared mutex locking procedures for
00068  * exclusion.
00069  *
00070  * REGIONs are not moved or removed during the life of the environment, and
00071  * so processes can have long-lived references to them.
00072  *
00073  * One of the REGION structures describes the environment region itself.
00074  *
00075  * The REGION array is not locked in any way.  It's an array so we don't have
00076  * to manipulate data structures after a crash -- on some systems, we have to
00077  * join and clean up the mutex region after application failure.  Using an
00078  * array means we don't have to worry about broken links or other nastiness
00079  * after the failure.
00080  *
00081  * All requests to create or join a region return a REGINFO structure, which
00082  * is held by the caller and used to open and subsequently close the reference
00083  * to the region.  The REGINFO structure contains the per-process information
00084  * that we need to access the region.
00085  *
00086  * The one remaining complication.  If the regions (including the environment
00087  * region) live in system memory, and the system memory isn't "named" somehow
00088  * in the filesystem name space, we need some way of finding it.  Do this by
00089  * by writing the REGENV_REF structure into the "__db.001" file.  When we find
00090  * a __db.001 file that is too small to be a real, on-disk environment, we use
00091  * the information it contains to redirect to the real "__db.001" file/memory.
00092  * This currently only happens when the REGENV file is in shared system memory.
00093  *
00094  * Although DB does not currently grow regions when they run out of memory, it
00095  * would be possible to do so.  To grow a region, allocate a new region of the
00096  * appropriate size, then copy the old region over it and insert the additional
00097  * memory into the already existing shalloc arena.  Region users must reset
00098  * their base addresses and any local pointers into the memory, of course.
00099  * This failed in historic versions of DB because the region mutexes lived in
00100  * the mapped memory, and when it was unmapped and remapped (or copied),
00101  * threads could lose track of it.  Also, some systems didn't support mutex
00102  * copying, e.g., from OSF1 V4.0:
00103  *
00104  *      The address of an msemaphore structure may be significant.  If the
00105  *      msemaphore structure contains any value copied from an msemaphore
00106  *      structure at a different address, the result is undefined.
00107  *
00108  * All mutexes are now maintained in a separate region which is never unmapped,
00109  * so growing regions should be possible.
00110  */
00111 
00112 #if defined(__cplusplus)
00113 extern "C" {
00114 #endif
00115 
00116 #define DB_REGION_PREFIX        "__db"          /* DB file name prefix. */
00117 #define DB_REGION_FMT           "__db.%03d"     /* Region file name format. */
00118 #define DB_REGION_ENV           "__db.001"      /* Primary environment name. */
00119 #define DB_REGION_NAME_LENGTH   8               /* Length of file names. */
00120 
00121 #define INVALID_REGION_ID       0       /* Out-of-band region ID. */
00122 #define REGION_ID_ENV           1       /* Primary environment ID. */
00123 
00124 typedef enum {
00125         INVALID_REGION_TYPE=0,          /* Region type. */
00126         REGION_TYPE_ENV,
00127         REGION_TYPE_LOCK,
00128         REGION_TYPE_LOG,
00129         REGION_TYPE_MPOOL,
00130         REGION_TYPE_MUTEX,
00131         REGION_TYPE_TXN } reg_type_t;
00132 
00133 #define INVALID_REGION_SEGID    -1      /* Segment IDs are either shmget(2) or
00134                                          * Win16 segment identifiers.  They are
00135                                          * both stored in a "long", and we need
00136                                          * an out-of-band value.
00137                                          */
00138 /*
00139  * Nothing can live at region offset 0, because, in all cases, that's where
00140  * we store *something*.  Lots of code needs an out-of-band value for region
00141  * offsets, so we use 0.
00142  */
00143 #define INVALID_ROFF            0
00144 
00145 /* Reference describing system memory version of REGENV. */
00146 typedef struct __db_reg_env_ref {
00147         roff_t     size;                /* Region size. */
00148         long       segid;               /* UNIX shmget ID, VxWorks ID. */
00149 } REGENV_REF;
00150 
00151 /* Per-environment region information. */
00152 typedef struct __db_reg_env {
00153         /*
00154          * !!!
00155          * The magic, panic, version and envid fields of the region are fixed
00156          * in size, the timestamp field is the first field which is variable
00157          * length.  These fields must never change in order, to guarantee we
00158          * can always read them, no matter what Berkeley DB release we have.
00159          *
00160          * !!!
00161          * The magic and panic fields are NOT protected by any mutex, and for
00162          * this reason cannot be anything more complicated than zero/non-zero.
00163          */
00164         u_int32_t magic;                /* Valid region magic number. */
00165         u_int32_t panic;                /* Environment is dead. */
00166 
00167         u_int32_t majver;               /* Major DB version number. */
00168         u_int32_t minver;               /* Minor DB version number. */
00169         u_int32_t patchver;             /* Patch DB version number. */
00170 
00171         u_int32_t envid;                /* Unique environment ID. */
00172 
00173         time_t    timestamp;            /* Creation time. */
00174 
00175         u_int32_t init_flags;           /* Flags environment initialized with.*/
00176 
00177         /*
00178          * The mtx_regenv mutex protects the environment reference count and
00179          * memory allocation from the primary shared region (the crypto and
00180          * replication implementations allocate memory from the primary shared
00181          * region).  The rest of the fields are initialized at creation time,
00182          * and so don't need mutex protection.  The flags, op_timestamp and
00183          * rep_timestamp fields are used by replication only and are
00184          * protected * by the replication mutex.  The rep_timestamp is
00185          * is not protected when it is used in recovery as that is already
00186          * single threaded.
00187          */
00188         db_mutex_t mtx_regenv;          /* Refcnt, region allocation mutex. */
00189         u_int32_t  refcnt;              /* References to the environment. */
00190 
00191         u_int32_t region_cnt;           /* Number of REGIONs. */
00192         roff_t    region_off;           /* Offset of region array */
00193 
00194         roff_t    cipher_off;           /* Offset of cipher area */
00195 
00196         roff_t    rep_off;              /* Offset of the replication area. */
00197 #define DB_REGENV_REPLOCKED     0x0001  /* Env locked for rep backup. */
00198         u_int32_t flags;                /* Shared environment flags. */
00199 #define DB_REGENV_TIMEOUT       30      /* Backup timeout. */
00200         time_t    op_timestamp;         /* Timestamp for operations. */
00201         time_t    rep_timestamp;        /* Timestamp for rep db handles. */
00202 
00203         size_t  pad;                    /* Guarantee that following memory is
00204                                          * size_t aligned.  This is necessary
00205                                          * because we're going to store the
00206                                          * allocation region information there.
00207                                          */
00208 } REGENV;
00209 
00210 /* Per-region shared region information. */
00211 typedef struct __db_region {
00212         u_int32_t       id;             /* Region id. */
00213         reg_type_t      type;           /* Region type. */
00214 
00215         roff_t  size_orig;              /* Region size in bytes (original). */
00216         roff_t  size;                   /* Region size in bytes (adjusted). */
00217 
00218         roff_t  primary;                /* Primary data structure offset. */
00219 
00220         long    segid;                  /* UNIX shmget(2), Win16 segment ID. */
00221 } REGION;
00222 
00223 /*
00224  * Per-process/per-attachment information about a single region.
00225  */
00226 struct __db_reginfo_t {         /* __db_r_attach IN parameters. */
00227         DB_ENV     *dbenv;              /* Enclosing environment. */
00228         reg_type_t  type;               /* Region type. */
00229         u_int32_t   id;                 /* Region id. */
00230 
00231                                 /* __db_r_attach OUT parameters. */
00232         REGION     *rp;                 /* Shared region. */
00233 
00234         char       *name;               /* Region file name. */
00235 
00236         void       *addr_orig;          /* Region address (original). */
00237         void       *addr;               /* Region address (adjusted). */
00238         void       *primary;            /* Primary data structure address. */
00239 
00240         size_t      max_alloc;          /* Maximum bytes allocated. */
00241         size_t      allocated;          /* Bytes allocated. */
00242 
00243 #ifdef DB_WIN32
00244         HANDLE  wnt_handle;             /* Win/NT HANDLE. */
00245 #endif
00246 
00247 #define REGION_CREATE           0x01    /* Caller created region. */
00248 #define REGION_CREATE_OK        0x02    /* Caller willing to create region. */
00249 #define REGION_JOIN_OK          0x04    /* Caller is looking for a match. */
00250         u_int32_t   flags;
00251 };
00252 
00253 /*
00254  * R_ADDR       Return a per-process address for a shared region offset.
00255  * R_OFFSET     Return a shared region offset for a per-process address.
00256  */
00257 #define R_ADDR(reginfop, offset)                                        \
00258         (F_ISSET((reginfop)->dbenv, DB_ENV_PRIVATE) ? (void *)(offset) :\
00259         (void *)((u_int8_t *)((reginfop)->addr) + (offset)))
00260 #define R_OFFSET(reginfop, p)                                           \
00261         (F_ISSET((reginfop)->dbenv, DB_ENV_PRIVATE) ? (roff_t)(p) :     \
00262         (roff_t)((u_int8_t *)(p) - (u_int8_t *)(reginfop)->addr))
00263 
00264 /* PANIC_CHECK: Check to see if the DB environment is dead. */
00265 #define PANIC_CHECK(dbenv)                                              \
00266         if ((dbenv)->reginfo != NULL && ((REGENV *)                     \
00267             ((REGINFO *)(dbenv)->reginfo)->primary)->panic != 0 &&      \
00268             !F_ISSET((dbenv), DB_ENV_NOPANIC))                          \
00269                 return (__db_panic_msg(dbenv));
00270 
00271 #if defined(__cplusplus)
00272 }
00273 #endif
00274 #endif /* !_DB_REGION_H_ */

Generated on Sun Dec 25 12:14:22 2005 for Berkeley DB 4.4.16 by  doxygen 1.4.2