00001 /*- 00002 * See the file LICENSE for redistribution information. 00003 * 00004 * Copyright (c) 1998-2005 00005 * Sleepycat Software. All rights reserved. 00006 * 00007 * $Id: region.h,v 12.7 2005/10/13 00:53:00 bostic Exp $ 00008 */ 00009 00010 #ifndef _DB_REGION_H_ 00011 #define _DB_REGION_H_ 00012 00013 /* 00014 * The DB environment consists of some number of "regions", which are described 00015 * by the following four structures: 00016 * 00017 * REGENV -- shared information about the environment 00018 * REGENV_REF -- file describing system memory version of REGENV 00019 * REGION -- shared information about a single region 00020 * REGINFO -- per-process information about a REGION 00021 * 00022 * There are three types of memory that hold regions: 00023 * per-process heap (malloc) 00024 * file mapped into memory (mmap, MapViewOfFile) 00025 * system memory (shmget, CreateFileMapping) 00026 * 00027 * By default, regions are created in filesystem-backed shared memory. They 00028 * can also be created in system shared memory (DB_SYSTEM_MEM), or, if private 00029 * to a process, in heap memory (DB_PRIVATE). 00030 * 00031 * Regions in the filesystem are named "__db.001", "__db.002" and so on. If 00032 * we're not using a private environment allocated in heap, "__db.001" will 00033 * always exist, as we use it to synchronize on the regions, whether they are 00034 * in filesystem-backed memory or system memory. 00035 * 00036 * The file "__db.001" contains a REGENV structure and an array of REGION 00037 * structures. Each REGION structures describes an underlying chunk of 00038 * shared memory. 00039 * 00040 * __db.001 00041 * +---------+ 00042 * |REGENV | 00043 * +---------+ +----------+ 00044 * |REGION |-> | __db.002 | 00045 * | | +----------+ 00046 * +---------+ +----------+ 00047 * |REGION |-> | __db.003 | 00048 * | | +----------+ 00049 * +---------+ +----------+ 00050 * |REGION |-> | __db.004 | 00051 * | | +----------+ 00052 * +---------+ 00053 * 00054 * The tricky part about manipulating the regions is creating or joining the 00055 * database environment. We have to be sure only a single thread of control 00056 * creates and/or recovers a database environment. All other threads should 00057 * then join without seeing inconsistent data. 00058 * 00059 * We do this in two parts: first, we use the underlying O_EXCL flag to the 00060 * open system call to serialize creation of the __db.001 file. The thread 00061 * of control creating that file then proceeds to create the remaining 00062 * regions in the environment, including the mutex region. Once the mutex 00063 * region has been created, the creating thread of control fills in the 00064 * __db.001 file's magic number. Other threads of control (the ones that 00065 * didn't create the __db.001 file), wait on the initialization of the 00066 * __db.001 file's magic number. After it has been initialized, all threads 00067 * of control can proceed, using normal shared mutex locking procedures for 00068 * exclusion. 00069 * 00070 * REGIONs are not moved or removed during the life of the environment, and 00071 * so processes can have long-lived references to them. 00072 * 00073 * One of the REGION structures describes the environment region itself. 00074 * 00075 * The REGION array is not locked in any way. It's an array so we don't have 00076 * to manipulate data structures after a crash -- on some systems, we have to 00077 * join and clean up the mutex region after application failure. Using an 00078 * array means we don't have to worry about broken links or other nastiness 00079 * after the failure. 00080 * 00081 * All requests to create or join a region return a REGINFO structure, which 00082 * is held by the caller and used to open and subsequently close the reference 00083 * to the region. The REGINFO structure contains the per-process information 00084 * that we need to access the region. 00085 * 00086 * The one remaining complication. If the regions (including the environment 00087 * region) live in system memory, and the system memory isn't "named" somehow 00088 * in the filesystem name space, we need some way of finding it. Do this by 00089 * by writing the REGENV_REF structure into the "__db.001" file. When we find 00090 * a __db.001 file that is too small to be a real, on-disk environment, we use 00091 * the information it contains to redirect to the real "__db.001" file/memory. 00092 * This currently only happens when the REGENV file is in shared system memory. 00093 * 00094 * Although DB does not currently grow regions when they run out of memory, it 00095 * would be possible to do so. To grow a region, allocate a new region of the 00096 * appropriate size, then copy the old region over it and insert the additional 00097 * memory into the already existing shalloc arena. Region users must reset 00098 * their base addresses and any local pointers into the memory, of course. 00099 * This failed in historic versions of DB because the region mutexes lived in 00100 * the mapped memory, and when it was unmapped and remapped (or copied), 00101 * threads could lose track of it. Also, some systems didn't support mutex 00102 * copying, e.g., from OSF1 V4.0: 00103 * 00104 * The address of an msemaphore structure may be significant. If the 00105 * msemaphore structure contains any value copied from an msemaphore 00106 * structure at a different address, the result is undefined. 00107 * 00108 * All mutexes are now maintained in a separate region which is never unmapped, 00109 * so growing regions should be possible. 00110 */ 00111 00112 #if defined(__cplusplus) 00113 extern "C" { 00114 #endif 00115 00116 #define DB_REGION_PREFIX "__db" /* DB file name prefix. */ 00117 #define DB_REGION_FMT "__db.%03d" /* Region file name format. */ 00118 #define DB_REGION_ENV "__db.001" /* Primary environment name. */ 00119 #define DB_REGION_NAME_LENGTH 8 /* Length of file names. */ 00120 00121 #define INVALID_REGION_ID 0 /* Out-of-band region ID. */ 00122 #define REGION_ID_ENV 1 /* Primary environment ID. */ 00123 00124 typedef enum { 00125 INVALID_REGION_TYPE=0, /* Region type. */ 00126 REGION_TYPE_ENV, 00127 REGION_TYPE_LOCK, 00128 REGION_TYPE_LOG, 00129 REGION_TYPE_MPOOL, 00130 REGION_TYPE_MUTEX, 00131 REGION_TYPE_TXN } reg_type_t; 00132 00133 #define INVALID_REGION_SEGID -1 /* Segment IDs are either shmget(2) or 00134 * Win16 segment identifiers. They are 00135 * both stored in a "long", and we need 00136 * an out-of-band value. 00137 */ 00138 /* 00139 * Nothing can live at region offset 0, because, in all cases, that's where 00140 * we store *something*. Lots of code needs an out-of-band value for region 00141 * offsets, so we use 0. 00142 */ 00143 #define INVALID_ROFF 0 00144 00145 /* Reference describing system memory version of REGENV. */ 00146 typedef struct __db_reg_env_ref { 00147 roff_t size; /* Region size. */ 00148 long segid; /* UNIX shmget ID, VxWorks ID. */ 00149 } REGENV_REF; 00150 00151 /* Per-environment region information. */ 00152 typedef struct __db_reg_env { 00153 /* 00154 * !!! 00155 * The magic, panic, version and envid fields of the region are fixed 00156 * in size, the timestamp field is the first field which is variable 00157 * length. These fields must never change in order, to guarantee we 00158 * can always read them, no matter what Berkeley DB release we have. 00159 * 00160 * !!! 00161 * The magic and panic fields are NOT protected by any mutex, and for 00162 * this reason cannot be anything more complicated than zero/non-zero. 00163 */ 00164 u_int32_t magic; /* Valid region magic number. */ 00165 u_int32_t panic; /* Environment is dead. */ 00166 00167 u_int32_t majver; /* Major DB version number. */ 00168 u_int32_t minver; /* Minor DB version number. */ 00169 u_int32_t patchver; /* Patch DB version number. */ 00170 00171 u_int32_t envid; /* Unique environment ID. */ 00172 00173 time_t timestamp; /* Creation time. */ 00174 00175 u_int32_t init_flags; /* Flags environment initialized with.*/ 00176 00177 /* 00178 * The mtx_regenv mutex protects the environment reference count and 00179 * memory allocation from the primary shared region (the crypto and 00180 * replication implementations allocate memory from the primary shared 00181 * region). The rest of the fields are initialized at creation time, 00182 * and so don't need mutex protection. The flags, op_timestamp and 00183 * rep_timestamp fields are used by replication only and are 00184 * protected * by the replication mutex. The rep_timestamp is 00185 * is not protected when it is used in recovery as that is already 00186 * single threaded. 00187 */ 00188 db_mutex_t mtx_regenv; /* Refcnt, region allocation mutex. */ 00189 u_int32_t refcnt; /* References to the environment. */ 00190 00191 u_int32_t region_cnt; /* Number of REGIONs. */ 00192 roff_t region_off; /* Offset of region array */ 00193 00194 roff_t cipher_off; /* Offset of cipher area */ 00195 00196 roff_t rep_off; /* Offset of the replication area. */ 00197 #define DB_REGENV_REPLOCKED 0x0001 /* Env locked for rep backup. */ 00198 u_int32_t flags; /* Shared environment flags. */ 00199 #define DB_REGENV_TIMEOUT 30 /* Backup timeout. */ 00200 time_t op_timestamp; /* Timestamp for operations. */ 00201 time_t rep_timestamp; /* Timestamp for rep db handles. */ 00202 00203 size_t pad; /* Guarantee that following memory is 00204 * size_t aligned. This is necessary 00205 * because we're going to store the 00206 * allocation region information there. 00207 */ 00208 } REGENV; 00209 00210 /* Per-region shared region information. */ 00211 typedef struct __db_region { 00212 u_int32_t id; /* Region id. */ 00213 reg_type_t type; /* Region type. */ 00214 00215 roff_t size_orig; /* Region size in bytes (original). */ 00216 roff_t size; /* Region size in bytes (adjusted). */ 00217 00218 roff_t primary; /* Primary data structure offset. */ 00219 00220 long segid; /* UNIX shmget(2), Win16 segment ID. */ 00221 } REGION; 00222 00223 /* 00224 * Per-process/per-attachment information about a single region. 00225 */ 00226 struct __db_reginfo_t { /* __db_r_attach IN parameters. */ 00227 DB_ENV *dbenv; /* Enclosing environment. */ 00228 reg_type_t type; /* Region type. */ 00229 u_int32_t id; /* Region id. */ 00230 00231 /* __db_r_attach OUT parameters. */ 00232 REGION *rp; /* Shared region. */ 00233 00234 char *name; /* Region file name. */ 00235 00236 void *addr_orig; /* Region address (original). */ 00237 void *addr; /* Region address (adjusted). */ 00238 void *primary; /* Primary data structure address. */ 00239 00240 size_t max_alloc; /* Maximum bytes allocated. */ 00241 size_t allocated; /* Bytes allocated. */ 00242 00243 #ifdef DB_WIN32 00244 HANDLE wnt_handle; /* Win/NT HANDLE. */ 00245 #endif 00246 00247 #define REGION_CREATE 0x01 /* Caller created region. */ 00248 #define REGION_CREATE_OK 0x02 /* Caller willing to create region. */ 00249 #define REGION_JOIN_OK 0x04 /* Caller is looking for a match. */ 00250 u_int32_t flags; 00251 }; 00252 00253 /* 00254 * R_ADDR Return a per-process address for a shared region offset. 00255 * R_OFFSET Return a shared region offset for a per-process address. 00256 */ 00257 #define R_ADDR(reginfop, offset) \ 00258 (F_ISSET((reginfop)->dbenv, DB_ENV_PRIVATE) ? (void *)(offset) :\ 00259 (void *)((u_int8_t *)((reginfop)->addr) + (offset))) 00260 #define R_OFFSET(reginfop, p) \ 00261 (F_ISSET((reginfop)->dbenv, DB_ENV_PRIVATE) ? (roff_t)(p) : \ 00262 (roff_t)((u_int8_t *)(p) - (u_int8_t *)(reginfop)->addr)) 00263 00264 /* PANIC_CHECK: Check to see if the DB environment is dead. */ 00265 #define PANIC_CHECK(dbenv) \ 00266 if ((dbenv)->reginfo != NULL && ((REGENV *) \ 00267 ((REGINFO *)(dbenv)->reginfo)->primary)->panic != 0 && \ 00268 !F_ISSET((dbenv), DB_ENV_NOPANIC)) \ 00269 return (__db_panic_msg(dbenv)); 00270 00271 #if defined(__cplusplus) 00272 } 00273 #endif 00274 #endif /* !_DB_REGION_H_ */