00001 /*- 00002 * See the file LICENSE for redistribution information. 00003 * 00004 * Copyright (c) 1996-2005 00005 * Sleepycat Software. All rights reserved. 00006 * 00007 * $Id: mp.h,v 12.5 2005/08/08 14:52:30 bostic Exp $ 00008 */ 00009 00010 #ifndef _DB_MP_H_ 00011 #define _DB_MP_H_ 00012 00013 struct __bh; typedef struct __bh BH; 00014 struct __db_mpool_hash; typedef struct __db_mpool_hash DB_MPOOL_HASH; 00015 struct __db_mpreg; typedef struct __db_mpreg DB_MPREG; 00016 struct __mpool; typedef struct __mpool MPOOL; 00017 00018 /* We require at least 20KB of cache. */ 00019 #define DB_CACHESIZE_MIN (20 * 1024) 00020 00021 /* 00022 * DB_MPOOLFILE initialization methods cannot be called after open is called, 00023 * other methods cannot be called before open is called 00024 */ 00025 #define MPF_ILLEGAL_AFTER_OPEN(dbmfp, name) \ 00026 if (F_ISSET(dbmfp, MP_OPEN_CALLED)) \ 00027 return (__db_mi_open((dbmfp)->dbenv, name, 1)); 00028 #define MPF_ILLEGAL_BEFORE_OPEN(dbmfp, name) \ 00029 if (!F_ISSET(dbmfp, MP_OPEN_CALLED)) \ 00030 return (__db_mi_open((dbmfp)->dbenv, name, 0)); 00031 00032 typedef enum { 00033 DB_SYNC_ALLOC, /* Flush for allocation. */ 00034 DB_SYNC_CACHE, /* Checkpoint or flush entire cache. */ 00035 DB_SYNC_FILE, /* Flush file. */ 00036 DB_SYNC_TRICKLE /* Trickle sync. */ 00037 } db_sync_op; 00038 00039 /* 00040 * DB_MPOOL -- 00041 * Per-process memory pool structure. 00042 */ 00043 struct __db_mpool { 00044 /* These fields need to be protected for multi-threaded support. */ 00045 db_mutex_t mutex; /* Thread mutex. */ 00046 00047 /* 00048 * DB_MPREG structure for the DB pgin/pgout routines. 00049 * 00050 * Linked list of application-specified pgin/pgout routines. 00051 */ 00052 DB_MPREG *pg_inout; 00053 LIST_HEAD(__db_mpregh, __db_mpreg) dbregq; 00054 00055 /* List of DB_MPOOLFILE's. */ 00056 TAILQ_HEAD(__db_mpoolfileh, __db_mpoolfile) dbmfq; 00057 00058 /* 00059 * The dbenv, nreg and reginfo fields are not thread protected, 00060 * as they are initialized during mpool creation, and not modified 00061 * again. 00062 */ 00063 DB_ENV *dbenv; /* Enclosing environment. */ 00064 00065 u_int32_t nreg; /* N underlying cache regions. */ 00066 REGINFO *reginfo; /* Underlying cache regions. */ 00067 }; 00068 00069 /* 00070 * DB_MPREG -- 00071 * DB_MPOOL registry of pgin/pgout functions. 00072 */ 00073 struct __db_mpreg { 00074 LIST_ENTRY(__db_mpreg) q; /* Linked list. */ 00075 00076 int32_t ftype; /* File type. */ 00077 /* Pgin, pgout routines. */ 00078 int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *)); 00079 int (*pgout) __P((DB_ENV *, db_pgno_t, void *, DBT *)); 00080 }; 00081 00082 /* 00083 * NCACHE -- 00084 * Select a cache based on the file and the page number. Assumes accesses 00085 * are uniform across pages, which is probably OK. What we really want to 00086 * avoid is anything that puts all pages from any single file in the same 00087 * cache, as we expect that file access will be bursty, and to avoid 00088 * putting all page number N pages in the same cache as we expect access 00089 * to the metapages (page 0) and the root of a btree (page 1) to be much 00090 * more frequent than a random data page. 00091 */ 00092 #define NCACHE(mp, mf_offset, pgno) \ 00093 (((pgno) ^ ((u_int32_t)(mf_offset) >> 3)) % ((MPOOL *)mp)->nreg) 00094 00095 /* 00096 * NBUCKET -- 00097 * We make the assumption that early pages of the file are more likely 00098 * to be retrieved than the later pages, which means the top bits will 00099 * be more interesting for hashing as they're less likely to collide. 00100 * That said, as 512 8K pages represents a 4MB file, so only reasonably 00101 * large files will have page numbers with any other than the bottom 9 00102 * bits set. We XOR in the MPOOL offset of the MPOOLFILE that backs the 00103 * page, since that should also be unique for the page. We don't want 00104 * to do anything very fancy -- speed is more important to us than using 00105 * good hashing. 00106 */ 00107 #define NBUCKET(mc, mf_offset, pgno) \ 00108 (((pgno) ^ ((mf_offset) << 9)) % (mc)->htab_buckets) 00109 00110 /* Macros to lock/unlock the mpool region as a whole. */ 00111 #define MPOOL_SYSTEM_LOCK(dbenv) \ 00112 MUTEX_LOCK(dbenv, ((MPOOL *)((DB_MPOOL *) \ 00113 (dbenv)->mp_handle)->reginfo[0].primary)->mtx_region) 00114 #define MPOOL_SYSTEM_UNLOCK(dbenv) \ 00115 MUTEX_UNLOCK(dbenv, ((MPOOL *)((DB_MPOOL *) \ 00116 (dbenv)->mp_handle)->reginfo[0].primary)->mtx_region) 00117 00118 /* Macros to lock/unlock a specific mpool region. */ 00119 #define MPOOL_REGION_LOCK(dbenv, infop) \ 00120 MUTEX_LOCK(dbenv, ((MPOOL *)(infop)->primary)->mtx_region) 00121 #define MPOOL_REGION_UNLOCK(dbenv, infop) \ 00122 MUTEX_UNLOCK(dbenv, ((MPOOL *)(infop)->primary)->mtx_region) 00123 00124 /* 00125 * MPOOL -- 00126 * Shared memory pool region. 00127 */ 00128 struct __mpool { 00129 /* 00130 * The memory pool can be broken up into individual pieces/files. 00131 * Not what we would have liked, but on Solaris you can allocate 00132 * only a little more than 2GB of memory in a contiguous chunk, 00133 * and I expect to see more systems with similar issues. 00134 * 00135 * While this structure is duplicated in each piece of the cache, 00136 * the first of these pieces/files describes the entire pool, the 00137 * second only describe a piece of the cache. 00138 */ 00139 db_mutex_t mtx_region; /* Region mutex. */ 00140 00141 /* 00142 * The lsn field and list of underlying MPOOLFILEs are thread protected 00143 * by the region lock. 00144 */ 00145 DB_LSN lsn; /* Maximum checkpoint LSN. */ 00146 00147 SH_TAILQ_HEAD(__mpfq) mpfq; /* List of MPOOLFILEs. */ 00148 00149 /* Configuration information: protected by the region lock. */ 00150 size_t mp_mmapsize; /* Maximum file size for mmap. */ 00151 int mp_maxopenfd; /* Maximum open file descriptors. */ 00152 int mp_maxwrite; /* Maximum buffers to write. */ 00153 int mp_maxwrite_sleep; /* Sleep after writing max buffers. */ 00154 00155 /* 00156 * The nreg, regids and maint_off fields are not thread protected, 00157 * as they are initialized during mpool creation, and not modified 00158 * again. 00159 */ 00160 u_int32_t nreg; /* Number of underlying REGIONS. */ 00161 roff_t regids; /* Array of underlying REGION Ids. */ 00162 00163 /* 00164 * The following structure fields only describe the per-cache portion 00165 * of the region. 00166 * 00167 * The htab and htab_buckets fields are not thread protected as they 00168 * are initialized during mpool creation, and not modified again. 00169 * 00170 * The last_checked and lru_count fields are thread protected by 00171 * the region lock. 00172 */ 00173 u_int32_t htab_buckets; /* Number of hash table entries. */ 00174 roff_t htab; /* Hash table offset. */ 00175 u_int32_t last_checked; /* Last bucket checked for free. */ 00176 u_int32_t lru_count; /* Counter for buffer LRU */ 00177 00178 /* 00179 * The stat fields are generally not thread protected, and cannot be 00180 * trusted. Note that st_pages is an exception, and is always updated 00181 * inside a region lock (although it is sometimes read outside of the 00182 * region lock). 00183 */ 00184 DB_MPOOL_STAT stat; /* Per-cache mpool statistics. */ 00185 00186 /* 00187 * We track page puts so that we can decide when allocation is never 00188 * going to succeed. We don't lock the field, all we care about is 00189 * if it changes. 00190 */ 00191 u_int32_t put_counter; /* Count of page put calls. */ 00192 }; 00193 00194 struct __db_mpool_hash { 00195 db_mutex_t mtx_hash; /* Per-bucket mutex. */ 00196 00197 DB_HASHTAB hash_bucket; /* Head of bucket. */ 00198 00199 u_int32_t hash_page_dirty;/* Count of dirty pages. */ 00200 u_int32_t hash_priority; /* Minimum priority of bucket buffer. */ 00201 }; 00202 00203 /* 00204 * The base mpool priority is 1/4th of the name space, or just under 2^30. 00205 * When the LRU counter wraps, we shift everybody down to a base-relative 00206 * value. 00207 */ 00208 #define MPOOL_BASE_DECREMENT (UINT32_MAX - (UINT32_MAX / 4)) 00209 00210 /* 00211 * Mpool priorities from low to high. Defined in terms of fractions of the 00212 * buffers in the pool. 00213 */ 00214 #define MPOOL_PRI_VERY_LOW -1 /* Dead duck. Check and set to 0. */ 00215 #define MPOOL_PRI_LOW -2 /* Low. */ 00216 #define MPOOL_PRI_DEFAULT 0 /* No adjustment -- special case.*/ 00217 #define MPOOL_PRI_HIGH 10 /* With the dirty buffers. */ 00218 #define MPOOL_PRI_DIRTY 10 /* Dirty gets a 10% boost. */ 00219 #define MPOOL_PRI_VERY_HIGH 1 /* Add number of buffers in pool. */ 00220 00221 /* 00222 * MPOOLFILE -- 00223 * Shared DB_MPOOLFILE information. 00224 */ 00225 struct __mpoolfile { 00226 db_mutex_t mutex; /* MPOOLFILE mutex. */ 00227 00228 /* Protected by MPOOLFILE mutex. */ 00229 u_int32_t mpf_cnt; /* Ref count: DB_MPOOLFILEs. */ 00230 u_int32_t block_cnt; /* Ref count: blocks in cache. */ 00231 00232 roff_t path_off; /* File name location. */ 00233 00234 /* 00235 * The following are used for file compaction processing. 00236 * They are only used when a thread is in the process 00237 * of trying to move free pages to the end of the file. 00238 * Other threads may look here when freeing a page. 00239 * Protected by a lock on the metapage. 00240 */ 00241 u_int32_t free_ref; /* Refcount to freelist. */ 00242 u_int32_t free_cnt; /* Count of free pages. */ 00243 size_t free_size; /* Allocated size of free list. */ 00244 roff_t free_list; /* Offset to free list. */ 00245 00246 /* 00247 * We normally don't lock the deadfile field when we read it since we 00248 * only care if the field is zero or non-zero. We do lock on read when 00249 * searching for a matching MPOOLFILE -- see that code for more detail. 00250 */ 00251 int32_t deadfile; /* Dirty pages can be discarded. */ 00252 00253 /* Protected by mpool cache 0 region lock. */ 00254 SH_TAILQ_ENTRY q; /* List of MPOOLFILEs */ 00255 db_pgno_t last_pgno; /* Last page in the file. */ 00256 db_pgno_t orig_last_pgno; /* Original last page in the file. */ 00257 db_pgno_t maxpgno; /* Maximum page number. */ 00258 00259 /* 00260 * None of the following fields are thread protected. 00261 * 00262 * There are potential races with the ftype field because it's read 00263 * without holding a lock. However, it has to be set before adding 00264 * any buffers to the cache that depend on it being set, so there 00265 * would need to be incorrect operation ordering to have a problem. 00266 */ 00267 int32_t ftype; /* File type. */ 00268 00269 /* 00270 * There are potential races with the priority field because it's read 00271 * without holding a lock. However, a collision is unlikely and if it 00272 * happens is of little consequence. 00273 */ 00274 int32_t priority; /* Priority when unpinning buffer. */ 00275 00276 /* 00277 * There are potential races with the file_written field (many threads 00278 * may be writing blocks at the same time), and with no_backing_file 00279 * and unlink_on_close fields, as they may be set while other threads 00280 * are reading them. However, we only care if the field value is zero 00281 * or non-zero, so don't lock the memory. 00282 * 00283 * !!! 00284 * Theoretically, a 64-bit architecture could put two of these fields 00285 * in a single memory operation and we could race. I have never seen 00286 * an architecture where that's a problem, and I believe Java requires 00287 * that to never be the case. 00288 * 00289 * File_written is set whenever a buffer is marked dirty in the cache. 00290 * It can be cleared in some cases, after all dirty buffers have been 00291 * written AND the file has been flushed to disk. 00292 */ 00293 int32_t file_written; /* File was written. */ 00294 int32_t no_backing_file; /* Never open a backing file. */ 00295 int32_t unlink_on_close; /* Unlink file on last close. */ 00296 00297 /* 00298 * We do not protect the statistics in "stat" because of the cost of 00299 * the mutex in the get/put routines. There is a chance that a count 00300 * will get lost. 00301 */ 00302 DB_MPOOL_FSTAT stat; /* Per-file mpool statistics. */ 00303 00304 /* 00305 * The remaining fields are initialized at open and never subsequently 00306 * modified. 00307 */ 00308 int32_t lsn_off; /* Page's LSN offset. */ 00309 u_int32_t clear_len; /* Bytes to clear on page create. */ 00310 00311 roff_t fileid_off; /* File ID string location. */ 00312 00313 roff_t pgcookie_len; /* Pgin/pgout cookie length. */ 00314 roff_t pgcookie_off; /* Pgin/pgout cookie location. */ 00315 00316 /* 00317 * The flags are initialized at open and never subsequently modified. 00318 */ 00319 #define MP_CAN_MMAP 0x001 /* If the file can be mmap'd. */ 00320 #define MP_DIRECT 0x002 /* No OS buffering. */ 00321 #define MP_DURABLE_UNKNOWN 0x004 /* We don't care about durability. */ 00322 #define MP_EXTENT 0x008 /* Extent file. */ 00323 #define MP_FAKE_DEADFILE 0x010 /* Deadfile field: fake flag. */ 00324 #define MP_FAKE_FILEWRITTEN 0x020 /* File_written field: fake flag. */ 00325 #define MP_FAKE_NB 0x040 /* No_backing_file field: fake flag. */ 00326 #define MP_FAKE_UOC 0x080 /* Unlink_on_close field: fake flag. */ 00327 #define MP_NOT_DURABLE 0x100 /* File is not durable. */ 00328 #define MP_TEMP 0x200 /* Backing file is a temporary. */ 00329 u_int32_t flags; 00330 }; 00331 00332 /* 00333 * Flags to __memp_bh_free. 00334 */ 00335 #define BH_FREE_FREEMEM 0x01 00336 #define BH_FREE_UNLOCKED 0x02 00337 00338 /* 00339 * BH -- 00340 * Buffer header. 00341 */ 00342 struct __bh { 00343 db_mutex_t mtx_bh; /* Buffer thread/process mutex. */ 00344 00345 u_int16_t ref; /* Reference count. */ 00346 u_int16_t ref_sync; /* Sync wait-for reference count. */ 00347 00348 #define BH_CALLPGIN 0x001 /* Convert the page before use. */ 00349 #define BH_DIRTY 0x002 /* Page was modified. */ 00350 #define BH_DIRTY_CREATE 0x004 /* Page created, must be written. */ 00351 #define BH_DISCARD 0x008 /* Page is useless. */ 00352 #define BH_LOCKED 0x010 /* Page is locked (I/O in progress). */ 00353 #define BH_TRASH 0x020 /* Page is garbage. */ 00354 u_int16_t flags; 00355 00356 u_int32_t priority; /* LRU priority. */ 00357 SH_TAILQ_ENTRY hq; /* MPOOL hash bucket queue. */ 00358 00359 db_pgno_t pgno; /* Underlying MPOOLFILE page number. */ 00360 roff_t mf_offset; /* Associated MPOOLFILE offset. */ 00361 00362 /* 00363 * !!! 00364 * This array must be at least size_t aligned -- the DB access methods 00365 * put PAGE and other structures into it, and then access them directly. 00366 * (We guarantee size_t alignment to applications in the documentation, 00367 * too.) 00368 */ 00369 u_int8_t buf[1]; /* Variable length data. */ 00370 }; 00371 /* 00372 * Flags to __memp_ftruncate. 00373 */ 00374 #define MP_TRUNC_RECOVER 0x01 00375 00376 #include "dbinc_auto/mp_ext.h" 00377 #endif /* !_DB_MP_H_ */