Main Page | Class Hierarchy | Data Structures | Directories | File List | Data Fields | Related Pages

mp.h

00001 /*-
00002  * See the file LICENSE for redistribution information.
00003  *
00004  * Copyright (c) 1996-2005
00005  *      Sleepycat Software.  All rights reserved.
00006  *
00007  * $Id: mp.h,v 12.5 2005/08/08 14:52:30 bostic Exp $
00008  */
00009 
00010 #ifndef _DB_MP_H_
00011 #define _DB_MP_H_
00012 
00013 struct __bh;            typedef struct __bh BH;
00014 struct __db_mpool_hash; typedef struct __db_mpool_hash DB_MPOOL_HASH;
00015 struct __db_mpreg;      typedef struct __db_mpreg DB_MPREG;
00016 struct __mpool;         typedef struct __mpool MPOOL;
00017 
00018                                 /* We require at least 20KB of cache. */
00019 #define DB_CACHESIZE_MIN        (20 * 1024)
00020 
00021 /*
00022  * DB_MPOOLFILE initialization methods cannot be called after open is called,
00023  * other methods cannot be called before open is called
00024  */
00025 #define MPF_ILLEGAL_AFTER_OPEN(dbmfp, name)                             \
00026         if (F_ISSET(dbmfp, MP_OPEN_CALLED))                             \
00027                 return (__db_mi_open((dbmfp)->dbenv, name, 1));
00028 #define MPF_ILLEGAL_BEFORE_OPEN(dbmfp, name)                            \
00029         if (!F_ISSET(dbmfp, MP_OPEN_CALLED))                            \
00030                 return (__db_mi_open((dbmfp)->dbenv, name, 0));
00031 
00032 typedef enum {
00033         DB_SYNC_ALLOC,          /* Flush for allocation. */
00034         DB_SYNC_CACHE,          /* Checkpoint or flush entire cache. */
00035         DB_SYNC_FILE,           /* Flush file. */
00036         DB_SYNC_TRICKLE         /* Trickle sync. */
00037 } db_sync_op;
00038 
00039 /*
00040  * DB_MPOOL --
00041  *      Per-process memory pool structure.
00042  */
00043 struct __db_mpool {
00044         /* These fields need to be protected for multi-threaded support. */
00045         db_mutex_t mutex;               /* Thread mutex. */
00046 
00047         /*
00048          * DB_MPREG structure for the DB pgin/pgout routines.
00049          *
00050          * Linked list of application-specified pgin/pgout routines.
00051          */
00052         DB_MPREG *pg_inout;
00053         LIST_HEAD(__db_mpregh, __db_mpreg) dbregq;
00054 
00055                                         /* List of DB_MPOOLFILE's. */
00056         TAILQ_HEAD(__db_mpoolfileh, __db_mpoolfile) dbmfq;
00057 
00058         /*
00059          * The dbenv, nreg and reginfo fields are not thread protected,
00060          * as they are initialized during mpool creation, and not modified
00061          * again.
00062          */
00063         DB_ENV     *dbenv;              /* Enclosing environment. */
00064 
00065         u_int32_t   nreg;               /* N underlying cache regions. */
00066         REGINFO    *reginfo;            /* Underlying cache regions. */
00067 };
00068 
00069 /*
00070  * DB_MPREG --
00071  *      DB_MPOOL registry of pgin/pgout functions.
00072  */
00073 struct __db_mpreg {
00074         LIST_ENTRY(__db_mpreg) q;       /* Linked list. */
00075 
00076         int32_t ftype;                  /* File type. */
00077                                         /* Pgin, pgout routines. */
00078         int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *));
00079         int (*pgout) __P((DB_ENV *, db_pgno_t, void *, DBT *));
00080 };
00081 
00082 /*
00083  * NCACHE --
00084  *      Select a cache based on the file and the page number.  Assumes accesses
00085  *      are uniform across pages, which is probably OK.  What we really want to
00086  *      avoid is anything that puts all pages from any single file in the same
00087  *      cache, as we expect that file access will be bursty, and to avoid
00088  *      putting all page number N pages in the same cache as we expect access
00089  *      to the metapages (page 0) and the root of a btree (page 1) to be much
00090  *      more frequent than a random data page.
00091  */
00092 #define NCACHE(mp, mf_offset, pgno)                                     \
00093         (((pgno) ^ ((u_int32_t)(mf_offset) >> 3)) % ((MPOOL *)mp)->nreg)
00094 
00095 /*
00096  * NBUCKET --
00097  *       We make the assumption that early pages of the file are more likely
00098  *       to be retrieved than the later pages, which means the top bits will
00099  *       be more interesting for hashing as they're less likely to collide.
00100  *       That said, as 512 8K pages represents a 4MB file, so only reasonably
00101  *       large files will have page numbers with any other than the bottom 9
00102  *       bits set.  We XOR in the MPOOL offset of the MPOOLFILE that backs the
00103  *       page, since that should also be unique for the page.  We don't want
00104  *       to do anything very fancy -- speed is more important to us than using
00105  *       good hashing.
00106  */
00107 #define NBUCKET(mc, mf_offset, pgno)                                    \
00108         (((pgno) ^ ((mf_offset) << 9)) % (mc)->htab_buckets)
00109 
00110 /* Macros to lock/unlock the mpool region as a whole. */
00111 #define MPOOL_SYSTEM_LOCK(dbenv)                                        \
00112         MUTEX_LOCK(dbenv, ((MPOOL *)((DB_MPOOL *)                       \
00113             (dbenv)->mp_handle)->reginfo[0].primary)->mtx_region)
00114 #define MPOOL_SYSTEM_UNLOCK(dbenv)                                      \
00115         MUTEX_UNLOCK(dbenv, ((MPOOL *)((DB_MPOOL *)                     \
00116             (dbenv)->mp_handle)->reginfo[0].primary)->mtx_region)
00117 
00118 /* Macros to lock/unlock a specific mpool region. */
00119 #define MPOOL_REGION_LOCK(dbenv, infop)                                 \
00120         MUTEX_LOCK(dbenv, ((MPOOL *)(infop)->primary)->mtx_region)
00121 #define MPOOL_REGION_UNLOCK(dbenv, infop)                               \
00122         MUTEX_UNLOCK(dbenv, ((MPOOL *)(infop)->primary)->mtx_region)
00123 
00124 /*
00125  * MPOOL --
00126  *      Shared memory pool region.
00127  */
00128 struct __mpool {
00129         /*
00130          * The memory pool can be broken up into individual pieces/files.
00131          * Not what we would have liked, but on Solaris you can allocate
00132          * only a little more than 2GB of memory in a contiguous chunk,
00133          * and I expect to see more systems with similar issues.
00134          *
00135          * While this structure is duplicated in each piece of the cache,
00136          * the first of these pieces/files describes the entire pool, the
00137          * second only describe a piece of the cache.
00138          */
00139         db_mutex_t      mtx_region;     /* Region mutex. */
00140 
00141         /*
00142          * The lsn field and list of underlying MPOOLFILEs are thread protected
00143          * by the region lock.
00144          */
00145         DB_LSN    lsn;                  /* Maximum checkpoint LSN. */
00146 
00147         SH_TAILQ_HEAD(__mpfq) mpfq;     /* List of MPOOLFILEs. */
00148 
00149         /* Configuration information: protected by the region lock. */
00150         size_t mp_mmapsize;             /* Maximum file size for mmap. */
00151         int    mp_maxopenfd;            /* Maximum open file descriptors. */
00152         int    mp_maxwrite;             /* Maximum buffers to write. */
00153         int    mp_maxwrite_sleep;       /* Sleep after writing max buffers. */
00154 
00155         /*
00156          * The nreg, regids and maint_off fields are not thread protected,
00157          * as they are initialized during mpool creation, and not modified
00158          * again.
00159          */
00160         u_int32_t nreg;                 /* Number of underlying REGIONS. */
00161         roff_t    regids;               /* Array of underlying REGION Ids. */
00162 
00163         /*
00164          * The following structure fields only describe the per-cache portion
00165          * of the region.
00166          *
00167          * The htab and htab_buckets fields are not thread protected as they
00168          * are initialized during mpool creation, and not modified again.
00169          *
00170          * The last_checked and lru_count fields are thread protected by
00171          * the region lock.
00172          */
00173         u_int32_t htab_buckets; /* Number of hash table entries. */
00174         roff_t    htab;         /* Hash table offset. */
00175         u_int32_t last_checked; /* Last bucket checked for free. */
00176         u_int32_t lru_count;            /* Counter for buffer LRU */
00177 
00178         /*
00179          * The stat fields are generally not thread protected, and cannot be
00180          * trusted.  Note that st_pages is an exception, and is always updated
00181          * inside a region lock (although it is sometimes read outside of the
00182          * region lock).
00183          */
00184         DB_MPOOL_STAT stat;             /* Per-cache mpool statistics. */
00185 
00186         /*
00187          * We track page puts so that we can decide when allocation is never
00188          * going to succeed.  We don't lock the field, all we care about is
00189          * if it changes.
00190          */
00191         u_int32_t  put_counter;         /* Count of page put calls. */
00192 };
00193 
00194 struct __db_mpool_hash {
00195         db_mutex_t      mtx_hash;       /* Per-bucket mutex. */
00196 
00197         DB_HASHTAB      hash_bucket;    /* Head of bucket. */
00198 
00199         u_int32_t       hash_page_dirty;/* Count of dirty pages. */
00200         u_int32_t       hash_priority;  /* Minimum priority of bucket buffer. */
00201 };
00202 
00203 /*
00204  * The base mpool priority is 1/4th of the name space, or just under 2^30.
00205  * When the LRU counter wraps, we shift everybody down to a base-relative
00206  * value.
00207  */
00208 #define MPOOL_BASE_DECREMENT    (UINT32_MAX - (UINT32_MAX / 4))
00209 
00210 /*
00211  * Mpool priorities from low to high.  Defined in terms of fractions of the
00212  * buffers in the pool.
00213  */
00214 #define MPOOL_PRI_VERY_LOW      -1      /* Dead duck.  Check and set to 0. */
00215 #define MPOOL_PRI_LOW           -2      /* Low. */
00216 #define MPOOL_PRI_DEFAULT       0       /* No adjustment -- special case.*/
00217 #define MPOOL_PRI_HIGH          10      /* With the dirty buffers. */
00218 #define MPOOL_PRI_DIRTY         10      /* Dirty gets a 10% boost. */
00219 #define MPOOL_PRI_VERY_HIGH     1       /* Add number of buffers in pool. */
00220 
00221 /*
00222  * MPOOLFILE --
00223  *      Shared DB_MPOOLFILE information.
00224  */
00225 struct __mpoolfile {
00226         db_mutex_t mutex;               /* MPOOLFILE mutex. */
00227 
00228         /* Protected by MPOOLFILE mutex. */
00229         u_int32_t mpf_cnt;              /* Ref count: DB_MPOOLFILEs. */
00230         u_int32_t block_cnt;            /* Ref count: blocks in cache. */
00231 
00232         roff_t    path_off;             /* File name location. */
00233 
00234         /*
00235          * The following are used for file compaction processing.
00236          * They are only used when a thread is in the process
00237          * of trying to move free pages to the end of the file.
00238          * Other threads may look here when freeing a page.
00239          * Protected by a lock on the metapage.
00240          */
00241         u_int32_t free_ref;             /* Refcount to freelist. */
00242         u_int32_t free_cnt;             /* Count of free pages. */
00243         size_t    free_size;            /* Allocated size of free list. */
00244         roff_t    free_list;            /* Offset to free list. */
00245 
00246         /*
00247          * We normally don't lock the deadfile field when we read it since we
00248          * only care if the field is zero or non-zero.  We do lock on read when
00249          * searching for a matching MPOOLFILE -- see that code for more detail.
00250          */
00251         int32_t   deadfile;             /* Dirty pages can be discarded. */
00252 
00253         /* Protected by mpool cache 0 region lock. */
00254         SH_TAILQ_ENTRY q;               /* List of MPOOLFILEs */
00255         db_pgno_t last_pgno;            /* Last page in the file. */
00256         db_pgno_t orig_last_pgno;       /* Original last page in the file. */
00257         db_pgno_t maxpgno;              /* Maximum page number. */
00258 
00259         /*
00260          * None of the following fields are thread protected.
00261          *
00262          * There are potential races with the ftype field because it's read
00263          * without holding a lock.  However, it has to be set before adding
00264          * any buffers to the cache that depend on it being set, so there
00265          * would need to be incorrect operation ordering to have a problem.
00266          */
00267         int32_t   ftype;                /* File type. */
00268 
00269         /*
00270          * There are potential races with the priority field because it's read
00271          * without holding a lock.  However, a collision is unlikely and if it
00272          * happens is of little consequence.
00273          */
00274         int32_t   priority;             /* Priority when unpinning buffer. */
00275 
00276         /*
00277          * There are potential races with the file_written field (many threads
00278          * may be writing blocks at the same time), and with no_backing_file
00279          * and unlink_on_close fields, as they may be set while other threads
00280          * are reading them.  However, we only care if the field value is zero
00281          * or non-zero, so don't lock the memory.
00282          *
00283          * !!!
00284          * Theoretically, a 64-bit architecture could put two of these fields
00285          * in a single memory operation and we could race.  I have never seen
00286          * an architecture where that's a problem, and I believe Java requires
00287          * that to never be the case.
00288          *
00289          * File_written is set whenever a buffer is marked dirty in the cache.
00290          * It can be cleared in some cases, after all dirty buffers have been
00291          * written AND the file has been flushed to disk.
00292          */
00293         int32_t   file_written;         /* File was written. */
00294         int32_t   no_backing_file;      /* Never open a backing file. */
00295         int32_t   unlink_on_close;      /* Unlink file on last close. */
00296 
00297         /*
00298          * We do not protect the statistics in "stat" because of the cost of
00299          * the mutex in the get/put routines.  There is a chance that a count
00300          * will get lost.
00301          */
00302         DB_MPOOL_FSTAT stat;            /* Per-file mpool statistics. */
00303 
00304         /*
00305          * The remaining fields are initialized at open and never subsequently
00306          * modified.
00307          */
00308         int32_t   lsn_off;              /* Page's LSN offset. */
00309         u_int32_t clear_len;            /* Bytes to clear on page create. */
00310 
00311         roff_t    fileid_off;           /* File ID string location. */
00312 
00313         roff_t    pgcookie_len;         /* Pgin/pgout cookie length. */
00314         roff_t    pgcookie_off;         /* Pgin/pgout cookie location. */
00315 
00316         /*
00317          * The flags are initialized at open and never subsequently modified.
00318          */
00319 #define MP_CAN_MMAP             0x001   /* If the file can be mmap'd. */
00320 #define MP_DIRECT               0x002   /* No OS buffering. */
00321 #define MP_DURABLE_UNKNOWN      0x004   /* We don't care about durability. */
00322 #define MP_EXTENT               0x008   /* Extent file. */
00323 #define MP_FAKE_DEADFILE        0x010   /* Deadfile field: fake flag. */
00324 #define MP_FAKE_FILEWRITTEN     0x020   /* File_written field: fake flag. */
00325 #define MP_FAKE_NB              0x040   /* No_backing_file field: fake flag. */
00326 #define MP_FAKE_UOC             0x080   /* Unlink_on_close field: fake flag. */
00327 #define MP_NOT_DURABLE          0x100   /* File is not durable. */
00328 #define MP_TEMP                 0x200   /* Backing file is a temporary. */
00329         u_int32_t  flags;
00330 };
00331 
00332 /*
00333  * Flags to __memp_bh_free.
00334  */
00335 #define BH_FREE_FREEMEM         0x01
00336 #define BH_FREE_UNLOCKED        0x02
00337 
00338 /*
00339  * BH --
00340  *      Buffer header.
00341  */
00342 struct __bh {
00343         db_mutex_t      mtx_bh;         /* Buffer thread/process mutex. */
00344 
00345         u_int16_t       ref;            /* Reference count. */
00346         u_int16_t       ref_sync;       /* Sync wait-for reference count. */
00347 
00348 #define BH_CALLPGIN     0x001           /* Convert the page before use. */
00349 #define BH_DIRTY        0x002           /* Page was modified. */
00350 #define BH_DIRTY_CREATE 0x004           /* Page created, must be written. */
00351 #define BH_DISCARD      0x008           /* Page is useless. */
00352 #define BH_LOCKED       0x010           /* Page is locked (I/O in progress). */
00353 #define BH_TRASH        0x020           /* Page is garbage. */
00354         u_int16_t       flags;
00355 
00356         u_int32_t       priority;       /* LRU priority. */
00357         SH_TAILQ_ENTRY  hq;             /* MPOOL hash bucket queue. */
00358 
00359         db_pgno_t pgno;                 /* Underlying MPOOLFILE page number. */
00360         roff_t    mf_offset;            /* Associated MPOOLFILE offset. */
00361 
00362         /*
00363          * !!!
00364          * This array must be at least size_t aligned -- the DB access methods
00365          * put PAGE and other structures into it, and then access them directly.
00366          * (We guarantee size_t alignment to applications in the documentation,
00367          * too.)
00368          */
00369         u_int8_t   buf[1];              /* Variable length data. */
00370 };
00371 /*
00372  * Flags to __memp_ftruncate.
00373  */
00374 #define MP_TRUNC_RECOVER        0x01
00375 
00376 #include "dbinc_auto/mp_ext.h"
00377 #endif /* !_DB_MP_H_ */

Generated on Sun Dec 25 12:14:22 2005 for Berkeley DB 4.4.16 by  doxygen 1.4.2