00001 /*- 00002 * See the file LICENSE for redistribution information. 00003 * 00004 * Copyright (c) 1996-2005 00005 * Sleepycat Software. All rights reserved. 00006 * 00007 * $Id: log.h,v 12.12 2005/10/20 18:57:05 bostic Exp $ 00008 */ 00009 00010 #ifndef _LOG_H_ 00011 #define _LOG_H_ 00012 00013 /******************************************************* 00014 * DBREG: 00015 * The DB file register code keeps track of open files. It's stored 00016 * in the log subsystem's shared region, and so appears in the log.h 00017 * header file, but is logically separate. 00018 *******************************************************/ 00019 /* 00020 * The per-process table that maps log file-id's to DB structures. 00021 */ 00022 typedef struct __db_entry { 00023 DB *dbp; /* Open dbp for this file id. */ 00024 int deleted; /* File was not found during open. */ 00025 } DB_ENTRY; 00026 00027 /* 00028 * FNAME -- 00029 * File name and id. 00030 */ 00031 struct __fname { 00032 SH_TAILQ_ENTRY q; /* File name queue. */ 00033 00034 int32_t id; /* Logging file id. */ 00035 DBTYPE s_type; /* Saved DB type. */ 00036 00037 roff_t name_off; /* Name offset. */ 00038 db_pgno_t meta_pgno; /* Page number of the meta page. */ 00039 u_int8_t ufid[DB_FILE_ID_LEN]; /* Unique file id. */ 00040 00041 u_int32_t create_txnid; /* 00042 * Txn ID of the DB create, stored so 00043 * we can log it at register time. 00044 */ 00045 #define DB_FNAME_NOTLOGGED 0x01 /* Log of close failed. */ 00046 #define DB_FNAME_DURABLE 0x02 /* File is durable. */ 00047 u_int32_t flags; 00048 }; 00049 00050 /* File open/close register log record opcodes. */ 00051 #define DBREG_CHKPNT 1 /* Checkpoint: file name/id dump. */ 00052 #define DBREG_CLOSE 2 /* File close. */ 00053 #define DBREG_OPEN 3 /* File open. */ 00054 #define DBREG_PREOPEN 4 /* Open in mpool only. */ 00055 #define DBREG_RCLOSE 5 /* File close after recovery. */ 00056 #define DBREG_REOPEN 6 /* Open for in-memory database. */ 00057 00058 /******************************************************* 00059 * LOG: 00060 * The log subsystem information. 00061 *******************************************************/ 00062 struct __db_log; typedef struct __db_log DB_LOG; 00063 struct __hdr; typedef struct __hdr HDR; 00064 struct __log; typedef struct __log LOG; 00065 struct __log_persist; typedef struct __log_persist LOGP; 00066 00067 #define LFPREFIX "log." /* Log file name prefix. */ 00068 #define LFNAME "log.%010d" /* Log file name template. */ 00069 #define LFNAME_V1 "log.%05d" /* Log file name template, rev 1. */ 00070 00071 #define LG_MAX_DEFAULT (10 * MEGABYTE) /* 10 MB. */ 00072 #define LG_MAX_INMEM (256 * 1024) /* 256 KB. */ 00073 #define LG_BSIZE_DEFAULT (32 * 1024) /* 32 KB. */ 00074 #define LG_BSIZE_INMEM (1 * MEGABYTE) /* 1 MB. */ 00075 #define LG_BASE_REGION_SIZE (60 * 1024) /* 60 KB. */ 00076 00077 /* 00078 * DB_LOG 00079 * Per-process log structure. 00080 */ 00081 struct __db_log { 00082 /* 00083 * These fields need to be protected for multi-threaded support. 00084 */ 00085 db_mutex_t mtx_dbreg; /* Mutex for thread protection. */ 00086 00087 DB_ENTRY *dbentry; /* Recovery file-id mapping. */ 00088 #define DB_GROW_SIZE 64 00089 int32_t dbentry_cnt; /* Entries. Grows by DB_GROW_SIZE. */ 00090 00091 /* 00092 * These fields are only accessed when the region lock is held, so 00093 * they do not have to be protected by the thread lock as well. 00094 */ 00095 u_int32_t lfname; /* Log file "name". */ 00096 DB_FH *lfhp; /* Log file handle. */ 00097 00098 u_int8_t *bufp; /* Region buffer. */ 00099 00100 /* These fields are not thread protected. */ 00101 DB_ENV *dbenv; /* Reference to error information. */ 00102 REGINFO reginfo; /* Region information. */ 00103 00104 #define DBLOG_RECOVER 0x01 /* We are in recovery. */ 00105 #define DBLOG_FORCE_OPEN 0x02 /* Force the DB open even if it appears 00106 * to be deleted. */ 00107 u_int32_t flags; 00108 }; 00109 00110 /* 00111 * HDR -- 00112 * Log record header. 00113 */ 00114 struct __hdr { 00115 u_int32_t prev; /* Previous offset. */ 00116 u_int32_t len; /* Current length. */ 00117 u_int8_t chksum[DB_MAC_KEY]; /* Current checksum. */ 00118 u_int8_t iv[DB_IV_BYTES]; /* IV */ 00119 u_int32_t orig_size; /* Original size of log record */ 00120 /* !!! - 'size' is not written to log, must be last in hdr */ 00121 size_t size; /* Size of header to use */ 00122 }; 00123 00124 /* 00125 * We use HDR internally, and then when we write out, we write out 00126 * prev, len, and then a 4-byte checksum if normal operation or 00127 * a crypto-checksum and IV and original size if running in crypto 00128 * mode. We must store the original size in case we pad. Set the 00129 * size when we set up the header. We compute a DB_MAC_KEY size 00130 * checksum regardless, but we can safely just use the first 4 bytes. 00131 */ 00132 #define HDR_NORMAL_SZ 12 00133 #define HDR_CRYPTO_SZ 12 + DB_MAC_KEY + DB_IV_BYTES 00134 00135 struct __log_persist { 00136 u_int32_t magic; /* DB_LOGMAGIC */ 00137 u_int32_t version; /* DB_LOGVERSION */ 00138 00139 u_int32_t log_size; /* Log file size. */ 00140 u_int32_t notused; /* Historically the log file mode. */ 00141 }; 00142 00143 /* Macros to lock/unlock the log region as a whole. */ 00144 #define LOG_SYSTEM_LOCK(dbenv) \ 00145 MUTEX_LOCK(dbenv, ((LOG *)((DB_LOG *) \ 00146 (dbenv)->lg_handle)->reginfo.primary)->mtx_region) 00147 #define LOG_SYSTEM_UNLOCK(dbenv) \ 00148 MUTEX_UNLOCK(dbenv, ((LOG *)((DB_LOG *) \ 00149 (dbenv)->lg_handle)->reginfo.primary)->mtx_region) 00150 00151 /* 00152 * LOG -- 00153 * Shared log region. One of these is allocated in shared memory, 00154 * and describes the log. 00155 */ 00156 struct __log { 00157 db_mutex_t mtx_region; /* Region mutex. */ 00158 00159 db_mutex_t mtx_filelist; /* Mutex guarding file name list. */ 00160 00161 LOGP persist; /* Persistent information. */ 00162 00163 SH_TAILQ_HEAD(__fq1) fq; /* List of file names. */ 00164 int32_t fid_max; /* Max fid allocated. */ 00165 roff_t free_fid_stack; /* Stack of free file ids. */ 00166 u_int free_fids; /* Height of free fid stack. */ 00167 u_int free_fids_alloced; /* N free fid slots allocated. */ 00168 00169 /* 00170 * The lsn LSN is the file offset that we're about to write and which 00171 * we will return to the user. 00172 */ 00173 DB_LSN lsn; /* LSN at current file offset. */ 00174 00175 /* 00176 * The f_lsn LSN is the LSN (returned to the user) that "owns" the 00177 * first byte of the buffer. If the record associated with the LSN 00178 * spans buffers, it may not reflect the physical file location of 00179 * the first byte of the buffer. 00180 */ 00181 DB_LSN f_lsn; /* LSN of first byte in the buffer. */ 00182 size_t b_off; /* Current offset in the buffer. */ 00183 u_int32_t w_off; /* Current write offset in the file. */ 00184 u_int32_t len; /* Length of the last record. */ 00185 00186 DB_LSN active_lsn; /* Oldest active LSN in the buffer. */ 00187 size_t a_off; /* Offset in the buffer of first active 00188 file. */ 00189 00190 /* 00191 * The s_lsn LSN is the last LSN that we know is on disk, not just 00192 * written, but synced. This field is protected by the flush mutex 00193 * rather than by the region mutex. 00194 */ 00195 db_mutex_t mtx_flush; /* Mutex guarding flushing. */ 00196 int in_flush; /* Log flush in progress. */ 00197 DB_LSN s_lsn; /* LSN of the last sync. */ 00198 00199 DB_LOG_STAT stat; /* Log statistics. */ 00200 00201 /* 00202 * !!! 00203 * NOTE: the next 11 fields, waiting_lsn, verify_lsn, max_wait_lsn, 00204 * maxperm_lsn, wait_recs, rcvd_recs, ready_lsn and bulk_* are NOT 00205 * protected by the log region lock. They are protected by 00206 * REP->mtx_clientdb. If you need access to both, you must acquire 00207 * REP->mtx_clientdb before acquiring the log region lock. 00208 * 00209 * The waiting_lsn is used by the replication system. It is the 00210 * first LSN that we are holding without putting in the log, because 00211 * we received one or more log records out of order. Associated with 00212 * the waiting_lsn is the number of log records that we still have to 00213 * receive before we decide that we should request it again. 00214 * 00215 * The max_wait_lsn is used to control retransmission in the face 00216 * of dropped messages. If we are requesting all records from the 00217 * current gap (i.e., chunk of the log that we are missing), then 00218 * the max_wait_lsn contains the first LSN that we are known to have 00219 * in the __db.rep.db. If we requested only a single record, then 00220 * the max_wait_lsn has the LSN of that record we requested. 00221 */ 00222 DB_LSN waiting_lsn; /* First log record after a gap. */ 00223 DB_LSN verify_lsn; /* LSN we are waiting to verify. */ 00224 DB_LSN max_wait_lsn; /* Maximum LSN requested. */ 00225 DB_LSN max_perm_lsn; /* Maximum PERMANENT LSN processed. */ 00226 u_int32_t wait_recs; /* Records to wait before requesting. */ 00227 u_int32_t rcvd_recs; /* Records received while waiting. */ 00228 /* 00229 * The ready_lsn is also used by the replication system. It is the 00230 * next LSN we expect to receive. It's normally equal to "lsn", 00231 * except at the beginning of a log file, at which point it's set 00232 * to the LSN of the first record of the new file (after the 00233 * header), rather than to 0. 00234 */ 00235 DB_LSN ready_lsn; 00236 /* 00237 * The bulk_buf is used by replication for bulk transfer. While this 00238 * is protected by REP->mtx_clientdb, this doesn't contend with the 00239 * above fields because the above are used by clients and the bulk 00240 * fields below are used by a master. 00241 */ 00242 roff_t bulk_buf; /* Bulk transfer buffer in region. */ 00243 uintptr_t bulk_off; /* Current offset into bulk buffer. */ 00244 u_int32_t bulk_len; /* Length of buffer. */ 00245 u_int32_t bulk_flags; /* Bulk buffer flags. */ 00246 00247 /* 00248 * During initialization, the log system walks forward through the 00249 * last log file to find its end. If it runs into a checkpoint 00250 * while it's doing so, it caches it here so that the transaction 00251 * system doesn't need to walk through the file again on its 00252 * initialization. 00253 */ 00254 DB_LSN cached_ckp_lsn; 00255 00256 u_int32_t regionmax; /* Configured size of the region. */ 00257 00258 roff_t buffer_off; /* Log buffer offset in the region. */ 00259 u_int32_t buffer_size; /* Log buffer size. */ 00260 00261 u_int32_t log_size; /* Log file's size. */ 00262 u_int32_t log_nsize; /* Next log file's size. */ 00263 00264 int filemode; /* Log file permissions mode. */ 00265 00266 /* 00267 * DB_LOG_AUTOREMOVE and DB_LOG_INMEMORY: not protected by a mutex, 00268 * all we care about is if they're zero or non-zero. 00269 */ 00270 int db_log_autoremove; 00271 int db_log_inmemory; 00272 00273 u_int32_t ncommit; /* Number of txns waiting to commit. */ 00274 DB_LSN t_lsn; /* LSN of first commit */ 00275 SH_TAILQ_HEAD(__commit) commits;/* list of txns waiting to commit. */ 00276 SH_TAILQ_HEAD(__free) free_commits;/* free list of commit structs. */ 00277 00278 /* 00279 * In-memory logs maintain a list of the start positions of all log 00280 * files currently active in the in-memory buffer. This is to make the 00281 * lookup from LSN to log buffer offset efficient. 00282 */ 00283 SH_TAILQ_HEAD(__logfile) logfiles; 00284 SH_TAILQ_HEAD(__free_logfile) free_logfiles; 00285 }; 00286 00287 /* 00288 * __db_commit structure -- 00289 * One of these is allocated for each transaction waiting to commit. 00290 */ 00291 struct __db_commit { 00292 db_mutex_t mtx_txnwait; /* Mutex for txn to wait on. */ 00293 DB_LSN lsn; /* LSN of commit record. */ 00294 SH_TAILQ_ENTRY links; /* Either on free or waiting list. */ 00295 00296 #define DB_COMMIT_FLUSH 0x0001 /* Flush the log when you wake up. */ 00297 u_int32_t flags; 00298 }; 00299 00300 /* 00301 * Check for the proper progression of Log Sequence Numbers. 00302 * If we are rolling forward the LSN on the page must be greater 00303 * than or equal to the previous LSN in log record. 00304 * We ignore NOT LOGGED LSNs. The user did an unlogged update. 00305 * We should eventually see a log record that matches and continue 00306 * forward. 00307 * If truncate is supported then a ZERO LSN implies a page that was 00308 * allocated prior to the recovery start pont and then truncated 00309 * later in the log. An allocation of a page after this 00310 * page will extend the file, leaving a hole. We want to 00311 * ignore this page until it is truncated again. 00312 * 00313 */ 00314 00315 #ifdef HAVE_FTRUNCATE 00316 #define CHECK_LSN(e, redo, cmp, lsn, prev) \ 00317 if (DB_REDO(redo) && (cmp) < 0 && \ 00318 ((!IS_NOT_LOGGED_LSN(*(lsn)) && !IS_ZERO_LSN(*(lsn))) || \ 00319 IS_REP_CLIENT(e))) { \ 00320 ret = __db_check_lsn(dbenv, lsn, prev); \ 00321 goto out; \ 00322 } 00323 #else 00324 #define CHECK_LSN(e, redo, cmp, lsn, prev) \ 00325 if (DB_REDO(redo) && (cmp) < 0 && \ 00326 (!IS_NOT_LOGGED_LSN(*(lsn)) || IS_REP_CLIENT(e))) { \ 00327 ret = __db_check_lsn(dbenv, lsn, prev); \ 00328 goto out; \ 00329 } 00330 #endif 00331 00332 /* 00333 * Helper for in-memory logs -- check whether an offset is in range 00334 * in a ring buffer (inclusive of start, exclusive of end). 00335 */ 00336 struct __db_filestart { 00337 u_int32_t file; 00338 size_t b_off; 00339 00340 SH_TAILQ_ENTRY links; /* Either on free or waiting list. */ 00341 }; 00342 00343 #define RINGBUF_LEN(lp, start, end) \ 00344 ((start) < (end) ? \ 00345 (end) - (start) : (lp)->buffer_size - ((start) - (end))) 00346 00347 /* 00348 * Internal macro to set pointer to the begin_lsn for generated 00349 * logging routines. If begin_lsn is already set then do nothing. 00350 * Return a pointer to the last lsn too. 00351 */ 00352 #undef DB_SET_TXN_LSNP 00353 #define DB_SET_TXN_LSNP(txn, blsnp, llsnp) do { \ 00354 DB_LSN *__lsnp; \ 00355 TXN_DETAIL *__td; \ 00356 __td = (txn)->td; \ 00357 *(llsnp) = &__td->last_lsn; \ 00358 while (__td->parent != INVALID_ROFF) \ 00359 __td = R_ADDR(&(txn)->mgrp->reginfo, __td->parent); \ 00360 __lsnp = &__td->begin_lsn; \ 00361 if (IS_ZERO_LSN(*__lsnp)) \ 00362 *(blsnp) = __lsnp; \ 00363 } while (0) 00364 00365 /* 00366 * These are used in __log_backup to determine which LSN in the 00367 * checkpoint record to compare and return. 00368 */ 00369 #define CKPLSN_CMP 0 00370 #define LASTCKP_CMP 1 00371 00372 /* 00373 * Status codes indicating the validity of a log file examined by 00374 * __log_valid(). 00375 */ 00376 typedef enum { 00377 DB_LV_INCOMPLETE, 00378 DB_LV_NONEXISTENT, 00379 DB_LV_NORMAL, 00380 DB_LV_OLD_READABLE, 00381 DB_LV_OLD_UNREADABLE 00382 } logfile_validity; 00383 00384 #include "dbinc_auto/dbreg_auto.h" 00385 #include "dbinc_auto/dbreg_ext.h" 00386 #include "dbinc_auto/log_ext.h" 00387 #endif /* !_LOG_H_ */