00001 /*- 00002 * See the file LICENSE for redistribution information. 00003 * 00004 * Copyright (c) 2001-2005 00005 * Sleepycat Software. All rights reserved. 00006 * 00007 * $Id: rep.h,v 12.22 2005/10/27 13:27:01 bostic Exp $ 00008 */ 00009 00010 #ifndef _REP_H_ 00011 #define _REP_H_ 00012 00013 #include "dbinc_auto/rep_auto.h" 00014 00015 /* 00016 * Message types 00017 */ 00018 #define REP_ALIVE 1 /* I am alive message. */ 00019 #define REP_ALIVE_REQ 2 /* Request for alive messages. */ 00020 #define REP_ALL_REQ 3 /* Request all log records greater than LSN. */ 00021 #define REP_BULK_LOG 4 /* Bulk transfer of log records. */ 00022 #define REP_BULK_PAGE 5 /* Bulk transfer of pages. */ 00023 #define REP_DUPMASTER 6 /* Duplicate master detected; propagate. */ 00024 #define REP_FILE 7 /* Page of a database file. NOTUSED */ 00025 #define REP_FILE_FAIL 8 /* File requested does not exist. */ 00026 #define REP_FILE_REQ 9 /* Request for a database file. NOTUSED */ 00027 #define REP_LOG 10 /* Log record. */ 00028 #define REP_LOG_MORE 11 /* There are more log records to request. */ 00029 #define REP_LOG_REQ 12 /* Request for a log record. */ 00030 #define REP_MASTER_REQ 13 /* Who is the master */ 00031 #define REP_NEWCLIENT 14 /* Announces the presence of a new client. */ 00032 #define REP_NEWFILE 15 /* Announce a log file change. */ 00033 #define REP_NEWMASTER 16 /* Announces who the master is. */ 00034 #define REP_NEWSITE 17 /* Announces that a site has heard from a new 00035 * site; like NEWCLIENT, but indirect. A 00036 * NEWCLIENT message comes directly from the new 00037 * client while a NEWSITE comes indirectly from 00038 * someone who heard about a NEWSITE. 00039 */ 00040 #define REP_PAGE 18 /* Database page. */ 00041 #define REP_PAGE_FAIL 19 /* Requested page does not exist. */ 00042 #define REP_PAGE_MORE 20 /* There are more pages to request. */ 00043 #define REP_PAGE_REQ 21 /* Request for a database page. */ 00044 #define REP_REREQUEST 22 /* Force rerequest. */ 00045 #define REP_UPDATE 23 /* Environment hotcopy information. */ 00046 #define REP_UPDATE_REQ 24 /* Request for hotcopy information. */ 00047 #define REP_VERIFY 25 /* A log record for verification. */ 00048 #define REP_VERIFY_FAIL 26 /* The client is outdated. */ 00049 #define REP_VERIFY_REQ 27 /* Request for a log record to verify. */ 00050 #define REP_VOTE1 28 /* Send out your information for an election. */ 00051 #define REP_VOTE2 29 /* Send a "you are master" vote. */ 00052 00053 /* 00054 * REP_PRINT_MESSAGE 00055 * A function to print a debugging message. 00056 * 00057 * RPRINT 00058 * A macro for debug printing. Takes as an arg the arg set for __db_msg. 00059 * 00060 * !!! This function assumes a local DB_MSGBUF variable called 'mb'. 00061 */ 00062 #ifdef DIAGNOSTIC 00063 #define REP_PRINT_MESSAGE(dbenv, eid, rp, str) \ 00064 __rep_print_message(dbenv, eid, rp, str) 00065 #define RPRINT(e, r, x) do { \ 00066 if (FLD_ISSET((e)->verbose, DB_VERB_REPLICATION)) { \ 00067 DB_MSGBUF_INIT(&mb); \ 00068 if ((e)->db_errpfx == NULL) { \ 00069 if (F_ISSET((r), REP_F_CLIENT)) \ 00070 __db_msgadd((e), &mb, "CLIENT: "); \ 00071 else if (F_ISSET((r), REP_F_MASTER)) \ 00072 __db_msgadd((e), &mb, "MASTER: "); \ 00073 else \ 00074 __db_msgadd((e), &mb, "REP_UNDEF: "); \ 00075 } else \ 00076 __db_msgadd((e), &mb, "%s: ",(e)->db_errpfx); \ 00077 __db_msgadd x; \ 00078 DB_MSGBUF_FLUSH((e), &mb); \ 00079 } \ 00080 } while (0) 00081 #else 00082 #define REP_PRINT_MESSAGE(dbenv, eid, rp, str) 00083 #define RPRINT(e, r, x) 00084 #endif 00085 00086 /* 00087 * Election gen file name 00088 * The file contains an egen number for an election this client has NOT 00089 * participated in. I.e. it is the number of a future election. We 00090 * create it when we create the rep region, if it doesn't already exist 00091 * and initialize egen to 1. If it does exist, we read it when we create 00092 * the rep region. We write it immediately before sending our VOTE1 in 00093 * an election. That way, if a client has ever sent a vote for any 00094 * election, the file is already going to be updated to reflect a future 00095 * election, should it crash. 00096 */ 00097 #define REP_EGENNAME "__db.rep.egen" 00098 00099 /* 00100 * Database types for __rep_client_dbinit 00101 */ 00102 typedef enum { 00103 REP_DB, /* Log record database. */ 00104 REP_PG /* Pg database. */ 00105 } repdb_t; 00106 00107 /* Macros to lock/unlock the replication region as a whole. */ 00108 #define REP_SYSTEM_LOCK(dbenv) \ 00109 MUTEX_LOCK(dbenv, ((DB_REP *) \ 00110 (dbenv)->rep_handle)->region->mtx_region) 00111 #define REP_SYSTEM_UNLOCK(dbenv) \ 00112 MUTEX_UNLOCK(dbenv, ((DB_REP *) \ 00113 (dbenv)->rep_handle)->region->mtx_region) 00114 00115 /* 00116 * REP -- 00117 * Shared replication structure. 00118 */ 00119 typedef struct __rep { 00120 db_mutex_t mtx_region; /* Region mutex. */ 00121 db_mutex_t mtx_clientdb; /* Client database mutex. */ 00122 roff_t tally_off; /* Offset of the tally region. */ 00123 roff_t v2tally_off; /* Offset of the vote2 tally region. */ 00124 int eid; /* Environment id. */ 00125 int master_id; /* ID of the master site. */ 00126 u_int32_t egen; /* Replication election generation. */ 00127 u_int32_t gen; /* Replication generation number. */ 00128 u_int32_t recover_gen; /* Last generation number in log. */ 00129 int asites; /* Space allocated for sites. */ 00130 int nsites; /* Number of sites in group. */ 00131 int nvotes; /* Number of votes needed. */ 00132 int priority; /* My priority in an election. */ 00133 u_int32_t gbytes; /* Limit on data sent in single... */ 00134 u_int32_t bytes; /* __rep_process_message call. */ 00135 #define DB_REP_REQUEST_GAP 4 00136 #define DB_REP_MAX_GAP 128 00137 u_int32_t request_gap; /* # of records to receive before we 00138 * request a missing log record. */ 00139 u_int32_t max_gap; /* Maximum number of records before 00140 * requesting a missing log record. */ 00141 /* Status change information */ 00142 int elect_th; /* A thread is in rep_elect. */ 00143 u_int32_t msg_th; /* Number of callers in rep_proc_msg. */ 00144 int start_th; /* A thread is in rep_start. */ 00145 u_int32_t handle_cnt; /* Count of handles in library. */ 00146 u_int32_t op_cnt; /* Multi-step operation count.*/ 00147 int in_recovery; /* Running recovery now. */ 00148 00149 /* Backup information. */ 00150 u_int32_t nfiles; /* Number of files we have info on. */ 00151 u_int32_t curfile; /* Current file we're getting. */ 00152 __rep_fileinfo_args *curinfo; /* Current file info ptr. */ 00153 void *finfo; /* Current file info buffer. */ 00154 void *nextinfo; /* Next file info buffer. */ 00155 void *originfo; /* Original file info buffer. */ 00156 DB_LSN first_lsn; /* Earliest LSN we need. */ 00157 DB_LSN last_lsn; /* Latest LSN we need. */ 00158 db_pgno_t ready_pg; /* Next pg expected. */ 00159 db_pgno_t waiting_pg; /* First pg after gap. */ 00160 db_pgno_t max_wait_pg; /* Maximum pg requested. */ 00161 u_int32_t npages; /* Num of pages rcvd for this file. */ 00162 DB_MPOOLFILE *file_mpf; /* Mpoolfile for in-mem database. */ 00163 DB *file_dbp; /* This file's page info. */ 00164 DB *queue_dbp; /* Dbp for a queue file. */ 00165 00166 /* Vote tallying information. */ 00167 int sites; /* Sites heard from. */ 00168 int winner; /* Current winner. */ 00169 int w_priority; /* Winner priority. */ 00170 u_int32_t w_gen; /* Winner generation. */ 00171 DB_LSN w_lsn; /* Winner LSN. */ 00172 u_int32_t w_tiebreaker; /* Winner tiebreaking value. */ 00173 int votes; /* Number of votes for this site. */ 00174 u_int32_t esec; /* Election start seconds. */ 00175 u_int32_t eusec; /* Election start useconds. */ 00176 00177 /* Statistics. */ 00178 DB_REP_STAT stat; 00179 00180 /* Configuration. */ 00181 #define REP_C_BULK 0x00001 /* Bulk transfer. */ 00182 #define REP_C_DELAYCLIENT 0x00002 /* Delay client sync-up. */ 00183 #define REP_C_NOAUTOINIT 0x00004 /* No auto initialization. */ 00184 #define REP_C_NOWAIT 0x00008 /* Immediate error return. */ 00185 u_int32_t config; /* Configuration flags. */ 00186 00187 #define REP_F_CLIENT 0x00001 /* Client replica. */ 00188 #define REP_F_DELAY 0x00002 /* Delaying client sync-up. */ 00189 #define REP_F_EPHASE1 0x00004 /* In phase 1 of election. */ 00190 #define REP_F_EPHASE2 0x00008 /* In phase 2 of election. */ 00191 #define REP_F_MASTER 0x00010 /* Master replica. */ 00192 #define REP_F_MASTERELECT 0x00020 /* Master elect */ 00193 #define REP_F_NOARCHIVE 0x00040 /* Rep blocks log_archive */ 00194 #define REP_F_READY 0x00080 /* Wait for txn_cnt to be 0. */ 00195 #define REP_F_RECOVER_LOG 0x00100 /* In recovery - log. */ 00196 #define REP_F_RECOVER_PAGE 0x00200 /* In recovery - pages. */ 00197 #define REP_F_RECOVER_UPDATE 0x00400 /* In recovery - files. */ 00198 #define REP_F_RECOVER_VERIFY 0x00800 /* In recovery - verify. */ 00199 #define REP_F_TALLY 0x01000 /* Tallied vote before elect. */ 00200 u_int32_t flags; 00201 } REP; 00202 00203 /* 00204 * Recovery flag mask to easily check any/all recovery bits. That is 00205 * REP_F_READY and all REP_F_RECOVER*. This must change if the values 00206 * of the flags change. 00207 */ 00208 #define REP_F_RECOVER_MASK \ 00209 (REP_F_READY | REP_F_RECOVER_LOG | REP_F_RECOVER_PAGE | \ 00210 REP_F_RECOVER_UPDATE | REP_F_RECOVER_VERIFY) 00211 00212 #define IN_ELECTION(R) F_ISSET((R), REP_F_EPHASE1 | REP_F_EPHASE2) 00213 #define IN_ELECTION_TALLY(R) \ 00214 F_ISSET((R), REP_F_EPHASE1 | REP_F_EPHASE2 | REP_F_TALLY) 00215 #define IS_REP_MASTER(dbenv) \ 00216 (REP_ON(dbenv) && ((DB_REP *)(dbenv)->rep_handle)->region && \ 00217 F_ISSET(((REP *)((DB_REP *)(dbenv)->rep_handle)->region), \ 00218 REP_F_MASTER)) 00219 00220 #define IS_REP_CLIENT(dbenv) \ 00221 (REP_ON(dbenv) && ((DB_REP *)(dbenv)->rep_handle)->region && \ 00222 F_ISSET(((REP *)((DB_REP *)(dbenv)->rep_handle)->region), \ 00223 REP_F_CLIENT)) 00224 00225 #define IS_CLIENT_PGRECOVER(dbenv) \ 00226 (IS_REP_CLIENT(dbenv) && \ 00227 F_ISSET(((REP *)((DB_REP *)(dbenv)->rep_handle)->region), \ 00228 REP_F_RECOVER_PAGE)) 00229 00230 /* 00231 * Macros to figure out if we need to do replication pre/post-amble processing. 00232 * Skip for specific DB handles owned by the replication layer, either because 00233 * replication is running recovery or because it's a handle entirely owned by 00234 * the replication code (replication opens its own databases to track state). 00235 */ 00236 #define IS_ENV_REPLICATED(E) (REP_ON(E) && \ 00237 ((DB_REP *)((E)->rep_handle))->region != NULL && \ 00238 ((DB_REP *)((E)->rep_handle))->region->flags != 0) 00239 00240 /* 00241 * Gap processing flags. These provide control over the basic 00242 * gap processing algorithm for some special cases. 00243 */ 00244 #define REP_GAP_FORCE 0x001 /* Force a request for a gap. */ 00245 #define REP_GAP_REREQUEST 0x002 /* Gap request is a forced rerequest. */ 00246 /* REREQUEST is a superset of FORCE. */ 00247 00248 /* 00249 * Basic pre/post-amble processing. 00250 */ 00251 #define REPLICATION_WRAP(dbenv, func_call, ret) do { \ 00252 int __rep_check, __t_ret; \ 00253 __rep_check = IS_ENV_REPLICATED(dbenv) ? 1 : 0; \ 00254 if (__rep_check && ((ret) = __env_rep_enter(dbenv, 0)) != 0) \ 00255 return ((ret)); \ 00256 (ret) = func_call; \ 00257 if (__rep_check && \ 00258 (__t_ret = __env_db_rep_exit(dbenv)) != 0 && (ret) == 0) \ 00259 (ret) = __t_ret; \ 00260 } while (0) 00261 00262 /* 00263 * Per-process replication structure. 00264 * 00265 * There are 2 mutexes used in replication. 00266 * 1. mtx_region - This protects the fields of the rep region above. 00267 * 2. mtx_clientdb - This protects the per-process flags, and bookkeeping 00268 * database and all of the components that maintain it. Those 00269 * components include the following fields in the log region (see log.h): 00270 * a. ready_lsn 00271 * b. waiting_lsn 00272 * c. verify_lsn 00273 * d. wait_recs 00274 * e. rcvd_recs 00275 * f. max_wait_lsn 00276 * These fields in the log region are NOT protected by the log region lock at 00277 * all. 00278 * 00279 * Note that the per-process flags should truly be protected by a special 00280 * per-process thread mutex, but it is currently set in so isolated a manner 00281 * that it didn't make sense to do so and in most case we're already holding 00282 * the mtx_clientdb anyway. 00283 * 00284 * The lock ordering protocol is that mtx_clientdb must be acquired first and 00285 * then either REP->mtx_region, or the LOG->mtx_region mutex may be acquired if 00286 * necessary. 00287 */ 00288 struct __db_rep { 00289 DB *rep_db; /* Bookkeeping database. */ 00290 00291 REP *region; /* In memory structure. */ 00292 u_int8_t *bulk; /* Shared memory bulk area. */ 00293 #define DBREP_OPENFILES 0x0001 /* This handle has opened files. */ 00294 u_int32_t flags; /* per-process flags. */ 00295 }; 00296 00297 /* 00298 * Control structure for replication communication infrastructure. 00299 * 00300 * Note that the version information should be at the beginning of the 00301 * structure, so that we can rearrange the rest of it while letting the 00302 * version checks continue to work. DB_REPVERSION should be revved any time 00303 * the rest of the structure changes or when the message numbers change. 00304 */ 00305 typedef struct __rep_control { 00306 #define DB_REPVERSION 3 00307 u_int32_t rep_version; /* Replication version number. */ 00308 u_int32_t log_version; /* Log version number. */ 00309 00310 DB_LSN lsn; /* Log sequence number. */ 00311 u_int32_t rectype; /* Message type. */ 00312 u_int32_t gen; /* Generation number. */ 00313 u_int32_t flags; /* log_put flag value. */ 00314 } REP_CONTROL; 00315 00316 /* Election vote information. */ 00317 typedef struct __rep_vote { 00318 u_int32_t egen; /* Election generation. */ 00319 int nsites; /* Number of sites I've been in 00320 * communication with. */ 00321 int nvotes; /* Number of votes needed to win. */ 00322 int priority; /* My site's priority. */ 00323 u_int32_t tiebreaker; /* Tie-breaking quasi-random value. */ 00324 } REP_VOTE_INFO; 00325 00326 typedef struct __rep_vtally { 00327 u_int32_t egen; /* Voter's election generation. */ 00328 int eid; /* Voter's ID. */ 00329 } REP_VTALLY; 00330 00331 /* 00332 * The REP_THROTTLE_ONLY flag is used to do throttle processing only. 00333 * If set, it will only allow sending the REP_*_MORE message, but not 00334 * the normal, non-throttled message. It is used to support throttling 00335 * with bulk transfer. 00336 */ 00337 /* Flags for __rep_send_throttle. */ 00338 #define REP_THROTTLE_ONLY 0x0001 /* Send _MORE message only. */ 00339 00340 /* Throttled message processing information. */ 00341 typedef struct __rep_throttle { 00342 DB_LSN lsn; /* LSN of this record. */ 00343 DBT *data_dbt; /* DBT of this record. */ 00344 u_int32_t gbytes; /* This call's max gbytes sent. */ 00345 u_int32_t bytes; /* This call's max bytes sent. */ 00346 u_int32_t type; /* Record type. */ 00347 } REP_THROTTLE; 00348 00349 /* Bulk processing information. */ 00350 /* 00351 * !!! 00352 * We use a uintptr_t for the offset. We'd really like to use a ptrdiff_t 00353 * since that really is what it is. But ptrdiff_t is not portable and 00354 * doesn't exist everywhere. 00355 */ 00356 typedef struct __rep_bulk { 00357 u_int8_t *addr; /* Address of bulk buffer. */ 00358 uintptr_t *offp; /* Ptr to current offset into buffer. */ 00359 u_int32_t len; /* Bulk buffer length. */ 00360 u_int32_t type; /* Item type in buffer (log, page). */ 00361 DB_LSN lsn; /* First LSN in buffer. */ 00362 int eid; /* ID of potential recipients. */ 00363 #define BULK_FORCE 0x001 /* Force buffer after this record. */ 00364 #define BULK_XMIT 0x002 /* Buffer in transit. */ 00365 u_int32_t *flagsp; /* Buffer flags. */ 00366 } REP_BULK; 00367 00368 /* 00369 * This structure takes care of representing a transaction. 00370 * It holds all the records, sorted by page number so that 00371 * we can obtain locks and apply updates in a deadlock free 00372 * order. 00373 */ 00374 typedef struct __lsn_collection { 00375 u_int nlsns; 00376 u_int nalloc; 00377 DB_LSN *array; 00378 } LSN_COLLECTION; 00379 00380 /* 00381 * This is used by the page-prep routines to do the lock_vec call to 00382 * apply the updates for a single transaction or a collection of 00383 * transactions. 00384 */ 00385 typedef struct _linfo { 00386 int n; 00387 DB_LOCKREQ *reqs; 00388 DBT *objs; 00389 } linfo_t; 00390 00391 #include "dbinc_auto/rep_ext.h" 00392 #endif /* !_REP_H_ */