Main Page | Class Hierarchy | Data Structures | Directories | File List | Data Fields | Related Pages

rep.h

00001 /*-
00002  * See the file LICENSE for redistribution information.
00003  *
00004  * Copyright (c) 2001-2005
00005  *      Sleepycat Software.  All rights reserved.
00006  *
00007  * $Id: rep.h,v 12.22 2005/10/27 13:27:01 bostic Exp $
00008  */
00009 
00010 #ifndef _REP_H_
00011 #define _REP_H_
00012 
00013 #include "dbinc_auto/rep_auto.h"
00014 
00015 /*
00016  * Message types
00017  */
00018 #define REP_ALIVE       1       /* I am alive message. */
00019 #define REP_ALIVE_REQ   2       /* Request for alive messages. */
00020 #define REP_ALL_REQ     3       /* Request all log records greater than LSN. */
00021 #define REP_BULK_LOG    4       /* Bulk transfer of log records. */
00022 #define REP_BULK_PAGE   5       /* Bulk transfer of pages. */
00023 #define REP_DUPMASTER   6       /* Duplicate master detected; propagate. */
00024 #define REP_FILE        7       /* Page of a database file. NOTUSED */
00025 #define REP_FILE_FAIL   8       /* File requested does not exist. */
00026 #define REP_FILE_REQ    9       /* Request for a database file. NOTUSED */
00027 #define REP_LOG         10      /* Log record. */
00028 #define REP_LOG_MORE    11      /* There are more log records to request. */
00029 #define REP_LOG_REQ     12      /* Request for a log record. */
00030 #define REP_MASTER_REQ  13      /* Who is the master */
00031 #define REP_NEWCLIENT   14      /* Announces the presence of a new client. */
00032 #define REP_NEWFILE     15      /* Announce a log file change. */
00033 #define REP_NEWMASTER   16      /* Announces who the master is. */
00034 #define REP_NEWSITE     17      /* Announces that a site has heard from a new
00035                                  * site; like NEWCLIENT, but indirect.  A
00036                                  * NEWCLIENT message comes directly from the new
00037                                  * client while a NEWSITE comes indirectly from
00038                                  * someone who heard about a NEWSITE.
00039                                  */
00040 #define REP_PAGE        18      /* Database page. */
00041 #define REP_PAGE_FAIL   19      /* Requested page does not exist. */
00042 #define REP_PAGE_MORE   20      /* There are more pages to request. */
00043 #define REP_PAGE_REQ    21      /* Request for a database page. */
00044 #define REP_REREQUEST   22      /* Force rerequest. */
00045 #define REP_UPDATE      23      /* Environment hotcopy information. */
00046 #define REP_UPDATE_REQ  24      /* Request for hotcopy information. */
00047 #define REP_VERIFY      25      /* A log record for verification. */
00048 #define REP_VERIFY_FAIL 26      /* The client is outdated. */
00049 #define REP_VERIFY_REQ  27      /* Request for a log record to verify. */
00050 #define REP_VOTE1       28      /* Send out your information for an election. */
00051 #define REP_VOTE2       29      /* Send a "you are master" vote. */
00052 
00053 /*
00054  * REP_PRINT_MESSAGE
00055  *      A function to print a debugging message.
00056  *
00057  * RPRINT
00058  *      A macro for debug printing.  Takes as an arg the arg set for __db_msg.
00059  *
00060  * !!! This function assumes a local DB_MSGBUF variable called 'mb'.
00061  */
00062 #ifdef DIAGNOSTIC
00063 #define REP_PRINT_MESSAGE(dbenv, eid, rp, str)                          \
00064         __rep_print_message(dbenv, eid, rp, str)
00065 #define RPRINT(e, r, x) do {                                            \
00066         if (FLD_ISSET((e)->verbose, DB_VERB_REPLICATION)) {             \
00067                 DB_MSGBUF_INIT(&mb);                                    \
00068                 if ((e)->db_errpfx == NULL) {                           \
00069                         if (F_ISSET((r), REP_F_CLIENT))                 \
00070                                 __db_msgadd((e), &mb, "CLIENT: ");      \
00071                         else if (F_ISSET((r), REP_F_MASTER))            \
00072                                 __db_msgadd((e), &mb, "MASTER: ");      \
00073                         else                                            \
00074                                 __db_msgadd((e), &mb, "REP_UNDEF: ");   \
00075                 } else                                                  \
00076                         __db_msgadd((e), &mb, "%s: ",(e)->db_errpfx);   \
00077                 __db_msgadd x;                                          \
00078                 DB_MSGBUF_FLUSH((e), &mb);                              \
00079         }                                                               \
00080 } while (0)
00081 #else
00082 #define REP_PRINT_MESSAGE(dbenv, eid, rp, str)
00083 #define RPRINT(e, r, x)
00084 #endif
00085 
00086 /*
00087  * Election gen file name
00088  * The file contains an egen number for an election this client has NOT
00089  * participated in.  I.e. it is the number of a future election.  We
00090  * create it when we create the rep region, if it doesn't already exist
00091  * and initialize egen to 1.  If it does exist, we read it when we create
00092  * the rep region.  We write it immediately before sending our VOTE1 in
00093  * an election.  That way, if a client has ever sent a vote for any
00094  * election, the file is already going to be updated to reflect a future
00095  * election, should it crash.
00096  */
00097 #define REP_EGENNAME    "__db.rep.egen"
00098 
00099 /*
00100  * Database types for __rep_client_dbinit
00101  */
00102 typedef enum {
00103         REP_DB,         /* Log record database. */
00104         REP_PG          /* Pg database. */
00105 } repdb_t;
00106 
00107 /* Macros to lock/unlock the replication region as a whole. */
00108 #define REP_SYSTEM_LOCK(dbenv)                                          \
00109         MUTEX_LOCK(dbenv, ((DB_REP *)                                   \
00110             (dbenv)->rep_handle)->region->mtx_region)
00111 #define REP_SYSTEM_UNLOCK(dbenv)                                        \
00112         MUTEX_UNLOCK(dbenv, ((DB_REP *)                                 \
00113             (dbenv)->rep_handle)->region->mtx_region)
00114 
00115 /*
00116  * REP --
00117  * Shared replication structure.
00118  */
00119 typedef struct __rep {
00120         db_mutex_t      mtx_region;     /* Region mutex. */
00121         db_mutex_t      mtx_clientdb;   /* Client database mutex. */
00122         roff_t          tally_off;      /* Offset of the tally region. */
00123         roff_t          v2tally_off;    /* Offset of the vote2 tally region. */
00124         int             eid;            /* Environment id. */
00125         int             master_id;      /* ID of the master site. */
00126         u_int32_t       egen;           /* Replication election generation. */
00127         u_int32_t       gen;            /* Replication generation number. */
00128         u_int32_t       recover_gen;    /* Last generation number in log. */
00129         int             asites;         /* Space allocated for sites. */
00130         int             nsites;         /* Number of sites in group. */
00131         int             nvotes;         /* Number of votes needed. */
00132         int             priority;       /* My priority in an election. */
00133         u_int32_t       gbytes;         /* Limit on data sent in single... */
00134         u_int32_t       bytes;          /* __rep_process_message call. */
00135 #define DB_REP_REQUEST_GAP      4
00136 #define DB_REP_MAX_GAP          128
00137         u_int32_t       request_gap;    /* # of records to receive before we
00138                                          * request a missing log record. */
00139         u_int32_t       max_gap;        /* Maximum number of records before
00140                                          * requesting a missing log record. */
00141         /* Status change information */
00142         int             elect_th;       /* A thread is in rep_elect. */
00143         u_int32_t       msg_th;         /* Number of callers in rep_proc_msg. */
00144         int             start_th;       /* A thread is in rep_start. */
00145         u_int32_t       handle_cnt;     /* Count of handles in library. */
00146         u_int32_t       op_cnt;         /* Multi-step operation count.*/
00147         int             in_recovery;    /* Running recovery now. */
00148 
00149         /* Backup information. */
00150         u_int32_t       nfiles;         /* Number of files we have info on. */
00151         u_int32_t       curfile;        /* Current file we're getting. */
00152         __rep_fileinfo_args     *curinfo;       /* Current file info ptr. */
00153         void            *finfo;         /* Current file info buffer. */
00154         void            *nextinfo;      /* Next file info buffer. */
00155         void            *originfo;      /* Original file info buffer. */
00156         DB_LSN          first_lsn;      /* Earliest LSN we need. */
00157         DB_LSN          last_lsn;       /* Latest LSN we need. */
00158         db_pgno_t       ready_pg;       /* Next pg expected. */
00159         db_pgno_t       waiting_pg;     /* First pg after gap. */
00160         db_pgno_t       max_wait_pg;    /* Maximum pg requested. */
00161         u_int32_t       npages;         /* Num of pages rcvd for this file. */
00162         DB_MPOOLFILE    *file_mpf;      /* Mpoolfile for in-mem database. */
00163         DB              *file_dbp;      /* This file's page info. */
00164         DB              *queue_dbp;     /* Dbp for a queue file. */
00165 
00166         /* Vote tallying information. */
00167         int             sites;          /* Sites heard from. */
00168         int             winner;         /* Current winner. */
00169         int             w_priority;     /* Winner priority. */
00170         u_int32_t       w_gen;          /* Winner generation. */
00171         DB_LSN          w_lsn;          /* Winner LSN. */
00172         u_int32_t       w_tiebreaker;   /* Winner tiebreaking value. */
00173         int             votes;          /* Number of votes for this site. */
00174         u_int32_t       esec;           /* Election start seconds. */
00175         u_int32_t       eusec;          /* Election start useconds. */
00176 
00177         /* Statistics. */
00178         DB_REP_STAT     stat;
00179 
00180         /* Configuration. */
00181 #define REP_C_BULK              0x00001         /* Bulk transfer. */
00182 #define REP_C_DELAYCLIENT       0x00002         /* Delay client sync-up. */
00183 #define REP_C_NOAUTOINIT        0x00004         /* No auto initialization. */
00184 #define REP_C_NOWAIT            0x00008         /* Immediate error return. */
00185         u_int32_t       config;         /* Configuration flags. */
00186 
00187 #define REP_F_CLIENT            0x00001         /* Client replica. */
00188 #define REP_F_DELAY             0x00002         /* Delaying client sync-up. */
00189 #define REP_F_EPHASE1           0x00004         /* In phase 1 of election. */
00190 #define REP_F_EPHASE2           0x00008         /* In phase 2 of election. */
00191 #define REP_F_MASTER            0x00010         /* Master replica. */
00192 #define REP_F_MASTERELECT       0x00020         /* Master elect */
00193 #define REP_F_NOARCHIVE         0x00040         /* Rep blocks log_archive */
00194 #define REP_F_READY             0x00080         /* Wait for txn_cnt to be 0. */
00195 #define REP_F_RECOVER_LOG       0x00100         /* In recovery - log. */
00196 #define REP_F_RECOVER_PAGE      0x00200         /* In recovery - pages. */
00197 #define REP_F_RECOVER_UPDATE    0x00400         /* In recovery - files. */
00198 #define REP_F_RECOVER_VERIFY    0x00800         /* In recovery - verify. */
00199 #define REP_F_TALLY             0x01000         /* Tallied vote before elect. */
00200         u_int32_t       flags;
00201 } REP;
00202 
00203 /*
00204  * Recovery flag mask to easily check any/all recovery bits.  That is
00205  * REP_F_READY and all REP_F_RECOVER*.  This must change if the values
00206  * of the flags change.
00207  */
00208 #define REP_F_RECOVER_MASK                                              \
00209     (REP_F_READY | REP_F_RECOVER_LOG | REP_F_RECOVER_PAGE |             \
00210      REP_F_RECOVER_UPDATE | REP_F_RECOVER_VERIFY)
00211 
00212 #define IN_ELECTION(R)          F_ISSET((R), REP_F_EPHASE1 | REP_F_EPHASE2)
00213 #define IN_ELECTION_TALLY(R) \
00214         F_ISSET((R), REP_F_EPHASE1 | REP_F_EPHASE2 | REP_F_TALLY)
00215 #define IS_REP_MASTER(dbenv)                                            \
00216         (REP_ON(dbenv) && ((DB_REP *)(dbenv)->rep_handle)->region &&    \
00217             F_ISSET(((REP *)((DB_REP *)(dbenv)->rep_handle)->region),   \
00218             REP_F_MASTER))
00219 
00220 #define IS_REP_CLIENT(dbenv)                                            \
00221         (REP_ON(dbenv) && ((DB_REP *)(dbenv)->rep_handle)->region &&    \
00222             F_ISSET(((REP *)((DB_REP *)(dbenv)->rep_handle)->region),   \
00223             REP_F_CLIENT))
00224 
00225 #define IS_CLIENT_PGRECOVER(dbenv)                                      \
00226         (IS_REP_CLIENT(dbenv) &&                                        \
00227             F_ISSET(((REP *)((DB_REP *)(dbenv)->rep_handle)->region),   \
00228             REP_F_RECOVER_PAGE))
00229 
00230 /*
00231  * Macros to figure out if we need to do replication pre/post-amble processing.
00232  * Skip for specific DB handles owned by the replication layer, either because
00233  * replication is running recovery or because it's a handle entirely owned by
00234  * the replication code (replication opens its own databases to track state).
00235  */
00236 #define IS_ENV_REPLICATED(E) (REP_ON(E) &&                              \
00237         ((DB_REP *)((E)->rep_handle))->region != NULL &&                \
00238         ((DB_REP *)((E)->rep_handle))->region->flags != 0)
00239 
00240 /*
00241  * Gap processing flags.  These provide control over the basic
00242  * gap processing algorithm for some special cases.
00243  */
00244 #define REP_GAP_FORCE           0x001   /* Force a request for a gap. */
00245 #define REP_GAP_REREQUEST       0x002   /* Gap request is a forced rerequest. */
00246                                         /* REREQUEST is a superset of FORCE. */
00247 
00248 /*
00249  * Basic pre/post-amble processing.
00250  */
00251 #define REPLICATION_WRAP(dbenv, func_call, ret) do {                    \
00252         int __rep_check, __t_ret;                                       \
00253         __rep_check = IS_ENV_REPLICATED(dbenv) ? 1 : 0;                 \
00254         if (__rep_check && ((ret) = __env_rep_enter(dbenv, 0)) != 0)    \
00255                 return ((ret));                                         \
00256         (ret) = func_call;                                              \
00257         if (__rep_check &&                                              \
00258             (__t_ret = __env_db_rep_exit(dbenv)) != 0 && (ret) == 0)    \
00259                 (ret) = __t_ret;                                        \
00260 } while (0)
00261 
00262 /*
00263  * Per-process replication structure.
00264  *
00265  * There are 2 mutexes used in replication.
00266  * 1.  mtx_region - This protects the fields of the rep region above.
00267  * 2.  mtx_clientdb - This protects the per-process flags, and bookkeeping
00268  * database and all of the components that maintain it.  Those
00269  * components include the following fields in the log region (see log.h):
00270  *      a. ready_lsn
00271  *      b. waiting_lsn
00272  *      c. verify_lsn
00273  *      d. wait_recs
00274  *      e. rcvd_recs
00275  *      f. max_wait_lsn
00276  * These fields in the log region are NOT protected by the log region lock at
00277  * all.
00278  *
00279  * Note that the per-process flags should truly be protected by a special
00280  * per-process thread mutex, but it is currently set in so isolated a manner
00281  * that it didn't make sense to do so and in most case we're already holding
00282  * the mtx_clientdb anyway.
00283  *
00284  * The lock ordering protocol is that mtx_clientdb must be acquired first and
00285  * then either REP->mtx_region, or the LOG->mtx_region mutex may be acquired if
00286  * necessary.
00287  */
00288 struct __db_rep {
00289         DB              *rep_db;        /* Bookkeeping database. */
00290 
00291         REP             *region;        /* In memory structure. */
00292         u_int8_t        *bulk;          /* Shared memory bulk area. */
00293 #define DBREP_OPENFILES         0x0001  /* This handle has opened files. */
00294         u_int32_t       flags;          /* per-process flags. */
00295 };
00296 
00297 /*
00298  * Control structure for replication communication infrastructure.
00299  *
00300  * Note that the version information should be at the beginning of the
00301  * structure, so that we can rearrange the rest of it while letting the
00302  * version checks continue to work.  DB_REPVERSION should be revved any time
00303  * the rest of the structure changes or when the message numbers change.
00304  */
00305 typedef struct __rep_control {
00306 #define DB_REPVERSION   3
00307         u_int32_t       rep_version;    /* Replication version number. */
00308         u_int32_t       log_version;    /* Log version number. */
00309 
00310         DB_LSN          lsn;            /* Log sequence number. */
00311         u_int32_t       rectype;        /* Message type. */
00312         u_int32_t       gen;            /* Generation number. */
00313         u_int32_t       flags;          /* log_put flag value. */
00314 } REP_CONTROL;
00315 
00316 /* Election vote information. */
00317 typedef struct __rep_vote {
00318         u_int32_t       egen;           /* Election generation. */
00319         int             nsites;         /* Number of sites I've been in
00320                                          * communication with. */
00321         int             nvotes;         /* Number of votes needed to win. */
00322         int             priority;       /* My site's priority. */
00323         u_int32_t       tiebreaker;     /* Tie-breaking quasi-random value. */
00324 } REP_VOTE_INFO;
00325 
00326 typedef struct __rep_vtally {
00327         u_int32_t       egen;           /* Voter's election generation. */
00328         int             eid;            /* Voter's ID. */
00329 } REP_VTALLY;
00330 
00331 /*
00332  * The REP_THROTTLE_ONLY flag is used to do throttle processing only.
00333  * If set, it will only allow sending the REP_*_MORE message, but not
00334  * the normal, non-throttled message.  It is used to support throttling
00335  * with bulk transfer.
00336  */
00337 /* Flags for __rep_send_throttle. */
00338 #define REP_THROTTLE_ONLY       0x0001  /* Send _MORE message only. */
00339 
00340 /* Throttled message processing information. */
00341 typedef struct __rep_throttle {
00342         DB_LSN          lsn;            /* LSN of this record. */
00343         DBT             *data_dbt;      /* DBT of this record. */
00344         u_int32_t       gbytes;         /* This call's max gbytes sent. */
00345         u_int32_t       bytes;          /* This call's max bytes sent. */
00346         u_int32_t       type;           /* Record type. */
00347 } REP_THROTTLE;
00348 
00349 /* Bulk processing information. */
00350 /*
00351  * !!!
00352  * We use a uintptr_t for the offset.  We'd really like to use a ptrdiff_t
00353  * since that really is what it is.  But ptrdiff_t is not portable and
00354  * doesn't exist everywhere.
00355  */
00356 typedef struct __rep_bulk {
00357         u_int8_t        *addr;          /* Address of bulk buffer. */
00358         uintptr_t       *offp;          /* Ptr to current offset into buffer. */
00359         u_int32_t       len;            /* Bulk buffer length. */
00360         u_int32_t       type;           /* Item type in buffer (log, page). */
00361         DB_LSN          lsn;            /* First LSN in buffer. */
00362         int             eid;            /* ID of potential recipients. */
00363 #define BULK_FORCE      0x001           /* Force buffer after this record. */
00364 #define BULK_XMIT       0x002           /* Buffer in transit. */
00365         u_int32_t       *flagsp;        /* Buffer flags. */
00366 } REP_BULK;
00367 
00368 /*
00369  * This structure takes care of representing a transaction.
00370  * It holds all the records, sorted by page number so that
00371  * we can obtain locks and apply updates in a deadlock free
00372  * order.
00373  */
00374 typedef struct __lsn_collection {
00375         u_int nlsns;
00376         u_int nalloc;
00377         DB_LSN *array;
00378 } LSN_COLLECTION;
00379 
00380 /*
00381  * This is used by the page-prep routines to do the lock_vec call to
00382  * apply the updates for a single transaction or a collection of
00383  * transactions.
00384  */
00385 typedef struct _linfo {
00386         int             n;
00387         DB_LOCKREQ      *reqs;
00388         DBT             *objs;
00389 } linfo_t;
00390 
00391 #include "dbinc_auto/rep_ext.h"
00392 #endif  /* !_REP_H_ */

Generated on Sun Dec 25 12:14:22 2005 for Berkeley DB 4.4.16 by  doxygen 1.4.2