Main Page | Class Hierarchy | Data Structures | Directories | File List | Data Fields | Related Pages

rep_elect.c

00001 /*-
00002  * See the file LICENSE for redistribution information.
00003  *
00004  * Copyright (c) 2004-2005
00005  *      Sleepycat Software.  All rights reserved.
00006  *
00007  * $Id: rep_elect.c,v 12.10 2005/08/23 14:18:19 sue Exp $
00008  */
00009 
00010 #include "db_config.h"
00011 
00012 #ifndef NO_SYSTEM_INCLUDES
00013 #include <stdlib.h>
00014 #include <string.h>
00015 #if TIME_WITH_SYS_TIME
00016 #include <sys/time.h>
00017 #include <time.h>
00018 #else
00019 #if HAVE_SYS_TIME_H
00020 #include <sys/time.h>
00021 #else
00022 #include <time.h>
00023 #endif
00024 #endif
00025 #endif
00026 
00027 #include "db_int.h"
00028 #include "dbinc/db_page.h"
00029 #include "dbinc/db_am.h"
00030 #include "dbinc/log.h"
00031 
00032 static void __rep_cmp_vote __P((DB_ENV *, REP *, int, DB_LSN *,
00033     int, u_int32_t, u_int32_t));
00034 static int __rep_cmp_vote2 __P((DB_ENV *, REP *, int, u_int32_t));
00035 static int __rep_elect_init
00036                __P((DB_ENV *, DB_LSN *, int, int, int, int *, u_int32_t *));
00037 static int __rep_tally __P((DB_ENV *, REP *, int, int *, u_int32_t, roff_t));
00038 static int __rep_wait __P((DB_ENV *, u_int32_t, int *, u_int32_t));
00039 
00040 /*
00041  * __rep_elect --
00042  *      Called after master failure to hold/participate in an election for
00043  *      a new master.
00044  *
00045  * PUBLIC:  int __rep_elect __P((DB_ENV *, int, int, int,
00046  * PUBLIC:     u_int32_t, int *, u_int32_t));
00047  */
00048 int
00049 __rep_elect(dbenv, nsites, nvotes, priority, timeout, eidp, flags)
00050         DB_ENV *dbenv;
00051         int nsites, nvotes, priority;
00052         u_int32_t timeout;
00053         int *eidp;
00054         u_int32_t flags;
00055 {
00056         DB_LOG *dblp;
00057         DB_LSN lsn;
00058         DB_REP *db_rep;
00059         REP *rep;
00060         int ack, done, in_progress, ret, send_vote;
00061         u_int32_t egen, orig_tally, tiebreaker, to;
00062 #ifdef DIAGNOSTIC
00063         DB_MSGBUF mb;
00064 #endif
00065 
00066         PANIC_CHECK(dbenv);
00067         COMPQUIET(flags, 0);
00068         ENV_REQUIRES_CONFIG(dbenv, dbenv->rep_handle, "rep_elect", DB_INIT_REP);
00069 
00070         /* Error checking. */
00071         if (nsites <= 0) {
00072                 __db_err(dbenv,
00073                     "DB_ENV->rep_elect: nsites must be greater than 0");
00074                 return (EINVAL);
00075         }
00076         if (nvotes < 0) {
00077                 __db_err(dbenv,
00078                     "DB_ENV->rep_elect: nvotes may not be negative");
00079                 return (EINVAL);
00080         }
00081         if (priority < 0) {
00082                 __db_err(dbenv,
00083                     "DB_ENV->rep_elect: priority may not be negative");
00084                 return (EINVAL);
00085         }
00086         if (nsites < nvotes) {
00087                 __db_err(dbenv,
00088     "DB_ENV->rep_elect: nvotes (%d) is larger than nsites (%d)",
00089                     nvotes, nsites);
00090                 return (EINVAL);
00091         }
00092 
00093         ack = nvotes;
00094         /* If they give us a 0 for nvotes, default to simple majority.  */
00095         if (nvotes == 0)
00096                 ack = (nsites / 2) + 1;
00097 
00098         /*
00099          * XXX
00100          * If users give us less than a majority, they run the risk of
00101          * having a network partition.  However, this also allows the
00102          * scenario of master/1 client to elect the client.  Allow
00103          * sub-majority values, but give a warning.
00104          */
00105         if (nvotes <= (nsites / 2)) {
00106                 __db_err(dbenv,
00107     "DB_ENV->rep_elect:WARNING: nvotes (%d) is sub-majority with nsites (%d)",
00108                     nvotes, nsites);
00109         }
00110 
00111         db_rep = dbenv->rep_handle;
00112         rep = db_rep->region;
00113         dblp = dbenv->lg_handle;
00114 
00115         RPRINT(dbenv, rep,
00116             (dbenv, &mb, "Start election nsites %d, ack %d, priority %d",
00117             nsites, ack, priority));
00118 
00119         LOG_SYSTEM_LOCK(dbenv);
00120         lsn = ((LOG *)dblp->reginfo.primary)->lsn;
00121         LOG_SYSTEM_UNLOCK(dbenv);
00122 
00123         orig_tally = 0;
00124         to = timeout;
00125         if ((ret = __rep_elect_init(dbenv,
00126             &lsn, nsites, ack, priority, &in_progress, &orig_tally)) != 0) {
00127                 if (ret == DB_REP_NEWMASTER) {
00128                         ret = 0;
00129                         *eidp = dbenv->rep_eid;
00130                 }
00131                 goto err;
00132         }
00133         /*
00134          * If another thread is in the middle of an election we
00135          * just quietly return and not interfere.
00136          */
00137         if (in_progress) {
00138                 *eidp = rep->master_id;
00139                 return (0);
00140         }
00141         __os_clock(dbenv, &rep->esec, &rep->eusec);
00142 restart:
00143         /* Generate a randomized tiebreaker value. */
00144         __os_unique_id(dbenv, &tiebreaker);
00145 
00146         REP_SYSTEM_LOCK(dbenv);
00147         F_SET(rep, REP_F_EPHASE1 | REP_F_NOARCHIVE);
00148         F_CLR(rep, REP_F_TALLY);
00149 
00150         /*
00151          * We are about to participate at this egen.  We must
00152          * write out the next egen before participating in this one
00153          * so that if we crash we can never participate in this egen
00154          * again.
00155          */
00156         if ((ret = __rep_write_egen(dbenv, rep->egen + 1)) != 0)
00157                 goto lockdone;
00158 
00159         /* Tally our own vote */
00160         if (__rep_tally(dbenv, rep, rep->eid, &rep->sites, rep->egen,
00161             rep->tally_off) != 0) {
00162                 ret = EINVAL;
00163                 goto lockdone;
00164         }
00165         __rep_cmp_vote(dbenv, rep, rep->eid, &lsn, priority, rep->gen,
00166             tiebreaker);
00167 
00168         RPRINT(dbenv, rep, (dbenv, &mb, "Beginning an election"));
00169 
00170         /* Now send vote */
00171         send_vote = DB_EID_INVALID;
00172         egen = rep->egen;
00173         REP_SYSTEM_UNLOCK(dbenv);
00174         __rep_send_vote(dbenv, &lsn, nsites, ack, priority, tiebreaker, egen,
00175             DB_EID_BROADCAST, REP_VOTE1);
00176         DB_ENV_TEST_RECOVERY(dbenv, DB_TEST_ELECTVOTE1, ret, NULL);
00177         ret = __rep_wait(dbenv, to, eidp, REP_F_EPHASE1);
00178         switch (ret) {
00179                 case 0:
00180                         /* Check if election complete or phase complete. */
00181                         if (*eidp != DB_EID_INVALID) {
00182                                 RPRINT(dbenv, rep, (dbenv, &mb,
00183                                     "Ended election phase 1 %d", ret));
00184                                 goto edone;
00185                         }
00186                         goto phase2;
00187                 case DB_REP_EGENCHG:
00188                         if (to > timeout)
00189                                 to = timeout;
00190                         to = (to * 8) / 10;
00191                         RPRINT(dbenv, rep, (dbenv, &mb,
00192 "Egen changed while waiting. Now %lu.  New timeout %lu, orig timeout %lu",
00193                             (u_long)rep->egen, (u_long)to, (u_long)timeout));
00194                         /*
00195                          * If the egen changed while we were sleeping, that
00196                          * means we're probably late to the next election,
00197                          * so we'll backoff our timeout so that we don't get
00198                          * into an out-of-phase election scenario.
00199                          *
00200                          * Backoff to 80% of the current timeout.
00201                          */
00202                         goto restart;
00203                 case DB_TIMEOUT:
00204                         break;
00205                 default:
00206                         goto err;
00207         }
00208         /*
00209          * If we got here, we haven't heard from everyone, but we've
00210          * run out of time, so it's time to decide if we have enough
00211          * votes to pick a winner and if so, to send out a vote to
00212          * the winner.
00213          */
00214         REP_SYSTEM_LOCK(dbenv);
00215         /*
00216          * If our egen changed while we were waiting.  We need to
00217          * essentially reinitialize our election.
00218          */
00219         if (egen != rep->egen) {
00220                 REP_SYSTEM_UNLOCK(dbenv);
00221                 RPRINT(dbenv, rep, (dbenv, &mb, "Egen changed from %lu to %lu",
00222                     (u_long)egen, (u_long)rep->egen));
00223                 goto restart;
00224         }
00225         if (rep->sites >= rep->nvotes) {
00226 
00227                 /* We think we've seen enough to cast a vote. */
00228                 send_vote = rep->winner;
00229                 /*
00230                  * See if we won.  This will make sure we
00231                  * don't count ourselves twice if we're racing
00232                  * with incoming votes.
00233                  */
00234                 if (rep->winner == rep->eid) {
00235                         (void)__rep_tally(dbenv, rep, rep->eid, &rep->votes,
00236                             egen, rep->v2tally_off);
00237                         RPRINT(dbenv, rep, (dbenv, &mb,
00238                             "Counted my vote %d", rep->votes));
00239                 }
00240                 F_SET(rep, REP_F_EPHASE2);
00241                 F_CLR(rep, REP_F_EPHASE1);
00242         }
00243         REP_SYSTEM_UNLOCK(dbenv);
00244         if (send_vote == DB_EID_INVALID) {
00245                 /* We do not have enough votes to elect. */
00246                 RPRINT(dbenv, rep, (dbenv, &mb,
00247                     "Not enough votes to elect: recvd %d of %d from %d sites",
00248                     rep->sites, rep->nvotes, rep->nsites));
00249                 ret = DB_REP_UNAVAIL;
00250                 goto err;
00251 
00252         } else {
00253                 /*
00254                  * We have seen enough vote1's.  Now we need to wait
00255                  * for all the vote2's.
00256                  */
00257                 if (send_vote != rep->eid) {
00258                         RPRINT(dbenv, rep, (dbenv, &mb, "Sending vote"));
00259                         __rep_send_vote(dbenv, NULL, 0, 0, 0, 0, egen,
00260                             send_vote, REP_VOTE2);
00261                         /*
00262                          * If we are NOT the new master we want to send
00263                          * our vote to the winner, and wait longer.  The
00264                          * reason is that the winner may be "behind" us
00265                          * in the election waiting and if the master is
00266                          * down, the winner will wait the full timeout
00267                          * and we want to give the winner enough time to
00268                          * process all the votes.  Otherwise we could
00269                          * incorrectly return DB_REP_UNAVAIL and start a
00270                          * new election before the winner can declare
00271                          * itself.
00272                          */
00273                         to = to * 2;
00274 
00275                 }
00276 
00277 phase2:         ret = __rep_wait(dbenv, to, eidp, REP_F_EPHASE2);
00278                 RPRINT(dbenv, rep, (dbenv, &mb,
00279                     "Ended election phase 2 %d", ret));
00280                 switch (ret) {
00281                         case 0:
00282                                 goto edone;
00283                         case DB_REP_EGENCHG:
00284                                 if (to > timeout)
00285                                         to = timeout;
00286                                 to = (to * 8) / 10;
00287                                 RPRINT(dbenv, rep, (dbenv, &mb,
00288 "While waiting egen changed to %lu.  Phase 2 New timeout %lu, orig timeout %lu",
00289                                     (u_long)rep->egen,
00290                                     (u_long)to, (u_long)timeout));
00291                                 goto restart;
00292                         case DB_TIMEOUT:
00293                                 ret = DB_REP_UNAVAIL;
00294                                 break;
00295                         default:
00296                                 goto err;
00297                 }
00298                 REP_SYSTEM_LOCK(dbenv);
00299                 if (egen != rep->egen) {
00300                         REP_SYSTEM_UNLOCK(dbenv);
00301                         RPRINT(dbenv, rep, (dbenv, &mb,
00302                             "Egen ph2 changed from %lu to %lu",
00303                             (u_long)egen, (u_long)rep->egen));
00304                         goto restart;
00305                 }
00306                 done = rep->votes >= rep->nvotes;
00307                 RPRINT(dbenv, rep, (dbenv, &mb,
00308                     "After phase 2: done %d, votes %d, nsites %d",
00309                     done, rep->votes, rep->nsites));
00310                 if (send_vote == rep->eid && done) {
00311                         __rep_elect_master(dbenv, rep, eidp);
00312                         ret = 0;
00313                         goto lockdone;
00314                 }
00315                 REP_SYSTEM_UNLOCK(dbenv);
00316         }
00317 
00318 err:    REP_SYSTEM_LOCK(dbenv);
00319 lockdone:
00320         /*
00321          * If we get here because of a non-election error, then we
00322          * did not tally our vote.  The only non-election error is
00323          * from elect_init where we were unable to grow_sites.  In
00324          * that case we do not want to discard all known election info.
00325          */
00326         if (ret == 0 || ret == DB_REP_UNAVAIL)
00327                 __rep_elect_done(dbenv, rep);
00328         else if (orig_tally)
00329                 F_SET(rep, orig_tally);
00330 
00331         /*
00332          * If the election finished elsewhere, we need to decrement
00333          * the elect_th anyway.
00334          */
00335         if (0) {
00336 edone:          REP_SYSTEM_LOCK(dbenv);
00337         }
00338         rep->elect_th = 0;
00339 
00340         RPRINT(dbenv, rep, (dbenv, &mb,
00341             "Ended election with %d, sites %d, egen %lu, flags 0x%lx",
00342             ret, rep->sites, (u_long)rep->egen, (u_long)rep->flags));
00343         REP_SYSTEM_UNLOCK(dbenv);
00344 
00345 DB_TEST_RECOVERY_LABEL
00346         return (ret);
00347 }
00348 
00349 /*
00350  * __rep_vote1 --
00351  *      Handle incoming vote1 message on a client.
00352  *
00353  * PUBLIC: int __rep_vote1 __P((DB_ENV *, REP_CONTROL *, DBT *, int));
00354  */
00355 int
00356 __rep_vote1(dbenv, rp, rec, eid)
00357         DB_ENV *dbenv;
00358         REP_CONTROL *rp;
00359         DBT *rec;
00360         int eid;
00361 {
00362         DB_LOG *dblp;
00363         DB_LSN lsn;
00364         DB_REP *db_rep;
00365         DBT data_dbt;
00366         LOG *lp;
00367         REP *rep;
00368         REP_VOTE_INFO *vi;
00369         u_int32_t egen;
00370         int done, master, ret;
00371 #ifdef DIAGNOSTIC
00372         DB_MSGBUF mb;
00373 #endif
00374 
00375         ret = 0;
00376         db_rep = dbenv->rep_handle;
00377         rep = db_rep->region;
00378         dblp = dbenv->lg_handle;
00379         lp = dblp->reginfo.primary;
00380 
00381         if (F_ISSET(rep, REP_F_MASTER)) {
00382                 RPRINT(dbenv, rep,
00383                     (dbenv, &mb, "Master received vote"));
00384                 LOG_SYSTEM_LOCK(dbenv);
00385                 lsn = lp->lsn;
00386                 LOG_SYSTEM_UNLOCK(dbenv);
00387                 (void)__rep_send_message(dbenv,
00388                     DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0);
00389                 return (ret);
00390         }
00391 
00392         vi = (REP_VOTE_INFO *)rec->data;
00393         REP_SYSTEM_LOCK(dbenv);
00394 
00395         /*
00396          * If we get a vote from a later election gen, we
00397          * clear everything from the current one, and we'll
00398          * start over by tallying it.  If we get an old vote,
00399          * send an ALIVE to the old participant.
00400          */
00401         RPRINT(dbenv, rep, (dbenv, &mb,
00402             "Received vote1 egen %lu, egen %lu",
00403             (u_long)vi->egen, (u_long)rep->egen));
00404         if (vi->egen < rep->egen) {
00405                 RPRINT(dbenv, rep, (dbenv, &mb,
00406                     "Received old vote %lu, egen %lu, ignoring vote1",
00407                     (u_long)vi->egen, (u_long)rep->egen));
00408                 egen = rep->egen;
00409                 REP_SYSTEM_UNLOCK(dbenv);
00410                 data_dbt.data = &egen;
00411                 data_dbt.size = sizeof(egen);
00412                 (void)__rep_send_message(dbenv,
00413                     eid, REP_ALIVE, &rp->lsn, &data_dbt, 0, 0);
00414                 return (ret);
00415         }
00416         if (vi->egen > rep->egen) {
00417                 RPRINT(dbenv, rep, (dbenv, &mb,
00418                     "Received VOTE1 from egen %lu, my egen %lu; reset",
00419                     (u_long)vi->egen, (u_long)rep->egen));
00420                 __rep_elect_done(dbenv, rep);
00421                 rep->egen = vi->egen;
00422         }
00423         if (!IN_ELECTION(rep))
00424                 F_SET(rep, REP_F_TALLY);
00425 
00426         /* Check if this site knows about more sites than we do. */
00427         if (vi->nsites > rep->nsites)
00428                 rep->nsites = vi->nsites;
00429 
00430         /* Check if this site requires more votes than we do. */
00431         if (vi->nvotes > rep->nvotes)
00432                 rep->nvotes = vi->nvotes;
00433 
00434         /*
00435          * We are keeping the vote, let's see if that changes our
00436          * count of the number of sites.
00437          */
00438         if (rep->sites + 1 > rep->nsites)
00439                 rep->nsites = rep->sites + 1;
00440         if (rep->nsites > rep->asites &&
00441             (ret = __rep_grow_sites(dbenv, rep->nsites)) != 0) {
00442                 RPRINT(dbenv, rep, (dbenv, &mb,
00443                     "Grow sites returned error %d", ret));
00444                 goto err;
00445         }
00446 
00447         /*
00448          * Ignore vote1's if we're in phase 2.
00449          */
00450         if (F_ISSET(rep, REP_F_EPHASE2)) {
00451                 RPRINT(dbenv, rep, (dbenv, &mb,
00452                     "In phase 2, ignoring vote1"));
00453                 goto err;
00454         }
00455 
00456         /*
00457          * Record this vote.  If we get back non-zero, we
00458          * ignore the vote.
00459          */
00460         if ((ret = __rep_tally(dbenv, rep, eid, &rep->sites,
00461             vi->egen, rep->tally_off)) != 0) {
00462                 RPRINT(dbenv, rep, (dbenv, &mb,
00463                     "Tally returned %d, sites %d",
00464                     ret, rep->sites));
00465                 ret = 0;
00466                 goto err;
00467         }
00468         RPRINT(dbenv, rep, (dbenv, &mb,
00469             "Incoming vote: (eid)%d (pri)%d (gen)%lu (egen)%lu [%lu,%lu]",
00470             eid, vi->priority,
00471             (u_long)rp->gen, (u_long)vi->egen,
00472             (u_long)rp->lsn.file, (u_long)rp->lsn.offset));
00473 #ifdef DIAGNOSTIC
00474         if (rep->sites > 1)
00475                 RPRINT(dbenv, rep, (dbenv, &mb,
00476     "Existing vote: (eid)%d (pri)%d (gen)%lu (sites)%d [%lu,%lu]",
00477                     rep->winner, rep->w_priority,
00478                     (u_long)rep->w_gen, rep->sites,
00479                     (u_long)rep->w_lsn.file,
00480                     (u_long)rep->w_lsn.offset));
00481 #endif
00482         __rep_cmp_vote(dbenv, rep, eid, &rp->lsn, vi->priority,
00483             rp->gen, vi->tiebreaker);
00484         /*
00485          * If you get a vote and you're not in an election, we've
00486          * already recorded this vote.  But that is all we need
00487          * to do.
00488          */
00489         if (!IN_ELECTION(rep)) {
00490                 RPRINT(dbenv, rep, (dbenv, &mb,
00491                     "Not in election, but received vote1 0x%x",
00492                     rep->flags));
00493                 ret = DB_REP_HOLDELECTION;
00494                 goto err;
00495         }
00496 
00497         master = rep->winner;
00498         lsn = rep->w_lsn;
00499         /*
00500          * We need to check sites == nsites, not more than half
00501          * like we do in __rep_elect and the VOTE2 code below.  The
00502          * reason is that we want to process all the incoming votes
00503          * and not short-circuit once we reach more than half.  The
00504          * real winner's vote may be in the last half.
00505          */
00506         done = rep->sites >= rep->nsites && rep->w_priority != 0;
00507         if (done) {
00508                 RPRINT(dbenv, rep,
00509                     (dbenv, &mb, "Phase1 election done"));
00510                 RPRINT(dbenv, rep, (dbenv, &mb, "Voting for %d%s",
00511                     master, master == rep->eid ? "(self)" : ""));
00512                 egen = rep->egen;
00513                 F_SET(rep, REP_F_EPHASE2);
00514                 F_CLR(rep, REP_F_EPHASE1);
00515                 if (master == rep->eid) {
00516                         (void)__rep_tally(dbenv, rep, rep->eid,
00517                             &rep->votes, egen, rep->v2tally_off);
00518                         goto err;
00519                 }
00520                 REP_SYSTEM_UNLOCK(dbenv);
00521 
00522                 /* Vote for someone else. */
00523                 __rep_send_vote(dbenv, NULL, 0, 0, 0, 0, egen,
00524                     master, REP_VOTE2);
00525         } else
00526 err:            REP_SYSTEM_UNLOCK(dbenv);
00527         return (ret);
00528 }
00529 
00530 /*
00531  * __rep_vote2 --
00532  *      Handle incoming vote1 message on a client.
00533  *
00534  * PUBLIC: int __rep_vote2 __P((DB_ENV *, DBT *, int *));
00535  */
00536 int
00537 __rep_vote2(dbenv, rec, eidp)
00538         DB_ENV *dbenv;
00539         DBT *rec;
00540         int *eidp;
00541 {
00542         DB_LOG *dblp;
00543         DB_LSN lsn;
00544         DB_REP *db_rep;
00545         LOG *lp;
00546         REP *rep;
00547         REP_VOTE_INFO *vi;
00548         int done, ret;
00549 #ifdef DIAGNOSTIC
00550         DB_MSGBUF mb;
00551 #endif
00552 
00553         ret = 0;
00554         db_rep = dbenv->rep_handle;
00555         rep = db_rep->region;
00556         dblp = dbenv->lg_handle;
00557         lp = dblp->reginfo.primary;
00558 
00559         RPRINT(dbenv, rep, (dbenv, &mb, "We received a vote%s",
00560             F_ISSET(rep, REP_F_MASTER) ? " (master)" : ""));
00561         if (F_ISSET(rep, REP_F_MASTER)) {
00562                 LOG_SYSTEM_LOCK(dbenv);
00563                 lsn = lp->lsn;
00564                 LOG_SYSTEM_UNLOCK(dbenv);
00565                 rep->stat.st_elections_won++;
00566                 (void)__rep_send_message(dbenv,
00567                     DB_EID_BROADCAST, REP_NEWMASTER, &lsn, NULL, 0, 0);
00568                 return (ret);
00569         }
00570 
00571         REP_SYSTEM_LOCK(dbenv);
00572 
00573         /* If we have priority 0, we should never get a vote. */
00574         DB_ASSERT(rep->priority != 0);
00575 
00576         /*
00577          * We might be the last to the party and we haven't had
00578          * time to tally all the vote1's, but others have and
00579          * decided we're the winner.  So, if we're in the process
00580          * of tallying sites, keep the vote so that when our
00581          * election thread catches up we'll have the votes we
00582          * already received.
00583          */
00584         vi = (REP_VOTE_INFO *)rec->data;
00585         if (!IN_ELECTION_TALLY(rep) && vi->egen >= rep->egen) {
00586                 RPRINT(dbenv, rep, (dbenv, &mb,
00587                     "Not in election gen %lu, at %lu, got vote",
00588                     (u_long)vi->egen, (u_long)rep->egen));
00589                 ret = DB_REP_HOLDELECTION;
00590                 goto err;
00591         }
00592 
00593         /*
00594          * Record this vote.  In a VOTE2, the only valid entry
00595          * in the REP_VOTE_INFO is the election generation.
00596          *
00597          * There are several things which can go wrong that we
00598          * need to account for:
00599          * 1. If we receive a latent VOTE2 from an earlier election,
00600          * we want to ignore it.
00601          * 2. If we receive a VOTE2 from a site from which we never
00602          * received a VOTE1, we want to ignore it.
00603          * 3. If we have received a duplicate VOTE2 from this election
00604          * from the same site we want to ignore it.
00605          * 4. If this is from the current election and someone is
00606          * really voting for us, then we finally get to record it.
00607          */
00608         /*
00609          * __rep_cmp_vote2 checks for cases 1 and 2.
00610          */
00611         if ((ret = __rep_cmp_vote2(dbenv, rep, *eidp, vi->egen)) != 0) {
00612                 ret = 0;
00613                 goto err;
00614         }
00615         /*
00616          * __rep_tally takes care of cases 3 and 4.
00617          */
00618         if ((ret = __rep_tally(dbenv, rep, *eidp, &rep->votes,
00619             vi->egen, rep->v2tally_off)) != 0) {
00620                 ret = 0;
00621                 goto err;
00622         }
00623         done = rep->votes >= rep->nvotes;
00624         RPRINT(dbenv, rep, (dbenv, &mb, "Counted vote %d of %d",
00625             rep->votes, rep->nvotes));
00626         if (done) {
00627                 __rep_elect_master(dbenv, rep, eidp);
00628                 ret = DB_REP_NEWMASTER;
00629         }
00630 
00631 err:    REP_SYSTEM_UNLOCK(dbenv);
00632         return (ret);
00633 }
00634 
00635 /*
00636  * __rep_tally --
00637  *      Handle incoming vote1 message on a client.  Called with the db_rep
00638  *      mutex held.  This function will return 0 if we successfully tally
00639  *      the vote and non-zero if the vote is ignored.  This will record
00640  *      both VOTE1 and VOTE2 records, depending on which region offset the
00641  *      caller passed in.
00642  */
00643 static int
00644 __rep_tally(dbenv, rep, eid, countp, egen, vtoff)
00645         DB_ENV *dbenv;
00646         REP *rep;
00647         int eid, *countp;
00648         u_int32_t egen;
00649         roff_t vtoff;
00650 {
00651         REP_VTALLY *tally, *vtp;
00652         int i;
00653 #ifdef DIAGNOSTIC
00654         DB_MSGBUF mb;
00655 #else
00656         COMPQUIET(rep, NULL);
00657 #endif
00658 
00659         tally = R_ADDR((REGINFO *)dbenv->reginfo, vtoff);
00660         i = 0;
00661         vtp = &tally[i];
00662         while (i < *countp) {
00663                 /*
00664                  * Ignore votes from earlier elections (i.e. we've heard
00665                  * from this site in this election, but its vote from an
00666                  * earlier election got delayed and we received it now).
00667                  * However, if we happened to hear from an earlier vote
00668                  * and we recorded it and we're now hearing from a later
00669                  * election we want to keep the updated one.  Note that
00670                  * updating the entry will not increase the count.
00671                  * Also ignore votes that are duplicates.
00672                  */
00673                 if (vtp->eid == eid) {
00674                         RPRINT(dbenv, rep, (dbenv, &mb,
00675                             "Tally found[%d] (%d, %lu), this vote (%d, %lu)",
00676                                     i, vtp->eid, (u_long)vtp->egen,
00677                                     eid, (u_long)egen));
00678                         if (vtp->egen >= egen)
00679                                 return (1);
00680                         else {
00681                                 vtp->egen = egen;
00682                                 return (0);
00683                         }
00684                 }
00685                 i++;
00686                 vtp = &tally[i];
00687         }
00688         /*
00689          * If we get here, we have a new voter we haven't
00690          * seen before.  Tally this vote.
00691          */
00692 #ifdef DIAGNOSTIC
00693         if (vtoff == rep->tally_off)
00694                 RPRINT(dbenv, rep, (dbenv, &mb, "Tallying VOTE1[%d] (%d, %lu)",
00695                     i, eid, (u_long)egen));
00696         else
00697                 RPRINT(dbenv, rep, (dbenv, &mb, "Tallying VOTE2[%d] (%d, %lu)",
00698                     i, eid, (u_long)egen));
00699 #endif
00700         vtp->eid = eid;
00701         vtp->egen = egen;
00702         (*countp)++;
00703         return (0);
00704 }
00705 
00706 /*
00707  * __rep_cmp_vote --
00708  *      Compare incoming vote1 message on a client.  Called with the db_rep
00709  *      mutex held.
00710  *
00711  */
00712 static void
00713 __rep_cmp_vote(dbenv, rep, eid, lsnp, priority, gen, tiebreaker)
00714         DB_ENV *dbenv;
00715         REP *rep;
00716         int eid;
00717         DB_LSN *lsnp;
00718         int priority;
00719         u_int32_t gen, tiebreaker;
00720 {
00721         int cmp;
00722 
00723 #ifdef DIAGNOSTIC
00724         DB_MSGBUF mb;
00725 #else
00726         COMPQUIET(dbenv, NULL);
00727 #endif
00728         cmp = log_compare(lsnp, &rep->w_lsn);
00729         /*
00730          * If we've seen more than one, compare us to the best so far.
00731          * If we're the first, make ourselves the winner to start.
00732          */
00733         if (rep->sites > 1 && priority != 0) {
00734                 /*
00735                  * LSN is primary determinant. Then priority if LSNs
00736                  * are equal, then tiebreaker if both are equal.
00737                  */
00738                 if (cmp > 0 ||
00739                     (cmp == 0 && (priority > rep->w_priority ||
00740                     (priority == rep->w_priority &&
00741                     (tiebreaker > rep->w_tiebreaker))))) {
00742                         RPRINT(dbenv, rep, (dbenv, &mb, "Accepting new vote"));
00743                         rep->winner = eid;
00744                         rep->w_priority = priority;
00745                         rep->w_lsn = *lsnp;
00746                         rep->w_gen = gen;
00747                         rep->w_tiebreaker = tiebreaker;
00748                 }
00749         } else if (rep->sites == 1) {
00750                 if (priority != 0) {
00751                         /* Make ourselves the winner to start. */
00752                         rep->winner = eid;
00753                         rep->w_priority = priority;
00754                         rep->w_gen = gen;
00755                         rep->w_lsn = *lsnp;
00756                         rep->w_tiebreaker = tiebreaker;
00757                 } else {
00758                         rep->winner = DB_EID_INVALID;
00759                         rep->w_priority = 0;
00760                         rep->w_gen = 0;
00761                         ZERO_LSN(rep->w_lsn);
00762                         rep->w_tiebreaker = 0;
00763                 }
00764         }
00765         return;
00766 }
00767 
00768 /*
00769  * __rep_cmp_vote2 --
00770  *      Compare incoming vote2 message with vote1's we've recorded.  Called
00771  *      with the db_rep mutex held.  We return 0 if the VOTE2 is from a
00772  *      site we've heard from and it is from this election.  Otherwise return 1.
00773  *
00774  */
00775 static int
00776 __rep_cmp_vote2(dbenv, rep, eid, egen)
00777         DB_ENV *dbenv;
00778         REP *rep;
00779         int eid;
00780         u_int32_t egen;
00781 {
00782         int i;
00783         REP_VTALLY *tally, *vtp;
00784 #ifdef DIAGNOSTIC
00785         DB_MSGBUF mb;
00786 #endif
00787 
00788         tally = R_ADDR((REGINFO *)dbenv->reginfo, rep->tally_off);
00789         i = 0;
00790         vtp = &tally[i];
00791         for (i = 0; i < rep->sites; i++) {
00792                 vtp = &tally[i];
00793                 if (vtp->eid == eid && vtp->egen == egen) {
00794                         RPRINT(dbenv, rep, (dbenv, &mb,
00795                             "Found matching vote1 (%d, %lu), at %d of %d",
00796                             eid, (u_long)egen, i, rep->sites));
00797                         return (0);
00798                 }
00799         }
00800         RPRINT(dbenv, rep,
00801             (dbenv, &mb, "Didn't find vote1 for eid %d, egen %lu",
00802             eid, (u_long)egen));
00803         return (1);
00804 }
00805 
00806 /*
00807  * __rep_elect_init
00808  *      Initialize an election.  Sets beginp non-zero if the election is
00809  * already in progress; makes it 0 otherwise.
00810  */
00811 static int
00812 __rep_elect_init(dbenv, lsnp, nsites, nvotes, priority, beginp, otally)
00813         DB_ENV *dbenv;
00814         DB_LSN *lsnp;
00815         int nsites, nvotes, priority;
00816         int *beginp;
00817         u_int32_t *otally;
00818 {
00819         DB_REP *db_rep;
00820         REP *rep;
00821         int ret;
00822 
00823         db_rep = dbenv->rep_handle;
00824         rep = db_rep->region;
00825 
00826         ret = 0;
00827 
00828         /* We may miscount, as we don't hold the replication mutex here. */
00829         rep->stat.st_elections++;
00830 
00831         /* If we are already a master; simply broadcast that fact and return. */
00832         if (F_ISSET(rep, REP_F_MASTER)) {
00833                 (void)__rep_send_message(dbenv,
00834                     DB_EID_BROADCAST, REP_NEWMASTER, lsnp, NULL, 0, 0);
00835                 rep->stat.st_elections_won++;
00836                 return (DB_REP_NEWMASTER);
00837         }
00838 
00839         REP_SYSTEM_LOCK(dbenv);
00840         if (otally != NULL)
00841                 *otally = F_ISSET(rep, REP_F_TALLY);
00842         *beginp = IN_ELECTION(rep) || rep->elect_th;
00843         if (!*beginp) {
00844                 /*
00845                  * Make sure that we always initialize all the election fields
00846                  * before putting ourselves in an election state.  That means
00847                  * issuing calls that can fail (allocation) before setting all
00848                  * the variables.
00849                  */
00850                 if (nsites > rep->asites &&
00851                     (ret = __rep_grow_sites(dbenv, nsites)) != 0)
00852                         goto err;
00853                 DB_ENV_TEST_RECOVERY(dbenv, DB_TEST_ELECTINIT, ret, NULL);
00854                 rep->elect_th = 1;
00855                 rep->nsites = nsites;
00856                 rep->nvotes = nvotes;
00857                 rep->priority = priority;
00858                 rep->master_id = DB_EID_INVALID;
00859         }
00860 DB_TEST_RECOVERY_LABEL
00861 err:    REP_SYSTEM_UNLOCK(dbenv);
00862         return (ret);
00863 }
00864 
00865 /*
00866  * __rep_elect_master
00867  *      Set up for new master from election.  Must be called with
00868  *      the replication region mutex held.
00869  *
00870  * PUBLIC: void __rep_elect_master __P((DB_ENV *, REP *, int *));
00871  */
00872 void
00873 __rep_elect_master(dbenv, rep, eidp)
00874         DB_ENV *dbenv;
00875         REP *rep;
00876         int *eidp;
00877 {
00878 #ifdef DIAGNOSTIC
00879         DB_MSGBUF mb;
00880 #else
00881         COMPQUIET(dbenv, NULL);
00882 #endif
00883         rep->master_id = rep->eid;
00884         F_SET(rep, REP_F_MASTERELECT);
00885         if (eidp != NULL)
00886                 *eidp = rep->master_id;
00887         rep->stat.st_elections_won++;
00888         RPRINT(dbenv, rep, (dbenv, &mb,
00889             "Got enough votes to win; election done; winner is %d, gen %lu",
00890             rep->master_id, (u_long)rep->gen));
00891 }
00892 
00893 static int
00894 __rep_wait(dbenv, timeout, eidp, flags)
00895         DB_ENV *dbenv;
00896         u_int32_t timeout;
00897         int *eidp;
00898         u_int32_t flags;
00899 {
00900         DB_REP *db_rep;
00901         REP *rep;
00902         int done, echg;
00903         u_int32_t egen, sleeptime;
00904 
00905         done = echg = 0;
00906         db_rep = dbenv->rep_handle;
00907         rep = db_rep->region;
00908         egen = rep->egen;
00909 
00910         /*
00911          * The user specifies an overall timeout function, but checking
00912          * is cheap and the timeout may be a generous upper bound.
00913          * Sleep repeatedly for the smaller of .5s and timeout/10.
00914          */
00915         sleeptime = (timeout > 5000000) ? 500000 : timeout / 10;
00916         if (sleeptime == 0)
00917                 sleeptime++;
00918         while (timeout > 0) {
00919                 __os_sleep(dbenv, 0, sleeptime);
00920                 REP_SYSTEM_LOCK(dbenv);
00921                 echg = egen != rep->egen;
00922                 done = !F_ISSET(rep, flags) && rep->master_id != DB_EID_INVALID;
00923 
00924                 *eidp = rep->master_id;
00925                 REP_SYSTEM_UNLOCK(dbenv);
00926 
00927                 if (done)
00928                         return (0);
00929 
00930                 if (echg)
00931                         return (DB_REP_EGENCHG);
00932 
00933                 if (timeout > sleeptime)
00934                         timeout -= sleeptime;
00935                 else
00936                         timeout = 0;
00937         }
00938         return (DB_TIMEOUT);
00939 }

Generated on Sun Dec 25 12:14:44 2005 for Berkeley DB 4.4.16 by  doxygen 1.4.2