Main Page | Class Hierarchy | Data Structures | Directories | File List | Data Fields | Related Pages

db_dispatch.c

00001 /*-
00002  * See the file LICENSE for redistribution information.
00003  *
00004  * Copyright (c) 1996-2005
00005  *      Sleepycat Software.  All rights reserved.
00006  */
00007 /*
00008  * Copyright (c) 1995, 1996
00009  *      The President and Fellows of Harvard University.  All rights reserved.
00010  *
00011  * This code is derived from software contributed to Berkeley by
00012  * Margo Seltzer.
00013  *
00014  * Redistribution and use in source and binary forms, with or without
00015  * modification, are permitted provided that the following conditions
00016  * are met:
00017  * 1. Redistributions of source code must retain the above copyright
00018  *    notice, this list of conditions and the following disclaimer.
00019  * 2. Redistributions in binary form must reproduce the above copyright
00020  *    notice, this list of conditions and the following disclaimer in the
00021  *    documentation and/or other materials provided with the distribution.
00022  * 3. Neither the name of the University nor the names of its contributors
00023  *    may be used to endorse or promote products derived from this software
00024  *    without specific prior written permission.
00025  *
00026  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
00027  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00028  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00029  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
00030  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00031  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00032  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00033  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00034  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00035  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00036  * SUCH DAMAGE.
00037  *
00038  * $Id: db_dispatch.c,v 12.12 2005/11/10 21:11:42 bostic Exp $
00039  */
00040 
00041 #include "db_config.h"
00042 
00043 #ifndef NO_SYSTEM_INCLUDES
00044 #include <sys/types.h>
00045 
00046 #include <string.h>
00047 #endif
00048 
00049 #include "db_int.h"
00050 #include "dbinc/db_page.h"
00051 #ifndef HAVE_FTRUNCATE
00052 #include "dbinc/db_shash.h"
00053 #endif
00054 #include "dbinc/hash.h"
00055 #ifndef HAVE_FTRUNCATE
00056 #include "dbinc/lock.h"
00057 #include "dbinc/mp.h"
00058 #endif
00059 #include "dbinc/log.h"
00060 #include "dbinc/fop.h"
00061 #include "dbinc/txn.h"
00062 
00063 #ifndef HAVE_FTRUNCATE
00064 static int __db_limbo_fix __P((DB *, DB_TXN *,
00065                 DB_TXNLIST *, db_pgno_t *, DBMETA *, db_limbo_state));
00066 static int __db_limbo_bucket __P((DB_ENV *,
00067              DB_TXN *, DB_TXNLIST *, db_limbo_state));
00068 static int __db_limbo_move __P((DB_ENV *, DB_TXN *, DB_TXN *, DB_TXNLIST *));
00069 static int __db_limbo_prepare __P(( DB *, DB_TXN *, DB_TXNLIST *));
00070 static int __db_lock_move __P((DB_ENV *,
00071                 u_int8_t *, db_pgno_t, db_lockmode_t, DB_TXN *, DB_TXN *));
00072 static int __db_txnlist_pgnoadd __P((DB_ENV *, DB_TXNHEAD *,
00073                 int32_t, u_int8_t *, char *, db_pgno_t));
00074 #endif
00075 static int __db_txnlist_find_internal __P((DB_ENV *, DB_TXNHEAD *,
00076                 db_txnlist_type, u_int32_t, u_int8_t *, DB_TXNLIST **,
00077                 int, u_int32_t *));
00078 
00079 /*
00080  * __db_dispatch --
00081  *
00082  * This is the transaction dispatch function used by the db access methods.
00083  * It is designed to handle the record format used by all the access
00084  * methods (the one automatically generated by the db_{h,log,read}.sh
00085  * scripts in the tools directory).  An application using a different
00086  * recovery paradigm will supply a different dispatch function to txn_open.
00087  *
00088  * PUBLIC: int __db_dispatch __P((DB_ENV *,
00089  * PUBLIC:     int (**)__P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)),
00090  * PUBLIC:     size_t, DBT *, DB_LSN *, db_recops, DB_TXNHEAD *));
00091  */
00092 int
00093 __db_dispatch(dbenv, dtab, dtabsize, db, lsnp, redo, info)
00094         DB_ENV *dbenv;          /* The environment. */
00095         int (**dtab)__P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
00096         size_t dtabsize;        /* Size of the dtab. */
00097         DBT *db;                /* The log record upon which to dispatch. */
00098         DB_LSN *lsnp;           /* The lsn of the record being dispatched. */
00099         db_recops redo;         /* Redo this op (or undo it). */
00100         DB_TXNHEAD *info;       /* Transaction list. */
00101 {
00102         DB_LSN prev_lsn;
00103         u_int32_t rectype, status, txnid;
00104         int make_call, ret;
00105 
00106         memcpy(&rectype, db->data, sizeof(rectype));
00107         memcpy(&txnid, (u_int8_t *)db->data + sizeof(rectype), sizeof(txnid));
00108         make_call = ret = 0;
00109 
00110         /* If we don't have a dispatch table, it's hard to dispatch. */
00111         DB_ASSERT(dtab != NULL);
00112 
00113         /*
00114          * If we find a record that is in the user's number space and they
00115          * have specified a recovery routine, let them handle it.  If they
00116          * didn't specify a recovery routine, then we expect that they've
00117          * followed all our rules and registered new recovery functions.
00118          */
00119         switch (redo) {
00120         case DB_TXN_ABORT:
00121         case DB_TXN_APPLY:
00122         case DB_TXN_PRINT:
00123                 make_call = 1;
00124                 break;
00125         case DB_TXN_OPENFILES:
00126                 /*
00127                  * We collect all the transactions that have
00128                  * "begin" records, those with no previous LSN,
00129                  * so that we do not abort partial transactions.
00130                  * These are known to be undone, otherwise the
00131                  * log would not have been freeable.
00132                  */
00133                 memcpy(&prev_lsn, (u_int8_t *)db->data +
00134                     sizeof(rectype) + sizeof(txnid), sizeof(prev_lsn));
00135                 if (txnid != 0 && prev_lsn.file == 0 && (ret =
00136                     __db_txnlist_add(dbenv, info, txnid, TXN_OK, NULL)) != 0)
00137                         return (ret);
00138 
00139                 /* FALLTHROUGH */
00140         case DB_TXN_POPENFILES:
00141                 if (rectype == DB___dbreg_register ||
00142                     rectype == DB___txn_child ||
00143                     rectype == DB___txn_ckp || rectype == DB___txn_recycle)
00144                         return (dtab[rectype](dbenv, db, lsnp, redo, info));
00145                 break;
00146         case DB_TXN_BACKWARD_ROLL:
00147                 /*
00148                  * Running full recovery in the backward pass. In general,
00149                  * we only process records during this pass that belong
00150                  * to aborted transactions.  Unfortunately, there are several
00151                  * exceptions:
00152                  * 1. If this is a meta-record, one not associated with
00153                  *    a transaction, then we must always process it.
00154                  * 2. If this is a transaction commit/abort, we must
00155                  *    always process it, so that we know the status of
00156                  *    every transaction.
00157                  * 3. If this is a child commit, we need to process it
00158                  *    because the outcome of the child transaction depends
00159                  *    on the outcome of the parent.
00160                  * 4. If this is a dbreg_register record, we must always
00161                  *    process is because they contain non-transactional
00162                  *    closes that must be properly handled.
00163                  * 5. If this is a noop, we must always undo it so that we
00164                  *    properly handle any aborts before a file was closed.
00165                  * 6. If this a file remove, we need to process it to
00166                  *    determine if the on-disk file is the same as the
00167                  *    one being described.
00168                  */
00169                 switch (rectype) {
00170                 /*
00171                  * These either do not belong to a transaction or (regop)
00172                  * must be processed regardless of the status of the
00173                  * transaction.
00174                  */
00175                 case DB___txn_regop:
00176                 case DB___txn_recycle:
00177                 case DB___txn_ckp:
00178                         make_call = 1;
00179                         break;
00180                 /*
00181                  * These belong to a transaction whose status must be
00182                  * checked.
00183                  */
00184                 case DB___txn_child:
00185                 case DB___db_noop:
00186                 case DB___fop_file_remove:
00187                 case DB___dbreg_register:
00188                         make_call = 1;
00189 
00190                         /* FALLTHROUGH */
00191                 default:
00192                         if (txnid == 0)
00193                                 break;
00194 
00195                         ret = __db_txnlist_find(dbenv, info, txnid, &status);
00196 
00197                         /* If not found, this is an incomplete abort.  */
00198                         if (ret == DB_NOTFOUND)
00199                                 return (__db_txnlist_add(dbenv,
00200                                     info, txnid, TXN_IGNORE, lsnp));
00201                         if (ret != 0)
00202                                 return (ret);
00203 
00204                         /*
00205                          * If we ignore the transaction, ignore the operation
00206                          * UNLESS this is a child commit in which case we need
00207                          * to make sure that the child also gets marked as
00208                          * ignore.
00209                          */
00210                         if (status == TXN_IGNORE && rectype != DB___txn_child) {
00211                                 make_call = 0;
00212                                 break;
00213                         }
00214                         if (status == TXN_COMMIT)
00215                                 break;
00216 
00217                         /* Set make_call in case we came through default */
00218                         make_call = 1;
00219                         if (status == TXN_OK &&
00220                             (ret = __db_txnlist_update(dbenv,
00221                             info, txnid, rectype == DB___txn_xa_regop ?
00222                             TXN_PREPARE : TXN_ABORT, NULL, &status, 0)) != 0)
00223                                 return (ret);
00224                 }
00225                 break;
00226         case DB_TXN_FORWARD_ROLL:
00227                 /*
00228                  * In the forward pass, if we haven't seen the transaction,
00229                  * do nothing, else recover it.
00230                  *
00231                  * We need to always redo DB___db_noop records, so that we
00232                  * properly handle any commits after the file was closed.
00233                  */
00234                 switch (rectype) {
00235                 case DB___txn_recycle:
00236                 case DB___txn_ckp:
00237                 case DB___db_noop:
00238                         make_call = 1;
00239                         break;
00240 
00241                 default:
00242                         if (txnid == 0)
00243                                 status = 0;
00244                         else {
00245                                 ret = __db_txnlist_find(dbenv,
00246                                     info, txnid, &status);
00247 
00248                                 if (ret == DB_NOTFOUND)
00249                                         /* Break out out of if clause. */
00250                                         ;
00251                                 else if (ret != 0)
00252                                         return (ret);
00253                                 else if (status == TXN_COMMIT) {
00254                                         make_call = 1;
00255                                         break;
00256                                 }
00257                         }
00258 
00259 #ifndef HAVE_FTRUNCATE
00260                         if (status != TXN_IGNORE &&
00261                             (rectype == DB___ham_metagroup ||
00262                             rectype == DB___ham_groupalloc ||
00263                             rectype == DB___db_pg_alloc)) {
00264                                 /*
00265                                  * Because we do not have truncate
00266                                  * all allocation records must be reprocessed
00267                                  * during rollforward in case the file was
00268                                  * just created.  It may not have been
00269                                  * present during the backward pass.
00270                                  */
00271                                 make_call = 1;
00272                                 redo = DB_TXN_BACKWARD_ALLOC;
00273                         } else
00274 #endif
00275                         if (rectype == DB___dbreg_register) {
00276                                 /*
00277                                  * This may be a transaction dbreg_register.
00278                                  * If it is, we only make the call on a COMMIT,
00279                                  * which we checked above. If it's not, then we
00280                                  * should always make the call, because we need
00281                                  * the file open information.
00282                                  */
00283                                 if (txnid == 0)
00284                                         make_call = 1;
00285                         }
00286                 }
00287                 break;
00288         case DB_TXN_BACKWARD_ALLOC:
00289         default:
00290                 return (__db_unknown_flag(
00291                     dbenv, "__db_dispatch", (u_int32_t)redo));
00292         }
00293 
00294         if (make_call) {
00295                 /*
00296                  * If the debug flag is set then we are logging
00297                  * records for a non-durable update so that they
00298                  * may be examined for diagnostic purposes.
00299                  * So only make the call if we are printing,
00300                  * otherwise we need to extract the previous
00301                  * lsn so undo will work properly.
00302                  */
00303                 if (rectype & DB_debug_FLAG) {
00304                         if (redo == DB_TXN_PRINT)
00305                                 rectype &= ~DB_debug_FLAG;
00306                         else {
00307                                 memcpy(lsnp,
00308                                     (u_int8_t *)db->data +
00309                                     sizeof(rectype) +
00310                                     sizeof(txnid), sizeof(*lsnp));
00311                                 return (0);
00312                         }
00313                 }
00314                 if (rectype >= DB_user_BEGIN && dbenv->app_dispatch != NULL)
00315                         return (dbenv->app_dispatch(dbenv, db, lsnp, redo));
00316                 else {
00317                         /*
00318                          * The size of the dtab table argument is the same as
00319                          * the standard table, use the standard table's size
00320                          * as our sanity check.
00321                          */
00322                         if (rectype > dtabsize || dtab[rectype] == NULL) {
00323                                 __db_err(dbenv,
00324                                     "Illegal record type %lu in log",
00325                                     (u_long)rectype);
00326                                 return (EINVAL);
00327                         }
00328                         return (dtab[rectype](dbenv, db, lsnp, redo, info));
00329                 }
00330         }
00331 
00332         return (0);
00333 }
00334 
00335 /*
00336  * __db_add_recovery --
00337  *
00338  * PUBLIC: int __db_add_recovery __P((DB_ENV *,
00339  * PUBLIC:   int (***)(DB_ENV *, DBT *, DB_LSN *, db_recops, void *), size_t *,
00340  * PUBLIC:   int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops, void *), u_int32_t));
00341  */
00342 int
00343 __db_add_recovery(dbenv, dtab, dtabsize, func, ndx)
00344         DB_ENV *dbenv;
00345         int (***dtab) __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
00346         size_t *dtabsize;
00347         int (*func) __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
00348         u_int32_t ndx;
00349 {
00350         size_t i, nsize;
00351         int ret;
00352 
00353         /* Check if we have to grow the table. */
00354         if (ndx >= *dtabsize) {
00355                 nsize = ndx + 40;
00356                 if ((ret =
00357                     __os_realloc(dbenv, nsize * sizeof((*dtab)[0]), dtab)) != 0)
00358                         return (ret);
00359                 for (i = *dtabsize; i < nsize; ++i)
00360                         (*dtab)[i] = NULL;
00361                 *dtabsize = nsize;
00362         }
00363 
00364         (*dtab)[ndx] = func;
00365         return (0);
00366 }
00367 
00368 /*
00369  * __db_txnlist_init --
00370  *      Initialize transaction linked list.
00371  *
00372  * PUBLIC: int __db_txnlist_init __P((DB_ENV *,
00373  * PUBLIC:     u_int32_t, u_int32_t, DB_LSN *, DB_TXNHEAD **));
00374  */
00375 int
00376 __db_txnlist_init(dbenv, low_txn, hi_txn, trunc_lsn, retp)
00377         DB_ENV *dbenv;
00378         u_int32_t low_txn, hi_txn;
00379         DB_LSN *trunc_lsn;
00380         DB_TXNHEAD **retp;
00381 {
00382         DB_TXNHEAD *headp;
00383         u_int32_t size, tmp;
00384         int ret;
00385 
00386         /*
00387          * Size a hash table.
00388          *      If low is zero then we are being called during rollback
00389          * and we need only one slot.
00390          *      Hi maybe lower than low if we have recycled txnid's.
00391          *      The numbers here are guesses about txn density, we can afford
00392          * to look at a few entries in each slot.
00393          */
00394         if (low_txn == 0)
00395                 size = 1;
00396         else {
00397                 if (hi_txn < low_txn) {
00398                         tmp = hi_txn;
00399                         hi_txn = low_txn;
00400                         low_txn = tmp;
00401                 }
00402                 tmp = hi_txn - low_txn;
00403                 /* See if we wrapped around. */
00404                 if (tmp > (TXN_MAXIMUM - TXN_MINIMUM) / 2)
00405                         tmp = (low_txn - TXN_MINIMUM) + (TXN_MAXIMUM - hi_txn);
00406                 size = tmp / 5;
00407                 if (size < 100)
00408                         size = 100;
00409         }
00410         if ((ret = __os_malloc(dbenv,
00411             sizeof(DB_TXNHEAD) + size * sizeof(headp->head), &headp)) != 0)
00412                 return (ret);
00413 
00414         memset(headp, 0, sizeof(DB_TXNHEAD) + size * sizeof(headp->head));
00415         headp->maxid = hi_txn;
00416         headp->generation = 0;
00417         headp->nslots = size;
00418         headp->gen_alloc = 8;
00419         if ((ret = __os_malloc(dbenv, headp->gen_alloc *
00420             sizeof(headp->gen_array[0]), &headp->gen_array)) != 0) {
00421                 __os_free(dbenv, headp);
00422                 return (ret);
00423         }
00424         headp->gen_array[0].generation = 0;
00425         headp->gen_array[0].txn_min = TXN_MINIMUM;
00426         headp->gen_array[0].txn_max = TXN_MAXIMUM;
00427         if (trunc_lsn != NULL) {
00428                 headp->trunc_lsn = *trunc_lsn;
00429                 headp->maxlsn = *trunc_lsn;
00430         } else {
00431                 ZERO_LSN(headp->trunc_lsn);
00432                 ZERO_LSN(headp->maxlsn);
00433         }
00434         ZERO_LSN(headp->ckplsn);
00435 
00436         *retp = headp;
00437         return (0);
00438 }
00439 
00440 /*
00441  * __db_txnlist_add --
00442  *      Add an element to our transaction linked list.
00443  *
00444  * PUBLIC: int __db_txnlist_add __P((DB_ENV *,
00445  * PUBLIC:     DB_TXNHEAD *, u_int32_t, u_int32_t, DB_LSN *));
00446  */
00447 int
00448 __db_txnlist_add(dbenv, hp, txnid, status, lsn)
00449         DB_ENV *dbenv;
00450         DB_TXNHEAD *hp;
00451         u_int32_t txnid, status;
00452         DB_LSN *lsn;
00453 {
00454         DB_TXNLIST *elp;
00455         int ret;
00456 
00457         if ((ret = __os_malloc(dbenv, sizeof(DB_TXNLIST), &elp)) != 0)
00458                 return (ret);
00459 
00460         LIST_INSERT_HEAD(&hp->head[DB_TXNLIST_MASK(hp, txnid)], elp, links);
00461 
00462         elp->type = TXNLIST_TXNID;
00463         elp->u.t.txnid = txnid;
00464         elp->u.t.status = status;
00465         elp->u.t.generation = hp->generation;
00466         if (txnid > hp->maxid)
00467                 hp->maxid = txnid;
00468         if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
00469                 hp->maxlsn = *lsn;
00470 
00471         DB_ASSERT(lsn == NULL ||
00472             status != TXN_COMMIT || log_compare(&hp->maxlsn, lsn) >= 0);
00473 
00474         return (0);
00475 }
00476 
00477 /*
00478  * __db_txnlist_remove --
00479  *      Remove an element from our transaction linked list.
00480  *
00481  * PUBLIC: int __db_txnlist_remove __P((DB_ENV *, DB_TXNHEAD *, u_int32_t));
00482  */
00483 int
00484 __db_txnlist_remove(dbenv, hp, txnid)
00485         DB_ENV *dbenv;
00486         DB_TXNHEAD *hp;
00487         u_int32_t txnid;
00488 {
00489         DB_TXNLIST *entry;
00490         u_int32_t status;
00491 
00492         return (__db_txnlist_find_internal(dbenv,
00493             hp, TXNLIST_TXNID, txnid, NULL, &entry, 1, &status));
00494 }
00495 
00496 /*
00497  * __db_txnlist_ckp --
00498  *      Used to record the maximum checkpoint that will be retained
00499  * after recovery.  Typically this is simply the max checkpoint, but
00500  * if we are doing client replication recovery or timestamp-based
00501  * recovery, we are going to virtually truncate the log and we need
00502  * to retain the last checkpoint before the truncation point.
00503  *
00504  * PUBLIC: void __db_txnlist_ckp __P((DB_ENV *, DB_TXNHEAD *, DB_LSN *));
00505  */
00506 void
00507 __db_txnlist_ckp(dbenv, hp, ckp_lsn)
00508         DB_ENV *dbenv;
00509         DB_TXNHEAD *hp;
00510         DB_LSN *ckp_lsn;
00511 {
00512 
00513         COMPQUIET(dbenv, NULL);
00514 
00515         if (IS_ZERO_LSN(hp->ckplsn) && !IS_ZERO_LSN(hp->maxlsn) &&
00516             log_compare(&hp->maxlsn, ckp_lsn) >= 0)
00517                 hp->ckplsn = *ckp_lsn;
00518 }
00519 
00520 /*
00521  * __db_txnlist_end --
00522  *      Discard transaction linked list.
00523  *
00524  * PUBLIC: void __db_txnlist_end __P((DB_ENV *, DB_TXNHEAD *));
00525  */
00526 void
00527 __db_txnlist_end(dbenv, hp)
00528         DB_ENV *dbenv;
00529         DB_TXNHEAD *hp;
00530 {
00531         u_int32_t i;
00532         DB_TXNLIST *p;
00533 
00534         if (hp == NULL)
00535                 return;
00536 
00537         for (i = 0; i < hp->nslots; i++)
00538                 while (hp != NULL && (p = LIST_FIRST(&hp->head[i])) != NULL) {
00539                         switch (p->type) {
00540                         case TXNLIST_LSN:
00541                                 __os_free(dbenv, p->u.l.lsn_stack);
00542                                 break;
00543                         case TXNLIST_DELETE:
00544                         case TXNLIST_PGNO:
00545                         case TXNLIST_TXNID:
00546                         default:
00547                                 /*
00548                                  * Possibly an incomplete DB_TXNLIST; just
00549                                  * free it.
00550                                  */
00551                                 break;
00552                         }
00553                         LIST_REMOVE(p, links);
00554                         __os_free(dbenv, p);
00555                 }
00556 
00557         if (hp->gen_array != NULL)
00558                 __os_free(dbenv, hp->gen_array);
00559         __os_free(dbenv, hp);
00560 }
00561 
00562 /*
00563  * __db_txnlist_find --
00564  *      Checks to see if a txnid with the current generation is in the
00565  *      txnid list.  This returns DB_NOTFOUND if the item isn't in the
00566  *      list otherwise it returns (like __db_txnlist_find_internal)
00567  *      the status of the transaction.  A txnid of 0 means the record
00568  *      was generated while not in a transaction.
00569  *
00570  * PUBLIC: int __db_txnlist_find __P((DB_ENV *,
00571  * PUBLIC:     DB_TXNHEAD *, u_int32_t, u_int32_t *));
00572  */
00573 int
00574 __db_txnlist_find(dbenv, hp, txnid, statusp)
00575         DB_ENV *dbenv;
00576         DB_TXNHEAD *hp;
00577         u_int32_t txnid, *statusp;
00578 {
00579         DB_TXNLIST *entry;
00580 
00581         if (txnid == 0)
00582                 return (DB_NOTFOUND);
00583 
00584         return (__db_txnlist_find_internal(dbenv, hp,
00585             TXNLIST_TXNID, txnid, NULL, &entry, 0, statusp));
00586 }
00587 
00588 /*
00589  * __db_txnlist_update --
00590  *      Change the status of an existing transaction entry.
00591  *      Returns DB_NOTFOUND if no such entry exists.
00592  *
00593  * PUBLIC: int __db_txnlist_update __P((DB_ENV *, DB_TXNHEAD *,
00594  * PUBLIC:     u_int32_t, u_int32_t, DB_LSN *, u_int32_t *, int));
00595  */
00596 int
00597 __db_txnlist_update(dbenv, hp, txnid, status, lsn, ret_status, add_ok)
00598         DB_ENV *dbenv;
00599         DB_TXNHEAD *hp;
00600         u_int32_t txnid, status;
00601         DB_LSN *lsn;
00602         u_int32_t *ret_status;
00603         int add_ok;
00604 {
00605         DB_TXNLIST *elp;
00606         int ret;
00607 
00608         if (txnid == 0)
00609                 return (DB_NOTFOUND);
00610 
00611         ret = __db_txnlist_find_internal(dbenv,
00612             hp, TXNLIST_TXNID, txnid, NULL, &elp, 0, ret_status);
00613 
00614         if (ret == DB_NOTFOUND && add_ok) {
00615                 *ret_status = status;
00616                 return (__db_txnlist_add(dbenv, hp, txnid, status, lsn));
00617         }
00618         if (ret != 0)
00619                 return (ret);
00620 
00621         if (*ret_status == TXN_IGNORE)
00622                 return (0);
00623 
00624         elp->u.t.status = status;
00625 
00626         if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
00627                 hp->maxlsn = *lsn;
00628 
00629         return (ret);
00630 }
00631 
00632 /*
00633  * __db_txnlist_find_internal --
00634  *      Find an entry on the transaction list.  If the entry is not there or
00635  *      the list pointer is not initialized we return DB_NOTFOUND.  If the
00636  *      item is found, we return the status.  Currently we always call this
00637  *      with an initialized list pointer but checking for NULL keeps it general.
00638  */
00639 static int
00640 __db_txnlist_find_internal(dbenv,
00641     hp, type, txnid, uid, txnlistp, delete, statusp)
00642         DB_ENV *dbenv;
00643         DB_TXNHEAD *hp;
00644         db_txnlist_type type;
00645         u_int32_t  txnid;
00646         u_int8_t uid[DB_FILE_ID_LEN];
00647         DB_TXNLIST **txnlistp;
00648         int delete;
00649         u_int32_t *statusp;
00650 {
00651         struct __db_headlink *head;
00652         DB_TXNLIST *p;
00653         u_int32_t generation, hash, i;
00654         int ret;
00655 
00656         ret = 0;
00657 
00658         if (hp == NULL)
00659                 return (DB_NOTFOUND);
00660 
00661         switch (type) {
00662         case TXNLIST_TXNID:
00663                 hash = txnid;
00664                 /* Find the most recent generation containing this ID */
00665                 for (i = 0; i <= hp->generation; i++)
00666                         /* The range may wrap around the end. */
00667                         if (hp->gen_array[i].txn_min <
00668                             hp->gen_array[i].txn_max ?
00669                             (txnid >= hp->gen_array[i].txn_min &&
00670                             txnid <= hp->gen_array[i].txn_max) :
00671                             (txnid >= hp->gen_array[i].txn_min ||
00672                             txnid <= hp->gen_array[i].txn_max))
00673                                 break;
00674                 DB_ASSERT(i <= hp->generation);
00675                 generation = hp->gen_array[i].generation;
00676                 break;
00677         case TXNLIST_PGNO:
00678                 memcpy(&hash, uid, sizeof(hash));
00679                 generation = 0;
00680                 break;
00681         case TXNLIST_DELETE:
00682         case TXNLIST_LSN:
00683         default:
00684                 return (__db_panic(dbenv, EINVAL));
00685         }
00686 
00687         head = &hp->head[DB_TXNLIST_MASK(hp, hash)];
00688 
00689         for (p = LIST_FIRST(head); p != NULL; p = LIST_NEXT(p, links)) {
00690                 if (p->type != type)
00691                         continue;
00692                 switch (type) {
00693                 case TXNLIST_TXNID:
00694                         if (p->u.t.txnid != txnid ||
00695                             generation != p->u.t.generation)
00696                                 continue;
00697                         *statusp = p->u.t.status;
00698                         break;
00699 
00700                 case TXNLIST_PGNO:
00701                         if (memcmp(uid, p->u.p.uid, DB_FILE_ID_LEN) != 0)
00702                                 continue;
00703                         break;
00704                 case TXNLIST_DELETE:
00705                 case TXNLIST_LSN:
00706                 default:
00707                         return (__db_panic(dbenv, EINVAL));
00708                 }
00709                 if (delete == 1) {
00710                         LIST_REMOVE(p, links);
00711                         __os_free(dbenv, p);
00712                         *txnlistp = NULL;
00713                 } else if (p != LIST_FIRST(head)) {
00714                         /* Move it to head of list. */
00715                         LIST_REMOVE(p, links);
00716                         LIST_INSERT_HEAD(head, p, links);
00717                         *txnlistp = p;
00718                 } else
00719                         *txnlistp = p;
00720                 return (ret);
00721         }
00722 
00723         return (DB_NOTFOUND);
00724 }
00725 
00726 /*
00727  * __db_txnlist_gen --
00728  *      Change the current generation number.
00729  *
00730  * PUBLIC: int __db_txnlist_gen __P((DB_ENV *,
00731  * PUBLIC:       DB_TXNHEAD *, int, u_int32_t, u_int32_t));
00732  */
00733 int
00734 __db_txnlist_gen(dbenv, hp, incr, min, max)
00735         DB_ENV *dbenv;
00736         DB_TXNHEAD *hp;
00737         int incr;
00738         u_int32_t min, max;
00739 {
00740         int ret;
00741 
00742         /*
00743          * During recovery generation numbers keep track of "restart"
00744          * checkpoints and recycle records.  Restart checkpoints occur
00745          * whenever we take a checkpoint and there are no outstanding
00746          * transactions.  When that happens, we can reset transaction IDs
00747          * back to TXNID_MINIMUM.  Currently we only do the reset
00748          * at then end of recovery.  Recycle records occur when txnids
00749          * are exhausted during runtime.  A free range of ids is identified
00750          * and logged.  This code maintains a stack of ranges.  A txnid
00751          * is given the generation number of the first range it falls into
00752          * in the stack.
00753          */
00754         if (incr < 0) {
00755                 --hp->generation;
00756                 memmove(hp->gen_array, &hp->gen_array[1],
00757                     (hp->generation + 1) * sizeof(hp->gen_array[0]));
00758         } else {
00759                 ++hp->generation;
00760                 if (hp->generation >= hp->gen_alloc) {
00761                         hp->gen_alloc *= 2;
00762                         if ((ret = __os_realloc(dbenv, hp->gen_alloc *
00763                             sizeof(hp->gen_array[0]), &hp->gen_array)) != 0)
00764                                 return (ret);
00765                 }
00766                 memmove(&hp->gen_array[1], &hp->gen_array[0],
00767                     hp->generation * sizeof(hp->gen_array[0]));
00768                 hp->gen_array[0].generation = hp->generation;
00769                 hp->gen_array[0].txn_min = min;
00770                 hp->gen_array[0].txn_max = max;
00771         }
00772         return (0);
00773 }
00774 
00775 /*
00776  * __db_txnlist_lsnadd --
00777  *      Save the prev_lsn from a txn_child record.
00778  *
00779  * PUBLIC: int __db_txnlist_lsnadd __P((DB_ENV *, DB_TXNHEAD *, DB_LSN *));
00780  */
00781 int
00782 __db_txnlist_lsnadd(dbenv, hp, lsnp)
00783         DB_ENV *dbenv;
00784         DB_TXNHEAD *hp;
00785         DB_LSN *lsnp;
00786 {
00787         DB_TXNLIST *elp;
00788         int ret;
00789 
00790         if (IS_ZERO_LSN(*lsnp))
00791                 return (0);
00792 
00793         for (elp = LIST_FIRST(&hp->head[0]);
00794             elp != NULL; elp = LIST_NEXT(elp, links))
00795                 if (elp->type == TXNLIST_LSN)
00796                         break;
00797 
00798         if (elp == NULL) {
00799                 if ((ret = __db_txnlist_lsninit(dbenv, hp, lsnp)) != 0)
00800                         return (ret);
00801                 return (DB_SURPRISE_KID);
00802         }
00803 
00804         if (elp->u.l.stack_indx == elp->u.l.stack_size) {
00805                 elp->u.l.stack_size <<= 1;
00806                 if ((ret = __os_realloc(dbenv, sizeof(DB_LSN) *
00807                      elp->u.l.stack_size, &elp->u.l.lsn_stack)) != 0) {
00808                         __db_txnlist_end(dbenv, hp);
00809                         return (ret);
00810                 }
00811         }
00812         elp->u.l.lsn_stack[elp->u.l.stack_indx++] = *lsnp;
00813 
00814         return (0);
00815 }
00816 
00817 /*
00818  * __db_txnlist_lsnget --
00819  *
00820  * PUBLIC: int __db_txnlist_lsnget __P((DB_ENV *,
00821  * PUBLIC:     DB_TXNHEAD *, DB_LSN *, u_int32_t));
00822  *      Get the lsn saved from a txn_child record.
00823  */
00824 int
00825 __db_txnlist_lsnget(dbenv, hp, lsnp, flags)
00826         DB_ENV *dbenv;
00827         DB_TXNHEAD *hp;
00828         DB_LSN *lsnp;
00829         u_int32_t flags;
00830 {
00831         DB_TXNLIST *elp;
00832 
00833         COMPQUIET(dbenv, NULL);
00834         COMPQUIET(flags, 0);
00835 
00836         for (elp = LIST_FIRST(&hp->head[0]);
00837             elp != NULL; elp = LIST_NEXT(elp, links))
00838                 if (elp->type == TXNLIST_LSN)
00839                         break;
00840 
00841         if (elp == NULL || elp->u.l.stack_indx == 0) {
00842                 ZERO_LSN(*lsnp);
00843                 return (0);
00844         }
00845 
00846         *lsnp = elp->u.l.lsn_stack[--elp->u.l.stack_indx];
00847 
00848         return (0);
00849 }
00850 
00851 /*
00852  * __db_txnlist_lsninit --
00853  *      Initialize a transaction list with an lsn array entry.
00854  *
00855  * PUBLIC: int __db_txnlist_lsninit __P((DB_ENV *, DB_TXNHEAD *, DB_LSN *));
00856  */
00857 int
00858 __db_txnlist_lsninit(dbenv, hp, lsnp)
00859         DB_ENV *dbenv;
00860         DB_TXNHEAD *hp;
00861         DB_LSN *lsnp;
00862 {
00863         DB_TXNLIST *elp;
00864         int ret;
00865 
00866         elp = NULL;
00867 
00868         if ((ret = __os_malloc(dbenv, sizeof(DB_TXNLIST), &elp)) != 0)
00869                 goto err;
00870         LIST_INSERT_HEAD(&hp->head[0], elp, links);
00871 
00872         elp->type = TXNLIST_LSN;
00873         if ((ret = __os_malloc(dbenv,
00874             sizeof(DB_LSN) * DB_LSN_STACK_SIZE, &elp->u.l.lsn_stack)) != 0)
00875                 goto err;
00876         elp->u.l.stack_indx = 1;
00877         elp->u.l.stack_size = DB_LSN_STACK_SIZE;
00878         elp->u.l.lsn_stack[0] = *lsnp;
00879 
00880         return (0);
00881 
00882 err:    __db_txnlist_end(dbenv, hp);
00883         return (ret);
00884 }
00885 
00886 #ifndef HAVE_FTRUNCATE
00887 /*
00888  * __db_add_limbo -- add pages to the limbo list.
00889  *      Get the file information and call pgnoadd for each page.
00890  *
00891  * PUBLIC: #ifndef HAVE_FTRUNCATE
00892  * PUBLIC: int __db_add_limbo __P((DB_ENV *,
00893  * PUBLIC:      DB_TXNHEAD *, int32_t, db_pgno_t, int32_t));
00894  * PUBLIC: #endif
00895  */
00896 int
00897 __db_add_limbo(dbenv, hp, fileid, pgno, count)
00898         DB_ENV *dbenv;
00899         DB_TXNHEAD *hp;
00900         int32_t fileid;
00901         db_pgno_t pgno;
00902         int32_t count;
00903 {
00904         DB_LOG *dblp;
00905         FNAME *fnp;
00906         int ret;
00907 
00908         dblp = dbenv->lg_handle;
00909         if ((ret = __dbreg_id_to_fname(dblp, fileid, 0, &fnp)) != 0)
00910                 return (ret);
00911 
00912         do {
00913                 if ((ret =
00914                     __db_txnlist_pgnoadd(dbenv, hp, fileid, fnp->ufid,
00915                     R_ADDR(&dblp->reginfo, fnp->name_off), pgno)) != 0)
00916                         return (ret);
00917                 pgno++;
00918         } while (--count != 0);
00919 
00920         return (0);
00921 }
00922 
00923 /*
00924  * __db_do_the_limbo -- move pages from limbo to free.
00925  *
00926  * Limbo processing is what ensures that we correctly handle and
00927  * recover from page allocations.  During recovery, for each database,
00928  * we process each in-question allocation, link them into the free list
00929  * and then write out the new meta-data page that contains the pointer
00930  * to the new beginning of the free list.  On an abort, we use our
00931  * standard __db_free mechanism in a compensating transaction which logs
00932  * the specific modifications to the free list.
00933  *
00934  * If we run out of log space during an abort, then we can't write the
00935  * compensating transaction, so we abandon the idea of a compensating
00936  * transaction, and go back to processing how we do during recovery.
00937  * The reason that this is not the norm is that it's expensive: it requires
00938  * that we flush any database with an in-question allocation.  Thus if
00939  * a compensating transaction fails, we never try to restart it.
00940  *
00941  * Since files may be open and closed within transactions (in particular,
00942  * the master database for subdatabases), we must be prepared to open
00943  * files during this process.  If there is a compensating transaction, we
00944  * can open the files in that transaction.  If this was an abort and there
00945  * is no compensating transaction, then we've got to perform these opens
00946  * in the context of the aborting transaction so that we do not deadlock.
00947  * During recovery, there's no locking, so this isn't an issue.
00948  *
00949  * What you want to keep in mind when reading this is that there are two
00950  * algorithms going on here:  ctxn == NULL, then we're either in recovery
00951  * or our compensating transaction has failed and we're doing the
00952  * "create list and write meta-data page" algorithm.  Otherwise, we're in
00953  * an abort and doing the "use compensating transaction" algorithm.
00954  *
00955  * PUBLIC: #ifndef HAVE_FTRUNCATE
00956  * PUBLIC: int __db_do_the_limbo __P((DB_ENV *,
00957  * PUBLIC:     DB_TXN *, DB_TXN *, DB_TXNHEAD *, db_limbo_state));
00958  * PUBLIC: #endif
00959  */
00960 int
00961 __db_do_the_limbo(dbenv, ptxn, txn, hp, state)
00962         DB_ENV *dbenv;
00963         DB_TXN *ptxn, *txn;
00964         DB_TXNHEAD *hp;
00965         db_limbo_state state;
00966 {
00967         DB_TXNLIST *elp;
00968         u_int32_t h;
00969         int ret;
00970 
00971         ret = 0;
00972         /*
00973          * The slots correspond to hash buckets.  We've hashed the
00974          * fileids into hash buckets and need to pick up all affected
00975          * files. (There will only be a single slot for an abort.)
00976          */
00977         for (h = 0; h < hp->nslots; h++) {
00978                 if ((elp = LIST_FIRST(&hp->head[h])) == NULL)
00979                         continue;
00980                 if (ptxn != NULL) {
00981                         if ((ret =
00982                             __db_limbo_move(dbenv, ptxn, txn, elp)) != 0)
00983                                 goto err;
00984                 } else if ((ret =
00985                     __db_limbo_bucket(dbenv, txn, elp, state)) != 0)
00986                         goto err;
00987         }
00988 
00989 err:    if (ret != 0) {
00990                 __db_err(dbenv, "Fatal error in abort of an allocation");
00991                 ret = __db_panic(dbenv, ret);
00992         }
00993 
00994         return (ret);
00995 }
00996 
00997 /* Limbo support routines. */
00998 
00999 /*
01000  * __db_lock_move --
01001  *      Move a lock from child to parent.
01002  */
01003 static int
01004 __db_lock_move(dbenv, fileid, pgno, mode, ptxn, txn)
01005         DB_ENV *dbenv;
01006         u_int8_t *fileid;
01007         db_pgno_t pgno;
01008         db_lockmode_t mode;
01009         DB_TXN *ptxn, *txn;
01010 {
01011         DBT lock_dbt;
01012         DB_LOCK lock;
01013         DB_LOCK_ILOCK lock_obj;
01014         DB_LOCKREQ req;
01015         int ret;
01016 
01017         lock_obj.pgno = pgno;
01018         memcpy(lock_obj.fileid, fileid, DB_FILE_ID_LEN);
01019         lock_obj.type = DB_PAGE_LOCK;
01020 
01021         memset(&lock_dbt, 0, sizeof(lock_dbt));
01022         lock_dbt.data = &lock_obj;
01023         lock_dbt.size = sizeof(lock_obj);
01024 
01025         if ((ret = __lock_get(dbenv,
01026             txn->txnid, 0, &lock_dbt, mode, &lock)) == 0) {
01027                 memset(&req, 0, sizeof(req));
01028                 req.lock = lock;
01029                 req.op = DB_LOCK_TRADE;
01030                 ret = __lock_vec(dbenv, ptxn->txnid, 0, &req, 1, NULL);
01031         }
01032         return (ret);
01033 }
01034 
01035 /*
01036  * __db_limbo_move
01037  *      Move just the metapage lock to the parent.
01038  */
01039 static int
01040 __db_limbo_move(dbenv, ptxn, txn, elp)
01041         DB_ENV *dbenv;
01042         DB_TXN *ptxn, *txn;
01043         DB_TXNLIST *elp;
01044 {
01045         int ret;
01046 
01047         for (; elp != NULL; elp = LIST_NEXT(elp, links)) {
01048                 if (elp->type != TXNLIST_PGNO || elp->u.p.locked == 1)
01049                         continue;
01050                 if ((ret = __db_lock_move(dbenv, elp->u.p.uid,
01051                     PGNO_BASE_MD, DB_LOCK_WRITE, ptxn, txn)) != 0)
01052                         return (ret);
01053                 elp->u.p.locked = 1;
01054         }
01055 
01056         return (0);
01057 }
01058 /*
01059  * __db_limbo_bucket
01060  *      Perform limbo processing for a single hash bucket in the txnlist.
01061  * txn is the transaction aborting in the case of an abort and ctxn is the
01062  * compensating transaction.
01063  */
01064 
01065 #define T_RESTORED(txn)       ((txn) != NULL && F_ISSET(txn, TXN_RESTORED))
01066 static int
01067 __db_limbo_bucket(dbenv, txn, elp, state)
01068         DB_ENV *dbenv;
01069         DB_TXN *txn;
01070         DB_TXNLIST *elp;
01071         db_limbo_state state;
01072 {
01073         DB *dbp;
01074         DB_MPOOLFILE *mpf;
01075         DBMETA *meta;
01076         DB_TXN *ctxn, *t;
01077         FNAME *fname;
01078         db_pgno_t last_pgno, pgno;
01079         int dbp_created, in_retry, ret, t_ret;
01080 
01081         ctxn = NULL;
01082         in_retry = 0;
01083         meta = NULL;
01084         mpf = NULL;
01085         ret = 0;
01086         for (; elp != NULL; elp = LIST_NEXT(elp, links)) {
01087                 if (elp->type != TXNLIST_PGNO)
01088                         continue;
01089 retry:          dbp_created = 0;
01090 
01091                 /*
01092                  * Pick the transaction in which to potentially
01093                  * log compensations.
01094                  */
01095                 if (state == LIMBO_PREPARE)
01096                         ctxn = txn;
01097                 else if (!in_retry && state != LIMBO_RECOVER &&
01098                     state != LIMBO_TIMESTAMP && !T_RESTORED(txn) &&
01099                     (ret = __txn_compensate_begin(dbenv, &ctxn)) != 0)
01100                         return (ret);
01101 
01102                 /*
01103                  * Either use the compensating transaction or
01104                  * the one passed in, which will be null if recovering.
01105                  */
01106                 t = ctxn == NULL ? txn : ctxn;
01107 
01108                 /* First try to get a dbp by fileid. */
01109                 ret = __dbreg_id_to_db(dbenv, t, &dbp, elp->u.p.fileid, 0);
01110 
01111                 /*
01112                  * If the file was closed and reopened its id could change.
01113                  * Look it up the hard way.
01114                  */
01115                 if (ret == DB_DELETED || ret == ENOENT ||
01116                     ((ret == 0 &&
01117                     memcmp(elp->u.p.uid, dbp->fileid, DB_FILE_ID_LEN) != 0))) {
01118                         if ((ret = __dbreg_fid_to_fname(
01119                             dbenv->lg_handle, elp->u.p.uid, 0, &fname)) == 0)
01120                                 ret = __dbreg_id_to_db(
01121                                      dbenv, t, &dbp, fname->id, 0);
01122                 }
01123                 /*
01124                  * File is being destroyed.  No need to worry about
01125                  * dealing with recovery of allocations.
01126                  */
01127                 if (ret == DB_DELETED ||
01128                     (ret == 0 && F_ISSET(dbp, DB_AM_DISCARD)))
01129                         goto next;
01130 
01131                 if (ret != 0) {
01132                         if ((ret = db_create(&dbp, dbenv, 0)) != 0)
01133                                 goto err;
01134 
01135                         /*
01136                          * This tells the system not to lock, which is always
01137                          * OK, whether this is an abort or recovery.
01138                          */
01139                         F_SET(dbp, DB_AM_COMPENSATE);
01140                         dbp_created = 1;
01141 
01142                         /* It is ok if the file is nolonger there. */
01143                         ret = __db_open(dbp, t, elp->u.p.fname, NULL,
01144                             DB_UNKNOWN, DB_ODDFILESIZE, __db_omode(OWNER_RW),
01145                             PGNO_BASE_MD);
01146                         if (ret == ENOENT)
01147                                 goto next;
01148                 }
01149 
01150                 /*
01151                  * Verify that we are opening the same file that we were
01152                  * referring to when we wrote this log record.
01153                  */
01154                 if (memcmp(elp->u.p.uid, dbp->fileid, DB_FILE_ID_LEN) != 0)
01155                         goto next;
01156 
01157                 mpf = dbp->mpf;
01158                 last_pgno = PGNO_INVALID;
01159 
01160                 if (meta == NULL &&
01161                     (ctxn == NULL || state == LIMBO_COMPENSATE)) {
01162                         pgno = PGNO_BASE_MD;
01163                         if ((ret = __memp_fget(mpf, &pgno, 0, &meta)) != 0)
01164                                 goto err;
01165                         last_pgno = meta->free;
01166                 }
01167 
01168                 if (state == LIMBO_PREPARE) {
01169                         if ((ret = __db_limbo_prepare(dbp, ctxn, elp)) != 0)
01170                                 goto err;
01171                 } else
01172                         ret = __db_limbo_fix(dbp,
01173                              ctxn, elp, &last_pgno, meta, state);
01174                 /*
01175                  * If we were doing compensating transactions, then we are
01176                  * going to hope this error was due to running out of space.
01177                  * We'll change modes (into the sync the file mode) and keep
01178                  * trying.  If we weren't doing compensating transactions,
01179                  * then this is a real error and we're sunk.
01180                  */
01181                 if (ret != 0) {
01182                         if (ret == DB_RUNRECOVERY || ctxn == NULL)
01183                                 goto err;
01184                         in_retry = 1;
01185                         if ((ret = __txn_abort(ctxn)) != 0)
01186                                 goto err;
01187                         ctxn = NULL;
01188                         goto retry;
01189                 }
01190 
01191                 if (state == LIMBO_PREPARE)
01192                         ctxn = NULL;
01193 
01194                 else if (ctxn != NULL) {
01195                         /*
01196                          * We only force compensation at the end of recovery.
01197                          * We want the txn_commit to be logged so turn
01198                          * off the recovery flag briefly.
01199                          */
01200                         if (state == LIMBO_COMPENSATE)
01201                                 F_CLR(
01202                                     (DB_LOG *)dbenv->lg_handle, DBLOG_RECOVER);
01203                         ret = __txn_commit(ctxn, DB_TXN_NOSYNC);
01204                         ctxn = NULL;
01205                         if (state == LIMBO_COMPENSATE)
01206                                 F_SET(
01207                                     (DB_LOG *)dbenv->lg_handle, DBLOG_RECOVER);
01208                         if (ret != 0)
01209                                 goto retry;
01210                 }
01211 
01212                 /*
01213                  * This is where we handle the case where we're explicitly
01214                  * putting together a free list.  We need to decide whether
01215                  * we have to write the meta-data page, and if we do, then
01216                  * we need to sync it as well.
01217                  */
01218                 else if (last_pgno == meta->free) {
01219                         /* No change to page; just put the page back. */
01220                         if ((ret = __memp_fput(mpf, meta, 0)) != 0)
01221                                 goto err;
01222                         meta = NULL;
01223                 } else {
01224                         /*
01225                          * These changes are unlogged so we cannot have the
01226                          * metapage pointing at pages that are not on disk.
01227                          * Therefore, we flush the new free list, then update
01228                          * the metapage.  We have to put the meta-data page
01229                          * first so that it isn't pinned when we try to sync.
01230                          */
01231                         if (!IS_RECOVERING(dbenv) && !T_RESTORED(txn))
01232                                 __db_err(dbenv, "Flushing free list to disk");
01233                         if ((ret = __memp_fput(mpf, meta, 0)) != 0)
01234                                 goto err;
01235                         meta = NULL;
01236                         /*
01237                          * If the sync fails then we cannot flush the
01238                          * newly allocated pages.  That is, the file
01239                          * cannot be extended. Don't let the metapage
01240                          * point at them.
01241                          * We may lose these pages from the file if it
01242                          * can be extended later.  If there is never
01243                          * space for the pages, then things will be ok.
01244                          */
01245                         if ((ret = __db_sync(dbp)) == 0) {
01246                                 pgno = PGNO_BASE_MD;
01247                                 if ((ret =
01248                                     __memp_fget(mpf, &pgno, 0, &meta)) != 0)
01249                                         goto err;
01250                                 meta->free = last_pgno;
01251                                 if ((ret = __memp_fput(mpf,
01252                                      meta, DB_MPOOL_DIRTY)) != 0)
01253                                         goto err;
01254                                 meta = NULL;
01255                         } else {
01256                                 __db_err(dbenv,
01257                                     "%s: %s", dbp->fname, db_strerror(ret));
01258                                 __db_err(dbenv, "%s: %s %s", dbp->fname,
01259                                     "allocation flush failed, some free pages",
01260                                     "may not appear in the free list");
01261                                 ret = 0;
01262                         }
01263                 }
01264 
01265 next:
01266                 /*
01267                  * If we get here, either we have processed the list
01268                  * or the db file has been deleted or could not be opened.
01269                  */
01270                 if (ctxn != NULL &&
01271                     (t_ret = __txn_abort(ctxn)) != 0 && ret == 0)
01272                         ret = t_ret;
01273 
01274                 if (dbp_created &&
01275                     (t_ret = __db_close(dbp, txn, DB_NOSYNC)) != 0 && ret == 0)
01276                         ret = t_ret;
01277                 dbp = NULL;
01278                 if (state != LIMBO_PREPARE && state != LIMBO_TIMESTAMP) {
01279                         __os_free(dbenv, elp->u.p.fname);
01280                         __os_free(dbenv, elp->u.p.pgno_array);
01281                 }
01282                 if (ret == ENOENT)
01283                         ret = 0;
01284                 else if (ret != 0)
01285                         goto err;
01286         }
01287 
01288 err:    if (meta != NULL)
01289                 (void)__memp_fput(mpf, meta, 0);
01290         return (ret);
01291 }
01292 
01293 /*
01294  * __db_limbo_fix --
01295  *      Process a single limbo entry which describes all the page allocations
01296  * for a single file.
01297  */
01298 static int
01299 __db_limbo_fix(dbp, ctxn, elp, lastp, meta, state)
01300         DB *dbp;
01301         DB_TXN *ctxn;
01302         DB_TXNLIST *elp;
01303         db_pgno_t *lastp;
01304         DBMETA *meta;
01305         db_limbo_state state;
01306 {
01307         DBC *dbc;
01308         DBT ldbt;
01309         DB_MPOOLFILE *mpf;
01310         PAGE *freep, *pagep;
01311         db_pgno_t next, pgno;
01312         u_int32_t i;
01313         int ret, t_ret;
01314 
01315         /*
01316          * Loop through the entries for this txnlist element and
01317          * either link them into the free list or write a compensating
01318          * record for each.
01319          */
01320         dbc = NULL;
01321         mpf = dbp->mpf;
01322         pagep = NULL;
01323         ret = 0;
01324 
01325         for (i = 0; i < elp->u.p.nentries; i++) {
01326                 pgno = elp->u.p.pgno_array[i];
01327 
01328                 if (pgno == PGNO_INVALID)
01329                         continue;
01330 
01331                 if ((ret =
01332                     __memp_fget(mpf, &pgno, DB_MPOOL_CREATE, &pagep)) != 0) {
01333                         if (ret != ENOSPC)
01334                                 goto err;
01335                         continue;
01336                 }
01337 
01338                 if (state == LIMBO_COMPENSATE || IS_ZERO_LSN(LSN(pagep))) {
01339                         if (ctxn == NULL) {
01340                                 /*
01341                                  * If this is a fatal recovery which
01342                                  * spans a previous crash this page may
01343                                  * be on the free list already.
01344                                  */
01345                                 for (next = *lastp; next != 0; ) {
01346                                         if (next == pgno)
01347                                                 break;
01348                                         if ((ret = __memp_fget(mpf,
01349                                             &next, 0, &freep)) != 0)
01350                                                 goto err;
01351                                         next = NEXT_PGNO(freep);
01352                                         if ((ret =
01353                                             __memp_fput(mpf, freep, 0)) != 0)
01354                                                 goto err;
01355                                 }
01356 
01357                                 if (next != pgno) {
01358                                         P_INIT(pagep, dbp->pgsize, pgno,
01359                                             PGNO_INVALID, *lastp, 0, P_INVALID);
01360                                         /* Make the lsn non-zero but generic. */
01361                                         INIT_LSN(LSN(pagep));
01362                                         *lastp = pgno;
01363                                 }
01364                         } else if (state == LIMBO_COMPENSATE) {
01365                                 /*
01366                                  * Generate a log record for what we did on the
01367                                  * LIMBO_TIMESTAMP pass.  All pages here are
01368                                  * free so P_OVERHEAD is sufficient.
01369                                  */
01370                                 ZERO_LSN(pagep->lsn);
01371                                 memset(&ldbt, 0, sizeof(ldbt));
01372                                 ldbt.data = pagep;
01373                                 ldbt.size = P_OVERHEAD(dbp);
01374                                 if ((ret = __db_pg_new_log(dbp, ctxn,
01375                                      &LSN(meta), 0, pagep->pgno,
01376                                      &LSN(meta), PGNO_BASE_MD,
01377                                      &ldbt, pagep->next_pgno)) != 0)
01378                                         goto err;
01379                         } else {
01380                                 if (dbc == NULL && (ret =
01381                                     __db_cursor(dbp, ctxn, &dbc, 0)) != 0)
01382                                                 goto err;
01383                                 /*
01384                                  * If the dbp is compensating (because we
01385                                  * opened it), the dbc will automatically be
01386                                  * marked compensating, but in case we didn't
01387                                  * do the open, we have to mark it explicitly.
01388                                  */
01389                                 F_SET(dbc, DBC_COMPENSATE);
01390 
01391                                 /*
01392                                  * If aborting a txn for a different process
01393                                  * via XA or failchk, DB_AM_RECOVER will be
01394                                  * set but we need to log the compensating
01395                                  * transactions.
01396                                  */
01397                                 F_CLR(dbc, DBC_RECOVER);
01398 
01399                                 ret = __db_free(dbc, pagep);
01400                                 pagep = NULL;
01401 
01402                                 /*
01403                                  * On any error, we hope that the error was
01404                                  * caused due to running out of space, and we
01405                                  * switch modes, doing the processing where we
01406                                  * sync out files instead of doing compensating
01407                                  * transactions.  If this was a real error and
01408                                  * not out of space, we assume that some other
01409                                  * call will fail real soon.
01410                                  */
01411                                 if (ret != 0) {
01412                                         /* Assume that this is out of space. */
01413                                         (void)__db_c_close(dbc);
01414                                         dbc = NULL;
01415                                         goto err;
01416                                 }
01417                         }
01418                 }
01419                 else
01420                         elp->u.p.pgno_array[i] = PGNO_INVALID;
01421 
01422                 if (pagep != NULL) {
01423                         ret = __memp_fput(mpf, pagep, DB_MPOOL_DIRTY);
01424                         pagep = NULL;
01425                 }
01426                 if (ret != 0)
01427                         goto err;
01428         }
01429 
01430 err:    if (pagep != NULL &&
01431             (t_ret = __memp_fput(mpf, pagep, DB_MPOOL_DIRTY)) != 0 && ret == 0)
01432                 ret = t_ret;
01433         if (dbc != NULL && (t_ret = __db_c_close(dbc)) != 0 && ret == 0)
01434                 ret = t_ret;
01435         return (ret);
01436 }
01437 
01438 static int
01439 __db_limbo_prepare(dbp, txn, elp)
01440         DB *dbp;
01441         DB_TXN *txn;
01442         DB_TXNLIST *elp;
01443 {
01444         DB_LSN lsn;
01445         DB_MPOOLFILE *mpf;
01446         PAGE *pagep;
01447         db_pgno_t pgno;
01448         u_int32_t i;
01449         int ret, t_ret;
01450 
01451         /*
01452          * Loop through the entries for this txnlist element and
01453          * output a prepare record for them.
01454          */
01455         pagep = NULL;
01456         ret = 0;
01457         mpf = dbp->mpf;
01458 
01459         for (i = 0; i < elp->u.p.nentries; i++) {
01460                 pgno = elp->u.p.pgno_array[i];
01461 
01462                 if ((ret =
01463                     __memp_fget(mpf, &pgno, DB_MPOOL_CREATE, &pagep)) != 0) {
01464                         if (ret != ENOSPC)
01465                                 return (ret);
01466                         continue;
01467                 }
01468 
01469                 if (IS_ZERO_LSN(LSN(pagep)))
01470                         ret = __db_pg_prepare_log(dbp, txn, &lsn, 0, pgno);
01471 
01472                 if ((t_ret = __memp_fput(mpf, pagep, 0)) != 0 && ret == 0)
01473                         ret = t_ret;
01474 
01475                 if (ret != 0)
01476                         return (ret);
01477         }
01478 
01479         return (0);
01480 }
01481 
01482 #define DB_TXNLIST_MAX_PGNO     8               /* A nice even number. */
01483 
01484 /*
01485  * __db_txnlist_pgnoadd --
01486  *      Find the txnlist entry for a file and add this pgno, or add the list
01487  *      entry for the file and then add the pgno.
01488  */
01489 static int
01490 __db_txnlist_pgnoadd(dbenv, hp, fileid, uid, fname, pgno)
01491         DB_ENV *dbenv;
01492         DB_TXNHEAD *hp;
01493         int32_t fileid;
01494         u_int8_t uid[DB_FILE_ID_LEN];
01495         char *fname;
01496         db_pgno_t pgno;
01497 {
01498         DB_TXNLIST *elp;
01499         size_t len;
01500         u_int32_t hash, status;
01501         int ret;
01502 
01503         elp = NULL;
01504 
01505         if ((ret = __db_txnlist_find_internal(dbenv, hp,
01506             TXNLIST_PGNO, 0, uid, &elp, 0, &status)) != 0 && ret != DB_NOTFOUND)
01507                 goto err;
01508 
01509         if (ret == DB_NOTFOUND || status != TXN_OK) {
01510                 if ((ret =
01511                     __os_malloc(dbenv, sizeof(DB_TXNLIST), &elp)) != 0)
01512                         goto err;
01513                 memcpy(&hash, uid, sizeof(hash));
01514                 LIST_INSERT_HEAD(
01515                     &hp->head[DB_TXNLIST_MASK(hp, hash)], elp, links);
01516                 memcpy(elp->u.p.uid, uid, DB_FILE_ID_LEN);
01517 
01518                 len = strlen(fname) + 1;
01519                 if ((ret = __os_malloc(dbenv, len, &elp->u.p.fname)) != 0)
01520                         goto err;
01521                 memcpy(elp->u.p.fname, fname, len);
01522 
01523                 elp->u.p.maxentry = 0;
01524                 elp->u.p.locked = 0;
01525                 elp->type = TXNLIST_PGNO;
01526                 if ((ret = __os_malloc(dbenv,
01527                     8 * sizeof(db_pgno_t), &elp->u.p.pgno_array)) != 0)
01528                         goto err;
01529                 elp->u.p.maxentry = DB_TXNLIST_MAX_PGNO;
01530                 elp->u.p.nentries = 0;
01531         } else if (elp->u.p.nentries == elp->u.p.maxentry) {
01532                 elp->u.p.maxentry <<= 1;
01533                 if ((ret = __os_realloc(dbenv, elp->u.p.maxentry *
01534                     sizeof(db_pgno_t), &elp->u.p.pgno_array)) != 0)
01535                         goto err;
01536         }
01537 
01538         elp->u.p.pgno_array[elp->u.p.nentries++] = pgno;
01539         /* Update to the latest fileid.  Limbo will find it faster. */
01540         elp->u.p.fileid = fileid;
01541 
01542         return (0);
01543 
01544 err:    return (ret);
01545 }
01546 #endif
01547 
01548 #ifdef DEBUG
01549 /*
01550  * __db_txnlist_print --
01551  *      Print out the transaction list.
01552  *
01553  * PUBLIC: void __db_txnlist_print __P((DB_TXNHEAD *));
01554  */
01555 void
01556 __db_txnlist_print(hp)
01557         DB_TXNHEAD *hp;
01558 {
01559         DB_TXNLIST *p;
01560         u_int32_t i;
01561         char *txntype;
01562 
01563         printf("Maxid: %lu Generation: %lu\n",
01564             (u_long)hp->maxid, (u_long)hp->generation);
01565         for (i = 0; i < hp->nslots; i++)
01566                 for (p = LIST_FIRST(&hp->head[i]);
01567                     p != NULL; p = LIST_NEXT(p, links)) {
01568                         if (p->type != TXNLIST_TXNID) {
01569                                 printf("Unrecognized type: %d\n", p->type);
01570                                 continue;
01571                         }
01572                         switch (p->u.t.status) {
01573                         case TXN_OK:
01574                                 txntype = "OK";
01575                                 break;
01576                         case TXN_COMMIT:
01577                                 txntype = "commit";
01578                                 break;
01579                         case TXN_PREPARE:
01580                                 txntype = "prepare";
01581                                 break;
01582                         case TXN_ABORT:
01583                                 txntype = "abort";
01584                                 break;
01585                         case TXN_IGNORE:
01586                                 txntype = "ignore";
01587                                 break;
01588                         case TXN_EXPECTED:
01589                                 txntype = "expected";
01590                                 break;
01591                         case TXN_UNEXPECTED:
01592                                 txntype = "unexpected";
01593                                 break;
01594                         default:
01595                                 txntype = "UNKNOWN";
01596                                 break;
01597                         }
01598                         printf("TXNID: %lx(%lu): %s\n",
01599                             (u_long)p->u.t.txnid,
01600                             (u_long)p->u.t.generation, txntype);
01601                 }
01602 }
01603 #endif

Generated on Sun Dec 25 12:14:19 2005 for Berkeley DB 4.4.16 by  doxygen 1.4.2