Main Page | Class Hierarchy | Data Structures | Directories | File List | Data Fields | Related Pages

bt_compact.c

00001 /*-
00002  * See the file LICENSE for redistribution information.
00003  *
00004  * Copyright (c) 1999-2005
00005  *      Sleepycat Software.  All rights reserved.
00006  *
00007  * $Id: bt_compact.c,v 12.34 2005/11/10 21:07:48 bostic Exp $
00008  */
00009 
00010 #include "db_config.h"
00011 
00012 #ifndef NO_SYSTEM_INCLUDES
00013 #include <sys/types.h>
00014 #include <string.h>
00015 #endif
00016 
00017 #include "db_int.h"
00018 #include "dbinc/db_page.h"
00019 #include "dbinc/db_shash.h"
00020 #include "dbinc/btree.h"
00021 #include "dbinc/lock.h"
00022 #include "dbinc/log.h"
00023 #include "dbinc/mp.h"
00024 #include "dbinc/txn.h"
00025 
00026 static int __bam_compact_dups __P((DBC *,
00027      PAGE *, u_int32_t, int, DB_COMPACT *, int *));
00028 static int __bam_compact_int __P((DBC *,
00029      DBT *, DBT *, u_int32_t, int *, DB_COMPACT *, int *));
00030 static int __bam_csearch __P((DBC *, DBT *, u_int32_t, int));
00031 static int __bam_merge __P((DBC *,
00032      DBC *,  u_int32_t, DBT *, DB_COMPACT *,int *));
00033 static int __bam_merge_internal __P((DBC *, DBC *, int, DB_COMPACT *, int *));
00034 static int __bam_merge_pages __P((DBC *, DBC *, DB_COMPACT *));
00035 static int __bam_merge_records __P((DBC *, DBC*,  u_int32_t, DB_COMPACT *));
00036 static int __bam_truncate_internal_overflow __P((DBC *, PAGE *, DB_COMPACT *));
00037 static int __bam_truncate_overflow __P((DBC *,
00038      db_pgno_t, db_pgno_t, DB_COMPACT *));
00039 static int __bam_truncate_page __P((DBC *, PAGE **, int));
00040 static int __bam_truncate_root_page __P((DBC *,
00041      PAGE *, u_int32_t, DB_COMPACT *));
00042 
00043 #ifdef HAVE_FTRUNCATE
00044 static int __bam_free_freelist __P((DB *, DB_TXN *));
00045 static int __bam_savekey __P((DBC *, int, DBT *));
00046 static int __bam_setup_freelist __P((DB *, struct pglist *, u_int32_t));
00047 static int __bam_truncate_internal __P((DB *, DB_TXN *, DB_COMPACT *));
00048 #endif
00049 
00050 #define SAVE_START                                                      \
00051         do {                                                            \
00052                 save_data = *c_data;                                    \
00053                 ret = __db_retcopy(dbenv,                               \
00054                      &save_start, end->data, end->size,                 \
00055                      &save_start.data, &save_start.ulen);               \
00056         } while (0)
00057 
00058 /*
00059  * Only restore those things that are negated by aborting the
00060  * transaction.  We don't restore the number of deadlocks, for example.
00061  */
00062 
00063 #define RESTORE_START                                                   \
00064         do {                                                            \
00065                 c_data->compact_pages_free =                            \
00066                       save_data.compact_pages_free;                     \
00067                 c_data->compact_levels = save_data.compact_levels;      \
00068                 c_data->compact_truncate = save_data.compact_truncate;  \
00069                 ret = __db_retcopy(dbenv, end,                          \
00070                      save_start.data, save_start.size,                  \
00071                      &end->data, &end->ulen);                           \
00072         } while (0)
00073 /*
00074  * __bam_compact -- compact a btree.
00075  *
00076  * PUBLIC: int __bam_compact __P((DB *, DB_TXN *,
00077  * PUBLIC:     DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
00078  */
00079 int
00080 __bam_compact(dbp, txn, start, stop, c_data, flags, end)
00081         DB *dbp;
00082         DB_TXN *txn;
00083         DBT *start, *stop;
00084         DB_COMPACT *c_data;
00085         u_int32_t flags;
00086         DBT *end;
00087 {
00088         DBT current, save_start;
00089         DBC *dbc;
00090         DB_COMPACT save_data;
00091         DB_ENV *dbenv;
00092         db_pgno_t last_pgno;
00093         struct pglist *list;
00094         u_int32_t factor, nelems, truncated;
00095         int deadlock, done, ret, span, t_ret, txn_local;
00096 
00097         dbenv = dbp->dbenv;
00098 
00099         memset(&current, 0, sizeof(current));
00100         memset(&save_start, 0, sizeof(save_start));
00101         dbc = NULL;
00102         deadlock = 0;
00103         done = 0;
00104         factor = 0;
00105         ret = 0;
00106         span = 0;
00107         truncated = 0;
00108         last_pgno = 0;
00109 
00110         /*
00111          * We pass "end" to the internal routine, indicating where
00112          * that routine should begin its work and expecting that it
00113          * will return to us the last key that it processed.
00114          */
00115         if (end == NULL)
00116                 end = &current;
00117         if (start != NULL && (ret = __db_retcopy(dbenv,
00118              end, start->data, start->size, &end->data, &end->ulen)) != 0)
00119                 return (ret);
00120 
00121         list = NULL;
00122         nelems = 0;
00123 
00124         if (IS_DB_AUTO_COMMIT(dbp, txn))
00125                 txn_local = 1;
00126         else
00127                 txn_local = 0;
00128         if (!LF_ISSET(DB_FREE_SPACE | DB_FREELIST_ONLY))
00129                 goto no_free;
00130         if (LF_ISSET(DB_FREELIST_ONLY))
00131                 LF_SET(DB_FREE_SPACE);
00132 
00133 #ifdef HAVE_FTRUNCATE
00134         /* Sort the freelist and set up the in-memory list representation. */
00135         if (txn_local && (ret = __txn_begin(dbenv, NULL, &txn, 0)) != 0)
00136                 goto err;
00137 
00138         if ((ret = __db_free_truncate(dbp,
00139              txn, flags, c_data, &list, &nelems, &last_pgno)) != 0) {
00140                 LF_CLR(DB_FREE_SPACE);
00141                 goto terr;
00142         }
00143 
00144         /* If the freelist is empty and we are not filling, get out. */
00145         if (nelems == 0 && LF_ISSET(DB_FREELIST_ONLY)) {
00146                 ret = 0;
00147                 LF_CLR(DB_FREE_SPACE);
00148                 goto terr;
00149         }
00150         if ((ret = __bam_setup_freelist(dbp, list, nelems)) != 0) {
00151                 /* Someone else owns the free list. */
00152                 if (ret == EBUSY)
00153                         ret = 0;
00154         }
00155 
00156         /* Commit the txn and release the meta page lock. */
00157 terr:   if (txn_local) {
00158                 if ((t_ret = __txn_commit(txn, DB_TXN_NOSYNC)) != 0 && ret == 0)
00159                         ret = t_ret;
00160                 txn = NULL;
00161         }
00162         if (ret != 0)
00163                 goto err;
00164 
00165         /* Save the number truncated so far, we will add what we get below. */
00166         truncated = c_data->compact_pages_truncated;
00167         if (LF_ISSET(DB_FREELIST_ONLY))
00168                 goto done;
00169 #endif
00170 
00171         /*
00172          * We want factor to be the target number of free bytes on each page,
00173          * so we know when to stop adding items to a page.   Make sure to
00174          * subtract the page overhead when computing this target.  This can
00175          * result in a 1-2% error on the smallest page.
00176          * First figure out how many bytes we should use:
00177          */
00178 no_free:
00179         factor = dbp->pgsize - SIZEOF_PAGE;
00180         if (c_data->compact_fillpercent != 0) {
00181                 factor *= c_data->compact_fillpercent;
00182                 factor /= 100;
00183         }
00184         /* Now convert to the number of free bytes to target. */
00185         factor = (dbp->pgsize - SIZEOF_PAGE) - factor;
00186 
00187         if (c_data->compact_pages == 0)
00188                 c_data->compact_pages = DB_MAX_PAGES;
00189 
00190         do {
00191                 deadlock = 0;
00192 
00193                 SAVE_START;
00194                 if (ret != 0)
00195                         break;
00196 
00197                 if (txn_local) {
00198                         if ((ret = __txn_begin(dbenv, NULL, &txn, 0)) != 0)
00199                                 break;
00200 
00201                         if (c_data->compact_timeout != 0 &&
00202                             (ret = __txn_set_timeout(txn,
00203                             c_data->compact_timeout, DB_SET_LOCK_TIMEOUT)) != 0)
00204                                 goto err;
00205                 }
00206 
00207                 if ((ret = __db_cursor(dbp, txn, &dbc, 0)) != 0)
00208                         goto err;
00209 
00210                 if ((ret = __bam_compact_int(dbc, end, stop, factor,
00211                      &span, c_data, &done)) == DB_LOCK_DEADLOCK && txn_local) {
00212                         /*
00213                          * We retry on deadlock.  Cancel the statistics
00214                          * and reset the start point to before this
00215                          * iteration.
00216                          */
00217                         deadlock = 1;
00218                         c_data->compact_deadlock++;
00219                         RESTORE_START;
00220                 }
00221 
00222                 if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0)
00223                         ret = t_ret;
00224 
00225 err:            if (txn_local && txn != NULL) {
00226                         if (ret == 0 && deadlock == 0)
00227                                 ret = __txn_commit(txn, DB_TXN_NOSYNC);
00228                         else if ((t_ret = __txn_abort(txn)) != 0 && ret == 0)
00229                                 ret = t_ret;
00230                         txn = NULL;
00231                 }
00232         } while (ret == 0 && !done);
00233 
00234         if (current.data != NULL)
00235                 __os_free(dbenv, current.data);
00236         if (save_start.data != NULL)
00237                 __os_free(dbenv, save_start.data);
00238 
00239 #ifdef HAVE_FTRUNCATE
00240         /*
00241          * Finish up truncation work.  If there are pages left in the free
00242          * list then search the internal nodes of the tree as we may have
00243          * missed some while walking the leaf nodes.  Then calculate how
00244          * many pages we have truncated and release the in-memory free list.
00245          */
00246 done:   if (LF_ISSET(DB_FREE_SPACE)) {
00247                 DBMETA *meta;
00248                 db_pgno_t pgno;
00249 
00250                 pgno = PGNO_BASE_MD;
00251                 done = 1;
00252                 if (ret == 0 && !LF_ISSET(DB_FREELIST_ONLY) &&
00253                     (t_ret = __memp_fget(dbp->mpf, &pgno, 0, &meta)) == 0) {
00254                         done = meta->free == PGNO_INVALID;
00255                         ret = __memp_fput(dbp->mpf, meta, 0);
00256                 }
00257 
00258                 if (!done)
00259                         ret = __bam_truncate_internal(dbp, txn, c_data);
00260 
00261                 /* Clean up the free list. */
00262                 if (list != NULL)
00263                         __os_free(dbenv, list);
00264 
00265                 if ((t_ret = __memp_fget(dbp->mpf, &pgno, 0, &meta)) == 0) {
00266                         c_data->compact_pages_truncated =
00267                             truncated + last_pgno - meta->last_pgno;
00268                         if ((t_ret =
00269                             __memp_fput(dbp->mpf, meta, 0)) != 0 && ret == 0)
00270                                 ret = t_ret;
00271                 } else if (ret == 0)
00272                         ret = t_ret;
00273 
00274                 if ((t_ret = __bam_free_freelist(dbp, txn)) != 0 && ret == 0)
00275                         t_ret = ret;
00276         }
00277 #endif
00278 
00279         return (ret);
00280 }
00281 
00282 /*
00283  * __bam_csearch -- isolate search code for bam_compact.
00284  * This routine hides the differences between searching
00285  * a BTREE and a RECNO from the rest of the code.
00286  */
00287 #define CS_READ 0       /* We are just reading. */
00288 #define CS_PARENT       1       /* We want the parent too, write lock. */
00289 #define CS_NEXT         2       /* Get the next page. */
00290 #define CS_NEXT_WRITE   3       /* Get the next page and write lock. */
00291 #define CS_DEL          4       /* Get a stack to delete a page. */
00292 #define CS_START        5       /* Starting level for stack, write lock. */
00293 #define CS_GETRECNO     0x80    /* Extract record number from start. */
00294 
00295 static int
00296 __bam_csearch(dbc, start, sflag, level)
00297         DBC *dbc;
00298         DBT *start;
00299         u_int32_t sflag;
00300         int level;
00301 {
00302         BTREE_CURSOR *cp;
00303         int not_used, ret;
00304 
00305         cp = (BTREE_CURSOR *)dbc->internal;
00306 
00307         if (dbc->dbtype == DB_RECNO) {
00308                 /* If GETRECNO is not set the cp->recno is what we want. */
00309                 if (FLD_ISSET(sflag, CS_GETRECNO)) {
00310                         if (start == NULL || start->size == 0)
00311                                 cp->recno = 1;
00312                         else if ((ret =
00313                              __ram_getno(dbc, start, &cp->recno, 0)) != 0)
00314                                 return (ret);
00315                         FLD_CLR(sflag, CS_GETRECNO);
00316                 }
00317                 switch (sflag) {
00318                 case CS_READ:
00319                         sflag = S_READ;
00320                         break;
00321                 case CS_NEXT:
00322                         sflag = S_PARENT | S_READ;
00323                         break;
00324                 case CS_START:
00325                         level = LEAFLEVEL;
00326                         /* FALLTHROUGH */
00327                 case CS_DEL:
00328                 case CS_NEXT_WRITE:
00329                         sflag = S_STACK;
00330                         break;
00331                 case CS_PARENT:
00332                         sflag = S_PARENT | S_WRITE;
00333                         break;
00334                 default:
00335                         return (__db_panic(dbc->dbp->dbenv, EINVAL));
00336                 }
00337                 if ((ret = __bam_rsearch(dbc,
00338                      &cp->recno, sflag, level, &not_used)) != 0)
00339                         return (ret);
00340                 /* Reset the cursor's recno to the beginning of the page. */
00341                 cp->recno -= cp->csp->indx;
00342         } else {
00343                 FLD_CLR(sflag, CS_GETRECNO);
00344                 switch (sflag) {
00345                 case CS_READ:
00346                         sflag = S_READ | S_DUPFIRST;
00347                         break;
00348                 case CS_DEL:
00349                         sflag = S_DEL;
00350                         break;
00351                 case CS_NEXT:
00352                         sflag = S_NEXT;
00353                         break;
00354                 case CS_NEXT_WRITE:
00355                         sflag = S_NEXT | S_WRITE;
00356                         break;
00357                 case CS_START:
00358                         sflag = S_START | S_WRITE;
00359                         break;
00360                 case CS_PARENT:
00361                         sflag = S_PARENT | S_WRITE;
00362                         break;
00363                 default:
00364                         return (__db_panic(dbc->dbp->dbenv, EINVAL));
00365                 }
00366                 if (start == NULL || start->size == 0)
00367                         FLD_SET(sflag, S_MIN);
00368 
00369                 if ((ret = __bam_search(dbc,
00370                      cp->root, start, sflag, level, NULL, &not_used)) != 0)
00371                         return (ret);
00372         }
00373 
00374         return (0);
00375 }
00376 
00377 /*
00378  * __bam_compact_int -- internal compaction routine.
00379  *      Called either with a cursor on the main database
00380  * or a cursor initialized to the root of an off page duplicate
00381  * tree.
00382  */
00383 static int
00384 __bam_compact_int(dbc, start, stop, factor, spanp, c_data, donep)
00385         DBC *dbc;
00386         DBT *start, *stop;
00387         u_int32_t factor;
00388         int *spanp;
00389         DB_COMPACT *c_data;
00390         int *donep;
00391 {
00392         BTREE_CURSOR *cp, *ncp;
00393         DB *dbp;
00394         DBC *ndbc;
00395         DB_ENV *dbenv;
00396         DB_LOCK nolock;
00397         EPG *epg;
00398         DB_MPOOLFILE *dbmp;
00399         PAGE *pg, *ppg, *npg;
00400         db_pgno_t npgno;
00401         db_recno_t next_recno;
00402         u_int32_t sflag;
00403         int check_dups, check_trunc, done, level;
00404         int merged, nentry, next_page, pgs_done, ret, t_ret, tdone;
00405 #ifdef  DEBUG
00406         DBT trace;
00407         char buf[256];
00408 #define CTRACE(dbc, location, t, start, f) do {                         \
00409                 trace.data = t;                                         \
00410                 trace.size = (u_int32_t)strlen(t);                      \
00411                 DEBUG_LWRITE(dbc, dbc->txn, location, &trace, start, f) \
00412         } while (0)
00413 #define PTRACE(dbc, location, p, start, f) do {                         \
00414                 (void)sprintf(buf, "pgno: %lu", (u_long)p);             \
00415                 CTRACE(dbc, location, buf, start, f);                   \
00416         } while (0)
00417 #else
00418 #define CTRACE(dbc, location, t, start, f)
00419 #define PTRACE(dbc, location, p, start, f)
00420 #endif
00421 
00422         ndbc = NULL;
00423         pg = NULL;
00424         npg = NULL;
00425         done = 0;
00426         tdone = 0;
00427         pgs_done = 0;
00428         next_recno = 0;
00429         next_page = 0;
00430         LOCK_INIT(nolock);
00431         check_trunc = c_data->compact_truncate != PGNO_INVALID;
00432         check_dups = (!F_ISSET(dbc, DBC_OPD) &&
00433              F_ISSET(dbc->dbp, DB_AM_DUP)) || check_trunc;
00434 
00435         dbp = dbc->dbp;
00436         dbenv = dbp->dbenv;
00437         dbmp = dbp->mpf;
00438         cp = (BTREE_CURSOR *)dbc->internal;
00439 
00440         /* Search down the tree for the starting point. */
00441         if ((ret = __bam_csearch(dbc,
00442             start, CS_READ | CS_GETRECNO, LEAFLEVEL)) != 0) {
00443                 /* Its not an error to compact an empty db. */
00444                 if (ret == DB_NOTFOUND)
00445                         ret = 0;
00446                 goto err;
00447         }
00448 
00449         /*
00450          * Get the first leaf page. The loop below will change pg so
00451          * we clear the stack reference so we don't put a a page twice.
00452          */
00453         pg = cp->csp->page;
00454         cp->csp->page = NULL;
00455         next_recno = cp->recno;
00456 next:   /*
00457          * This is the start of the main compaction loop.  There are 3
00458          * parts to the process:
00459          * 1) Walk the leaf pages of the tree looking for a page to
00460          *      process.  We do this with read locks.  Save the
00461          *      key from the page and release it.
00462          * 2) Set up a cursor stack which will write lock the page
00463          *      and enough of its ancestors to get the job done.
00464          *      This could go to the root if we might delete a subtree
00465          *      or we have record numbers to update.
00466          * 3) Loop fetching pages after the above page and move enough
00467          *      data to fill it.
00468          * We exit the loop if we are at the end of the leaf pages, are
00469          * about to lock a new subtree (we span) or on error.
00470          */
00471 
00472         /* Walk the pages looking for something to fill up. */
00473         while ((npgno = NEXT_PGNO(pg)) != PGNO_INVALID) {
00474                 c_data->compact_pages_examine++;
00475                 PTRACE(dbc, "Next", PGNO(pg), start, 0);
00476 
00477                 /* If we have fetched the next page, get the new key. */
00478                 if (next_page == 1 &&
00479                     dbc->dbtype != DB_RECNO && NUM_ENT(pg) != 0) {
00480                         if ((ret = __db_ret(dbp, pg,
00481                              0, start, &start->data, &start->ulen)) != 0)
00482                                 goto err;
00483                 }
00484                 next_recno += NUM_ENT(pg);
00485                 if (P_FREESPACE(dbp, pg) > factor ||
00486                      (check_trunc && PGNO(pg) > c_data->compact_truncate))
00487                         break;
00488                 /*
00489                  * The page does not need more data or to be swapped,
00490                  * check to see if we want to look at possible duplicate
00491                  * trees or overflow records and the move on to the next page.
00492                  */
00493                 cp->recno += NUM_ENT(pg);
00494                 next_page = 1;
00495                 tdone = pgs_done;
00496                 PTRACE(dbc, "Dups", PGNO(pg), start, 0);
00497                 if (check_dups && (ret = __bam_compact_dups(
00498                      dbc, pg, factor, 0, c_data, &pgs_done)) != 0)
00499                         goto err;
00500                 npgno = NEXT_PGNO(pg);
00501                 if ((ret = __memp_fput(dbmp, pg, 0)) != 0)
00502                         goto err;
00503                 pg = NULL;
00504                 /*
00505                  * If we don't do anything we don't need to hold
00506                  * the lock on the previous page, so couple always.
00507                  */
00508                 if ((ret = __db_lget(dbc,
00509                     tdone == pgs_done ? LCK_COUPLE_ALWAYS : LCK_COUPLE,
00510                     npgno, DB_LOCK_READ, 0, &cp->csp->lock)) != 0)
00511                         goto err;
00512                 if ((ret = __memp_fget(dbmp, &npgno, 0, &pg)) != 0)
00513                         goto err;
00514         }
00515 
00516         /*
00517          * When we get here we have 3 cases:
00518          * 1) We've reached the end of the leaf linked list and are done.
00519          * 2) A page whose freespace exceeds our target and therefore needs
00520          *      to have data added to it.
00521          * 3) A page that doesn't have too much free space but needs to be
00522          *      checked for truncation.
00523          * In both cases 2 and 3, we need that page's first key or record
00524          * number.  We may already have it, if not get it here.
00525          */
00526         if ((nentry = NUM_ENT(pg)) != 0) {
00527                 next_page = 0;
00528                 /* Get a copy of the first recno on the page. */
00529                 if (dbc->dbtype == DB_RECNO) {
00530                         if ((ret = __db_retcopy(dbp->dbenv, start,
00531                              &cp->recno, sizeof(cp->recno),
00532                              &start->data, &start->ulen)) != 0)
00533                                 goto err;
00534                 } else if (start->size == 0 &&
00535                      (ret = __db_ret(dbp, pg,
00536                      0, start, &start->data, &start->ulen)) != 0)
00537                         goto err;
00538 
00539                 if (npgno == PGNO_INVALID) {
00540                         /* End of the tree, check its duplicates and exit. */
00541                         PTRACE(dbc, "GoDone", PGNO(pg), start, 0);
00542                         if (check_dups && (ret =
00543                            __bam_compact_dups(dbc,
00544                            pg, factor, 0, c_data, &pgs_done)) != 0)
00545                                 goto err;
00546                         c_data->compact_pages_examine++;
00547                         done = 1;
00548                         goto done;
00549                 }
00550         }
00551 
00552         /* Release the page so we don't deadlock getting its parent. */
00553         BT_STK_CLR(cp);
00554         if ((ret = __LPUT(dbc, cp->csp->lock)) != 0)
00555                 goto err;
00556         if ((ret = __memp_fput(dbmp, pg, 0)) != 0)
00557                 goto err;
00558         pg = NULL;
00559 
00560         /*
00561          * Setup the cursor stack. There are 3 cases:
00562          * 1) the page is empty and will be deleted: nentry == 0.
00563          * 2) the next page has the same parent: *spanp == 0.
00564          * 3) the next page has a different parent: *spanp == 1.
00565          *
00566          * We now need to search the tree again, getting a write lock
00567          * on the page we are going to merge or delete.  We do this by
00568          * searching down the tree and locking as much of the subtree
00569          * above the page as needed.  In the case of a delete we will
00570          * find the maximal subtree that can be deleted. In the case
00571          * of merge if the current page and the next page are siblings
00572          * with the same parent then we only need to lock the parent.
00573          * Otherwise *span will be set and we need to search to find the
00574          * lowest common ancestor.  Dbc will be set to contain the subtree
00575          * containing the page to be merged or deleted. Ndbc will contain
00576          * the minimal subtree containing that page and its next sibling.
00577          * In all cases for DB_RECNO we simplify things and get the whole
00578          * tree if we need more than a single parent.
00579          */
00580 
00581         /* Case 1 -- page is empty. */
00582         if (nentry == 0) {
00583                 CTRACE(dbc, "Empty", "", start, 0);
00584                 if (next_page == 1)
00585                         sflag = CS_NEXT_WRITE;
00586                 else
00587                         sflag = CS_DEL;
00588                 if ((ret = __bam_csearch(dbc, start, sflag, LEAFLEVEL)) != 0)
00589                         goto err;
00590 
00591                 pg = cp->csp->page;
00592                 /* Check to see if the page is still empty. */
00593                 if (NUM_ENT(pg) != 0)
00594                         npgno = PGNO(pg);
00595                 else {
00596                         npgno = NEXT_PGNO(pg);
00597                         /* If this is now the root, we are very done. */
00598                         if (PGNO(pg) == cp->root)
00599                                 done = 1;
00600                         else {
00601                                 if ((ret = __bam_dpages(dbc, 0, 0)) != 0)
00602                                         goto err;
00603                                 c_data->compact_pages_free++;
00604                                 goto next_no_release;
00605                         }
00606                 }
00607                 goto next_page;
00608         }
00609 
00610         /* case 3 -- different parents. */
00611         if (*spanp) {
00612                 CTRACE(dbc, "Span", "", start, 0);
00613                 if (ndbc == NULL && (ret = __db_c_dup(dbc, &ndbc, 0)) != 0)
00614                         goto err;
00615                 ncp = (BTREE_CURSOR *)ndbc->internal;
00616                 ncp->recno = next_recno;
00617                 /*
00618                  * Search the tree looking for the next page after the
00619                  * current key.  For RECNO get the whole stack.
00620                  * For BTREE the return will contain the stack that
00621                  * dominates both the current and next pages.
00622                  */
00623                 if ((ret = __bam_csearch(ndbc, start, CS_NEXT_WRITE, 0)) != 0)
00624                         goto err;
00625 
00626                 if (dbc->dbtype == DB_RECNO) {
00627                         /*
00628                          * The record we are looking for may have moved
00629                          * to the previous page.  This page should
00630                          * be at the beginning of its parent.
00631                          * If not, then start over.
00632                          */
00633                         if (ncp->csp[-1].indx != 0) {
00634                                 *spanp = 0;
00635                                 goto deleted;
00636                         }
00637 
00638                 }
00639                 PTRACE(dbc, "SDups", PGNO(ncp->csp->page), start, 0);
00640                 if (check_dups &&
00641                      (ret = __bam_compact_dups(ndbc,
00642                      ncp->csp->page, factor, 1, c_data, &pgs_done)) != 0)
00643                         goto err;
00644 
00645                 /*
00646                  * We need the stacks to be the same height
00647                  * so that we can merge parents.
00648                  */
00649                 level = LEVEL(ncp->sp->page);
00650                 sflag = CS_START;
00651                 if ((ret = __bam_csearch(dbc, start, sflag, level)) != 0)
00652                         goto err;
00653                 pg = cp->csp->page;
00654                 *spanp = 0;
00655 
00656                 /*
00657                  * The page may have emptied while we waited for the lock.
00658                  * Reset npgno so we re-get this page when we go back to the
00659                  * top.
00660                  */
00661                 if (NUM_ENT(pg) == 0) {
00662                         npgno = PGNO(pg);
00663                         goto next_page;
00664                 }
00665                 if (check_trunc && PGNO(pg) > c_data->compact_truncate) {
00666                         pgs_done++;
00667                         /* Get a fresh low numbered page. */
00668                         if ((ret = __bam_truncate_page(dbc, &pg, 1)) != 0)
00669                                 goto err1;
00670                 }
00671 
00672                 npgno = NEXT_PGNO(pg);
00673                 PTRACE(dbc, "SDups", PGNO(pg), start, 0);
00674                 if (check_dups && (ret =
00675                      __bam_compact_dups(dbc, pg,
00676                      factor, 1, c_data, &pgs_done)) != 0)
00677                         goto err1;
00678 
00679                 /*
00680                  * We may have dropped our locks, check again
00681                  * to see if we still need to fill this page and
00682                  * we are in a spanning situation.
00683                  */
00684 
00685                 if (P_FREESPACE(dbp, pg) <= factor ||
00686                      cp->csp[-1].indx != NUM_ENT(cp->csp[-1].page) - 1)
00687                         goto next_page;
00688 
00689                 /*
00690                  * Try to move things into a single parent.
00691                  */
00692                 merged = 0;
00693                 for (epg = cp->sp; epg != cp->csp; epg++) {
00694                         if (PGNO(epg->page) == cp->root)
00695                                 continue;
00696                         PTRACE(dbc, "PMerge", PGNO(epg->page), start, 0);
00697                         if ((ret = __bam_merge_internal(dbc,
00698                                ndbc, LEVEL(epg->page), c_data, &merged)) != 0)
00699                                 goto err1;
00700                         if (merged)
00701                                 break;
00702                 }
00703 
00704                 /* If we merged the parent, then we nolonger span. */
00705                 if (merged) {
00706                         pgs_done++;
00707                         if (cp->csp->page == NULL)
00708                                 goto deleted;
00709                         npgno = PGNO(pg);
00710                         goto next_page;
00711                 }
00712                 PTRACE(dbc, "SMerge", PGNO(cp->csp->page), start, 0);
00713                 npgno = NEXT_PGNO(ncp->csp->page);
00714                 if ((ret = __bam_merge(dbc,
00715                      ndbc, factor, stop, c_data, &done)) != 0)
00716                         goto err1;
00717                 pgs_done++;
00718                 /*
00719                  * __bam_merge could have freed our stack if it
00720                  * deleted a page possibly collapsing the tree.
00721                  */
00722                 if (cp->csp->page == NULL)
00723                         goto deleted;
00724                 cp->recno += NUM_ENT(pg);
00725 
00726                 /* If we did not bump to the next page something did not fit. */
00727                 if (npgno != NEXT_PGNO(pg)) {
00728                         npgno = NEXT_PGNO(pg);
00729                         goto next_page;
00730                 }
00731         } else {
00732                 /* Case 2 -- same parents. */
00733                 CTRACE(dbc, "Sib", "", start, 0);
00734                 if ((ret =
00735                     __bam_csearch(dbc, start, CS_PARENT, LEAFLEVEL)) != 0)
00736                         goto err;
00737 
00738                 pg = cp->csp->page;
00739                 DB_ASSERT(cp->csp - cp->sp == 1);
00740                 npgno = PGNO(pg);
00741 
00742                 /* We now have a write lock, recheck the page. */
00743                 if ((nentry = NUM_ENT(pg)) == 0)
00744                         goto next_page;
00745 
00746                 npgno = NEXT_PGNO(pg);
00747 
00748                 /* Check duplicate trees, we have a write lock on the page. */
00749                 PTRACE(dbc, "SibDup", PGNO(pg), start, 0);
00750                 if (check_dups && (ret =
00751                      __bam_compact_dups(dbc, pg,
00752                      factor, 1, c_data, &pgs_done)) != 0)
00753                         goto err1;
00754 
00755                 if (check_trunc && PGNO(pg) > c_data->compact_truncate) {
00756                         pgs_done++;
00757                         /* Get a fresh low numbered page. */
00758                         if ((ret = __bam_truncate_page(dbc, &pg, 1)) != 0)
00759                                 goto err1;
00760                 }
00761 
00762                 /* After re-locking check to see if we still need to fill. */
00763                 if (P_FREESPACE(dbp, pg) <= factor)
00764                         goto next_page;
00765 
00766                 /* If they have the same parent, just dup the cursor */
00767                 if (ndbc != NULL && (ret = __db_c_close(ndbc)) != 0)
00768                         goto err1;
00769                 if ((ret = __db_c_dup(dbc, &ndbc, DB_POSITION)) != 0)
00770                         goto err1;
00771                 ncp = (BTREE_CURSOR *)ndbc->internal;
00772 
00773                 /*
00774                  * ncp->recno needs to have the recno of the next page.
00775                  * Bump it by the number of records on the current page.
00776                  */
00777                 ncp->recno += NUM_ENT(pg);
00778         }
00779 
00780         /* Fetch pages until we fill this one. */
00781         while (!done && npgno != PGNO_INVALID &&
00782              P_FREESPACE(dbp, pg) > factor && c_data->compact_pages != 0) {
00783                 /*
00784                  * If our current position is the last one on a parent
00785                  * page, then we are about to merge across different
00786                  * internal nodes.  Thus, we need to lock higher up
00787                  * in the tree.  We will exit the routine and commit
00788                  * what we have done so far.  Set spanp so we know
00789                  * we are in this case when we come back.
00790                  */
00791                 if (cp->csp[-1].indx == NUM_ENT(cp->csp[-1].page) - 1) {
00792                         *spanp = 1;
00793                         npgno = PGNO(pg);
00794                         next_recno = cp->recno;
00795                         goto next_page;
00796                 }
00797 
00798                 /* Lock and get the next page. */
00799                 if ((ret = __db_lget(dbc, LCK_COUPLE,
00800                      npgno, DB_LOCK_WRITE, 0, &ncp->lock)) != 0)
00801                         goto err1;
00802                 if ((ret = __memp_fget(dbmp, &npgno, 0, &npg)) != 0)
00803                         goto err1;
00804 
00805                 /* Fix up the next page cursor with its parent node. */
00806                 if ((ret = __memp_fget(dbmp,
00807                     &PGNO(cp->csp[-1].page), 0, &ppg)) != 0)
00808                         goto err1;
00809                 BT_STK_PUSH(dbenv, ncp, ppg,
00810                      cp->csp[-1].indx + 1, nolock, DB_LOCK_NG, ret);
00811                 if (ret != 0)
00812                         goto err1;
00813 
00814                 /* Put the page on the stack. */
00815                 BT_STK_ENTER(dbenv, ncp, npg, 0, ncp->lock, DB_LOCK_WRITE, ret);
00816 
00817                 LOCK_INIT(ncp->lock);
00818                 npg = NULL;
00819 
00820                 c_data->compact_pages_examine++;
00821 
00822                 PTRACE(dbc, "MDups", PGNO(ncp->csp->page), start, 0);
00823                 if (check_dups && (ret = __bam_compact_dups(ndbc,
00824                      ncp->csp->page, factor, 1, c_data, &pgs_done)) != 0)
00825                         goto err1;
00826 
00827                 npgno = NEXT_PGNO(ncp->csp->page);
00828                 /*
00829                  * Merge the pages.  This will either free the next
00830                  * page or just update its parent pointer.
00831                  */
00832                 PTRACE(dbc, "Merge", PGNO(cp->csp->page), start, 0);
00833                 if ((ret = __bam_merge(dbc,
00834                      ndbc, factor, stop, c_data, &done)) != 0)
00835                         goto err1;
00836 
00837                 pgs_done++;
00838 
00839                 /*
00840                  * __bam_merge could have freed our stack if it
00841                  * deleted a page possibly collapsing the tree.
00842                  */
00843                 if (cp->csp->page == NULL)
00844                         goto deleted;
00845                 /* If we did not bump to the next page something did not fit. */
00846                 if (npgno != NEXT_PGNO(pg))
00847                         break;
00848         }
00849 
00850         /* Bottom of the main loop.  Move to the next page. */
00851         npgno = NEXT_PGNO(pg);
00852         cp->recno += NUM_ENT(pg);
00853         next_recno = cp->recno;
00854 
00855 next_page:
00856         if ((ret = __bam_stkrel(dbc, pgs_done == 0 ? STK_NOLOCK : 0)) != 0)
00857                 goto err1;
00858         if (ndbc != NULL &&
00859              (ret = __bam_stkrel(ndbc, pgs_done == 0 ? STK_NOLOCK : 0)) != 0)
00860                 goto err1;
00861 
00862 next_no_release:
00863         pg = NULL;
00864 
00865         if (npgno == PGNO_INVALID || c_data->compact_pages  == 0)
00866                 done = 1;
00867         if (!done) {
00868                 /*
00869                  * If we are at the end of this parent commit the
00870                  * transaction so we don't tie things up.
00871                  */
00872                 if (pgs_done != 0 && *spanp) {
00873 deleted:                if (((ret = __bam_stkrel(ndbc, 0)) != 0 ||
00874                              (ret = __db_c_close(ndbc)) != 0))
00875                                 goto err;
00876                         *donep = 0;
00877                         return (0);
00878                 }
00879 
00880                 /* Reget the next page to look at. */
00881                 cp->recno = next_recno;
00882                 if ((ret = __memp_fget(dbmp, &npgno, 0, &pg)) != 0)
00883                         goto err;
00884                 next_page = 1;
00885                 goto next;
00886         }
00887 
00888 done:
00889         if (0) {
00890                 /* We come here if pg is the same as cp->csp->page. */
00891 err1:           pg = NULL;
00892         }
00893 err:    if (dbc != NULL &&
00894             (t_ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0 && ret == 0)
00895                 ret = t_ret;
00896         if (ndbc != NULL) {
00897                 if ((t_ret = __bam_stkrel(ndbc, STK_CLRDBC)) != 0 && ret == 0)
00898                         ret = t_ret;
00899                 else if ((t_ret = __db_c_close(ndbc)) != 0 && ret == 0)
00900                         ret = t_ret;
00901         }
00902 
00903         if (pg != NULL && (t_ret = __memp_fput(dbmp, pg, 0) != 0) && ret == 0)
00904                 ret = t_ret;
00905         if (npg != NULL && (t_ret = __memp_fput(dbmp, npg, 0) != 0) && ret == 0)
00906                 ret = t_ret;
00907 
00908         *donep = done;
00909 
00910         return (ret);
00911 }
00912 
00913 /*
00914  * __bam_merge -- do actual merging of leaf pages.
00915  */
00916 static int
00917 __bam_merge(dbc, ndbc, factor, stop, c_data, donep)
00918         DBC *dbc, *ndbc;
00919         u_int32_t factor;
00920         DBT *stop;
00921         DB_COMPACT *c_data;
00922         int *donep;
00923 {
00924         BTREE_CURSOR *cp, *ncp;
00925         BTREE *t;
00926         DB *dbp;
00927         PAGE *pg, *npg;
00928         db_indx_t adj, nent;
00929         db_recno_t recno;
00930         int cmp, ret;
00931         int (*func) __P((DB *, const DBT *, const DBT *));
00932 
00933         dbp = dbc->dbp;
00934         t = dbp->bt_internal;
00935         cp = (BTREE_CURSOR *)dbc->internal;
00936         ncp = (BTREE_CURSOR *)ndbc->internal;
00937         pg = cp->csp->page;
00938         npg = ncp->csp->page;
00939 
00940         nent = NUM_ENT(npg);
00941 
00942         /* If the page is empty just throw it away. */
00943         if (nent == 0)
00944                 goto free;
00945         adj = TYPE(npg) == P_LBTREE ? P_INDX : O_INDX;
00946         /* Find if the stopping point is on this page. */
00947         if (stop != NULL && stop->size != 0) {
00948                 if (dbc->dbtype == DB_RECNO) {
00949                         if ((ret = __ram_getno(dbc, stop, &recno, 0)) != 0)
00950                                 goto err;
00951                         if (ncp->recno > recno) {
00952                                 *donep = 1;
00953                                 if (cp->recno > recno)
00954                                         goto done;
00955                         }
00956                 } else {
00957                         func = TYPE(npg) == P_LBTREE ?
00958                              (dbp->dup_compare == NULL ?
00959                              __bam_defcmp : dbp->dup_compare) : t->bt_compare;
00960 
00961                         if ((ret = __bam_cmp(dbp,
00962                             stop, npg, nent - adj, func, &cmp)) != 0)
00963                                 goto err;
00964 
00965                         /*
00966                          * If the last record is beyond the stopping
00967                          * point we are done after this page.  If the
00968                          * first record is beyond the stopping point
00969                          * don't even bother with this page.
00970                          */
00971                         if (cmp <= 0) {
00972                                 *donep = 1;
00973                                 if ((ret = __bam_cmp(dbp,
00974                                     stop, npg, 0, func, &cmp)) != 0)
00975                                         goto err;
00976                                 if (cmp <= 0)
00977                                         goto done;
00978                         }
00979                 }
00980         }
00981 
00982         /*
00983          * If there is too much data then just move records one at a time.
00984          * Otherwise copy the data space over and fix up the index table.
00985          * If we are on the left most child we will effect our parent's
00986          * index entry so we call merge_records to figure out key sizes.
00987          */
00988         if ((dbc->dbtype == DB_BTREE && 
00989             ncp->csp[-1].indx == 0 && ncp->csp[-1].entries != 1) ||
00990             (int)(P_FREESPACE(dbp, pg) -
00991             ((dbp->pgsize - P_OVERHEAD(dbp)) -
00992             P_FREESPACE(dbp, npg))) < (int)factor)
00993                 ret = __bam_merge_records(dbc, ndbc, factor, c_data);
00994         else
00995 free:           ret = __bam_merge_pages(dbc, ndbc, c_data);
00996 
00997 done:
00998 err:    return (ret);
00999 }
01000 
01001 static int
01002 __bam_merge_records(dbc, ndbc, factor, c_data)
01003         DBC *dbc, *ndbc;
01004         u_int32_t factor;
01005         DB_COMPACT *c_data;
01006 {
01007         BKEYDATA *bk, *tmp_bk;
01008         BINTERNAL *bi;
01009         BTREE *t;
01010         BTREE_CURSOR *cp, *ncp;
01011         DB *dbp;
01012         DBT a, b, data, hdr;
01013         EPG *epg;
01014         PAGE *pg, *npg;
01015         db_indx_t adj, indx, nent, *ninp, pind;
01016         int32_t adjust;
01017         u_int32_t free, nksize, pfree, size;
01018         int first_dup, is_dup, next_dup, n_ok, ret;
01019         size_t (*func) __P((DB *, const DBT *, const DBT *));
01020 
01021         dbp = dbc->dbp;
01022         t = dbp->bt_internal;
01023         cp = (BTREE_CURSOR *)dbc->internal;
01024         ncp = (BTREE_CURSOR *)ndbc->internal;
01025         pg = cp->csp->page;
01026         npg = ncp->csp->page;
01027         memset(&hdr, 0, sizeof(hdr));
01028         pind = NUM_ENT(pg);
01029         n_ok = 0;
01030         adjust = 0;
01031         ret = 0;
01032         nent = NUM_ENT(npg);
01033 
01034         DB_ASSERT (nent != 0);
01035 
01036         /* See if we want to swap out this page. */
01037         if (c_data->compact_truncate != PGNO_INVALID &&
01038              PGNO(npg) > c_data->compact_truncate) {
01039                 /* Get a fresh low numbered page. */
01040                 if ((ret = __bam_truncate_page(ndbc, &npg, 1)) != 0)
01041                         goto err;
01042         }
01043 
01044         ninp = P_INP(dbp, npg);
01045 
01046         /*
01047          * pg is the page that is being filled, it is in the stack in cp.
01048          * npg is the next page, it is in the stack in ncp.
01049          */
01050         free = P_FREESPACE(dbp, pg);
01051 
01052         adj = TYPE(npg) == P_LBTREE ? P_INDX : O_INDX;
01053         /*
01054          * Loop through the records and find the stopping point.
01055          */
01056         for (indx = 0; indx < nent; indx += adj)  {
01057                 bk = GET_BKEYDATA(dbp, npg, indx);
01058 
01059                 /* Size of the key. */
01060                 size = BITEM_PSIZE(bk);
01061 
01062                 /* Size of the data. */
01063                 if (TYPE(pg) == P_LBTREE)
01064                         size += BITEM_PSIZE(GET_BKEYDATA(dbp, npg, indx + 1));
01065                 /*
01066                  * If we are at a duplicate set, skip ahead to see and
01067                  * get the total size for the group.
01068                  */
01069                 n_ok = adj;
01070                 if (TYPE(pg) == P_LBTREE &&
01071                      indx < nent - adj &&
01072                      ninp[indx] == ninp[indx + adj]) {
01073                         do {
01074                                 /* Size of index for key reference. */
01075                                 size += sizeof(db_indx_t);
01076                                 n_ok++;
01077                                 /* Size of data item. */
01078                                 size += BITEM_PSIZE(
01079                                     GET_BKEYDATA(dbp, npg, indx + n_ok));
01080                                 n_ok++;
01081                         } while (indx + n_ok < nent &&
01082                             ninp[indx] == ninp[indx + n_ok]);
01083                 }
01084                 /* if the next set will not fit on the page we are done. */
01085                 if (free < size)
01086                         break;
01087 
01088                 /*
01089                  * Otherwise figure out if we are past the goal and if
01090                  * adding this set will put us closer to the goal than
01091                  * we are now.
01092                  */
01093                 if ((free - size) < factor) {
01094                         if (free - factor > factor - (free - size))
01095                                 indx += n_ok;
01096                         break;
01097                 }
01098                 free -= size;
01099                 indx += n_ok - adj;
01100         }
01101         if (indx == 0)
01102                 goto done;
01103         if (TYPE(pg) != P_LBTREE) {
01104                 if (indx == nent)
01105                         return (__bam_merge_pages(dbc, ndbc, c_data));
01106                 goto no_check;
01107         }
01108         /*
01109          * We need to update npg's parent key.  Avoid creating a new key
01110          * that will be too big. Get what space will be available on the
01111          * parents. Then if there will not be room for this key, see if
01112          * prefix compression will make it work, if not backup till we
01113          * find something that will.  (Needless to say, this is a very
01114          * unlikely event.)  If we are deleting this page then we will
01115          * need to propagate the next key to our grand parents, so we
01116          * see if that will fit.
01117          */
01118         pfree = dbp->pgsize;
01119         for (epg = &ncp->csp[-1]; epg >= ncp->sp; epg--)
01120                 if ((free = P_FREESPACE(dbp, epg->page)) < pfree) {
01121                         bi = GET_BINTERNAL(dbp, epg->page, epg->indx);
01122                         /* Add back in the key we will be deleting. */
01123                         free += BINTERNAL_PSIZE(bi->len);
01124                         if (free < pfree)
01125                                 pfree = free;
01126                         if (epg->indx != 0)
01127                                 break;
01128                 }
01129 
01130         /*
01131          * If we are at the end, we will delete this page.  We need to
01132          * check the next parent key only if we are the leftmost page and
01133          * will therefore have to propagate the key up the tree.
01134          */
01135         if (indx == nent) {
01136                 if (ncp->csp[-1].indx != 0 ||
01137                      BINTERNAL_PSIZE(GET_BINTERNAL(dbp,
01138                      ncp->csp[-1].page, 1)->len) <= pfree)
01139                         return (__bam_merge_pages(dbc, ndbc, c_data));
01140                 indx -= adj;
01141         }
01142         bk = GET_BKEYDATA(dbp, npg, indx);
01143         if (indx != 0 && BINTERNAL_SIZE(bk->len) >= pfree) {
01144                 if (F_ISSET(dbc, DBC_OPD)) {
01145                         if (dbp->dup_compare == __bam_defcmp)
01146                                 func = __bam_defpfx;
01147                         else
01148                                 func = NULL;
01149                 } else
01150                         func = t->bt_prefix;
01151         } else
01152                 func = NULL;
01153 
01154         /* Skip to the beginning of a duplicate set. */
01155         while (indx != 0 &&  ninp[indx] == ninp[indx - adj])
01156                 indx -= adj;
01157 
01158         while (indx != 0 && BINTERNAL_SIZE(bk->len) >= pfree) {
01159                 if (B_TYPE(bk->type) != B_KEYDATA)
01160                         goto noprefix;
01161                 /*
01162                  * Figure out if we can truncate this key.
01163                  * Code borrowed from bt_split.c
01164                  */
01165                 if (func == NULL)
01166                         goto noprefix;
01167                 tmp_bk = GET_BKEYDATA(dbp, npg, indx - adj);
01168                 if (B_TYPE(tmp_bk->type) != B_KEYDATA)
01169                         goto noprefix;
01170                 memset(&a, 0, sizeof(a));
01171                 a.size = tmp_bk->len;
01172                 a.data = tmp_bk->data;
01173                 memset(&b, 0, sizeof(b));
01174                 b.size = bk->len;
01175                 b.data = bk->data;
01176                 nksize = (u_int32_t)func(dbp, &a, &b);
01177                 if (BINTERNAL_PSIZE(nksize) < pfree)
01178                         break;
01179 noprefix:
01180                 /* Skip to the beginning of a duplicate set. */
01181                 do {
01182                         indx -= adj;
01183                 } while (indx != 0 &&  ninp[indx] == ninp[indx - adj]);
01184 
01185                 bk = GET_BKEYDATA(dbp, npg, indx);
01186         }
01187 
01188         if (indx == 0)
01189                 goto done;
01190         DB_ASSERT(indx <= nent);
01191 
01192         /* Loop through the records and move them from npg to pg. */
01193 no_check: is_dup = first_dup = next_dup = 0;
01194         do {
01195                 bk = GET_BKEYDATA(dbp, npg, 0);
01196                 /* Figure out if we are in a duplicate group or not. */
01197                 if ((NUM_ENT(npg) % 2) == 0) {
01198                         if (NUM_ENT(npg) > 2 && ninp[0] == ninp[2]) {
01199                                 if (!is_dup) {
01200                                         first_dup = 1;
01201                                         is_dup = 1;
01202                                 } else
01203                                         first_dup = 0;
01204 
01205                                 next_dup = 1;
01206                         } else if (next_dup) {
01207                                 is_dup = 1;
01208                                 first_dup = 0;
01209                                 next_dup = 0;
01210                         } else
01211                                 is_dup = 0;
01212                 }
01213 
01214                 if (is_dup && !first_dup && (pind % 2) == 0) {
01215                         /* Duplicate key. */
01216                         if ((ret = __bam_adjindx(dbc,
01217                              pg, pind, pind - P_INDX, 1)) != 0)
01218                                 goto err;
01219                         if (!next_dup)
01220                                 is_dup = 0;
01221                 } else switch (B_TYPE(bk->type)) {
01222                 case B_KEYDATA:
01223                         hdr.data = bk;
01224                         hdr.size = SSZA(BKEYDATA, data);
01225                         data.size = bk->len;
01226                         data.data = bk->data;
01227                         if ((ret = __db_pitem(dbc, pg, pind,
01228                              BKEYDATA_SIZE(bk->len), &hdr, &data)) != 0)
01229                                 goto err;
01230                         break;
01231                 case B_OVERFLOW:
01232                 case B_DUPLICATE:
01233                         data.size = BOVERFLOW_SIZE;
01234                         data.data = bk;
01235                         if ((ret = __db_pitem(dbc, pg, pind,
01236                              BOVERFLOW_SIZE, &data, NULL)) != 0)
01237                                 goto err;
01238                         break;
01239                 default:
01240                         __db_err(dbp->dbenv,
01241                             "Unknown record format, page %lu, indx 0",
01242                             (u_long)PGNO(pg));
01243                         ret = EINVAL;
01244                         goto err;
01245                 }
01246                 pind++;
01247                 if (next_dup && (NUM_ENT(npg) % 2) == 0) {
01248                         if ((ret = __bam_adjindx(ndbc,
01249                              npg, 0, O_INDX, 0)) != 0)
01250                                 goto err;
01251                 } else {
01252                         if ((ret = __db_ditem(ndbc,
01253                              npg, 0, BITEM_SIZE(bk))) != 0)
01254                                 goto err;
01255                 }
01256                 adjust++;
01257         } while (--indx != 0);
01258 
01259         DB_ASSERT(NUM_ENT(npg) != 0);
01260         if ((ret = __memp_fset(dbp->mpf, npg, DB_MPOOL_DIRTY)) != 0)
01261                 goto err;
01262 
01263         if (adjust != 0 &&
01264              (F_ISSET(cp, C_RECNUM) || F_ISSET(dbc, DBC_OPD))) {
01265                 DB_ASSERT(cp->csp - cp->sp == ncp->csp - ncp->sp);
01266                 if (TYPE(pg) == P_LBTREE)
01267                         adjust /= P_INDX;
01268                 if ((ret = __bam_adjust(ndbc, -adjust)) != 0)
01269                         goto err;
01270 
01271                 if ((ret = __bam_adjust(dbc, adjust)) != 0)
01272                         goto err;
01273         }
01274 
01275         /* Update parent with new key. */
01276         if (ndbc->dbtype == DB_BTREE &&
01277             (ret = __bam_pupdate(ndbc, pg)) != 0)
01278                 goto err;
01279         if ((ret = __memp_fset(dbp->mpf, pg, DB_MPOOL_DIRTY)) != 0)
01280                 goto err;
01281 
01282 done:   ret = __bam_stkrel(ndbc, STK_CLRDBC);
01283 
01284 err:    return (ret);
01285 }
01286 
01287 static int
01288 __bam_merge_pages(dbc, ndbc, c_data)
01289         DBC *dbc, *ndbc;
01290         DB_COMPACT *c_data;
01291 {
01292         BTREE_CURSOR *cp, *ncp;
01293         DB *dbp;
01294         DB_MPOOLFILE *dbmp;
01295         DBT data, hdr, ind;
01296         PAGE *pg, *npg;
01297         db_indx_t nent, *ninp, *pinp;
01298         db_pgno_t ppgno;
01299         u_int8_t *bp;
01300         u_int32_t len;
01301         int i, level, ret;
01302 
01303         COMPQUIET(ppgno, PGNO_INVALID);
01304         dbp = dbc->dbp;
01305         dbmp = dbp->mpf;
01306         cp = (BTREE_CURSOR *)dbc->internal;
01307         ncp = (BTREE_CURSOR *)ndbc->internal;
01308         pg = cp->csp->page;
01309         npg = ncp->csp->page;
01310         memset(&hdr, 0, sizeof(hdr));
01311         nent = NUM_ENT(npg);
01312 
01313         /* If the page is empty just throw it away. */
01314         if (nent == 0)
01315                 goto free;
01316         /* Bulk copy the data to the new page. */
01317         len = dbp->pgsize - HOFFSET(npg);
01318         if (DBC_LOGGING(dbc)) {
01319                 data.data = (u_int8_t *)npg + HOFFSET(npg);
01320                 data.size = len;
01321                 ind.data = P_INP(dbp, npg);
01322                 ind.size = NUM_ENT(npg) * sizeof(db_indx_t);
01323                 if ((ret = __bam_merge_log(dbp,
01324                      dbc->txn, &LSN(pg), 0, PGNO(pg),
01325                      &LSN(pg), PGNO(npg), &LSN(npg), NULL, &data, &ind)) != 0)
01326                         goto err;
01327         } else
01328                 LSN_NOT_LOGGED(LSN(pg));
01329         LSN(npg) = LSN(pg);
01330         bp = (u_int8_t *)pg + HOFFSET(pg) - len;
01331         memcpy(bp, (u_int8_t *)npg + HOFFSET(npg), len);
01332 
01333         /* Copy index table offset by what was there already. */
01334         pinp = P_INP(dbp, pg) + NUM_ENT(pg);
01335         ninp = P_INP(dbp, npg);
01336         for (i = 0; i < NUM_ENT(npg); i++)
01337                 *pinp++ = *ninp++ - (dbp->pgsize - HOFFSET(pg));
01338         HOFFSET(pg) -= len;
01339         NUM_ENT(pg) += i;
01340 
01341         NUM_ENT(npg) = 0;
01342         HOFFSET(npg) += len;
01343 
01344         if (F_ISSET(cp, C_RECNUM) || F_ISSET(dbc, DBC_OPD)) {
01345                 DB_ASSERT(cp->csp - cp->sp == ncp->csp - ncp->sp);
01346                 if (TYPE(pg) == P_LBTREE)
01347                         i /= P_INDX;
01348                 if ((ret = __bam_adjust(ndbc, -i)) != 0)
01349                         goto err;
01350 
01351                 if ((ret = __bam_adjust(dbc, i)) != 0)
01352                         goto err;
01353         }
01354         ret = __memp_fset(dbp->mpf, pg, DB_MPOOL_DIRTY);
01355 
01356 free:   /*
01357          * __bam_dpages may decide to collapse the tree.
01358          * This can happen if we have the root and there
01359          * are exactly 2 pointers left in it.
01360          * If it can collapse the tree we must free the other
01361          * stack since it will nolonger be valid.  This
01362          * must be done before hand because we cannot
01363          * hold a page pinned if it might be truncated.
01364          */
01365         if (PGNO(ncp->sp->page) == ncp->root &&
01366             NUM_ENT(ncp->sp->page) == 2) {
01367                 if ((ret = __bam_stkrel(dbc, STK_CLRDBC | STK_PGONLY)) != 0)
01368                         goto err;
01369                 level = LEVEL(ncp->sp->page);
01370                 ppgno = PGNO(ncp->csp[-1].page);
01371         } else
01372                 level = 0;
01373         if (c_data->compact_truncate > PGNO(npg))
01374                 c_data->compact_truncate--;
01375         if ((ret = __bam_dpages(ndbc,
01376             0, ndbc->dbtype == DB_RECNO ? 0 : 1)) != 0)
01377                 goto err;
01378         npg = NULL;
01379         c_data->compact_pages_free++;
01380         c_data->compact_pages--;
01381         if (level != 0) {
01382                 if ((ret = __memp_fget(dbmp, &ncp->root, 0, &npg)) != 0)
01383                         goto err;
01384                 if (level == LEVEL(npg))
01385                         level = 0;
01386                 if ((ret = __memp_fput(dbmp, npg, 0)) != 0)
01387                         goto err;
01388                 npg = NULL;
01389                 if (level != 0) {
01390                         c_data->compact_levels++;
01391                         c_data->compact_pages_free++;
01392                         if (c_data->compact_truncate > ppgno)
01393                                 c_data->compact_truncate--;
01394                         if (c_data->compact_pages != 0)
01395                                 c_data->compact_pages--;
01396                 }
01397         }
01398 
01399 err:    return (ret);
01400 }
01401 
01402 /*
01403  * __bam_merge_internal --
01404  *      Merge internal nodes of the tree.
01405  */
01406 static int
01407 __bam_merge_internal(dbc, ndbc, level, c_data, merged)
01408         DBC *dbc, *ndbc;
01409         int level;
01410         DB_COMPACT *c_data;
01411         int *merged;
01412 {
01413         BINTERNAL bi, *bip, *fip;
01414         BTREE_CURSOR *cp, *ncp;
01415         DB_MPOOLFILE *dbmp;
01416         DB *dbp;
01417         DBT data, hdr;
01418         EPG *epg, *save_csp, *nsave_csp;
01419         PAGE *pg, *npg;
01420         RINTERNAL *rk;
01421         db_indx_t indx, pind;
01422         db_pgno_t ppgno;
01423         int32_t trecs;
01424         u_int16_t size;
01425         u_int32_t free, pfree;
01426         int ret;
01427 
01428         COMPQUIET(bip, NULL);
01429         COMPQUIET(ppgno, PGNO_INVALID);
01430 
01431         /*
01432          * ndbc will contain the the dominating parent of the subtree.
01433          * dbc will have the tree containing the left child.
01434          *
01435          * The stacks descend to the leaf level.
01436          * If this is a recno tree then both stacks will start at the root.
01437          */
01438         dbp = dbc->dbp;
01439         dbmp = dbp->mpf;
01440         cp = (BTREE_CURSOR *)dbc->internal;
01441         ncp = (BTREE_CURSOR *)ndbc->internal;
01442         *merged = 0;
01443         ret = 0;
01444 
01445         /*
01446          * Set the stacks to the level requested.
01447          * Save the old value to restore when we exit.
01448          */
01449         save_csp = cp->csp;
01450         epg = &cp->csp[-level + 1];
01451         cp->csp = epg;
01452         pg = epg->page;
01453         pind = NUM_ENT(pg);
01454 
01455         nsave_csp = ncp->csp;
01456         epg = &ncp->csp[-level + 1];
01457         ncp->csp = epg;
01458         npg = epg->page;
01459         indx = NUM_ENT(npg);
01460 
01461         /*
01462          * The caller may have two stacks that include common ancestors, we
01463          * check here for convenience.
01464          */
01465         if (npg == pg)
01466                 goto done;
01467 
01468         if (TYPE(pg) == P_IBTREE) {
01469                 /*
01470                  * Check for overflow keys on both pages while we have
01471                  * them locked.
01472                  */
01473                  if ((ret =
01474                       __bam_truncate_internal_overflow(dbc, pg, c_data)) != 0)
01475                         goto err;
01476                  if ((ret =
01477                       __bam_truncate_internal_overflow(dbc, npg, c_data)) != 0)
01478                         goto err;
01479         }
01480 
01481         /*
01482          * If we are about to move data off the left most page of an
01483          * internal node we will need to update its parents, make sure there
01484          * will be room for the new key on all the parents in the stack.
01485          * If not, move less data.
01486          */
01487         fip = NULL;
01488         if (TYPE(pg) == P_IBTREE) {
01489                 /* See where we run out of space. */
01490                 free = P_FREESPACE(dbp, pg);
01491                 /*
01492                  * The leftmost key of an internal page is not accurate.
01493                  * Go up the tree to find a non-leftmost parent.
01494                  */
01495                 while (--epg >= ncp->sp && epg->indx == 0)
01496                         continue;
01497                 fip = bip = GET_BINTERNAL(dbp, epg->page, epg->indx);
01498                 epg = ncp->csp;
01499 
01500                 for (indx = 0;;) {
01501                         size = BINTERNAL_PSIZE(bip->len);
01502                         if (size > free)
01503                                 break;
01504                         free -= size;
01505                         if (++indx >= NUM_ENT(npg))
01506                                 break;
01507                         bip = GET_BINTERNAL(dbp, npg, indx);
01508                 }
01509 
01510                 /* See if we are deleting the page and we are not left most. */
01511                 if (indx == NUM_ENT(npg) && epg[-1].indx != 0)
01512                         goto fits;
01513 
01514                 pfree = dbp->pgsize;
01515                 for (epg--; epg >= ncp->sp; epg--)
01516                         if ((free = P_FREESPACE(dbp, epg->page)) < pfree) {
01517                                 bip = GET_BINTERNAL(dbp, epg->page, epg->indx);
01518                                 /* Add back in the key we will be deleting. */
01519                                 free += BINTERNAL_PSIZE(bip->len);
01520                                 if (free < pfree)
01521                                         pfree = free;
01522                                 if (epg->indx != 0)
01523                                         break;
01524                         }
01525                 epg = ncp->csp;
01526 
01527                 /* If we are at the end of the page we will delete it. */
01528                 if (indx == NUM_ENT(npg))
01529                         bip =
01530                              GET_BINTERNAL(dbp, epg[-1].page, epg[-1].indx + 1);
01531                 else
01532                         bip = GET_BINTERNAL(dbp, npg, indx);
01533 
01534                 /* Back up until we have a key that fits. */
01535                 while (indx != 0 && BINTERNAL_PSIZE(bip->len) > pfree) {
01536                         indx--;
01537                         bip = GET_BINTERNAL(dbp, npg, indx);
01538                 }
01539                 if (indx == 0)
01540                         goto done;
01541         }
01542 
01543 fits:   memset(&bi, 0, sizeof(bi));
01544         memset(&hdr, 0, sizeof(hdr));
01545         memset(&data, 0, sizeof(data));
01546         trecs = 0;
01547 
01548         /*
01549          * Copy data between internal nodes till one is full
01550          * or the other is empty.
01551          */
01552         do {
01553                 if (dbc->dbtype == DB_BTREE) {
01554                         bip = GET_BINTERNAL(dbp, npg, 0);
01555                         size = fip == NULL ?
01556                              BINTERNAL_SIZE(bip->len) :
01557                              BINTERNAL_SIZE(fip->len);
01558                         if (P_FREESPACE(dbp, pg) < size + sizeof(db_indx_t))
01559                                 break;
01560 
01561                         if (fip == NULL) {
01562                                 data.size = bip->len;
01563                                 data.data = bip->data;
01564                         } else {
01565                                 data.size = fip->len;
01566                                 data.data = fip->data;
01567                         }
01568                         bi.len = data.size;
01569                         B_TSET(bi.type, bip->type, 0);
01570                         bi.pgno = bip->pgno;
01571                         bi.nrecs = bip->nrecs;
01572                         hdr.data = &bi;
01573                         hdr.size = SSZA(BINTERNAL, data);
01574                         if (F_ISSET(cp, C_RECNUM) || F_ISSET(dbc, DBC_OPD))
01575                                 trecs += (int32_t)bip->nrecs;
01576                 } else {
01577                         rk = GET_RINTERNAL(dbp, npg, 0);
01578                         size = RINTERNAL_SIZE;
01579                         if (P_FREESPACE(dbp, pg) < size + sizeof(db_indx_t))
01580                                 break;
01581 
01582                         hdr.data = rk;
01583                         hdr.size = size;
01584                         trecs += (int32_t)rk->nrecs;
01585                 }
01586                 if ((ret = __db_pitem(dbc, pg, pind, size, &hdr, &data)) != 0)
01587                         goto err;
01588                 pind++;
01589                 if (fip != NULL) {
01590                         /* reset size to be for the record being deleted. */
01591                         size = BINTERNAL_SIZE(bip->len);
01592                         fip = NULL;
01593                 }
01594                 if ((ret = __db_ditem(ndbc, npg, 0, size)) != 0)
01595                         goto err;
01596                 *merged = 1;
01597         } while (--indx != 0);
01598 
01599         if (c_data->compact_truncate != PGNO_INVALID &&
01600              PGNO(pg) > c_data->compact_truncate && cp->csp != cp->sp) {
01601                 if ((ret = __bam_truncate_page(dbc, &pg, 1)) != 0)
01602                         goto err;
01603         }
01604 
01605         if (NUM_ENT(npg) != 0 && c_data->compact_truncate != PGNO_INVALID &&
01606             PGNO(npg) > c_data->compact_truncate && ncp->csp != ncp->sp) {
01607                 if ((ret = __bam_truncate_page(ndbc, &npg, 1)) != 0)
01608                         goto err;
01609         }
01610 
01611         if (!*merged)
01612                 goto done;
01613 
01614         if ((ret = __memp_fset(dbmp, pg, DB_MPOOL_DIRTY)) != 0)
01615                 goto err;
01616         if ((ret = __memp_fset(dbmp, npg, DB_MPOOL_DIRTY)) != 0)
01617                 goto err;
01618 
01619         if (trecs != 0) {
01620                 DB_ASSERT(cp->csp - cp->sp == ncp->csp - ncp->sp);
01621                 cp->csp--;
01622                 if ((ret = __bam_adjust(dbc, trecs)) != 0)
01623                         goto err;
01624 
01625                 ncp->csp--;
01626                 if ((ret = __bam_adjust(ndbc, -trecs)) != 0)
01627                         goto err;
01628                 ncp->csp++;
01629         }
01630         cp->csp = save_csp;
01631 
01632         /*
01633          * Either we emptied the page or we need to update its
01634          * parent to reflect the first page we now point to.
01635          * First get rid of the bottom of the stack,
01636          * bam_dpages will clear the stack.  We can drop
01637          * the locks on those pages as we have not done
01638          * anything to them.
01639          */
01640         do {
01641                 if ((ret = __memp_fput(dbmp, nsave_csp->page, 0)) != 0)
01642                         goto err;
01643                 if ((ret = __LPUT(dbc, nsave_csp->lock)) != 0)
01644                         goto err;
01645                 nsave_csp--;
01646         } while (nsave_csp != ncp->csp);
01647 
01648         if (NUM_ENT(npg) == 0)  {
01649                 /*
01650                  * __bam_dpages may decide to collapse the tree
01651                  * so we need to free our other stack.  The tree
01652                  * will change in hight and our stack will nolonger
01653                  * be valid.
01654                  */
01655                 if (PGNO(ncp->sp->page) == ncp->root &&
01656                     NUM_ENT(ncp->sp->page) == 2) {
01657                         if ((ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0)
01658                                 goto err;
01659                         level = LEVEL(ncp->sp->page);
01660                         ppgno = PGNO(ncp->csp[-1].page);
01661                 } else
01662                         level = 0;
01663 
01664                 if (c_data->compact_truncate > PGNO(npg))
01665                         c_data->compact_truncate--;
01666                 ret = __bam_dpages(ndbc,
01667                      0, ndbc->dbtype == DB_RECNO ? 0 : 1);
01668                 c_data->compact_pages_free++;
01669                 if (ret == 0 && level != 0) {
01670                         if ((ret = __memp_fget(dbmp, &ncp->root, 0, &npg)) != 0)
01671                                 goto err;
01672                         if (level == LEVEL(npg))
01673                                 level = 0;
01674                         if ((ret = __memp_fput(dbmp, npg, 0)) != 0)
01675                                 goto err;
01676                         npg = NULL;
01677                         if (level != 0) {
01678                                 c_data->compact_levels++;
01679                                 c_data->compact_pages_free++;
01680                                 if (c_data->compact_truncate > ppgno)
01681                                         c_data->compact_truncate--;
01682                                 if (c_data->compact_pages != 0)
01683                                         c_data->compact_pages--;
01684                         }
01685                 }
01686         } else
01687                 ret = __bam_pupdate(ndbc, npg);
01688         return (ret);
01689 
01690 done:
01691 err:    cp->csp = save_csp;
01692         ncp->csp = nsave_csp;
01693 
01694         return (ret);
01695 }
01696 
01697 /*
01698  * __bam_compact_dups -- try to compress off page dup trees.
01699  * We may or may not have a write lock on this page.
01700  */
01701 static int
01702 __bam_compact_dups(dbc, pg, factor, have_lock, c_data, donep)
01703         DBC *dbc;
01704         PAGE *pg;
01705         u_int32_t factor;
01706         int have_lock;
01707         DB_COMPACT *c_data;
01708         int *donep;
01709 {
01710         BTREE_CURSOR *cp;
01711         BOVERFLOW *bo;
01712         DB *dbp;
01713         DBC *opd;
01714         DBT start;
01715         DB_MPOOLFILE *dbmp;
01716         PAGE *dpg;
01717         db_indx_t i;
01718         int done, level, ret, span, t_ret;
01719 
01720         span = 0;
01721         ret = 0;
01722         opd = NULL;
01723 
01724         dbp = dbc->dbp;
01725         dbmp = dbp->mpf;
01726         cp = (BTREE_CURSOR *)dbc->internal;
01727 
01728         for (i = 0; i <  NUM_ENT(pg); i++) {
01729                 bo = GET_BOVERFLOW(dbp, pg, i);
01730                 if (B_TYPE(bo->type) == B_KEYDATA)
01731                         continue;
01732                 c_data->compact_pages_examine++;
01733                 if (bo->pgno > c_data->compact_truncate) {
01734                         (*donep)++;
01735                         if (!have_lock) {
01736                                 if ((ret = __db_lget(dbc, 0, PGNO(pg),
01737                                      DB_LOCK_WRITE, 0, &cp->csp->lock)) != 0)
01738                                         goto err;
01739                                 have_lock = 1;
01740                         }
01741                         if ((ret =
01742                              __bam_truncate_root_page(dbc, pg, i, c_data)) != 0)
01743                                 goto err;
01744                         /* Just in case it should move.  Could it? */
01745                         bo = GET_BOVERFLOW(dbp, pg, i);
01746                 }
01747 
01748                 if (B_TYPE(bo->type) == B_OVERFLOW) {
01749                         if ((ret = __bam_truncate_overflow(dbc, bo->pgno,
01750                              have_lock ? PGNO_INVALID : PGNO(pg), c_data)) != 0)
01751                                 goto err;
01752                         (*donep)++;
01753                         continue;
01754                 }
01755                 /*
01756                  * Take a peek at the root.  If it's a leaf then
01757                  * there is no tree here, avoid all the trouble.
01758                  */
01759                 if ((ret = __memp_fget(dbmp, &bo->pgno, 0, &dpg)) != 0)
01760                         goto err;
01761 
01762                 level = dpg->level;
01763                 if ((ret = __memp_fput(dbmp, dpg, 0)) != 0)
01764                         goto err;
01765                 if (level == LEAFLEVEL)
01766                         continue;
01767                 if ((ret = __db_c_newopd(dbc, bo->pgno, NULL, &opd)) != 0)
01768                         return (ret);
01769                 if (!have_lock) {
01770                         if ((ret = __db_lget(dbc, 0,
01771                              PGNO(pg), DB_LOCK_WRITE, 0, &cp->csp->lock)) != 0)
01772                                 goto err;
01773                         have_lock = 1;
01774                 }
01775                 (*donep)++;
01776                 memset(&start, 0, sizeof(start));
01777                 do {
01778                         if ((ret = __bam_compact_int(opd, &start,
01779                              NULL, factor, &span, c_data, &done)) != 0)
01780                                 break;
01781                 } while (!done);
01782 
01783                 if (start.data != NULL)
01784                         __os_free(dbp->dbenv, start.data);
01785 
01786                 if (ret != 0)
01787                         goto err;
01788 
01789                 ret = __db_c_close(opd);
01790                 opd = NULL;
01791                 if (ret != 0)
01792                         goto err;
01793         }
01794 
01795 err:    if (opd != NULL && (t_ret = __db_c_close(opd)) != 0 && ret == 0)
01796                 ret = t_ret;
01797         return (ret);
01798 }
01799 
01800 /*
01801  * __bam_truncate_page -- swap a page with a lower numbered page.
01802  *      The cusor has a stack which includes at least the
01803  * immediate parent of this page.
01804  */
01805 static int
01806 __bam_truncate_page(dbc, pgp, update_parent)
01807         DBC *dbc;
01808         PAGE **pgp;
01809         int update_parent;
01810 {
01811         BTREE_CURSOR *cp;
01812         DB *dbp;
01813         DBT data, hdr, ind;
01814         DB_LSN lsn;
01815         EPG *epg;
01816         PAGE *newpage;
01817         db_pgno_t newpgno, *pgnop;
01818         int ret;
01819 
01820         dbp = dbc->dbp;
01821 
01822         /*
01823          * We want to free a page that lives in the part of the file that
01824          * can be truncated, so we're going to move it onto a free page
01825          * that is in the part of the file that need not be truncated.
01826          * Since the freelist is ordered now, we can simply call __db_new
01827          * which will grab the first element off the freelist; we know this
01828          * is the lowest numbered free page.
01829          */
01830 
01831         if ((ret = __db_new(dbc, P_DONTEXTEND | TYPE(*pgp), &newpage)) != 0)
01832                 return (ret);
01833 
01834         /*
01835          * If newpage is null then __db_new would have had to allocate
01836          * a new page from the filesystem, so there is no reason
01837          * to continue this action.
01838          */
01839         if (newpage == NULL)
01840                 return (0);
01841 
01842         /*
01843          * It is possible that a higher page is allocated if other threads
01844          * are allocating at the same time, if so, just put it back.
01845          */
01846         if (PGNO(newpage) > PGNO(*pgp)) {
01847                 /* Its unfortunate but you can't just free a new overflow. */
01848                 if (TYPE(newpage) == P_OVERFLOW)
01849                         OV_LEN(newpage) = 0;
01850                 return (__db_free(dbc, newpage));
01851         }
01852 
01853         /* Log if necessary. */
01854         if (DBC_LOGGING(dbc)) {
01855                 hdr.data = *pgp;
01856                 hdr.size = P_OVERHEAD(dbp);
01857                 if (TYPE(*pgp) == P_OVERFLOW) {
01858                         data.data = (u_int8_t *)*pgp + P_OVERHEAD(dbp);
01859                         data.size = OV_LEN(*pgp);
01860                         ind.size = 0;
01861                 } else {
01862                         data.data = (u_int8_t *)*pgp + HOFFSET(*pgp);
01863                         data.size = dbp->pgsize - HOFFSET(*pgp);
01864                         ind.data = P_INP(dbp, *pgp);
01865                         ind.size = NUM_ENT(*pgp) * sizeof(db_indx_t);
01866                 }
01867                 if ((ret = __bam_merge_log(dbp, dbc->txn,
01868                       &LSN(newpage), 0, PGNO(newpage), &LSN(newpage),
01869                       PGNO(*pgp), &LSN(*pgp), &hdr, &data, &ind)) != 0)
01870                         goto err;
01871         } else
01872                 LSN_NOT_LOGGED(LSN(newpage));
01873 
01874         newpgno = PGNO(newpage);
01875         lsn = LSN(newpage);
01876         memcpy(newpage, *pgp, dbp->pgsize);
01877         PGNO(newpage) = newpgno;
01878         LSN(newpage) = lsn;
01879 
01880         /* Empty the old page. */
01881         if (TYPE(*pgp) == P_OVERFLOW)
01882                 OV_LEN(*pgp) = 0;
01883         else {
01884                 HOFFSET(*pgp) = dbp->pgsize;
01885                 NUM_ENT(*pgp) = 0;
01886         }
01887         LSN(*pgp) = lsn;
01888 
01889         if ((ret = __memp_fset(dbp->mpf, newpage, DB_MPOOL_DIRTY)) != 0)
01890                 goto err;
01891 
01892         /* Update siblings. */
01893         switch (TYPE(newpage)) {
01894         case P_OVERFLOW:
01895         case P_LBTREE:
01896         case P_LRECNO:
01897         case P_LDUP:
01898                 if (NEXT_PGNO(newpage) == PGNO_INVALID &&
01899                     PREV_PGNO(newpage) == PGNO_INVALID)
01900                         break;
01901                 if ((ret = __bam_relink(dbc, *pgp, PGNO(newpage))) != 0)
01902                         goto err;
01903                 break;
01904         default:
01905                 break;
01906         }
01907         cp = (BTREE_CURSOR*)dbc->internal;
01908 
01909         /*
01910          * Now, if we free this page, it will get truncated, when we free
01911          * all the pages after it in the file.
01912          */
01913         ret = __db_free(dbc, *pgp);
01914         /* db_free always puts the page. */
01915         *pgp = newpage;
01916 
01917         if (ret != 0)
01918                 return (ret);
01919 
01920         if (!update_parent)
01921                 goto done;
01922 
01923         /* Update the parent. */
01924         epg = &cp->csp[-1];
01925         switch (TYPE(epg->page)) {
01926         case P_IBTREE:
01927                 pgnop = &GET_BINTERNAL(dbp, epg->page, epg->indx)->pgno;
01928                 break;
01929         case P_IRECNO:
01930                 pgnop = &GET_RINTERNAL(dbp, epg->page, epg->indx)->pgno;
01931                 break;
01932         default:
01933                 pgnop = &GET_BOVERFLOW(dbp, epg->page, epg->indx)->pgno;
01934                 break;
01935         }
01936         if (DBC_LOGGING(dbc)) {
01937                 if ((ret = __bam_pgno_log(dbp, dbc->txn, &LSN(epg->page),
01938                     0, PGNO(epg->page), &LSN(epg->page), (u_int32_t)epg->indx,
01939                     *pgnop, PGNO(newpage))) != 0)
01940                         return (ret);
01941         } else
01942                 LSN_NOT_LOGGED(LSN(epg->page));
01943 
01944         *pgnop = PGNO(newpage);
01945         cp->csp->page = newpage;
01946         if ((ret = __memp_fset(dbp->mpf, epg->page, DB_MPOOL_DIRTY)) != 0)
01947                 return (ret);
01948 
01949 done:   return (0);
01950 
01951 err:    (void)__memp_fput(dbp->mpf, newpage, 0);
01952         return (ret);
01953 }
01954 
01955 /*
01956  * __bam_truncate_overflow -- find overflow pages to truncate.
01957  *      Walk the pages of an overflow chain and swap out
01958  * high numbered pages.  We are passed the first page
01959  * but only deal with the second and subsequent pages.
01960  */
01961 
01962 static int
01963 __bam_truncate_overflow(dbc, pgno, pg_lock, c_data)
01964         DBC *dbc;
01965         db_pgno_t pgno;
01966         db_pgno_t pg_lock;
01967         DB_COMPACT *c_data;
01968 {
01969         DB *dbp;
01970         DB_LOCK lock;
01971         PAGE *page;
01972         int ret, t_ret;
01973 
01974         dbp = dbc->dbp;
01975         page = NULL;
01976         LOCK_INIT(lock);
01977 
01978         if ((ret = __memp_fget(dbp->mpf, &pgno, 0, &page)) != 0)
01979                 return (ret);
01980 
01981         while ((pgno = NEXT_PGNO(page)) != PGNO_INVALID) {
01982                 if ((ret = __memp_fput(dbp->mpf, page, 0)) != 0)
01983                         return (ret);
01984                 if ((ret = __memp_fget(dbp->mpf, &pgno, 0, &page)) != 0)
01985                         return (ret);
01986                 if (pgno <= c_data->compact_truncate)
01987                         continue;
01988                 if (pg_lock != PGNO_INVALID) {
01989                         if ((ret = __db_lget(dbc,
01990                              0, pg_lock, DB_LOCK_WRITE, 0, &lock)) != 0)
01991                                 break;
01992                         pg_lock = PGNO_INVALID;
01993                 }
01994                 if ((ret = __bam_truncate_page(dbc, &page, 0)) != 0)
01995                         break;
01996         }
01997 
01998         if (page != NULL &&
01999             (t_ret = __memp_fput(dbp->mpf, page, 0)) != 0 && ret == 0)
02000                 ret = t_ret;
02001         if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
02002                 ret = t_ret;
02003         return (ret);
02004 }
02005 
02006 /*
02007  * __bam_truncate_root_page -- swap a page which is
02008  *    the root of an off page dup tree or the head of an overflow.
02009  * The page is reference by the pg/indx passed in.
02010  */
02011 static int
02012 __bam_truncate_root_page(dbc, pg, indx, c_data)
02013         DBC *dbc;
02014         PAGE *pg;
02015         u_int32_t indx;
02016         DB_COMPACT *c_data;
02017 {
02018         BINTERNAL *bi;
02019         BOVERFLOW *bo;
02020         DB *dbp;
02021         DBT orig;
02022         PAGE *page;
02023         db_pgno_t newpgno, *pgnop;
02024         int ret, t_ret;
02025 
02026         COMPQUIET(c_data, NULL);
02027         COMPQUIET(bo, NULL);
02028         COMPQUIET(newpgno, PGNO_INVALID);
02029         dbp = dbc->dbp;
02030         page = NULL;
02031         if (TYPE(pg) == P_IBTREE) {
02032                 bi = GET_BINTERNAL(dbp, pg, indx);
02033                 if (B_TYPE(bi->type) == B_OVERFLOW) {
02034                         bo = (BOVERFLOW *)(bi->data);
02035                         pgnop = &bo->pgno;
02036                 } else
02037                         pgnop = &bi->pgno;
02038         } else {
02039                 bo = GET_BOVERFLOW(dbp, pg, indx);
02040                 pgnop = &bo->pgno;
02041         }
02042 
02043         if ((ret = __memp_fget(dbp->mpf, pgnop, 0, &page)) != 0)
02044                 goto err;
02045 
02046         /*
02047          * If this is a multiply reference overflow key, then we will just
02048          * copy it and decrement the reference count.  This is part of a
02049          * fix to get rid of multiple references.
02050          */
02051         if (TYPE(page) == P_OVERFLOW && OV_REF(page) > 1) {
02052                 if ((ret = __db_ovref(dbc, bo->pgno, -1)) != 0)
02053                         goto err;
02054                 memset(&orig, 0, sizeof(orig));
02055                 if ((ret = __db_goff(dbp, &orig,
02056                     bo->tlen, bo->pgno, &orig.data, &orig.size)) == 0)
02057                         ret = __db_poff(dbc, &orig, &newpgno);
02058                 if (orig.data != NULL)
02059                         __os_free(dbp->dbenv, orig.data);
02060                 if (ret != 0)
02061                         goto err;
02062         } else {
02063                 if ((ret = __bam_truncate_page(dbc, &page, 0)) != 0)
02064                         goto err;
02065                 newpgno = PGNO(page);
02066                 /* If we could not allocate from the free list, give up.*/
02067                 if (newpgno == *pgnop)
02068                         goto err;
02069         }
02070 
02071         /* Update the reference. */
02072         if (DBC_LOGGING(dbc)) {
02073                 if ((ret = __bam_pgno_log(dbp,
02074                      dbc->txn, &LSN(pg), 0, PGNO(pg),
02075                      &LSN(pg), (u_int32_t)indx, *pgnop, newpgno)) != 0)
02076                         goto err;
02077         } else
02078                 LSN_NOT_LOGGED(LSN(pg));
02079 
02080         *pgnop = newpgno;
02081         if ((ret = __memp_fset(dbp->mpf, pg, DB_MPOOL_DIRTY)) != 0)
02082                 goto err;
02083 
02084 err:    if (page != NULL && (t_ret =
02085               __memp_fput(dbp->mpf, page, DB_MPOOL_DIRTY)) != 0 && ret == 0)
02086                 ret = t_ret;
02087         return (ret);
02088 }
02089 
02090 /*
02091  * -- bam_truncate_internal_overflow -- find overflow keys
02092  *      on internal pages and if they have high page
02093  * numbers swap them with lower pages and truncate them.
02094  * Note that if there are overflow keys in the internal
02095  * nodes they will get copied adding pages to the database.
02096  */
02097 static int
02098 __bam_truncate_internal_overflow(dbc, page, c_data)
02099         DBC *dbc;
02100         PAGE *page;
02101         DB_COMPACT *c_data;
02102 {
02103         BINTERNAL *bi;
02104         BOVERFLOW *bo;
02105         db_indx_t indx;
02106         int ret;
02107 
02108         COMPQUIET(bo, NULL);
02109         ret = 0;
02110         for (indx = 0; indx < NUM_ENT(page); indx++) {
02111                 bi = GET_BINTERNAL(dbc->dbp, page, indx);
02112                 if (B_TYPE(bi->type) != B_OVERFLOW)
02113                         continue;
02114                 bo = (BOVERFLOW *)(bi->data);
02115                 if (bo->pgno > c_data->compact_truncate && (ret =
02116                      __bam_truncate_root_page(dbc, page, indx, c_data)) != 0)
02117                         break;
02118                 if ((ret = __bam_truncate_overflow(
02119                      dbc, bo->pgno, PGNO_INVALID, c_data)) != 0)
02120                         break;
02121         }
02122         return (ret);
02123 }
02124 
02125 #ifdef HAVE_FTRUNCATE
02126 /*
02127  * __bam_savekey -- save the key from an internal page.
02128  *  We need to save information so that we can
02129  * fetch then next internal node of the tree.  This means
02130  * we need the btree key on this current page, or the
02131  * next record number.
02132  */
02133 static int
02134 __bam_savekey(dbc, next, start)
02135         DBC *dbc;
02136         int next;
02137         DBT *start;
02138 {
02139         BINTERNAL *bi;
02140         BOVERFLOW *bo;
02141         BTREE_CURSOR *cp;
02142         DB *dbp;
02143         DB_ENV *dbenv;
02144         PAGE *pg;
02145         RINTERNAL *ri;
02146         db_indx_t indx, top;
02147 
02148         dbp = dbc->dbp;
02149         dbenv = dbp->dbenv;
02150         cp = (BTREE_CURSOR *)dbc->internal;
02151         pg = cp->csp->page;
02152 
02153         if (dbc->dbtype == DB_RECNO) {
02154                 if (next)
02155                         for (indx = 0, top = NUM_ENT(pg); indx != top; indx++) {
02156                                 ri = GET_RINTERNAL(dbp, pg, indx);
02157                                 cp->recno += ri->nrecs;
02158                         }
02159                 return (__db_retcopy(dbenv, start, &cp->recno,
02160                      sizeof(cp->recno), &start->data, &start->ulen));
02161 
02162         }
02163         bi = GET_BINTERNAL(dbp, pg, NUM_ENT(pg) - 1);
02164         if (B_TYPE(bi->type) == B_OVERFLOW) {
02165                 bo = (BOVERFLOW *)(bi->data);
02166                 return (__db_goff(dbp, start,
02167                      bo->tlen, bo->pgno, &start->data, &start->ulen));
02168         }
02169         return (__db_retcopy(dbenv,
02170              start, bi->data, bi->len,  &start->data, &start->ulen));
02171 }
02172 
02173 /*
02174  * bam_truncate_internal --
02175  *      Find high numbered pages in the internal nodes of a tree and
02176  *      swap them.
02177  */
02178 static int
02179 __bam_truncate_internal(dbp, txn, c_data)
02180         DB *dbp;
02181         DB_TXN *txn;
02182         DB_COMPACT *c_data;
02183 {
02184         BTREE_CURSOR *cp;
02185         DBC *dbc;
02186         DBT start;
02187         PAGE *pg;
02188         db_pgno_t pgno;
02189         u_int32_t sflag;
02190         int level, local_txn, ret, t_ret;
02191 
02192         dbc = NULL;
02193         memset(&start, 0, sizeof(start));
02194 
02195         if (IS_DB_AUTO_COMMIT(dbp, txn)) {
02196                 local_txn = 1;
02197                 txn = NULL;
02198         } else
02199                 local_txn = 0;
02200 
02201         level = LEAFLEVEL + 1;
02202         sflag = CS_READ | CS_GETRECNO;
02203 
02204 new_txn:
02205         if (local_txn && (ret = __txn_begin(dbp->dbenv, NULL, &txn, 0)) != 0)
02206                 goto err;
02207 
02208         if ((ret = __db_cursor(dbp, txn, &dbc, 0)) != 0)
02209                 goto err;
02210         cp = (BTREE_CURSOR *)dbc->internal;
02211 
02212         pgno = PGNO_INVALID;
02213         do {
02214                 if ((ret = __bam_csearch(dbc, &start, sflag, level)) != 0) {
02215                         /* No more at this level, go up one. */
02216                         if (ret == DB_NOTFOUND) {
02217                                 level++;
02218                                 if (start.data != NULL)
02219                                         __os_free(dbp->dbenv, start.data);
02220                                 memset(&start, 0, sizeof(start));
02221                                 sflag = CS_READ | CS_GETRECNO;
02222                                 continue;
02223                         }
02224                         goto err;
02225                 }
02226                 c_data->compact_pages_examine++;
02227 
02228                 pg = cp->csp->page;
02229                 pgno = PGNO(pg);
02230 
02231                 sflag = CS_NEXT | CS_GETRECNO;
02232                 /* Grab info about the page and drop the stack. */
02233                 if (pgno != cp->root && (ret = __bam_savekey(dbc,
02234                     pgno <= c_data->compact_truncate, &start)) != 0)
02235                         goto err;
02236 
02237                 if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
02238                         goto err;
02239                 if (pgno == cp->root)
02240                         break;
02241 
02242                 if (pgno <= c_data->compact_truncate)
02243                         continue;
02244 
02245                 /* Reget the page with a write lock, and its parent too. */
02246                 if ((ret = __bam_csearch(dbc,
02247                     &start, CS_PARENT | CS_GETRECNO, level)) != 0)
02248                         goto err;
02249                 pg = cp->csp->page;
02250                 pgno = PGNO(pg);
02251 
02252                 if (pgno > c_data->compact_truncate) {
02253                         if ((ret = __bam_truncate_page(dbc, &pg, 1)) != 0)
02254                                 goto err;
02255                 }
02256                 if ((ret = __bam_stkrel(dbc,
02257                      pgno > c_data->compact_truncate ? 0 : STK_NOLOCK)) != 0)
02258                         goto err;
02259 
02260                 /* We are locking subtrees, so drop the write locks asap. */
02261                 if (local_txn && pgno > c_data->compact_truncate)
02262                         break;
02263         } while (pgno != cp->root);
02264 
02265         if ((ret = __db_c_close(dbc)) != 0)
02266                 goto err;
02267         dbc = NULL;
02268         if (local_txn) {
02269                 if ((ret = __txn_commit(txn, DB_TXN_NOSYNC)) != 0)
02270                         goto err;
02271                 txn = NULL;
02272         }
02273         if (pgno != ((BTREE *)dbp->bt_internal)->bt_root)
02274                 goto new_txn;
02275 
02276 err:    if (dbc != NULL && (t_ret = __bam_stkrel(dbc, 0)) != 0 && ret == 0)
02277                 ret = t_ret;
02278         if (dbc != NULL && (t_ret = __db_c_close(dbc)) != 0 && ret == 0)
02279                 ret = t_ret;
02280         if (local_txn &&
02281             txn != NULL && (t_ret = __txn_abort(txn)) != 0 && ret == 0)
02282                 ret = t_ret;
02283         if (start.data != NULL)
02284                 __os_free(dbp->dbenv, start.data);
02285         return (ret);
02286 }
02287 
02288 static int
02289 __bam_setup_freelist(dbp, list, nelems)
02290         DB *dbp;
02291         struct pglist *list;
02292         u_int32_t nelems;
02293 {
02294         DB_MPOOLFILE *mpf;
02295         db_pgno_t *plist;
02296         int ret;
02297 
02298         mpf = dbp->mpf;
02299 
02300         if ((ret = __memp_alloc_freelist(mpf, nelems, &plist)) != 0)
02301                 return (ret);
02302 
02303         while (nelems-- != 0)
02304                 *plist++ = list++->pgno;
02305 
02306         return (0);
02307 }
02308 
02309 static int
02310 __bam_free_freelist(dbp, txn)
02311         DB *dbp;
02312         DB_TXN *txn;
02313 {
02314         DBC *dbc;
02315         DB_LOCK lock;
02316         int ret, t_ret;
02317 
02318         LOCK_INIT(lock);
02319         ret = 0;
02320 
02321         /*
02322          * If we are not in a transaction then we need to get
02323          * a lock on the meta page, otherwise we should already
02324          * have the lock.
02325          */
02326 
02327         dbc = NULL;
02328         if (IS_DB_AUTO_COMMIT(dbp, txn)) {
02329                 /* Get a cursor so we can call __db_lget. */
02330                 if ((ret = __db_cursor(dbp, NULL, &dbc, 0)) != 0)
02331                         return (ret);
02332 
02333                 if ((ret = __db_lget(dbc,
02334                      0, PGNO_BASE_MD, DB_LOCK_WRITE, 0, &lock)) != 0)
02335                         goto err;
02336         }
02337 
02338         __memp_free_freelist(dbp->mpf);
02339 
02340 err:    if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
02341                 ret = t_ret;
02342 
02343         if (dbc != NULL && (t_ret = __db_c_close(dbc)) != 0 && ret == 0)
02344                 ret = t_ret;
02345 
02346         return (ret);
02347 }
02348 #endif

Generated on Sun Dec 25 12:14:12 2005 for Berkeley DB 4.4.16 by  doxygen 1.4.2