Main Page | Class Hierarchy | Data Structures | Directories | File List | Data Fields | Related Pages

db_meta.c

00001 /*-
00002  * See the file LICENSE for redistribution information.
00003  *
00004  * Copyright (c) 1996-2005
00005  *      Sleepycat Software.  All rights reserved.
00006  */
00007 /*
00008  * Copyright (c) 1990, 1993, 1994, 1995, 1996
00009  *      Keith Bostic.  All rights reserved.
00010  */
00011 /*
00012  * Copyright (c) 1990, 1993, 1994, 1995
00013  *      The Regents of the University of California.  All rights reserved.
00014  *
00015  * This code is derived from software contributed to Berkeley by
00016  * Mike Olson.
00017  *
00018  * Redistribution and use in source and binary forms, with or without
00019  * modification, are permitted provided that the following conditions
00020  * are met:
00021  * 1. Redistributions of source code must retain the above copyright
00022  *    notice, this list of conditions and the following disclaimer.
00023  * 2. Redistributions in binary form must reproduce the above copyright
00024  *    notice, this list of conditions and the following disclaimer in the
00025  *    documentation and/or other materials provided with the distribution.
00026  * 3. Neither the name of the University nor the names of its contributors
00027  *    may be used to endorse or promote products derived from this software
00028  *    without specific prior written permission.
00029  *
00030  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
00031  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00032  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00033  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
00034  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00035  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00036  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00037  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00038  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00039  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00040  * SUCH DAMAGE.
00041  *
00042  * $Id: db_meta.c,v 12.22 2005/10/27 01:46:34 bostic Exp $
00043  */
00044 
00045 #include "db_config.h"
00046 
00047 #ifndef NO_SYSTEM_INCLUDES
00048 #include <sys/types.h>
00049 
00050 #include <stdlib.h>
00051 #include <string.h>
00052 #endif
00053 
00054 #include "db_int.h"
00055 #include "dbinc/db_page.h"
00056 #include "dbinc/db_shash.h"
00057 #include "dbinc/lock.h"
00058 #include "dbinc/mp.h"
00059 #include "dbinc/db_am.h"
00060 
00061 static void __db_init_meta __P((DB *, void *, db_pgno_t, u_int32_t));
00062 #ifdef HAVE_FTRUNCATE
00063 static void __db_freelist_sort __P((struct pglist *, u_int32_t));
00064 static int  __db_pglistcmp __P((const void *, const void *));
00065 static int  __db_truncate_freelist __P((DBC *, DBMETA *,
00066       PAGE *, db_pgno_t *, u_int32_t, u_int32_t));
00067 #endif
00068 
00069 /*
00070  * __db_init_meta --
00071  *      Helper function for __db_new that initializes the important fields in
00072  * a meta-data page (used instead of P_INIT).  We need to make sure that we
00073  * retain the page number and LSN of the existing page.
00074  */
00075 static void
00076 __db_init_meta(dbp, p, pgno, pgtype)
00077         DB *dbp;
00078         void *p;
00079         db_pgno_t pgno;
00080         u_int32_t pgtype;
00081 {
00082         DB_LSN save_lsn;
00083         DBMETA *meta;
00084 
00085         meta = (DBMETA *)p;
00086         save_lsn = meta->lsn;
00087         memset(meta, 0, sizeof(DBMETA));
00088         meta->lsn = save_lsn;
00089         meta->pagesize = dbp->pgsize;
00090         if (F_ISSET(dbp, DB_AM_CHKSUM))
00091                 FLD_SET(meta->metaflags, DBMETA_CHKSUM);
00092         meta->pgno = pgno;
00093         meta->type = (u_int8_t)pgtype;
00094 }
00095 
00096 /*
00097  * __db_new --
00098  *      Get a new page, preferably from the freelist.
00099  *
00100  * PUBLIC: int __db_new __P((DBC *, u_int32_t, PAGE **));
00101  */
00102 int
00103 __db_new(dbc, type, pagepp)
00104         DBC *dbc;
00105         u_int32_t type;
00106         PAGE **pagepp;
00107 {
00108         DBMETA *meta;
00109         DB *dbp;
00110         DB_LOCK metalock;
00111         DB_LSN lsn;
00112         DB_MPOOLFILE *mpf;
00113         PAGE *h;
00114         db_pgno_t last, *list, pgno, newnext;
00115         u_int32_t meta_flags;
00116         int extend, ret, t_ret;
00117 
00118         meta = NULL;
00119         meta_flags = 0;
00120         dbp = dbc->dbp;
00121         mpf = dbp->mpf;
00122         h = NULL;
00123         newnext = PGNO_INVALID;
00124 
00125         pgno = PGNO_BASE_MD;
00126         if ((ret = __db_lget(dbc,
00127             LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
00128                 goto err;
00129         if ((ret = __memp_fget(mpf, &pgno, 0, &meta)) != 0)
00130                 goto err;
00131         last = meta->last_pgno;
00132         if (meta->free == PGNO_INVALID) {
00133                 if (FLD_ISSET(type, P_DONTEXTEND)) {
00134                         *pagepp = NULL;
00135                         goto err;
00136                 }
00137                 last = pgno = meta->last_pgno + 1;
00138                 ZERO_LSN(lsn);
00139                 extend = 1;
00140         } else {
00141                 pgno = meta->free;
00142                 if ((ret = __memp_fget(mpf, &pgno, 0, &h)) != 0)
00143                         goto err;
00144 
00145                 /*
00146                  * We want to take the first page off the free list and
00147                  * then set meta->free to the that page's next_pgno, but
00148                  * we need to log the change first.
00149                  */
00150                 newnext = h->next_pgno;
00151                 lsn = h->lsn;
00152                 extend = 0;
00153         }
00154 
00155         FLD_CLR(type, P_DONTEXTEND);
00156 
00157         /*
00158          * Log the allocation before fetching the new page.  If we
00159          * don't have room in the log then we don't want to tell
00160          * mpool to extend the file.
00161          */
00162         if (DBC_LOGGING(dbc)) {
00163                 if ((ret = __db_pg_alloc_log(dbp, dbc->txn, &LSN(meta), 0,
00164                     &LSN(meta), PGNO_BASE_MD, &lsn,
00165                     pgno, (u_int32_t)type, newnext, meta->last_pgno)) != 0)
00166                         goto err;
00167         } else
00168                 LSN_NOT_LOGGED(LSN(meta));
00169 
00170         meta_flags = DB_MPOOL_DIRTY;
00171         meta->free = newnext;
00172 
00173         if (extend == 1) {
00174                 if ((ret = __memp_fget(mpf, &pgno, DB_MPOOL_NEW, &h)) != 0)
00175                         goto err;
00176                 DB_ASSERT(last == pgno);
00177                 meta->last_pgno = pgno;
00178                 ZERO_LSN(h->lsn);
00179                 h->pgno = pgno;
00180         }
00181         LSN(h) = LSN(meta);
00182 
00183         DB_ASSERT(TYPE(h) == P_INVALID);
00184 
00185         if (TYPE(h) != P_INVALID)
00186                 return (__db_panic(dbp->dbenv, EINVAL));
00187 
00188         ret = __memp_fput(mpf, (PAGE *)meta, DB_MPOOL_DIRTY);
00189         meta = NULL;
00190         if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
00191                 ret = t_ret;
00192         if (ret != 0)
00193                 goto err;
00194 
00195         switch (type) {
00196                 case P_BTREEMETA:
00197                 case P_HASHMETA:
00198                 case P_QAMMETA:
00199                         __db_init_meta(dbp, h, h->pgno, type);
00200                         break;
00201                 default:
00202                         P_INIT(h, dbp->pgsize,
00203                             h->pgno, PGNO_INVALID, PGNO_INVALID, 0, type);
00204                         break;
00205         }
00206 
00207         /* Fix up the sorted free list if necessary. */
00208 #ifdef HAVE_FTRUNCATE
00209         if (extend == 0) {
00210                 u_int32_t nelems = 0;
00211 
00212                 if ((ret = __memp_get_freelist(dbp->mpf, &nelems, &list)) != 0)
00213                         goto err;
00214                 if (nelems != 0) {
00215                         DB_ASSERT(h->pgno == list[0]);
00216                         memmove(list, &list[1], (nelems - 1) * sizeof(*list));
00217                         if ((ret = __memp_extend_freelist(
00218                             dbp->mpf, nelems - 1, &list)) != 0)
00219                                 goto err;
00220                 }
00221         }
00222 #else
00223         COMPQUIET(list, NULL);
00224 #endif
00225 
00226         /*
00227          * If dirty reads are enabled and we are in a transaction, we could
00228          * abort this allocation after the page(s) pointing to this
00229          * one have their locks downgraded.  This would permit dirty readers
00230          * to access this page which is ok, but they must be off the
00231          * page when we abort.  We never lock overflow pages or off page
00232          * duplicate trees.
00233          */
00234         if (type != P_OVERFLOW && !F_ISSET(dbc, DBC_OPD) &&
00235              F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED) && dbc->txn != NULL) {
00236                 if ((ret = __db_lget(dbc, 0,
00237                     h->pgno, DB_LOCK_WWRITE, 0, &metalock)) != 0)
00238                         goto err;
00239         }
00240 
00241         *pagepp = h;
00242         return (0);
00243 
00244 err:    if (h != NULL)
00245                 (void)__memp_fput(mpf, h, 0);
00246         if (meta != NULL)
00247                 (void)__memp_fput(mpf, meta, meta_flags);
00248         (void)__TLPUT(dbc, metalock);
00249         return (ret);
00250 }
00251 
00252 /*
00253  * __db_free --
00254  *      Add a page to the head of the freelist.
00255  *
00256  * PUBLIC: int __db_free __P((DBC *, PAGE *));
00257  */
00258 int
00259 __db_free(dbc, h)
00260         DBC *dbc;
00261         PAGE *h;
00262 {
00263         DBMETA *meta;
00264         DB *dbp;
00265         DBT ddbt, ldbt;
00266         DB_LOCK metalock;
00267         DB_MPOOLFILE *mpf;
00268         db_pgno_t last_pgno, *lp, next_pgno, pgno, prev_pgno;
00269         u_int32_t dirty_flag, lflag, nelem;
00270         int do_truncate, ret, t_ret;
00271 #ifdef HAVE_FTRUNCATE
00272         db_pgno_t *list;
00273         u_int32_t position, start;
00274 #endif
00275 
00276         dbp = dbc->dbp;
00277         mpf = dbp->mpf;
00278         prev_pgno = PGNO_INVALID;
00279         nelem = 0;
00280         meta = NULL;
00281         do_truncate = 0;
00282         lp = NULL;
00283 
00284         /*
00285          * Retrieve the metadata page.  If we are not keeping a sorted
00286          * free list put the page at the head of the the free list.
00287          * If we are keeping a sorted free list, for truncation,
00288          * then figure out where this page belongs and either
00289          * link it in or truncate the file as much as possible.
00290          * If either the lock get or page get routines
00291          * fail, then we need to put the page with which we were called
00292          * back because our caller assumes we take care of it.
00293          */
00294         dirty_flag = 0;
00295         pgno = PGNO_BASE_MD;
00296         if ((ret = __db_lget(dbc,
00297             LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
00298                 goto err;
00299         if ((ret = __memp_fget(mpf, &pgno, 0, &meta)) != 0)
00300                 goto err1;
00301 
00302         last_pgno = meta->last_pgno;
00303         next_pgno = meta->free;
00304 
00305         DB_ASSERT(h->pgno != next_pgno);
00306 
00307 #ifdef HAVE_FTRUNCATE
00308         /*
00309          * If we are maintaining a sorted free list see if we either have a
00310          * new truncation point or the page goes somewhere in the middle of
00311          * the list.  If it goes in the middle of the list, we will drop the
00312          * meta page and get the previous page.
00313          */
00314         if ((ret = __memp_get_freelist(mpf, &nelem, &list)) != 0)
00315                 goto err;
00316         if (list == NULL)
00317                 goto no_sort;
00318 
00319         if (h->pgno != last_pgno) {
00320                 /*
00321                  * Put the page number in the sorted list.
00322                  * Finds its position and the previous page,
00323                  * extend the list, make room and insert.
00324                  */
00325                 position = 0;
00326                 if (nelem != 0) {
00327                         __db_freelist_pos(h->pgno, list, nelem, &position);
00328 
00329                         DB_ASSERT(h->pgno != list[position]);
00330 
00331                         /* Get the previous page if this is not the smallest. */
00332                         if (position != 0 || h->pgno > list[0])
00333                                 prev_pgno = list[position];
00334                 }
00335 
00336                 /* Put the page number into the list. */
00337                 if ((ret = __memp_extend_freelist(mpf, nelem + 1, &list)) != 0)
00338                         return (ret);
00339                 if (prev_pgno != PGNO_INVALID)
00340                         lp = &list[position + 1];
00341                 else
00342                         lp = list;
00343                 if (nelem != 0 && position != nelem)
00344                         memmove(lp + 1, lp,
00345                             (size_t)((u_int8_t*)&list[nelem] - (u_int8_t*)lp));
00346                 *lp = h->pgno;
00347         } else if (nelem != 0) {
00348                 /* Find the truncation point. */
00349                 for (lp = &list[nelem - 1]; lp >= list; lp--)
00350                         if (--last_pgno != *lp)
00351                                 break;
00352                 if (lp < list || last_pgno < h->pgno - 1)
00353                         do_truncate = 1;
00354                 last_pgno = meta->last_pgno;
00355         }
00356 
00357 no_sort:
00358         if (prev_pgno != PGNO_INVALID) {
00359                 if ((ret = __memp_fput(mpf, meta, 0)) != 0)
00360                         goto err1;
00361                 meta = NULL;
00362                 pgno = prev_pgno;
00363                 if ((ret = __memp_fget(mpf, &pgno, 0, &meta)) != 0)
00364                         goto err1;
00365                 next_pgno = NEXT_PGNO(meta);
00366         }
00367 #endif
00368 
00369         /* Log the change. */
00370         if (DBC_LOGGING(dbc)) {
00371                 memset(&ldbt, 0, sizeof(ldbt));
00372                 ldbt.data = h;
00373                 ldbt.size = P_OVERHEAD(dbp);
00374                 switch (h->type) {
00375                 case P_HASH:
00376                 case P_IBTREE:
00377                 case P_IRECNO:
00378                 case P_LBTREE:
00379                 case P_LRECNO:
00380                 case P_LDUP:
00381                         if (h->entries > 0) {
00382                                 ldbt.size += h->entries * sizeof(db_indx_t);
00383                                 ddbt.data = (u_int8_t *)h + HOFFSET(h);
00384                                 ddbt.size = dbp->pgsize - HOFFSET(h);
00385                                 if ((ret = __db_pg_freedata_log(dbp, dbc->txn,
00386                                      &LSN(meta), 0, h->pgno, &LSN(meta), pgno,
00387                                      &ldbt, next_pgno, last_pgno, &ddbt)) != 0)
00388                                         goto err1;
00389                                 goto logged;
00390                         }
00391                         break;
00392                 case P_HASHMETA:
00393                         ldbt.size = sizeof(HMETA);
00394                         break;
00395                 case P_BTREEMETA:
00396                         ldbt.size = sizeof(BTMETA);
00397                         break;
00398                 case P_OVERFLOW:
00399                         ldbt.size += OV_LEN(h);
00400                         break;
00401                 default:
00402                         DB_ASSERT(h->type != P_QAMDATA);
00403                 }
00404 
00405                 /*
00406                  * If we are truncating the file, we need to make sure
00407                  * the logging happens before the truncation.  If we
00408                  * are truncating multiple pages we don't need to flush the
00409                  * log here as it will be flushed by __db_truncate_freelist.
00410                  */
00411                 lflag = 0;
00412 #ifdef HAVE_FTRUNCATE
00413                 if (do_truncate == 0 && h->pgno == last_pgno)
00414                         lflag = DB_FLUSH;
00415 #endif
00416                 if ((ret = __db_pg_free_log(dbp,
00417                       dbc->txn, &LSN(meta), lflag, h->pgno,
00418                       &LSN(meta), pgno, &ldbt, next_pgno, last_pgno)) != 0)
00419                         goto err1;
00420         } else
00421                 LSN_NOT_LOGGED(LSN(meta));
00422 logged: LSN(h) = LSN(meta);
00423 
00424 #ifdef HAVE_FTRUNCATE
00425         if (do_truncate) {
00426                 start = (u_int32_t) (lp - list) + 1;
00427                 meta->last_pgno--;
00428                 ret = __db_truncate_freelist(
00429                       dbc, meta, h, list, start, nelem);
00430                 h = NULL;
00431         } else if (h->pgno == last_pgno) {
00432                 if ((ret = __memp_fput(mpf, h, DB_MPOOL_DISCARD)) != 0)
00433                         goto err;
00434                 /* Give the page back to the OS. */
00435                 if ((ret = __memp_ftruncate(mpf, last_pgno, 0)) != 0)
00436                         goto err;
00437                 DB_ASSERT(meta->pgno == PGNO_BASE_MD);
00438                 meta->last_pgno--;
00439                 h = NULL;
00440         } else
00441 #endif
00442 
00443         {
00444                 /*
00445                  * If we are not truncating the page then we
00446                  * reinitialize it and put it at the head of
00447                  * the free list.
00448                  */
00449                 P_INIT(h, dbp->pgsize,
00450                     h->pgno, PGNO_INVALID, next_pgno, 0, P_INVALID);
00451 #ifdef DIAGNOSTIC
00452                 memset((u_int8_t *) h + P_OVERHEAD(dbp),
00453                     CLEAR_BYTE, dbp->pgsize - P_OVERHEAD(dbp));
00454 #endif
00455                 if (prev_pgno == PGNO_INVALID)
00456                         meta->free = h->pgno;
00457                 else
00458                         NEXT_PGNO(meta) = h->pgno;
00459         }
00460 
00461         /* Discard the metadata or previous page. */
00462 err1:   if (meta != NULL && (t_ret =
00463             __memp_fput(mpf, (PAGE *)meta, DB_MPOOL_DIRTY)) != 0 && ret == 0)
00464                 ret = t_ret;
00465         if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
00466                 ret = t_ret;
00467 
00468         /* Discard the caller's page reference. */
00469         dirty_flag = DB_MPOOL_DIRTY;
00470 err:    if (h != NULL &&
00471             (t_ret = __memp_fput(mpf, h, dirty_flag)) != 0 && ret == 0)
00472                 ret = t_ret;
00473 
00474         /*
00475          * XXX
00476          * We have to unlock the caller's page in the caller!
00477          */
00478         return (ret);
00479 }
00480 
00481 #ifdef HAVE_FTRUNCATE
00482 /*
00483  * __db_freelist_pos -- find the position of a page in the freelist.
00484  *      The list is sorted, we do a binary search.
00485  *
00486  * PUBLIC: #ifdef HAVE_FTRUNCATE
00487  * PUBLIC: void __db_freelist_pos __P((db_pgno_t,
00488  * PUBLIC:       db_pgno_t *, u_int32_t, u_int32_t *));
00489  * PUBLIC: #endif
00490  */
00491 void
00492 __db_freelist_pos(pgno, list, nelem, posp)
00493         db_pgno_t pgno;
00494         db_pgno_t *list;
00495         u_int32_t nelem;
00496         u_int32_t *posp;
00497 {
00498         u_int32_t base, indx, lim;
00499 
00500         indx = 0;
00501         for (base = 0, lim = nelem; lim != 0; lim >>= 1) {
00502                 indx = base + (lim >> 1);
00503                 if (pgno == list[indx]) {
00504                         *posp = indx;
00505                         return;
00506                 }
00507                 if (pgno > list[indx]) {
00508                         base = indx + 1;
00509                         --lim;
00510                 }
00511         }
00512         if (base != 0)
00513                 base--;
00514         *posp = base;
00515         return;
00516 }
00517 
00518 static int
00519 __db_pglistcmp(a, b)
00520         const void *a, *b;
00521 {
00522         struct pglist *ap, *bp;
00523 
00524         ap = (struct pglist *)a;
00525         bp = (struct pglist *)b;
00526 
00527         return ((ap->pgno > bp->pgno) ? 1 : (ap->pgno < bp->pgno) ? -1: 0);
00528 }
00529 
00530 /*
00531  * __db_freelist_sort -- sort a list of free pages.
00532  */
00533 static void
00534 __db_freelist_sort(list, nelems)
00535         struct pglist *list;
00536         u_int32_t nelems;
00537 {
00538         qsort(list, (size_t)nelems, sizeof(struct pglist), __db_pglistcmp);
00539 }
00540 
00541 /*
00542  * __db_pg_truncate -- sort the freelist and find the truncation point.
00543  *
00544  * PUBLIC: #ifdef HAVE_FTRUNCATE
00545  * PUBLIC: int __db_pg_truncate __P((DB_MPOOLFILE *, struct pglist *list,
00546  * PUBLIC:    DB_COMPACT *, u_int32_t *, db_pgno_t *, DB_LSN *, int));
00547  * PUBLIC: #endif
00548  */
00549 int
00550 __db_pg_truncate(mpf, list, c_data, nelemp, last_pgno, lsnp, in_recovery)
00551         DB_MPOOLFILE *mpf;
00552         struct pglist *list;
00553         DB_COMPACT *c_data;
00554         u_int32_t *nelemp;
00555         db_pgno_t *last_pgno;
00556         DB_LSN *lsnp;
00557         int in_recovery;
00558 {
00559         PAGE *h;
00560         struct pglist *lp;
00561         db_pgno_t pgno;
00562         u_int32_t nelems;
00563         int modified, ret;
00564 
00565         ret = 0;
00566 
00567         nelems = *nelemp;
00568         /* Sort the list */
00569         __db_freelist_sort(list, nelems);
00570 
00571         /* Find the truncation point. */
00572         pgno = *last_pgno;
00573         lp = &list[nelems - 1];
00574         while (nelems != 0) {
00575                 if (lp->pgno != pgno)
00576                         break;
00577                 pgno--;
00578                 nelems--;
00579                 lp--;
00580         }
00581 
00582         /*
00583          * Figure out what (if any) pages can be truncated immediately and
00584          * record the place from which we can truncate, so we can do the
00585          * memp_ftruncate below.  We also use this to avoid ever putting
00586          * these pages on the freelist, which we are about to relink.
00587          */
00588         for (lp = list; lp < &list[nelems]; lp++) {
00589                 if ((ret = __memp_fget(mpf, &lp->pgno, 0, &h)) != 0) {
00590                         /* Page may have been truncated later. */
00591                         if (in_recovery && ret == DB_PAGE_NOTFOUND) {
00592                                 ret = 0;
00593                                 continue;
00594                         }
00595                         goto err;
00596                 }
00597                 modified = 0;
00598                 if (!in_recovery || log_compare(&LSN(h), &lp->lsn) == 0) {
00599                         if (lp == &list[nelems - 1])
00600                                 NEXT_PGNO(h) = PGNO_INVALID;
00601                         else
00602                                 NEXT_PGNO(h) = lp[1].pgno;
00603                         DB_ASSERT(NEXT_PGNO(h) < *last_pgno);
00604 
00605                         LSN(h) = *lsnp;
00606                         modified = 1;
00607                 }
00608                 if ((ret = __memp_fput(mpf, h,
00609                     modified ? DB_MPOOL_DIRTY : 0)) != 0)
00610                         goto err;
00611         }
00612 
00613         if (pgno != *last_pgno) {
00614                 if ((ret = __memp_ftruncate(mpf,
00615                     pgno + 1, in_recovery ? MP_TRUNC_RECOVER : 0)) != 0)
00616                         goto err;
00617                 if (c_data)
00618                         c_data->compact_pages_truncated += *last_pgno - pgno;
00619                 *last_pgno = pgno;
00620         }
00621         *nelemp = nelems;
00622 
00623 err:    return (ret);
00624 }
00625 
00626 /*
00627  * __db_free_truncate --
00628  *      Truncate free pages at the end of the file.
00629  *
00630  * PUBLIC: #ifdef HAVE_FTRUNCATE
00631  * PUBLIC: int __db_free_truncate __P((DB *, DB_TXN *, u_int32_t,
00632  * PUBLIC:    DB_COMPACT *, struct pglist **, u_int32_t *, db_pgno_t *));
00633  * PUBLIC: #endif
00634  */
00635 int
00636 __db_free_truncate(dbp, txn, flags, c_data, listp, nelemp, last_pgnop)
00637         DB *dbp;
00638         DB_TXN *txn;
00639         u_int32_t flags;
00640         DB_COMPACT *c_data;
00641         struct pglist **listp;
00642         u_int32_t *nelemp;
00643         db_pgno_t *last_pgnop;
00644 {
00645         DBC *dbc;
00646         DB_ENV *dbenv;
00647         DBMETA *meta;
00648         DBT ddbt;
00649         DB_LOCK metalock;
00650         DB_LSN null_lsn;
00651         DB_MPOOLFILE *mpf;
00652         PAGE *h;
00653         db_pgno_t pgno;
00654         u_int32_t nelems;
00655         struct pglist *list, *lp;
00656         int ret, t_ret;
00657         size_t size;
00658 
00659         COMPQUIET(flags, 0);
00660         list = NULL;
00661         meta = NULL;
00662         dbenv = dbp->dbenv;
00663         mpf = dbp->mpf;
00664         h = NULL;
00665         nelems = 0;
00666         if (listp != NULL) {
00667                 *listp = NULL;
00668                 DB_ASSERT(nelemp != NULL);
00669                 *nelemp = 0;
00670         }
00671 
00672         if ((ret = __db_cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
00673                 return (ret);
00674 
00675         pgno = PGNO_BASE_MD;
00676         if ((ret = __db_lget(dbc,
00677             LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
00678                 goto err;
00679         if ((ret = __memp_fget(mpf, &pgno, 0, &meta)) != 0)
00680                 goto err;
00681 
00682         if (last_pgnop != NULL)
00683                 *last_pgnop = meta->last_pgno;
00684         if ((pgno = meta->free) == PGNO_INVALID)
00685                 goto done;
00686 
00687         size = 128;
00688         if ((ret = __os_malloc(dbenv, size * sizeof(*list), &list)) != 0)
00689                 goto err;
00690         lp = list;
00691 
00692         do {
00693                 if (lp == &list[size]) {
00694                         size *= 2;
00695                         if ((ret = __os_realloc(dbenv,
00696                             size * sizeof(*list), &list)) != 0)
00697                                 goto err;
00698                         lp = &list[size / 2];
00699                 }
00700                 if ((ret = __memp_fget(mpf, &pgno, 0, &h)) != 0)
00701                         goto err;
00702 
00703                 lp->pgno = pgno;
00704                 lp->lsn = LSN(h);
00705                 pgno = NEXT_PGNO(h);
00706                 if ((ret = __memp_fput(mpf, h, 0)) != 0)
00707                         goto err;
00708                 lp++;
00709         } while (pgno != PGNO_INVALID);
00710         nelems = (u_int32_t)(lp - list);
00711 
00712         /* Log the current state of the free list */
00713         if (DBC_LOGGING(dbc)) {
00714                 ddbt.data = list;
00715                 ddbt.size = nelems * sizeof(*lp);
00716                 ZERO_LSN(null_lsn);
00717                 if ((ret = __db_pg_sort_log(dbp,
00718                      dbc->txn, &LSN(meta), DB_FLUSH, PGNO_BASE_MD, &LSN(meta),
00719                      PGNO_INVALID, &null_lsn, meta->last_pgno, &ddbt)) != 0)
00720                         goto err;
00721         } else
00722                 LSN_NOT_LOGGED(LSN(meta));
00723 
00724         if ((ret = __db_pg_truncate(mpf, list, c_data,
00725             &nelems, &meta->last_pgno, &LSN(meta), 0)) != 0)
00726                 goto err;
00727 
00728         if (nelems == 0)
00729                 meta->free = PGNO_INVALID;
00730         else
00731                 meta->free = list[0].pgno;
00732 
00733 done:   if (last_pgnop != NULL)
00734                 *last_pgnop = meta->last_pgno;
00735 
00736         /*
00737          * The truncate point is the number of pages in the free
00738          * list back from the last page.  The number of pages
00739          * in the free list are the number that we can swap in.
00740          */
00741         if (c_data)
00742                 c_data->compact_truncate = (u_int32_t)meta->last_pgno - nelems;
00743 
00744         if (nelems != 0 && listp != NULL) {
00745                 *listp = list;
00746                 *nelemp = nelems;
00747                 list = NULL;
00748         }
00749 
00750 err:    if (list != NULL)
00751                 __os_free(dbenv, list);
00752         if (meta != NULL && (t_ret =
00753              __memp_fput(mpf, (PAGE *)meta, DB_MPOOL_DIRTY)) != 0 && ret == 0)
00754                 ret = t_ret;
00755         if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
00756                 ret = t_ret;
00757         if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0)
00758                 ret = t_ret;
00759         return (ret);
00760 }
00761 
00762 static int
00763 __db_truncate_freelist(dbc, meta, h, list, start, nelem)
00764         DBC *dbc;
00765         DBMETA *meta;
00766         PAGE *h;
00767         db_pgno_t *list;
00768         u_int32_t start, nelem;
00769 {
00770         DB *dbp;
00771         DB_LSN null_lsn;
00772         DB_MPOOLFILE *mpf;
00773         DBT ddbt;
00774         PAGE *last_free, *pg;
00775         db_pgno_t *lp;
00776         struct pglist *plist, *pp;
00777         int ret;
00778 
00779         dbp = dbc->dbp;
00780         mpf = dbp->mpf;
00781         plist = NULL;
00782         last_free = NULL;
00783 
00784         if (start != 0 &&
00785             (ret = __memp_fget(mpf, &list[start - 1], 0, &last_free)) != 0)
00786                 goto err;
00787 
00788         if (DBC_LOGGING(dbc)) {
00789                 if ((ret = __os_malloc(dbp->dbenv,
00790                      (nelem - start) * sizeof(*pp), &plist)) != 0)
00791                         goto err;
00792 
00793                 pp = plist;
00794                 for (lp = &list[start]; lp < &list[nelem]; lp++) {
00795                         pp->pgno = *lp;
00796                         if ((ret = __memp_fget(mpf, lp, 0, &pg)) != 0)
00797                                 goto err;
00798                         pp->lsn = LSN(pg);
00799                         if ((ret = __memp_fput(mpf, pg, DB_MPOOL_DISCARD)) != 0)
00800                                 goto err;
00801                         pp++;
00802                 }
00803                 ddbt.data = plist;
00804                 ddbt.size = (nelem - start) * sizeof(*pp);
00805                 ZERO_LSN(null_lsn);
00806                 if (last_free != NULL) {
00807                         if ((ret = __db_pg_sort_log(dbp, dbc->txn, &LSN(meta),
00808                              DB_FLUSH, PGNO(meta), &LSN(meta), PGNO(last_free),
00809                              &LSN(last_free), meta->last_pgno, &ddbt)) != 0)
00810                                 goto err;
00811                 } else if ((ret = __db_pg_sort_log(dbp, dbc->txn,
00812                      &LSN(meta), DB_FLUSH, PGNO(meta), &LSN(meta),
00813                      PGNO_INVALID, &null_lsn, meta->last_pgno, &ddbt)) != 0)
00814                         goto err;
00815         } else
00816                 LSN_NOT_LOGGED(LSN(meta));
00817         if (last_free != NULL)
00818                 LSN(last_free) = LSN(meta);
00819 
00820         if ((ret = __memp_fput(mpf, h, DB_MPOOL_DISCARD)) != 0)
00821                 goto err;
00822         h = NULL;
00823         if ((ret = __memp_ftruncate(mpf, list[start], 0)) != 0)
00824                 goto err;
00825         meta->last_pgno = list[start] - 1;
00826 
00827         if (start == 0)
00828                 meta->free = PGNO_INVALID;
00829         else {
00830                 NEXT_PGNO(last_free) = PGNO_INVALID;
00831                 if ((ret = __memp_fput(mpf, last_free, DB_MPOOL_DIRTY)) != 0)
00832                         goto err;
00833                 last_free = NULL;
00834         }
00835 
00836         /* Shrink the number of elements in the list. */
00837         ret = __memp_extend_freelist(mpf, start, &list);
00838 
00839 err:    if (plist != NULL)
00840                 __os_free(dbp->dbenv, plist);
00841 
00842         /* We need to put the page on error. */
00843         if (h != NULL)
00844                 (void)__memp_fput(mpf, h, 0);
00845         if (last_free != NULL)
00846                 (void)__memp_fput(mpf, last_free, 0);
00847 
00848         return (ret);
00849 }
00850 #endif
00851 
00852 #ifdef DEBUG
00853 /*
00854  * __db_lprint --
00855  *      Print out the list of locks currently held by a cursor.
00856  *
00857  * PUBLIC: int __db_lprint __P((DBC *));
00858  */
00859 int
00860 __db_lprint(dbc)
00861         DBC *dbc;
00862 {
00863         DB_ENV *dbenv;
00864         DB *dbp;
00865         DB_LOCKREQ req;
00866 
00867         dbp = dbc->dbp;
00868         dbenv = dbp->dbenv;
00869 
00870         if (LOCKING_ON(dbenv)) {
00871                 req.op = DB_LOCK_DUMP;
00872                 (void)__lock_vec(dbenv, dbc->locker, 0, &req, 1, NULL);
00873         }
00874         return (0);
00875 }
00876 #endif
00877 
00878 /*
00879  * __db_lget --
00880  *      The standard lock get call.
00881  *
00882  * PUBLIC: int __db_lget __P((DBC *,
00883  * PUBLIC:     int, db_pgno_t, db_lockmode_t, u_int32_t, DB_LOCK *));
00884  */
00885 int
00886 __db_lget(dbc, action, pgno, mode, lkflags, lockp)
00887         DBC *dbc;
00888         int action;
00889         db_pgno_t pgno;
00890         db_lockmode_t mode;
00891         u_int32_t lkflags;
00892         DB_LOCK *lockp;
00893 {
00894         DB *dbp;
00895         DB_ENV *dbenv;
00896         DB_LOCKREQ couple[3], *reqp;
00897         DB_TXN *txn;
00898         int has_timeout, i, ret;
00899 
00900         dbp = dbc->dbp;
00901         dbenv = dbp->dbenv;
00902         txn = dbc->txn;
00903 
00904         /*
00905          * We do not always check if we're configured for locking before
00906          * calling __db_lget to acquire the lock.
00907          */
00908         if (CDB_LOCKING(dbenv) ||
00909             !LOCKING_ON(dbenv) || F_ISSET(dbc, DBC_COMPENSATE) ||
00910             (F_ISSET(dbc, DBC_RECOVER) &&
00911             (action != LCK_ROLLBACK || IS_REP_CLIENT(dbenv))) ||
00912             (action != LCK_ALWAYS && F_ISSET(dbc, DBC_OPD))) {
00913                 LOCK_INIT(*lockp);
00914                 return (0);
00915         }
00916 
00917         dbc->lock.pgno = pgno;
00918         if (lkflags & DB_LOCK_RECORD)
00919                 dbc->lock.type = DB_RECORD_LOCK;
00920         else
00921                 dbc->lock.type = DB_PAGE_LOCK;
00922         lkflags &= ~DB_LOCK_RECORD;
00923         if (action == LCK_ROLLBACK)
00924                 lkflags |= DB_LOCK_ABORT;
00925 
00926         /*
00927          * If the transaction enclosing this cursor has DB_LOCK_NOWAIT set,
00928          * pass that along to the lock call.
00929          */
00930         if (DB_NONBLOCK(dbc))
00931                 lkflags |= DB_LOCK_NOWAIT;
00932 
00933         if (F_ISSET(dbc, DBC_READ_UNCOMMITTED) && mode == DB_LOCK_READ)
00934                 mode = DB_LOCK_READ_UNCOMMITTED;
00935 
00936         has_timeout = F_ISSET(dbc, DBC_RECOVER) ||
00937             (txn != NULL && F_ISSET(txn, TXN_LOCKTIMEOUT));
00938 
00939         /*
00940          * Transactional locking.
00941          * Hold on to the previous read lock only if we are in full isolation.
00942          * COUPLE_ALWAYS indicates we are holding an interior node which need
00943          *      not be isolated.
00944          * Downgrade write locks if we are supporting dirty readers.
00945          */
00946         if ((action != LCK_COUPLE && action != LCK_COUPLE_ALWAYS) ||
00947             !LOCK_ISSET(*lockp))
00948                 action = 0;
00949         else if (dbc->txn == NULL || action == LCK_COUPLE_ALWAYS)
00950                 action = LCK_COUPLE;
00951         else if (F_ISSET(dbc,
00952             DBC_READ_COMMITTED) && lockp->mode == DB_LOCK_READ)
00953                 action = LCK_COUPLE;
00954         else if (F_ISSET(dbc,
00955             DBC_READ_UNCOMMITTED) && lockp->mode == DB_LOCK_READ_UNCOMMITTED)
00956                 action = LCK_COUPLE;
00957         else if (F_ISSET(dbc->dbp,
00958             DB_AM_READ_UNCOMMITTED) && lockp->mode == DB_LOCK_WRITE)
00959                 action = LCK_DOWNGRADE;
00960         else
00961                 action = 0;
00962 
00963         i = 0;
00964         switch (action) {
00965         default:
00966                 if (has_timeout)
00967                         goto couple;
00968                 ret = __lock_get(dbenv,
00969                     dbc->locker, lkflags, &dbc->lock_dbt, mode, lockp);
00970                 break;
00971 
00972         case LCK_DOWNGRADE:
00973                 couple[0].op = DB_LOCK_GET;
00974                 couple[0].obj = NULL;
00975                 couple[0].lock = *lockp;
00976                 couple[0].mode = DB_LOCK_WWRITE;
00977                 UMRW_SET(couple[0].timeout);
00978                 i++;
00979                 /* FALLTHROUGH */
00980         case LCK_COUPLE:
00981 couple:         couple[i].op = has_timeout? DB_LOCK_GET_TIMEOUT : DB_LOCK_GET;
00982                 couple[i].obj = &dbc->lock_dbt;
00983                 couple[i].mode = mode;
00984                 UMRW_SET(couple[i].timeout);
00985                 i++;
00986                 if (has_timeout)
00987                         couple[0].timeout =
00988                              F_ISSET(dbc, DBC_RECOVER) ? 0 : txn->lock_timeout;
00989                 if (action == LCK_COUPLE || action == LCK_DOWNGRADE) {
00990                         couple[i].op = DB_LOCK_PUT;
00991                         couple[i].lock = *lockp;
00992                         i++;
00993                 }
00994 
00995                 ret = __lock_vec(dbenv,
00996                     dbc->locker, lkflags, couple, i, &reqp);
00997                 if (ret == 0 || reqp == &couple[i - 1])
00998                         *lockp = i == 1 ? couple[0].lock : couple[i - 2].lock;
00999                 break;
01000         }
01001 
01002         if (txn != NULL && ret == DB_LOCK_DEADLOCK)
01003                 F_SET(txn, TXN_DEADLOCK);
01004         return ((ret == DB_LOCK_NOTGRANTED &&
01005              !F_ISSET(dbenv, DB_ENV_TIME_NOTGRANTED)) ? DB_LOCK_DEADLOCK : ret);
01006 }
01007 
01008 /*
01009  * __db_lput --
01010  *      The standard lock put call.
01011  *
01012  * PUBLIC: int __db_lput __P((DBC *, DB_LOCK *));
01013  */
01014 int
01015 __db_lput(dbc, lockp)
01016         DBC *dbc;
01017         DB_LOCK *lockp;
01018 {
01019         DB_ENV *dbenv;
01020         DB_LOCKREQ couple[2], *reqp;
01021         int action, ret;
01022 
01023         /*
01024          * Transactional locking.
01025          * Hold on to the read locks only if we are in full isolation.
01026          * Downgrade write locks if we are supporting dirty readers.
01027          */
01028         if (F_ISSET(dbc->dbp,
01029             DB_AM_READ_UNCOMMITTED) && lockp->mode == DB_LOCK_WRITE)
01030                 action = LCK_DOWNGRADE;
01031         else if (dbc->txn == NULL)
01032                 action = LCK_COUPLE;
01033         else if (F_ISSET(dbc,
01034             DBC_READ_COMMITTED) && lockp->mode == DB_LOCK_READ)
01035                 action = LCK_COUPLE;
01036         else if (F_ISSET(dbc,
01037             DBC_READ_UNCOMMITTED) && lockp->mode == DB_LOCK_READ_UNCOMMITTED)
01038                 action = LCK_COUPLE;
01039         else
01040                 action = 0;
01041 
01042         dbenv = dbc->dbp->dbenv;
01043         switch (action) {
01044         case LCK_COUPLE:
01045                 ret = __lock_put(dbenv, lockp);
01046                 break;
01047         case LCK_DOWNGRADE:
01048                 couple[0].op = DB_LOCK_GET;
01049                 couple[0].obj = NULL;
01050                 couple[0].mode = DB_LOCK_WWRITE;
01051                 couple[0].lock = *lockp;
01052                 UMRW_SET(couple[0].timeout);
01053                 couple[1].op = DB_LOCK_PUT;
01054                 couple[1].lock = *lockp;
01055                 ret = __lock_vec(dbenv, dbc->locker, 0, couple, 2, &reqp);
01056                 if (ret == 0 || reqp == &couple[1])
01057                         *lockp = couple[0].lock;
01058                 break;
01059         default:
01060                 ret = 0;
01061                 break;
01062         }
01063 
01064         return (ret);
01065 }

Generated on Sun Dec 25 12:14:20 2005 for Berkeley DB 4.4.16 by  doxygen 1.4.2