Main Page | Class Hierarchy | Data Structures | Directories | File List | Data Fields | Related Pages

btree.h

00001 /*-
00002  * See the file LICENSE for redistribution information.
00003  *
00004  * Copyright (c) 1996-2005
00005  *      Sleepycat Software.  All rights reserved.
00006  */
00007 /*
00008  * Copyright (c) 1990, 1993, 1994, 1995, 1996
00009  *      Keith Bostic.  All rights reserved.
00010  */
00011 /*
00012  * Copyright (c) 1990, 1993, 1994, 1995
00013  *      The Regents of the University of California.  All rights reserved.
00014  *
00015  * This code is derived from software contributed to Berkeley by
00016  * Mike Olson.
00017  *
00018  * Redistribution and use in source and binary forms, with or without
00019  * modification, are permitted provided that the following conditions
00020  * are met:
00021  * 1. Redistributions of source code must retain the above copyright
00022  *    notice, this list of conditions and the following disclaimer.
00023  * 2. Redistributions in binary form must reproduce the above copyright
00024  *    notice, this list of conditions and the following disclaimer in the
00025  *    documentation and/or other materials provided with the distribution.
00026  * 3. Neither the name of the University nor the names of its contributors
00027  *    may be used to endorse or promote products derived from this software
00028  *    without specific prior written permission.
00029  *
00030  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
00031  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00032  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00033  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
00034  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
00035  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
00036  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
00037  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00038  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
00039  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00040  * SUCH DAMAGE.
00041  *
00042  * $Id: btree.h,v 12.8 2005/08/08 14:52:30 bostic Exp $
00043  */
00044 #ifndef _DB_BTREE_H_
00045 #define _DB_BTREE_H_
00046 
00047 /* Forward structure declarations. */
00048 struct __btree;         typedef struct __btree BTREE;
00049 struct __cursor;        typedef struct __cursor BTREE_CURSOR;
00050 struct __epg;           typedef struct __epg EPG;
00051 struct __recno;         typedef struct __recno RECNO;
00052 
00053 #define DEFMINKEYPAGE    (2)
00054 
00055 /*
00056  * A recno order of 0 indicates that we don't have an order, not that we've
00057  * an order less than 1.
00058  */
00059 #define INVALID_ORDER   0
00060 
00061 #define ISINTERNAL(p)   (TYPE(p) == P_IBTREE || TYPE(p) == P_IRECNO)
00062 #define ISLEAF(p)       (TYPE(p) == P_LBTREE ||                         \
00063                             TYPE(p) == P_LRECNO || TYPE(p) == P_LDUP)
00064 
00065 /* Flags for __bam_cadjust_log(). */
00066 #define CAD_UPDATEROOT  0x01            /* Root page count was updated. */
00067 
00068 /* Flags for __bam_split_log(). */
00069 #define SPL_NRECS       0x01            /* Split tree has record count. */
00070 
00071 /* Flags for __bam_iitem(). */
00072 #define BI_DELETED      0x01            /* Key/data pair only placeholder. */
00073 
00074 /* Flags for __bam_stkrel(). */
00075 #define STK_CLRDBC      0x01            /* Clear dbc->page reference. */
00076 #define STK_NOLOCK      0x02            /* Don't retain locks. */
00077 #define STK_PGONLY      0x04
00078 
00079 /* Flags for __ram_ca(). These get logged, so make the values explicit. */
00080 typedef enum {
00081         CA_DELETE = 0,                  /* Delete the current record. */
00082         CA_IAFTER = 1,                  /* Insert before the current record. */
00083         CA_IBEFORE = 2,                 /* Insert after the current record. */
00084         CA_ICURRENT = 3                 /* Overwrite the current record. */
00085 } ca_recno_arg;
00086 
00087 /*
00088  * Flags for __bam_search() and __bam_rsearch().
00089  *
00090  * Note, internal page searches must find the largest record less than key in
00091  * the tree so that descents work.  Leaf page searches must find the smallest
00092  * record greater than key so that the returned index is the record's correct
00093  * position for insertion.
00094  *
00095  * The flags parameter to the search routines describes three aspects of the
00096  * search: the type of locking required (including if we're locking a pair of
00097  * pages), the item to return in the presence of duplicates and whether or not
00098  * to return deleted entries.  To simplify both the mnemonic representation
00099  * and the code that checks for various cases, we construct a set of bitmasks.
00100  */
00101 #define S_READ          0x00001         /* Read locks. */
00102 #define S_WRITE         0x00002         /* Write locks. */
00103 
00104 #define S_APPEND        0x00040         /* Append to the tree. */
00105 #define S_DELNO         0x00080         /* Don't return deleted items. */
00106 #define S_DUPFIRST      0x00100         /* Return first duplicate. */
00107 #define S_DUPLAST       0x00200         /* Return last duplicate. */
00108 #define S_EXACT         0x00400         /* Exact items only. */
00109 #define S_PARENT        0x00800         /* Lock page pair. */
00110 #define S_STACK         0x01000         /* Need a complete stack. */
00111 #define S_PAST_EOF      0x02000         /* If doing insert search (or keyfirst
00112                                          * or keylast operations), or a split
00113                                          * on behalf of an insert, it's okay to
00114                                          * return an entry one past end-of-page.
00115                                          */
00116 #define S_STK_ONLY      0x04000         /* Just return info in the stack */
00117 #define S_MAX           0x08000         /* Get the right most key */
00118 #define S_MIN           0x10000         /* Get the left most key */
00119 #define S_NEXT          0x20000         /* Get the page after this key */
00120 #define S_DEL           0x40000         /* Get the tree to delete this key. */
00121 #define S_START         0x80000         /* Level to start stack. */
00122 
00123 #define S_DELETE        (S_WRITE | S_DUPFIRST | S_DELNO | S_EXACT | S_STACK)
00124 #define S_FIND          (S_READ | S_DUPFIRST | S_DELNO)
00125 #define S_FIND_WR       (S_WRITE | S_DUPFIRST | S_DELNO)
00126 #define S_INSERT        (S_WRITE | S_DUPLAST | S_PAST_EOF | S_STACK)
00127 #define S_KEYFIRST      (S_WRITE | S_DUPFIRST | S_PAST_EOF | S_STACK)
00128 #define S_KEYLAST       (S_WRITE | S_DUPLAST | S_PAST_EOF | S_STACK)
00129 #define S_WRPAIR        (S_WRITE | S_DUPLAST | S_PAST_EOF | S_PARENT)
00130 
00131 /*
00132  * Various routines pass around page references.  A page reference is
00133  * a pointer to the page, and the indx indicates an item on the page.
00134  * Each page reference may include a lock.
00135  */
00136 struct __epg {
00137         PAGE         *page;             /* The page. */
00138         db_indx_t     indx;             /* The index on the page. */
00139         db_indx_t     entries;          /* The number of entries on page */
00140         DB_LOCK       lock;             /* The page's lock. */
00141         db_lockmode_t lock_mode;        /* The lock mode. */
00142 };
00143 
00144 /*
00145  * We maintain a stack of the pages that we're locking in the tree.  Grow
00146  * the stack as necessary.
00147  *
00148  * XXX
00149  * Temporary fix for #3243 -- clear the page and lock from the stack entry.
00150  * The correct fix is to never release a stack that doesn't hold items.
00151  */
00152 #define BT_STK_CLR(c) do {                                              \
00153         (c)->csp = (c)->sp;                                             \
00154         (c)->csp->page = NULL;                                          \
00155         LOCK_INIT((c)->csp->lock);                                      \
00156 } while (0)
00157 
00158 #define BT_STK_ENTER(dbenv, c, pagep, page_indx, l, mode, ret) do {     \
00159         if ((ret = ((c)->csp == (c)->esp ?                              \
00160             __bam_stkgrow(dbenv, c) : 0)) == 0) {                       \
00161                 (c)->csp->page = pagep;                                 \
00162                 (c)->csp->indx = (page_indx);                           \
00163                 (c)->csp->entries = NUM_ENT(pagep);                     \
00164                 (c)->csp->lock = l;                                     \
00165                 (c)->csp->lock_mode = mode;                             \
00166         }                                                               \
00167 } while (0)
00168 
00169 #define BT_STK_PUSH(dbenv, c, pagep, page_indx, lock, mode, ret) do {   \
00170         BT_STK_ENTER(dbenv, c, pagep, page_indx, lock, mode, ret);      \
00171         ++(c)->csp;                                                     \
00172 } while (0)
00173 
00174 #define BT_STK_NUM(dbenv, c, pagep, page_indx, ret) do {                \
00175         if ((ret = ((c)->csp ==                                         \
00176             (c)->esp ? __bam_stkgrow(dbenv, c) : 0)) == 0) {            \
00177                 (c)->csp->page = NULL;                                  \
00178                 (c)->csp->indx = (page_indx);                           \
00179                 (c)->csp->entries = NUM_ENT(pagep);                     \
00180                 LOCK_INIT((c)->csp->lock);                              \
00181                 (c)->csp->lock_mode = DB_LOCK_NG;                       \
00182         }                                                               \
00183 } while (0)
00184 
00185 #define BT_STK_NUMPUSH(dbenv, c, pagep, page_indx, ret) do {            \
00186         BT_STK_NUM(dbenv, cp, pagep, page_indx, ret);                   \
00187         ++(c)->csp;                                                     \
00188 } while (0)
00189 
00190 #define BT_STK_POP(c)                                                   \
00191         ((c)->csp == (c)->sp ? NULL : --(c)->csp)
00192 
00193 /* Btree/Recno cursor. */
00194 struct __cursor {
00195         /* struct __dbc_internal */
00196         __DBC_INTERNAL
00197 
00198         /* btree private part */
00199         EPG             *sp;            /* Stack pointer. */
00200         EPG             *csp;           /* Current stack entry. */
00201         EPG             *esp;           /* End stack pointer. */
00202         EPG              stack[5];
00203 
00204         db_indx_t        ovflsize;      /* Maximum key/data on-page size. */
00205 
00206         db_recno_t       recno;         /* Current record number. */
00207         u_int32_t        order;         /* Relative order among deleted curs. */
00208 
00209         /*
00210          * Btree:
00211          * We set a flag in the cursor structure if the underlying object has
00212          * been deleted.  It's not strictly necessary, we could get the same
00213          * information by looking at the page itself, but this method doesn't
00214          * require us to retrieve the page on cursor delete.
00215          *
00216          * Recno:
00217          * When renumbering recno databases during deletes, cursors referencing
00218          * "deleted" records end up positioned between two records, and so must
00219          * be specially adjusted on the next operation.
00220          */
00221 #define C_DELETED       0x0001          /* Record was deleted. */
00222         /*
00223          * There are three tree types that require maintaining record numbers.
00224          * Recno AM trees, Btree AM trees for which the DB_RECNUM flag was set,
00225          * and Btree off-page duplicate trees.
00226          */
00227 #define C_RECNUM        0x0002          /* Tree requires record counts. */
00228         /*
00229          * Recno trees have immutable record numbers by default, but optionally
00230          * support mutable record numbers.  Off-page duplicate Recno trees have
00231          * mutable record numbers.  All Btrees with record numbers (including
00232          * off-page duplicate trees) are mutable by design, no flag is needed.
00233          */
00234 #define C_RENUMBER      0x0004          /* Tree records are mutable. */
00235         u_int32_t        flags;
00236 };
00237 
00238 /*
00239  * Threshhold value, as a function of bt_minkey, of the number of
00240  * bytes a key/data pair can use before being placed on an overflow
00241  * page.  Assume every item requires the maximum alignment for
00242  * padding, out of sheer paranoia.
00243  */
00244 #define B_MINKEY_TO_OVFLSIZE(dbp, minkey, pgsize)                       \
00245         ((u_int16_t)(((pgsize) - P_OVERHEAD(dbp)) / ((minkey) * P_INDX) -\
00246             (BKEYDATA_PSIZE(0) + DB_ALIGN(1, sizeof(int32_t)))))
00247 
00248 /*
00249  * The maximum space that a single item can ever take up on one page.
00250  * Used by __bam_split to determine whether a split is still necessary.
00251  */
00252 #define B_MAX(a,b)      (((a) > (b)) ? (a) : (b))
00253 #define B_MAXSIZEONPAGE(ovflsize)                                       \
00254         (B_MAX(BOVERFLOW_PSIZE, BKEYDATA_PSIZE(ovflsize)))
00255 
00256 /*
00257  * The in-memory, per-tree btree/recno data structure.
00258  */
00259 struct __btree {                        /* Btree access method. */
00260         /*
00261          * !!!
00262          * These fields are write-once (when the structure is created) and
00263          * so are ignored as far as multi-threading is concerned.
00264          */
00265         db_pgno_t bt_meta;              /* Database meta-data page. */
00266         db_pgno_t bt_root;              /* Database root page. */
00267 
00268         u_int32_t bt_minkey;            /* Minimum keys per page. */
00269 
00270                                         /* Btree comparison function. */
00271         int (*bt_compare) __P((DB *, const DBT *, const DBT *));
00272                                         /* Btree prefix function. */
00273         size_t (*bt_prefix) __P((DB *, const DBT *, const DBT *));
00274 
00275                                         /* Recno access method. */
00276         int       re_pad;               /* Fixed-length padding byte. */
00277         int       re_delim;             /* Variable-length delimiting byte. */
00278         u_int32_t re_len;               /* Length for fixed-length records. */
00279         char     *re_source;            /* Source file name. */
00280 
00281         /*
00282          * !!!
00283          * The bt_lpgno field is NOT protected by any mutex, and for this
00284          * reason must be advisory only, so, while it is read/written by
00285          * multiple threads, DB is completely indifferent to the quality
00286          * of its information.
00287          */
00288         db_pgno_t bt_lpgno;             /* Last insert location. */
00289         DB_LSN    bt_llsn;              /* Last insert LSN. */
00290 
00291         /*
00292          * !!!
00293          * The re_modified field is NOT protected by any mutex, and for this
00294          * reason cannot be anything more complicated than a zero/non-zero
00295          * value.  The actual writing of the backing source file cannot be
00296          * threaded, so clearing the flag isn't a problem.
00297          */
00298         int       re_modified;          /* If the tree was modified. */
00299 
00300         /*
00301          * !!!
00302          * These fields are ignored as far as multi-threading is concerned.
00303          * There are no transaction semantics associated with backing files,
00304          * nor is there any thread protection.
00305          */
00306         FILE            *re_fp;         /* Source file handle. */
00307         int              re_eof;        /* Backing source file EOF reached. */
00308         db_recno_t       re_last;       /* Last record number read. */
00309 
00310 };
00311 
00312 /*
00313  * Modes for the __bam_curadj recovery records (btree_curadj).
00314  * These appear in log records, so we wire the values and
00315  * do not leave it up to the compiler.
00316  */
00317 typedef enum {
00318         DB_CA_DI        = 1,
00319         DB_CA_DUP       = 2,
00320         DB_CA_RSPLIT    = 3,
00321         DB_CA_SPLIT     = 4
00322 } db_ca_mode;
00323 
00324 /*
00325  * Flags for __bam_pinsert.
00326  */
00327 #define BPI_SPACEONLY   0x01            /* Only check for space to update. */
00328 #define BPI_NORECNUM    0x02            /* Not update the recnum on the left. */
00329 
00330 #include "dbinc_auto/btree_auto.h"
00331 #include "dbinc_auto/btree_ext.h"
00332 #include "dbinc/db_am.h"
00333 #endif /* !_DB_BTREE_H_ */

Generated on Sun Dec 25 12:14:22 2005 for Berkeley DB 4.4.16 by  doxygen 1.4.2