00001 /*- 00002 * See the file LICENSE for redistribution information. 00003 * 00004 * Copyright (c) 1996-2005 00005 * Sleepycat Software. All rights reserved. 00006 */ 00007 /* 00008 * Copyright (c) 1990, 1993, 1994, 1995, 1996 00009 * Keith Bostic. All rights reserved. 00010 */ 00011 /* 00012 * Copyright (c) 1990, 1993, 1994, 1995 00013 * The Regents of the University of California. All rights reserved. 00014 * 00015 * This code is derived from software contributed to Berkeley by 00016 * Mike Olson. 00017 * 00018 * Redistribution and use in source and binary forms, with or without 00019 * modification, are permitted provided that the following conditions 00020 * are met: 00021 * 1. Redistributions of source code must retain the above copyright 00022 * notice, this list of conditions and the following disclaimer. 00023 * 2. Redistributions in binary form must reproduce the above copyright 00024 * notice, this list of conditions and the following disclaimer in the 00025 * documentation and/or other materials provided with the distribution. 00026 * 3. Neither the name of the University nor the names of its contributors 00027 * may be used to endorse or promote products derived from this software 00028 * without specific prior written permission. 00029 * 00030 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 00031 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00032 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00033 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 00034 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 00035 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 00036 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 00037 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00038 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 00039 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 00040 * SUCH DAMAGE. 00041 * 00042 * $Id: btree.h,v 12.8 2005/08/08 14:52:30 bostic Exp $ 00043 */ 00044 #ifndef _DB_BTREE_H_ 00045 #define _DB_BTREE_H_ 00046 00047 /* Forward structure declarations. */ 00048 struct __btree; typedef struct __btree BTREE; 00049 struct __cursor; typedef struct __cursor BTREE_CURSOR; 00050 struct __epg; typedef struct __epg EPG; 00051 struct __recno; typedef struct __recno RECNO; 00052 00053 #define DEFMINKEYPAGE (2) 00054 00055 /* 00056 * A recno order of 0 indicates that we don't have an order, not that we've 00057 * an order less than 1. 00058 */ 00059 #define INVALID_ORDER 0 00060 00061 #define ISINTERNAL(p) (TYPE(p) == P_IBTREE || TYPE(p) == P_IRECNO) 00062 #define ISLEAF(p) (TYPE(p) == P_LBTREE || \ 00063 TYPE(p) == P_LRECNO || TYPE(p) == P_LDUP) 00064 00065 /* Flags for __bam_cadjust_log(). */ 00066 #define CAD_UPDATEROOT 0x01 /* Root page count was updated. */ 00067 00068 /* Flags for __bam_split_log(). */ 00069 #define SPL_NRECS 0x01 /* Split tree has record count. */ 00070 00071 /* Flags for __bam_iitem(). */ 00072 #define BI_DELETED 0x01 /* Key/data pair only placeholder. */ 00073 00074 /* Flags for __bam_stkrel(). */ 00075 #define STK_CLRDBC 0x01 /* Clear dbc->page reference. */ 00076 #define STK_NOLOCK 0x02 /* Don't retain locks. */ 00077 #define STK_PGONLY 0x04 00078 00079 /* Flags for __ram_ca(). These get logged, so make the values explicit. */ 00080 typedef enum { 00081 CA_DELETE = 0, /* Delete the current record. */ 00082 CA_IAFTER = 1, /* Insert before the current record. */ 00083 CA_IBEFORE = 2, /* Insert after the current record. */ 00084 CA_ICURRENT = 3 /* Overwrite the current record. */ 00085 } ca_recno_arg; 00086 00087 /* 00088 * Flags for __bam_search() and __bam_rsearch(). 00089 * 00090 * Note, internal page searches must find the largest record less than key in 00091 * the tree so that descents work. Leaf page searches must find the smallest 00092 * record greater than key so that the returned index is the record's correct 00093 * position for insertion. 00094 * 00095 * The flags parameter to the search routines describes three aspects of the 00096 * search: the type of locking required (including if we're locking a pair of 00097 * pages), the item to return in the presence of duplicates and whether or not 00098 * to return deleted entries. To simplify both the mnemonic representation 00099 * and the code that checks for various cases, we construct a set of bitmasks. 00100 */ 00101 #define S_READ 0x00001 /* Read locks. */ 00102 #define S_WRITE 0x00002 /* Write locks. */ 00103 00104 #define S_APPEND 0x00040 /* Append to the tree. */ 00105 #define S_DELNO 0x00080 /* Don't return deleted items. */ 00106 #define S_DUPFIRST 0x00100 /* Return first duplicate. */ 00107 #define S_DUPLAST 0x00200 /* Return last duplicate. */ 00108 #define S_EXACT 0x00400 /* Exact items only. */ 00109 #define S_PARENT 0x00800 /* Lock page pair. */ 00110 #define S_STACK 0x01000 /* Need a complete stack. */ 00111 #define S_PAST_EOF 0x02000 /* If doing insert search (or keyfirst 00112 * or keylast operations), or a split 00113 * on behalf of an insert, it's okay to 00114 * return an entry one past end-of-page. 00115 */ 00116 #define S_STK_ONLY 0x04000 /* Just return info in the stack */ 00117 #define S_MAX 0x08000 /* Get the right most key */ 00118 #define S_MIN 0x10000 /* Get the left most key */ 00119 #define S_NEXT 0x20000 /* Get the page after this key */ 00120 #define S_DEL 0x40000 /* Get the tree to delete this key. */ 00121 #define S_START 0x80000 /* Level to start stack. */ 00122 00123 #define S_DELETE (S_WRITE | S_DUPFIRST | S_DELNO | S_EXACT | S_STACK) 00124 #define S_FIND (S_READ | S_DUPFIRST | S_DELNO) 00125 #define S_FIND_WR (S_WRITE | S_DUPFIRST | S_DELNO) 00126 #define S_INSERT (S_WRITE | S_DUPLAST | S_PAST_EOF | S_STACK) 00127 #define S_KEYFIRST (S_WRITE | S_DUPFIRST | S_PAST_EOF | S_STACK) 00128 #define S_KEYLAST (S_WRITE | S_DUPLAST | S_PAST_EOF | S_STACK) 00129 #define S_WRPAIR (S_WRITE | S_DUPLAST | S_PAST_EOF | S_PARENT) 00130 00131 /* 00132 * Various routines pass around page references. A page reference is 00133 * a pointer to the page, and the indx indicates an item on the page. 00134 * Each page reference may include a lock. 00135 */ 00136 struct __epg { 00137 PAGE *page; /* The page. */ 00138 db_indx_t indx; /* The index on the page. */ 00139 db_indx_t entries; /* The number of entries on page */ 00140 DB_LOCK lock; /* The page's lock. */ 00141 db_lockmode_t lock_mode; /* The lock mode. */ 00142 }; 00143 00144 /* 00145 * We maintain a stack of the pages that we're locking in the tree. Grow 00146 * the stack as necessary. 00147 * 00148 * XXX 00149 * Temporary fix for #3243 -- clear the page and lock from the stack entry. 00150 * The correct fix is to never release a stack that doesn't hold items. 00151 */ 00152 #define BT_STK_CLR(c) do { \ 00153 (c)->csp = (c)->sp; \ 00154 (c)->csp->page = NULL; \ 00155 LOCK_INIT((c)->csp->lock); \ 00156 } while (0) 00157 00158 #define BT_STK_ENTER(dbenv, c, pagep, page_indx, l, mode, ret) do { \ 00159 if ((ret = ((c)->csp == (c)->esp ? \ 00160 __bam_stkgrow(dbenv, c) : 0)) == 0) { \ 00161 (c)->csp->page = pagep; \ 00162 (c)->csp->indx = (page_indx); \ 00163 (c)->csp->entries = NUM_ENT(pagep); \ 00164 (c)->csp->lock = l; \ 00165 (c)->csp->lock_mode = mode; \ 00166 } \ 00167 } while (0) 00168 00169 #define BT_STK_PUSH(dbenv, c, pagep, page_indx, lock, mode, ret) do { \ 00170 BT_STK_ENTER(dbenv, c, pagep, page_indx, lock, mode, ret); \ 00171 ++(c)->csp; \ 00172 } while (0) 00173 00174 #define BT_STK_NUM(dbenv, c, pagep, page_indx, ret) do { \ 00175 if ((ret = ((c)->csp == \ 00176 (c)->esp ? __bam_stkgrow(dbenv, c) : 0)) == 0) { \ 00177 (c)->csp->page = NULL; \ 00178 (c)->csp->indx = (page_indx); \ 00179 (c)->csp->entries = NUM_ENT(pagep); \ 00180 LOCK_INIT((c)->csp->lock); \ 00181 (c)->csp->lock_mode = DB_LOCK_NG; \ 00182 } \ 00183 } while (0) 00184 00185 #define BT_STK_NUMPUSH(dbenv, c, pagep, page_indx, ret) do { \ 00186 BT_STK_NUM(dbenv, cp, pagep, page_indx, ret); \ 00187 ++(c)->csp; \ 00188 } while (0) 00189 00190 #define BT_STK_POP(c) \ 00191 ((c)->csp == (c)->sp ? NULL : --(c)->csp) 00192 00193 /* Btree/Recno cursor. */ 00194 struct __cursor { 00195 /* struct __dbc_internal */ 00196 __DBC_INTERNAL 00197 00198 /* btree private part */ 00199 EPG *sp; /* Stack pointer. */ 00200 EPG *csp; /* Current stack entry. */ 00201 EPG *esp; /* End stack pointer. */ 00202 EPG stack[5]; 00203 00204 db_indx_t ovflsize; /* Maximum key/data on-page size. */ 00205 00206 db_recno_t recno; /* Current record number. */ 00207 u_int32_t order; /* Relative order among deleted curs. */ 00208 00209 /* 00210 * Btree: 00211 * We set a flag in the cursor structure if the underlying object has 00212 * been deleted. It's not strictly necessary, we could get the same 00213 * information by looking at the page itself, but this method doesn't 00214 * require us to retrieve the page on cursor delete. 00215 * 00216 * Recno: 00217 * When renumbering recno databases during deletes, cursors referencing 00218 * "deleted" records end up positioned between two records, and so must 00219 * be specially adjusted on the next operation. 00220 */ 00221 #define C_DELETED 0x0001 /* Record was deleted. */ 00222 /* 00223 * There are three tree types that require maintaining record numbers. 00224 * Recno AM trees, Btree AM trees for which the DB_RECNUM flag was set, 00225 * and Btree off-page duplicate trees. 00226 */ 00227 #define C_RECNUM 0x0002 /* Tree requires record counts. */ 00228 /* 00229 * Recno trees have immutable record numbers by default, but optionally 00230 * support mutable record numbers. Off-page duplicate Recno trees have 00231 * mutable record numbers. All Btrees with record numbers (including 00232 * off-page duplicate trees) are mutable by design, no flag is needed. 00233 */ 00234 #define C_RENUMBER 0x0004 /* Tree records are mutable. */ 00235 u_int32_t flags; 00236 }; 00237 00238 /* 00239 * Threshhold value, as a function of bt_minkey, of the number of 00240 * bytes a key/data pair can use before being placed on an overflow 00241 * page. Assume every item requires the maximum alignment for 00242 * padding, out of sheer paranoia. 00243 */ 00244 #define B_MINKEY_TO_OVFLSIZE(dbp, minkey, pgsize) \ 00245 ((u_int16_t)(((pgsize) - P_OVERHEAD(dbp)) / ((minkey) * P_INDX) -\ 00246 (BKEYDATA_PSIZE(0) + DB_ALIGN(1, sizeof(int32_t))))) 00247 00248 /* 00249 * The maximum space that a single item can ever take up on one page. 00250 * Used by __bam_split to determine whether a split is still necessary. 00251 */ 00252 #define B_MAX(a,b) (((a) > (b)) ? (a) : (b)) 00253 #define B_MAXSIZEONPAGE(ovflsize) \ 00254 (B_MAX(BOVERFLOW_PSIZE, BKEYDATA_PSIZE(ovflsize))) 00255 00256 /* 00257 * The in-memory, per-tree btree/recno data structure. 00258 */ 00259 struct __btree { /* Btree access method. */ 00260 /* 00261 * !!! 00262 * These fields are write-once (when the structure is created) and 00263 * so are ignored as far as multi-threading is concerned. 00264 */ 00265 db_pgno_t bt_meta; /* Database meta-data page. */ 00266 db_pgno_t bt_root; /* Database root page. */ 00267 00268 u_int32_t bt_minkey; /* Minimum keys per page. */ 00269 00270 /* Btree comparison function. */ 00271 int (*bt_compare) __P((DB *, const DBT *, const DBT *)); 00272 /* Btree prefix function. */ 00273 size_t (*bt_prefix) __P((DB *, const DBT *, const DBT *)); 00274 00275 /* Recno access method. */ 00276 int re_pad; /* Fixed-length padding byte. */ 00277 int re_delim; /* Variable-length delimiting byte. */ 00278 u_int32_t re_len; /* Length for fixed-length records. */ 00279 char *re_source; /* Source file name. */ 00280 00281 /* 00282 * !!! 00283 * The bt_lpgno field is NOT protected by any mutex, and for this 00284 * reason must be advisory only, so, while it is read/written by 00285 * multiple threads, DB is completely indifferent to the quality 00286 * of its information. 00287 */ 00288 db_pgno_t bt_lpgno; /* Last insert location. */ 00289 DB_LSN bt_llsn; /* Last insert LSN. */ 00290 00291 /* 00292 * !!! 00293 * The re_modified field is NOT protected by any mutex, and for this 00294 * reason cannot be anything more complicated than a zero/non-zero 00295 * value. The actual writing of the backing source file cannot be 00296 * threaded, so clearing the flag isn't a problem. 00297 */ 00298 int re_modified; /* If the tree was modified. */ 00299 00300 /* 00301 * !!! 00302 * These fields are ignored as far as multi-threading is concerned. 00303 * There are no transaction semantics associated with backing files, 00304 * nor is there any thread protection. 00305 */ 00306 FILE *re_fp; /* Source file handle. */ 00307 int re_eof; /* Backing source file EOF reached. */ 00308 db_recno_t re_last; /* Last record number read. */ 00309 00310 }; 00311 00312 /* 00313 * Modes for the __bam_curadj recovery records (btree_curadj). 00314 * These appear in log records, so we wire the values and 00315 * do not leave it up to the compiler. 00316 */ 00317 typedef enum { 00318 DB_CA_DI = 1, 00319 DB_CA_DUP = 2, 00320 DB_CA_RSPLIT = 3, 00321 DB_CA_SPLIT = 4 00322 } db_ca_mode; 00323 00324 /* 00325 * Flags for __bam_pinsert. 00326 */ 00327 #define BPI_SPACEONLY 0x01 /* Only check for space to update. */ 00328 #define BPI_NORECNUM 0x02 /* Not update the recnum on the left. */ 00329 00330 #include "dbinc_auto/btree_auto.h" 00331 #include "dbinc_auto/btree_ext.h" 00332 #include "dbinc/db_am.h" 00333 #endif /* !_DB_BTREE_H_ */