00001 /*------------------------------------------------------------------------- 00002 * 00003 * nbtree.h 00004 * header file for postgres btree access method implementation. 00005 * 00006 * 00007 * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group 00008 * Portions Copyright (c) 1994, Regents of the University of California 00009 * 00010 * src/include/access/nbtree.h 00011 * 00012 *------------------------------------------------------------------------- 00013 */ 00014 #ifndef NBTREE_H 00015 #define NBTREE_H 00016 00017 #include "access/genam.h" 00018 #include "access/itup.h" 00019 #include "access/sdir.h" 00020 #include "access/xlog.h" 00021 #include "access/xlogutils.h" 00022 #include "catalog/pg_index.h" 00023 00024 /* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */ 00025 typedef uint16 BTCycleId; 00026 00027 /* 00028 * BTPageOpaqueData -- At the end of every page, we store a pointer 00029 * to both siblings in the tree. This is used to do forward/backward 00030 * index scans. The next-page link is also critical for recovery when 00031 * a search has navigated to the wrong page due to concurrent page splits 00032 * or deletions; see src/backend/access/nbtree/README for more info. 00033 * 00034 * In addition, we store the page's btree level (counting upwards from 00035 * zero at a leaf page) as well as some flag bits indicating the page type 00036 * and status. If the page is deleted, we replace the level with the 00037 * next-transaction-ID value indicating when it is safe to reclaim the page. 00038 * 00039 * We also store a "vacuum cycle ID". When a page is split while VACUUM is 00040 * processing the index, a nonzero value associated with the VACUUM run is 00041 * stored into both halves of the split page. (If VACUUM is not running, 00042 * both pages receive zero cycleids.) This allows VACUUM to detect whether 00043 * a page was split since it started, with a small probability of false match 00044 * if the page was last split some exact multiple of MAX_BT_CYCLE_ID VACUUMs 00045 * ago. Also, during a split, the BTP_SPLIT_END flag is cleared in the left 00046 * (original) page, and set in the right page, but only if the next page 00047 * to its right has a different cycleid. 00048 * 00049 * NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested 00050 * instead. 00051 */ 00052 00053 typedef struct BTPageOpaqueData 00054 { 00055 BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */ 00056 BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */ 00057 union 00058 { 00059 uint32 level; /* tree level --- zero for leaf pages */ 00060 TransactionId xact; /* next transaction ID, if deleted */ 00061 } btpo; 00062 uint16 btpo_flags; /* flag bits, see below */ 00063 BTCycleId btpo_cycleid; /* vacuum cycle ID of latest split */ 00064 } BTPageOpaqueData; 00065 00066 typedef BTPageOpaqueData *BTPageOpaque; 00067 00068 /* Bits defined in btpo_flags */ 00069 #define BTP_LEAF (1 << 0) /* leaf page, i.e. not internal page */ 00070 #define BTP_ROOT (1 << 1) /* root page (has no parent) */ 00071 #define BTP_DELETED (1 << 2) /* page has been deleted from tree */ 00072 #define BTP_META (1 << 3) /* meta-page */ 00073 #define BTP_HALF_DEAD (1 << 4) /* empty, but still in tree */ 00074 #define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */ 00075 #define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples */ 00076 00077 /* 00078 * The max allowed value of a cycle ID is a bit less than 64K. This is 00079 * for convenience of pg_filedump and similar utilities: we want to use 00080 * the last 2 bytes of special space as an index type indicator, and 00081 * restricting cycle ID lets btree use that space for vacuum cycle IDs 00082 * while still allowing index type to be identified. 00083 */ 00084 #define MAX_BT_CYCLE_ID 0xFF7F 00085 00086 00087 /* 00088 * The Meta page is always the first page in the btree index. 00089 * Its primary purpose is to point to the location of the btree root page. 00090 * We also point to the "fast" root, which is the current effective root; 00091 * see README for discussion. 00092 */ 00093 00094 typedef struct BTMetaPageData 00095 { 00096 uint32 btm_magic; /* should contain BTREE_MAGIC */ 00097 uint32 btm_version; /* should contain BTREE_VERSION */ 00098 BlockNumber btm_root; /* current root location */ 00099 uint32 btm_level; /* tree level of the root page */ 00100 BlockNumber btm_fastroot; /* current "fast" root location */ 00101 uint32 btm_fastlevel; /* tree level of the "fast" root page */ 00102 } BTMetaPageData; 00103 00104 #define BTPageGetMeta(p) \ 00105 ((BTMetaPageData *) PageGetContents(p)) 00106 00107 #define BTREE_METAPAGE 0 /* first page is meta */ 00108 #define BTREE_MAGIC 0x053162 /* magic number of btree pages */ 00109 #define BTREE_VERSION 2 /* current version number */ 00110 00111 /* 00112 * Maximum size of a btree index entry, including its tuple header. 00113 * 00114 * We actually need to be able to fit three items on every page, 00115 * so restrict any one item to 1/3 the per-page available space. 00116 */ 00117 #define BTMaxItemSize(page) \ 00118 MAXALIGN_DOWN((PageGetPageSize(page) - \ 00119 MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \ 00120 MAXALIGN(sizeof(BTPageOpaqueData))) / 3) 00121 00122 /* 00123 * The leaf-page fillfactor defaults to 90% but is user-adjustable. 00124 * For pages above the leaf level, we use a fixed 70% fillfactor. 00125 * The fillfactor is applied during index build and when splitting 00126 * a rightmost page; when splitting non-rightmost pages we try to 00127 * divide the data equally. 00128 */ 00129 #define BTREE_MIN_FILLFACTOR 10 00130 #define BTREE_DEFAULT_FILLFACTOR 90 00131 #define BTREE_NONLEAF_FILLFACTOR 70 00132 00133 /* 00134 * Test whether two btree entries are "the same". 00135 * 00136 * Old comments: 00137 * In addition, we must guarantee that all tuples in the index are unique, 00138 * in order to satisfy some assumptions in Lehman and Yao. The way that we 00139 * do this is by generating a new OID for every insertion that we do in the 00140 * tree. This adds eight bytes to the size of btree index tuples. Note 00141 * that we do not use the OID as part of a composite key; the OID only 00142 * serves as a unique identifier for a given index tuple (logical position 00143 * within a page). 00144 * 00145 * New comments: 00146 * actually, we must guarantee that all tuples in A LEVEL 00147 * are unique, not in ALL INDEX. So, we can use the t_tid 00148 * as unique identifier for a given index tuple (logical position 00149 * within a level). - vadim 04/09/97 00150 */ 00151 #define BTTidSame(i1, i2) \ 00152 ( (i1).ip_blkid.bi_hi == (i2).ip_blkid.bi_hi && \ 00153 (i1).ip_blkid.bi_lo == (i2).ip_blkid.bi_lo && \ 00154 (i1).ip_posid == (i2).ip_posid ) 00155 #define BTEntrySame(i1, i2) \ 00156 BTTidSame((i1)->t_tid, (i2)->t_tid) 00157 00158 00159 /* 00160 * In general, the btree code tries to localize its knowledge about 00161 * page layout to a couple of routines. However, we need a special 00162 * value to indicate "no page number" in those places where we expect 00163 * page numbers. We can use zero for this because we never need to 00164 * make a pointer to the metadata page. 00165 */ 00166 00167 #define P_NONE 0 00168 00169 /* 00170 * Macros to test whether a page is leftmost or rightmost on its tree level, 00171 * as well as other state info kept in the opaque data. 00172 */ 00173 #define P_LEFTMOST(opaque) ((opaque)->btpo_prev == P_NONE) 00174 #define P_RIGHTMOST(opaque) ((opaque)->btpo_next == P_NONE) 00175 #define P_ISLEAF(opaque) ((opaque)->btpo_flags & BTP_LEAF) 00176 #define P_ISROOT(opaque) ((opaque)->btpo_flags & BTP_ROOT) 00177 #define P_ISDELETED(opaque) ((opaque)->btpo_flags & BTP_DELETED) 00178 #define P_ISHALFDEAD(opaque) ((opaque)->btpo_flags & BTP_HALF_DEAD) 00179 #define P_IGNORE(opaque) ((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) 00180 #define P_HAS_GARBAGE(opaque) ((opaque)->btpo_flags & BTP_HAS_GARBAGE) 00181 00182 /* 00183 * Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost 00184 * page. The high key is not a data key, but gives info about what range of 00185 * keys is supposed to be on this page. The high key on a page is required 00186 * to be greater than or equal to any data key that appears on the page. 00187 * If we find ourselves trying to insert a key > high key, we know we need 00188 * to move right (this should only happen if the page was split since we 00189 * examined the parent page). 00190 * 00191 * Our insertion algorithm guarantees that we can use the initial least key 00192 * on our right sibling as the high key. Once a page is created, its high 00193 * key changes only if the page is split. 00194 * 00195 * On a non-rightmost page, the high key lives in item 1 and data items 00196 * start in item 2. Rightmost pages have no high key, so we store data 00197 * items beginning in item 1. 00198 */ 00199 00200 #define P_HIKEY ((OffsetNumber) 1) 00201 #define P_FIRSTKEY ((OffsetNumber) 2) 00202 #define P_FIRSTDATAKEY(opaque) (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY) 00203 00204 /* 00205 * XLOG records for btree operations 00206 * 00207 * XLOG allows to store some information in high 4 bits of log 00208 * record xl_info field 00209 */ 00210 #define XLOG_BTREE_INSERT_LEAF 0x00 /* add index tuple without split */ 00211 #define XLOG_BTREE_INSERT_UPPER 0x10 /* same, on a non-leaf page */ 00212 #define XLOG_BTREE_INSERT_META 0x20 /* same, plus update metapage */ 00213 #define XLOG_BTREE_SPLIT_L 0x30 /* add index tuple with split */ 00214 #define XLOG_BTREE_SPLIT_R 0x40 /* as above, new item on right */ 00215 #define XLOG_BTREE_SPLIT_L_ROOT 0x50 /* add tuple with split of root */ 00216 #define XLOG_BTREE_SPLIT_R_ROOT 0x60 /* as above, new item on right */ 00217 #define XLOG_BTREE_DELETE 0x70 /* delete leaf index tuples for a page */ 00218 #define XLOG_BTREE_DELETE_PAGE 0x80 /* delete an entire page */ 00219 #define XLOG_BTREE_DELETE_PAGE_META 0x90 /* same, and update metapage */ 00220 #define XLOG_BTREE_NEWROOT 0xA0 /* new root page */ 00221 #define XLOG_BTREE_DELETE_PAGE_HALF 0xB0 /* page deletion that makes 00222 * parent half-dead */ 00223 #define XLOG_BTREE_VACUUM 0xC0 /* delete entries on a page during 00224 * vacuum */ 00225 #define XLOG_BTREE_REUSE_PAGE 0xD0 /* old page is about to be reused from 00226 * FSM */ 00227 00228 /* 00229 * All that we need to find changed index tuple 00230 */ 00231 typedef struct xl_btreetid 00232 { 00233 RelFileNode node; 00234 ItemPointerData tid; /* changed tuple id */ 00235 } xl_btreetid; 00236 00237 /* 00238 * All that we need to regenerate the meta-data page 00239 */ 00240 typedef struct xl_btree_metadata 00241 { 00242 BlockNumber root; 00243 uint32 level; 00244 BlockNumber fastroot; 00245 uint32 fastlevel; 00246 } xl_btree_metadata; 00247 00248 /* 00249 * This is what we need to know about simple (without split) insert. 00250 * 00251 * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META. 00252 * Note that INSERT_META implies it's not a leaf page. 00253 */ 00254 typedef struct xl_btree_insert 00255 { 00256 xl_btreetid target; /* inserted tuple id */ 00257 /* BlockNumber downlink field FOLLOWS IF NOT XLOG_BTREE_INSERT_LEAF */ 00258 /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_INSERT_META */ 00259 /* INDEX TUPLE FOLLOWS AT END OF STRUCT */ 00260 } xl_btree_insert; 00261 00262 #define SizeOfBtreeInsert (offsetof(xl_btreetid, tid) + SizeOfIptrData) 00263 00264 /* 00265 * On insert with split, we save all the items going into the right sibling 00266 * so that we can restore it completely from the log record. This way takes 00267 * less xlog space than the normal approach, because if we did it standardly, 00268 * XLogInsert would almost always think the right page is new and store its 00269 * whole page image. The left page, however, is handled in the normal 00270 * incremental-update fashion. 00271 * 00272 * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record. 00273 * The _L and _R variants indicate whether the inserted tuple went into the 00274 * left or right split page (and thus, whether newitemoff and the new item 00275 * are stored or not). The _ROOT variants indicate that we are splitting 00276 * the root page, and thus that a newroot record rather than an insert or 00277 * split record should follow. Note that a split record never carries a 00278 * metapage update --- we'll do that in the parent-level update. 00279 */ 00280 typedef struct xl_btree_split 00281 { 00282 RelFileNode node; 00283 BlockNumber leftsib; /* orig page / new left page */ 00284 BlockNumber rightsib; /* new right page */ 00285 BlockNumber rnext; /* next block (orig page's rightlink) */ 00286 uint32 level; /* tree level of page being split */ 00287 OffsetNumber firstright; /* first item moved to right page */ 00288 00289 /* 00290 * If level > 0, BlockIdData downlink follows. (We use BlockIdData rather 00291 * than BlockNumber for alignment reasons: SizeOfBtreeSplit is only 16-bit 00292 * aligned.) 00293 * 00294 * If level > 0, an IndexTuple representing the HIKEY of the left page 00295 * follows. We don't need this on leaf pages, because it's the same as 00296 * the leftmost key in the new right page. Also, it's suppressed if 00297 * XLogInsert chooses to store the left page's whole page image. 00298 * 00299 * In the _L variants, next are OffsetNumber newitemoff and the new item. 00300 * (In the _R variants, the new item is one of the right page's tuples.) 00301 * The new item, but not newitemoff, is suppressed if XLogInsert chooses 00302 * to store the left page's whole page image. 00303 * 00304 * Last are the right page's tuples in the form used by _bt_restore_page. 00305 */ 00306 } xl_btree_split; 00307 00308 #define SizeOfBtreeSplit (offsetof(xl_btree_split, firstright) + sizeof(OffsetNumber)) 00309 00310 /* 00311 * This is what we need to know about delete of individual leaf index tuples. 00312 * The WAL record can represent deletion of any number of index tuples on a 00313 * single index page when *not* executed by VACUUM. 00314 */ 00315 typedef struct xl_btree_delete 00316 { 00317 RelFileNode node; /* RelFileNode of the index */ 00318 BlockNumber block; 00319 RelFileNode hnode; /* RelFileNode of the heap the index currently 00320 * points at */ 00321 int nitems; 00322 00323 /* TARGET OFFSET NUMBERS FOLLOW AT THE END */ 00324 } xl_btree_delete; 00325 00326 #define SizeOfBtreeDelete (offsetof(xl_btree_delete, nitems) + sizeof(int)) 00327 00328 /* 00329 * This is what we need to know about page reuse within btree. 00330 */ 00331 typedef struct xl_btree_reuse_page 00332 { 00333 RelFileNode node; 00334 BlockNumber block; 00335 TransactionId latestRemovedXid; 00336 } xl_btree_reuse_page; 00337 00338 #define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page)) 00339 00340 /* 00341 * This is what we need to know about vacuum of individual leaf index tuples. 00342 * The WAL record can represent deletion of any number of index tuples on a 00343 * single index page when executed by VACUUM. 00344 * 00345 * The correctness requirement for applying these changes during recovery is 00346 * that we must do one of these two things for every block in the index: 00347 * * lock the block for cleanup and apply any required changes 00348 * * EnsureBlockUnpinned() 00349 * The purpose of this is to ensure that no index scans started before we 00350 * finish scanning the index are still running by the time we begin to remove 00351 * heap tuples. 00352 * 00353 * Any changes to any one block are registered on just one WAL record. All 00354 * blocks that we need to run EnsureBlockUnpinned() are listed as a block range 00355 * starting from the last block vacuumed through until this one. Individual 00356 * block numbers aren't given. 00357 * 00358 * Note that the *last* WAL record in any vacuum of an index is allowed to 00359 * have a zero length array of offsets. Earlier records must have at least one. 00360 */ 00361 typedef struct xl_btree_vacuum 00362 { 00363 RelFileNode node; 00364 BlockNumber block; 00365 BlockNumber lastBlockVacuumed; 00366 00367 /* TARGET OFFSET NUMBERS FOLLOW */ 00368 } xl_btree_vacuum; 00369 00370 #define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber)) 00371 00372 /* 00373 * This is what we need to know about deletion of a btree page. The target 00374 * identifies the tuple removed from the parent page (note that we remove 00375 * this tuple's downlink and the *following* tuple's key). Note we do not 00376 * store any content for the deleted page --- it is just rewritten as empty 00377 * during recovery, apart from resetting the btpo.xact. 00378 */ 00379 typedef struct xl_btree_delete_page 00380 { 00381 xl_btreetid target; /* deleted tuple id in parent page */ 00382 BlockNumber deadblk; /* child block being deleted */ 00383 BlockNumber leftblk; /* child block's left sibling, if any */ 00384 BlockNumber rightblk; /* child block's right sibling */ 00385 TransactionId btpo_xact; /* value of btpo.xact for use in recovery */ 00386 /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_DELETE_PAGE_META */ 00387 } xl_btree_delete_page; 00388 00389 #define SizeOfBtreeDeletePage (offsetof(xl_btree_delete_page, btpo_xact) + sizeof(TransactionId)) 00390 00391 /* 00392 * New root log record. There are zero tuples if this is to establish an 00393 * empty root, or two if it is the result of splitting an old root. 00394 * 00395 * Note that although this implies rewriting the metadata page, we don't need 00396 * an xl_btree_metadata record --- the rootblk and level are sufficient. 00397 */ 00398 typedef struct xl_btree_newroot 00399 { 00400 RelFileNode node; 00401 BlockNumber rootblk; /* location of new root */ 00402 uint32 level; /* its tree level */ 00403 /* 0 or 2 INDEX TUPLES FOLLOW AT END OF STRUCT */ 00404 } xl_btree_newroot; 00405 00406 #define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32)) 00407 00408 00409 /* 00410 * Operator strategy numbers for B-tree have been moved to access/skey.h, 00411 * because many places need to use them in ScanKeyInit() calls. 00412 * 00413 * The strategy numbers are chosen so that we can commute them by 00414 * subtraction, thus: 00415 */ 00416 #define BTCommuteStrategyNumber(strat) (BTMaxStrategyNumber + 1 - (strat)) 00417 00418 /* 00419 * When a new operator class is declared, we require that the user 00420 * supply us with an amproc procedure (BTORDER_PROC) for determining 00421 * whether, for two keys a and b, a < b, a = b, or a > b. This routine 00422 * must return < 0, 0, > 0, respectively, in these three cases. (It must 00423 * not return INT_MIN, since we may negate the result before using it.) 00424 * 00425 * To facilitate accelerated sorting, an operator class may choose to 00426 * offer a second procedure (BTSORTSUPPORT_PROC). For full details, see 00427 * src/include/utils/sortsupport.h. 00428 */ 00429 00430 #define BTORDER_PROC 1 00431 #define BTSORTSUPPORT_PROC 2 00432 00433 /* 00434 * We need to be able to tell the difference between read and write 00435 * requests for pages, in order to do locking correctly. 00436 */ 00437 00438 #define BT_READ BUFFER_LOCK_SHARE 00439 #define BT_WRITE BUFFER_LOCK_EXCLUSIVE 00440 00441 /* 00442 * BTStackData -- As we descend a tree, we push the (location, downlink) 00443 * pairs from internal pages onto a private stack. If we split a 00444 * leaf, we use this stack to walk back up the tree and insert data 00445 * into parent pages (and possibly to split them, too). Lehman and 00446 * Yao's update algorithm guarantees that under no circumstances can 00447 * our private stack give us an irredeemably bad picture up the tree. 00448 * Again, see the paper for details. 00449 */ 00450 00451 typedef struct BTStackData 00452 { 00453 BlockNumber bts_blkno; 00454 OffsetNumber bts_offset; 00455 IndexTupleData bts_btentry; 00456 struct BTStackData *bts_parent; 00457 } BTStackData; 00458 00459 typedef BTStackData *BTStack; 00460 00461 /* 00462 * BTScanOpaqueData is the btree-private state needed for an indexscan. 00463 * This consists of preprocessed scan keys (see _bt_preprocess_keys() for 00464 * details of the preprocessing), information about the current location 00465 * of the scan, and information about the marked location, if any. (We use 00466 * BTScanPosData to represent the data needed for each of current and marked 00467 * locations.) In addition we can remember some known-killed index entries 00468 * that must be marked before we can move off the current page. 00469 * 00470 * Index scans work a page at a time: we pin and read-lock the page, identify 00471 * all the matching items on the page and save them in BTScanPosData, then 00472 * release the read-lock while returning the items to the caller for 00473 * processing. This approach minimizes lock/unlock traffic. Note that we 00474 * keep the pin on the index page until the caller is done with all the items 00475 * (this is needed for VACUUM synchronization, see nbtree/README). When we 00476 * are ready to step to the next page, if the caller has told us any of the 00477 * items were killed, we re-lock the page to mark them killed, then unlock. 00478 * Finally we drop the pin and step to the next page in the appropriate 00479 * direction. 00480 * 00481 * If we are doing an index-only scan, we save the entire IndexTuple for each 00482 * matched item, otherwise only its heap TID and offset. The IndexTuples go 00483 * into a separate workspace array; each BTScanPosItem stores its tuple's 00484 * offset within that array. 00485 */ 00486 00487 typedef struct BTScanPosItem /* what we remember about each match */ 00488 { 00489 ItemPointerData heapTid; /* TID of referenced heap item */ 00490 OffsetNumber indexOffset; /* index item's location within page */ 00491 LocationIndex tupleOffset; /* IndexTuple's offset in workspace, if any */ 00492 } BTScanPosItem; 00493 00494 typedef struct BTScanPosData 00495 { 00496 Buffer buf; /* if valid, the buffer is pinned */ 00497 00498 BlockNumber nextPage; /* page's right link when we scanned it */ 00499 00500 /* 00501 * moreLeft and moreRight track whether we think there may be matching 00502 * index entries to the left and right of the current page, respectively. 00503 * We can clear the appropriate one of these flags when _bt_checkkeys() 00504 * returns continuescan = false. 00505 */ 00506 bool moreLeft; 00507 bool moreRight; 00508 00509 /* 00510 * If we are doing an index-only scan, nextTupleOffset is the first free 00511 * location in the associated tuple storage workspace. 00512 */ 00513 int nextTupleOffset; 00514 00515 /* 00516 * The items array is always ordered in index order (ie, increasing 00517 * indexoffset). When scanning backwards it is convenient to fill the 00518 * array back-to-front, so we start at the last slot and fill downwards. 00519 * Hence we need both a first-valid-entry and a last-valid-entry counter. 00520 * itemIndex is a cursor showing which entry was last returned to caller. 00521 */ 00522 int firstItem; /* first valid index in items[] */ 00523 int lastItem; /* last valid index in items[] */ 00524 int itemIndex; /* current index in items[] */ 00525 00526 BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */ 00527 } BTScanPosData; 00528 00529 typedef BTScanPosData *BTScanPos; 00530 00531 #define BTScanPosIsValid(scanpos) BufferIsValid((scanpos).buf) 00532 00533 /* We need one of these for each equality-type SK_SEARCHARRAY scan key */ 00534 typedef struct BTArrayKeyInfo 00535 { 00536 int scan_key; /* index of associated key in arrayKeyData */ 00537 int cur_elem; /* index of current element in elem_values */ 00538 int mark_elem; /* index of marked element in elem_values */ 00539 int num_elems; /* number of elems in current array value */ 00540 Datum *elem_values; /* array of num_elems Datums */ 00541 } BTArrayKeyInfo; 00542 00543 typedef struct BTScanOpaqueData 00544 { 00545 /* these fields are set by _bt_preprocess_keys(): */ 00546 bool qual_ok; /* false if qual can never be satisfied */ 00547 int numberOfKeys; /* number of preprocessed scan keys */ 00548 ScanKey keyData; /* array of preprocessed scan keys */ 00549 00550 /* workspace for SK_SEARCHARRAY support */ 00551 ScanKey arrayKeyData; /* modified copy of scan->keyData */ 00552 int numArrayKeys; /* number of equality-type array keys (-1 if 00553 * there are any unsatisfiable array keys) */ 00554 BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */ 00555 MemoryContext arrayContext; /* scan-lifespan context for array data */ 00556 00557 /* info about killed items if any (killedItems is NULL if never used) */ 00558 int *killedItems; /* currPos.items indexes of killed items */ 00559 int numKilled; /* number of currently stored items */ 00560 00561 /* 00562 * If we are doing an index-only scan, these are the tuple storage 00563 * workspaces for the currPos and markPos respectively. Each is of size 00564 * BLCKSZ, so it can hold as much as a full page's worth of tuples. 00565 */ 00566 char *currTuples; /* tuple storage for currPos */ 00567 char *markTuples; /* tuple storage for markPos */ 00568 00569 /* 00570 * If the marked position is on the same page as current position, we 00571 * don't use markPos, but just keep the marked itemIndex in markItemIndex 00572 * (all the rest of currPos is valid for the mark position). Hence, to 00573 * determine if there is a mark, first look at markItemIndex, then at 00574 * markPos. 00575 */ 00576 int markItemIndex; /* itemIndex, or -1 if not valid */ 00577 00578 /* keep these last in struct for efficiency */ 00579 BTScanPosData currPos; /* current position data */ 00580 BTScanPosData markPos; /* marked position, if any */ 00581 } BTScanOpaqueData; 00582 00583 typedef BTScanOpaqueData *BTScanOpaque; 00584 00585 /* 00586 * We use some private sk_flags bits in preprocessed scan keys. We're allowed 00587 * to use bits 16-31 (see skey.h). The uppermost bits are copied from the 00588 * index's indoption[] array entry for the index attribute. 00589 */ 00590 #define SK_BT_REQFWD 0x00010000 /* required to continue forward scan */ 00591 #define SK_BT_REQBKWD 0x00020000 /* required to continue backward scan */ 00592 #define SK_BT_INDOPTION_SHIFT 24 /* must clear the above bits */ 00593 #define SK_BT_DESC (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT) 00594 #define SK_BT_NULLS_FIRST (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT) 00595 00596 /* 00597 * prototypes for functions in nbtree.c (external entry points for btree) 00598 */ 00599 extern Datum btbuild(PG_FUNCTION_ARGS); 00600 extern Datum btbuildempty(PG_FUNCTION_ARGS); 00601 extern Datum btinsert(PG_FUNCTION_ARGS); 00602 extern Datum btbeginscan(PG_FUNCTION_ARGS); 00603 extern Datum btgettuple(PG_FUNCTION_ARGS); 00604 extern Datum btgetbitmap(PG_FUNCTION_ARGS); 00605 extern Datum btrescan(PG_FUNCTION_ARGS); 00606 extern Datum btendscan(PG_FUNCTION_ARGS); 00607 extern Datum btmarkpos(PG_FUNCTION_ARGS); 00608 extern Datum btrestrpos(PG_FUNCTION_ARGS); 00609 extern Datum btbulkdelete(PG_FUNCTION_ARGS); 00610 extern Datum btvacuumcleanup(PG_FUNCTION_ARGS); 00611 extern Datum btcanreturn(PG_FUNCTION_ARGS); 00612 extern Datum btoptions(PG_FUNCTION_ARGS); 00613 00614 /* 00615 * prototypes for functions in nbtinsert.c 00616 */ 00617 extern bool _bt_doinsert(Relation rel, IndexTuple itup, 00618 IndexUniqueCheck checkUnique, Relation heapRel); 00619 extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, int access); 00620 extern void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf, 00621 BTStack stack, bool is_root, bool is_only); 00622 00623 /* 00624 * prototypes for functions in nbtpage.c 00625 */ 00626 extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level); 00627 extern Buffer _bt_getroot(Relation rel, int access); 00628 extern Buffer _bt_gettrueroot(Relation rel); 00629 extern int _bt_getrootheight(Relation rel); 00630 extern void _bt_checkpage(Relation rel, Buffer buf); 00631 extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access); 00632 extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, 00633 BlockNumber blkno, int access); 00634 extern void _bt_relbuf(Relation rel, Buffer buf); 00635 extern void _bt_pageinit(Page page, Size size); 00636 extern bool _bt_page_recyclable(Page page); 00637 extern void _bt_delitems_delete(Relation rel, Buffer buf, 00638 OffsetNumber *itemnos, int nitems, Relation heapRel); 00639 extern void _bt_delitems_vacuum(Relation rel, Buffer buf, 00640 OffsetNumber *itemnos, int nitems, 00641 BlockNumber lastBlockVacuumed); 00642 extern int _bt_pagedel(Relation rel, Buffer buf, BTStack stack); 00643 00644 /* 00645 * prototypes for functions in nbtsearch.c 00646 */ 00647 extern BTStack _bt_search(Relation rel, 00648 int keysz, ScanKey scankey, bool nextkey, 00649 Buffer *bufP, int access); 00650 extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz, 00651 ScanKey scankey, bool nextkey, int access); 00652 extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, 00653 ScanKey scankey, bool nextkey); 00654 extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, 00655 Page page, OffsetNumber offnum); 00656 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); 00657 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); 00658 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost); 00659 00660 /* 00661 * prototypes for functions in nbtutils.c 00662 */ 00663 extern ScanKey _bt_mkscankey(Relation rel, IndexTuple itup); 00664 extern ScanKey _bt_mkscankey_nodata(Relation rel); 00665 extern void _bt_freeskey(ScanKey skey); 00666 extern void _bt_freestack(BTStack stack); 00667 extern void _bt_preprocess_array_keys(IndexScanDesc scan); 00668 extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir); 00669 extern bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir); 00670 extern void _bt_mark_array_keys(IndexScanDesc scan); 00671 extern void _bt_restore_array_keys(IndexScanDesc scan); 00672 extern void _bt_preprocess_keys(IndexScanDesc scan); 00673 extern IndexTuple _bt_checkkeys(IndexScanDesc scan, 00674 Page page, OffsetNumber offnum, 00675 ScanDirection dir, bool *continuescan); 00676 extern void _bt_killitems(IndexScanDesc scan, bool haveLock); 00677 extern BTCycleId _bt_vacuum_cycleid(Relation rel); 00678 extern BTCycleId _bt_start_vacuum(Relation rel); 00679 extern void _bt_end_vacuum(Relation rel); 00680 extern void _bt_end_vacuum_callback(int code, Datum arg); 00681 extern Size BTreeShmemSize(void); 00682 extern void BTreeShmemInit(void); 00683 00684 /* 00685 * prototypes for functions in nbtsort.c 00686 */ 00687 typedef struct BTSpool BTSpool; /* opaque type known only within nbtsort.c */ 00688 00689 extern BTSpool *_bt_spoolinit(Relation heap, Relation index, 00690 bool isunique, bool isdead); 00691 extern void _bt_spooldestroy(BTSpool *btspool); 00692 extern void _bt_spool(IndexTuple itup, BTSpool *btspool); 00693 extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2); 00694 00695 /* 00696 * prototypes for functions in nbtxlog.c 00697 */ 00698 extern void btree_redo(XLogRecPtr lsn, XLogRecord *record); 00699 extern void btree_desc(StringInfo buf, uint8 xl_info, char *rec); 00700 extern void btree_xlog_startup(void); 00701 extern void btree_xlog_cleanup(void); 00702 extern bool btree_safe_restartpoint(void); 00703 00704 #endif /* NBTREE_H */