PostgreSQL Source Code: src/include/access/nbtree.h Source File

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * nbtree.h
00004  *    header file for postgres btree access method implementation.
00005  *
00006  *
00007  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00008  * Portions Copyright (c) 1994, Regents of the University of California
00009  *
00010  * src/include/access/nbtree.h
00011  *
00012  *-------------------------------------------------------------------------
00013  */
00014 #ifndef NBTREE_H
00015 #define NBTREE_H
00016 
00017 #include "access/genam.h"
00018 #include "access/itup.h"
00019 #include "access/sdir.h"
00020 #include "access/xlog.h"
00021 #include "access/xlogutils.h"
00022 #include "catalog/pg_index.h"
00023 
00024 /* There's room for a 16-bit vacuum cycle ID in BTPageOpaqueData */
00025 typedef uint16 BTCycleId;
00026 
00027 /*
00028  *  BTPageOpaqueData -- At the end of every page, we store a pointer
00029  *  to both siblings in the tree.  This is used to do forward/backward
00030  *  index scans.  The next-page link is also critical for recovery when
00031  *  a search has navigated to the wrong page due to concurrent page splits
00032  *  or deletions; see src/backend/access/nbtree/README for more info.
00033  *
00034  *  In addition, we store the page's btree level (counting upwards from
00035  *  zero at a leaf page) as well as some flag bits indicating the page type
00036  *  and status.  If the page is deleted, we replace the level with the
00037  *  next-transaction-ID value indicating when it is safe to reclaim the page.
00038  *
00039  *  We also store a "vacuum cycle ID".  When a page is split while VACUUM is
00040  *  processing the index, a nonzero value associated with the VACUUM run is
00041  *  stored into both halves of the split page.  (If VACUUM is not running,
00042  *  both pages receive zero cycleids.)  This allows VACUUM to detect whether
00043  *  a page was split since it started, with a small probability of false match
00044  *  if the page was last split some exact multiple of MAX_BT_CYCLE_ID VACUUMs
00045  *  ago.  Also, during a split, the BTP_SPLIT_END flag is cleared in the left
00046  *  (original) page, and set in the right page, but only if the next page
00047  *  to its right has a different cycleid.
00048  *
00049  *  NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested
00050  *  instead.
00051  */
00052 
00053 typedef struct BTPageOpaqueData
00054 {
00055     BlockNumber btpo_prev;      /* left sibling, or P_NONE if leftmost */
00056     BlockNumber btpo_next;      /* right sibling, or P_NONE if rightmost */
00057     union
00058     {
00059         uint32      level;      /* tree level --- zero for leaf pages */
00060         TransactionId xact;     /* next transaction ID, if deleted */
00061     }           btpo;
00062     uint16      btpo_flags;     /* flag bits, see below */
00063     BTCycleId   btpo_cycleid;   /* vacuum cycle ID of latest split */
00064 } BTPageOpaqueData;
00065 
00066 typedef BTPageOpaqueData *BTPageOpaque;
00067 
00068 /* Bits defined in btpo_flags */
00069 #define BTP_LEAF        (1 << 0)    /* leaf page, i.e. not internal page */
00070 #define BTP_ROOT        (1 << 1)    /* root page (has no parent) */
00071 #define BTP_DELETED     (1 << 2)    /* page has been deleted from tree */
00072 #define BTP_META        (1 << 3)    /* meta-page */
00073 #define BTP_HALF_DEAD   (1 << 4)    /* empty, but still in tree */
00074 #define BTP_SPLIT_END   (1 << 5)    /* rightmost page of split group */
00075 #define BTP_HAS_GARBAGE (1 << 6)    /* page has LP_DEAD tuples */
00076 
00077 /*
00078  * The max allowed value of a cycle ID is a bit less than 64K.  This is
00079  * for convenience of pg_filedump and similar utilities: we want to use
00080  * the last 2 bytes of special space as an index type indicator, and
00081  * restricting cycle ID lets btree use that space for vacuum cycle IDs
00082  * while still allowing index type to be identified.
00083  */
00084 #define MAX_BT_CYCLE_ID     0xFF7F
00085 
00086 
00087 /*
00088  * The Meta page is always the first page in the btree index.
00089  * Its primary purpose is to point to the location of the btree root page.
00090  * We also point to the "fast" root, which is the current effective root;
00091  * see README for discussion.
00092  */
00093 
00094 typedef struct BTMetaPageData
00095 {
00096     uint32      btm_magic;      /* should contain BTREE_MAGIC */
00097     uint32      btm_version;    /* should contain BTREE_VERSION */
00098     BlockNumber btm_root;       /* current root location */
00099     uint32      btm_level;      /* tree level of the root page */
00100     BlockNumber btm_fastroot;   /* current "fast" root location */
00101     uint32      btm_fastlevel;  /* tree level of the "fast" root page */
00102 } BTMetaPageData;
00103 
00104 #define BTPageGetMeta(p) \
00105     ((BTMetaPageData *) PageGetContents(p))
00106 
00107 #define BTREE_METAPAGE  0       /* first page is meta */
00108 #define BTREE_MAGIC     0x053162    /* magic number of btree pages */
00109 #define BTREE_VERSION   2       /* current version number */
00110 
00111 /*
00112  * Maximum size of a btree index entry, including its tuple header.
00113  *
00114  * We actually need to be able to fit three items on every page,
00115  * so restrict any one item to 1/3 the per-page available space.
00116  */
00117 #define BTMaxItemSize(page) \
00118     MAXALIGN_DOWN((PageGetPageSize(page) - \
00119                    MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \
00120                    MAXALIGN(sizeof(BTPageOpaqueData))) / 3)
00121 
00122 /*
00123  * The leaf-page fillfactor defaults to 90% but is user-adjustable.
00124  * For pages above the leaf level, we use a fixed 70% fillfactor.
00125  * The fillfactor is applied during index build and when splitting
00126  * a rightmost page; when splitting non-rightmost pages we try to
00127  * divide the data equally.
00128  */
00129 #define BTREE_MIN_FILLFACTOR        10
00130 #define BTREE_DEFAULT_FILLFACTOR    90
00131 #define BTREE_NONLEAF_FILLFACTOR    70
00132 
00133 /*
00134  *  Test whether two btree entries are "the same".
00135  *
00136  *  Old comments:
00137  *  In addition, we must guarantee that all tuples in the index are unique,
00138  *  in order to satisfy some assumptions in Lehman and Yao.  The way that we
00139  *  do this is by generating a new OID for every insertion that we do in the
00140  *  tree.  This adds eight bytes to the size of btree index tuples.  Note
00141  *  that we do not use the OID as part of a composite key; the OID only
00142  *  serves as a unique identifier for a given index tuple (logical position
00143  *  within a page).
00144  *
00145  *  New comments:
00146  *  actually, we must guarantee that all tuples in A LEVEL
00147  *  are unique, not in ALL INDEX. So, we can use the t_tid
00148  *  as unique identifier for a given index tuple (logical position
00149  *  within a level). - vadim 04/09/97
00150  */
00151 #define BTTidSame(i1, i2)   \
00152     ( (i1).ip_blkid.bi_hi == (i2).ip_blkid.bi_hi && \
00153       (i1).ip_blkid.bi_lo == (i2).ip_blkid.bi_lo && \
00154       (i1).ip_posid == (i2).ip_posid )
00155 #define BTEntrySame(i1, i2) \
00156     BTTidSame((i1)->t_tid, (i2)->t_tid)
00157 
00158 
00159 /*
00160  *  In general, the btree code tries to localize its knowledge about
00161  *  page layout to a couple of routines.  However, we need a special
00162  *  value to indicate "no page number" in those places where we expect
00163  *  page numbers.  We can use zero for this because we never need to
00164  *  make a pointer to the metadata page.
00165  */
00166 
00167 #define P_NONE          0
00168 
00169 /*
00170  * Macros to test whether a page is leftmost or rightmost on its tree level,
00171  * as well as other state info kept in the opaque data.
00172  */
00173 #define P_LEFTMOST(opaque)      ((opaque)->btpo_prev == P_NONE)
00174 #define P_RIGHTMOST(opaque)     ((opaque)->btpo_next == P_NONE)
00175 #define P_ISLEAF(opaque)        ((opaque)->btpo_flags & BTP_LEAF)
00176 #define P_ISROOT(opaque)        ((opaque)->btpo_flags & BTP_ROOT)
00177 #define P_ISDELETED(opaque)     ((opaque)->btpo_flags & BTP_DELETED)
00178 #define P_ISHALFDEAD(opaque)    ((opaque)->btpo_flags & BTP_HALF_DEAD)
00179 #define P_IGNORE(opaque)        ((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD))
00180 #define P_HAS_GARBAGE(opaque)   ((opaque)->btpo_flags & BTP_HAS_GARBAGE)
00181 
00182 /*
00183  *  Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
00184  *  page.  The high key is not a data key, but gives info about what range of
00185  *  keys is supposed to be on this page.  The high key on a page is required
00186  *  to be greater than or equal to any data key that appears on the page.
00187  *  If we find ourselves trying to insert a key > high key, we know we need
00188  *  to move right (this should only happen if the page was split since we
00189  *  examined the parent page).
00190  *
00191  *  Our insertion algorithm guarantees that we can use the initial least key
00192  *  on our right sibling as the high key.  Once a page is created, its high
00193  *  key changes only if the page is split.
00194  *
00195  *  On a non-rightmost page, the high key lives in item 1 and data items
00196  *  start in item 2.  Rightmost pages have no high key, so we store data
00197  *  items beginning in item 1.
00198  */
00199 
00200 #define P_HIKEY             ((OffsetNumber) 1)
00201 #define P_FIRSTKEY          ((OffsetNumber) 2)
00202 #define P_FIRSTDATAKEY(opaque)  (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
00203 
00204 /*
00205  * XLOG records for btree operations
00206  *
00207  * XLOG allows to store some information in high 4 bits of log
00208  * record xl_info field
00209  */
00210 #define XLOG_BTREE_INSERT_LEAF  0x00    /* add index tuple without split */
00211 #define XLOG_BTREE_INSERT_UPPER 0x10    /* same, on a non-leaf page */
00212 #define XLOG_BTREE_INSERT_META  0x20    /* same, plus update metapage */
00213 #define XLOG_BTREE_SPLIT_L      0x30    /* add index tuple with split */
00214 #define XLOG_BTREE_SPLIT_R      0x40    /* as above, new item on right */
00215 #define XLOG_BTREE_SPLIT_L_ROOT 0x50    /* add tuple with split of root */
00216 #define XLOG_BTREE_SPLIT_R_ROOT 0x60    /* as above, new item on right */
00217 #define XLOG_BTREE_DELETE       0x70    /* delete leaf index tuples for a page */
00218 #define XLOG_BTREE_DELETE_PAGE  0x80    /* delete an entire page */
00219 #define XLOG_BTREE_DELETE_PAGE_META 0x90        /* same, and update metapage */
00220 #define XLOG_BTREE_NEWROOT      0xA0    /* new root page */
00221 #define XLOG_BTREE_DELETE_PAGE_HALF 0xB0        /* page deletion that makes
00222                                                  * parent half-dead */
00223 #define XLOG_BTREE_VACUUM       0xC0    /* delete entries on a page during
00224                                          * vacuum */
00225 #define XLOG_BTREE_REUSE_PAGE   0xD0    /* old page is about to be reused from
00226                                          * FSM */
00227 
00228 /*
00229  * All that we need to find changed index tuple
00230  */
00231 typedef struct xl_btreetid
00232 {
00233     RelFileNode node;
00234     ItemPointerData tid;        /* changed tuple id */
00235 } xl_btreetid;
00236 
00237 /*
00238  * All that we need to regenerate the meta-data page
00239  */
00240 typedef struct xl_btree_metadata
00241 {
00242     BlockNumber root;
00243     uint32      level;
00244     BlockNumber fastroot;
00245     uint32      fastlevel;
00246 } xl_btree_metadata;
00247 
00248 /*
00249  * This is what we need to know about simple (without split) insert.
00250  *
00251  * This data record is used for INSERT_LEAF, INSERT_UPPER, INSERT_META.
00252  * Note that INSERT_META implies it's not a leaf page.
00253  */
00254 typedef struct xl_btree_insert
00255 {
00256     xl_btreetid target;         /* inserted tuple id */
00257     /* BlockNumber downlink field FOLLOWS IF NOT XLOG_BTREE_INSERT_LEAF */
00258     /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_INSERT_META */
00259     /* INDEX TUPLE FOLLOWS AT END OF STRUCT */
00260 } xl_btree_insert;
00261 
00262 #define SizeOfBtreeInsert   (offsetof(xl_btreetid, tid) + SizeOfIptrData)
00263 
00264 /*
00265  * On insert with split, we save all the items going into the right sibling
00266  * so that we can restore it completely from the log record.  This way takes
00267  * less xlog space than the normal approach, because if we did it standardly,
00268  * XLogInsert would almost always think the right page is new and store its
00269  * whole page image.  The left page, however, is handled in the normal
00270  * incremental-update fashion.
00271  *
00272  * Note: the four XLOG_BTREE_SPLIT xl_info codes all use this data record.
00273  * The _L and _R variants indicate whether the inserted tuple went into the
00274  * left or right split page (and thus, whether newitemoff and the new item
00275  * are stored or not).  The _ROOT variants indicate that we are splitting
00276  * the root page, and thus that a newroot record rather than an insert or
00277  * split record should follow.  Note that a split record never carries a
00278  * metapage update --- we'll do that in the parent-level update.
00279  */
00280 typedef struct xl_btree_split
00281 {
00282     RelFileNode node;
00283     BlockNumber leftsib;        /* orig page / new left page */
00284     BlockNumber rightsib;       /* new right page */
00285     BlockNumber rnext;          /* next block (orig page's rightlink) */
00286     uint32      level;          /* tree level of page being split */
00287     OffsetNumber firstright;    /* first item moved to right page */
00288 
00289     /*
00290      * If level > 0, BlockIdData downlink follows.  (We use BlockIdData rather
00291      * than BlockNumber for alignment reasons: SizeOfBtreeSplit is only 16-bit
00292      * aligned.)
00293      *
00294      * If level > 0, an IndexTuple representing the HIKEY of the left page
00295      * follows.  We don't need this on leaf pages, because it's the same as
00296      * the leftmost key in the new right page.  Also, it's suppressed if
00297      * XLogInsert chooses to store the left page's whole page image.
00298      *
00299      * In the _L variants, next are OffsetNumber newitemoff and the new item.
00300      * (In the _R variants, the new item is one of the right page's tuples.)
00301      * The new item, but not newitemoff, is suppressed if XLogInsert chooses
00302      * to store the left page's whole page image.
00303      *
00304      * Last are the right page's tuples in the form used by _bt_restore_page.
00305      */
00306 } xl_btree_split;
00307 
00308 #define SizeOfBtreeSplit    (offsetof(xl_btree_split, firstright) + sizeof(OffsetNumber))
00309 
00310 /*
00311  * This is what we need to know about delete of individual leaf index tuples.
00312  * The WAL record can represent deletion of any number of index tuples on a
00313  * single index page when *not* executed by VACUUM.
00314  */
00315 typedef struct xl_btree_delete
00316 {
00317     RelFileNode node;           /* RelFileNode of the index */
00318     BlockNumber block;
00319     RelFileNode hnode;          /* RelFileNode of the heap the index currently
00320                                  * points at */
00321     int         nitems;
00322 
00323     /* TARGET OFFSET NUMBERS FOLLOW AT THE END */
00324 } xl_btree_delete;
00325 
00326 #define SizeOfBtreeDelete   (offsetof(xl_btree_delete, nitems) + sizeof(int))
00327 
00328 /*
00329  * This is what we need to know about page reuse within btree.
00330  */
00331 typedef struct xl_btree_reuse_page
00332 {
00333     RelFileNode node;
00334     BlockNumber block;
00335     TransactionId latestRemovedXid;
00336 } xl_btree_reuse_page;
00337 
00338 #define SizeOfBtreeReusePage    (sizeof(xl_btree_reuse_page))
00339 
00340 /*
00341  * This is what we need to know about vacuum of individual leaf index tuples.
00342  * The WAL record can represent deletion of any number of index tuples on a
00343  * single index page when executed by VACUUM.
00344  *
00345  * The correctness requirement for applying these changes during recovery is
00346  * that we must do one of these two things for every block in the index:
00347  *      * lock the block for cleanup and apply any required changes
00348  *      * EnsureBlockUnpinned()
00349  * The purpose of this is to ensure that no index scans started before we
00350  * finish scanning the index are still running by the time we begin to remove
00351  * heap tuples.
00352  *
00353  * Any changes to any one block are registered on just one WAL record. All
00354  * blocks that we need to run EnsureBlockUnpinned() are listed as a block range
00355  * starting from the last block vacuumed through until this one. Individual
00356  * block numbers aren't given.
00357  *
00358  * Note that the *last* WAL record in any vacuum of an index is allowed to
00359  * have a zero length array of offsets. Earlier records must have at least one.
00360  */
00361 typedef struct xl_btree_vacuum
00362 {
00363     RelFileNode node;
00364     BlockNumber block;
00365     BlockNumber lastBlockVacuumed;
00366 
00367     /* TARGET OFFSET NUMBERS FOLLOW */
00368 } xl_btree_vacuum;
00369 
00370 #define SizeOfBtreeVacuum   (offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber))
00371 
00372 /*
00373  * This is what we need to know about deletion of a btree page.  The target
00374  * identifies the tuple removed from the parent page (note that we remove
00375  * this tuple's downlink and the *following* tuple's key).  Note we do not
00376  * store any content for the deleted page --- it is just rewritten as empty
00377  * during recovery, apart from resetting the btpo.xact.
00378  */
00379 typedef struct xl_btree_delete_page
00380 {
00381     xl_btreetid target;         /* deleted tuple id in parent page */
00382     BlockNumber deadblk;        /* child block being deleted */
00383     BlockNumber leftblk;        /* child block's left sibling, if any */
00384     BlockNumber rightblk;       /* child block's right sibling */
00385     TransactionId btpo_xact;    /* value of btpo.xact for use in recovery */
00386     /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_DELETE_PAGE_META */
00387 } xl_btree_delete_page;
00388 
00389 #define SizeOfBtreeDeletePage   (offsetof(xl_btree_delete_page, btpo_xact) + sizeof(TransactionId))
00390 
00391 /*
00392  * New root log record.  There are zero tuples if this is to establish an
00393  * empty root, or two if it is the result of splitting an old root.
00394  *
00395  * Note that although this implies rewriting the metadata page, we don't need
00396  * an xl_btree_metadata record --- the rootblk and level are sufficient.
00397  */
00398 typedef struct xl_btree_newroot
00399 {
00400     RelFileNode node;
00401     BlockNumber rootblk;        /* location of new root */
00402     uint32      level;          /* its tree level */
00403     /* 0 or 2 INDEX TUPLES FOLLOW AT END OF STRUCT */
00404 } xl_btree_newroot;
00405 
00406 #define SizeOfBtreeNewroot  (offsetof(xl_btree_newroot, level) + sizeof(uint32))
00407 
00408 
00409 /*
00410  *  Operator strategy numbers for B-tree have been moved to access/skey.h,
00411  *  because many places need to use them in ScanKeyInit() calls.
00412  *
00413  *  The strategy numbers are chosen so that we can commute them by
00414  *  subtraction, thus:
00415  */
00416 #define BTCommuteStrategyNumber(strat)  (BTMaxStrategyNumber + 1 - (strat))
00417 
00418 /*
00419  *  When a new operator class is declared, we require that the user
00420  *  supply us with an amproc procedure (BTORDER_PROC) for determining
00421  *  whether, for two keys a and b, a < b, a = b, or a > b.  This routine
00422  *  must return < 0, 0, > 0, respectively, in these three cases.  (It must
00423  *  not return INT_MIN, since we may negate the result before using it.)
00424  *
00425  *  To facilitate accelerated sorting, an operator class may choose to
00426  *  offer a second procedure (BTSORTSUPPORT_PROC).  For full details, see
00427  *  src/include/utils/sortsupport.h.
00428  */
00429 
00430 #define BTORDER_PROC        1
00431 #define BTSORTSUPPORT_PROC  2
00432 
00433 /*
00434  *  We need to be able to tell the difference between read and write
00435  *  requests for pages, in order to do locking correctly.
00436  */
00437 
00438 #define BT_READ         BUFFER_LOCK_SHARE
00439 #define BT_WRITE        BUFFER_LOCK_EXCLUSIVE
00440 
00441 /*
00442  *  BTStackData -- As we descend a tree, we push the (location, downlink)
00443  *  pairs from internal pages onto a private stack.  If we split a
00444  *  leaf, we use this stack to walk back up the tree and insert data
00445  *  into parent pages (and possibly to split them, too).  Lehman and
00446  *  Yao's update algorithm guarantees that under no circumstances can
00447  *  our private stack give us an irredeemably bad picture up the tree.
00448  *  Again, see the paper for details.
00449  */
00450 
00451 typedef struct BTStackData
00452 {
00453     BlockNumber bts_blkno;
00454     OffsetNumber bts_offset;
00455     IndexTupleData bts_btentry;
00456     struct BTStackData *bts_parent;
00457 } BTStackData;
00458 
00459 typedef BTStackData *BTStack;
00460 
00461 /*
00462  * BTScanOpaqueData is the btree-private state needed for an indexscan.
00463  * This consists of preprocessed scan keys (see _bt_preprocess_keys() for
00464  * details of the preprocessing), information about the current location
00465  * of the scan, and information about the marked location, if any.  (We use
00466  * BTScanPosData to represent the data needed for each of current and marked
00467  * locations.)  In addition we can remember some known-killed index entries
00468  * that must be marked before we can move off the current page.
00469  *
00470  * Index scans work a page at a time: we pin and read-lock the page, identify
00471  * all the matching items on the page and save them in BTScanPosData, then
00472  * release the read-lock while returning the items to the caller for
00473  * processing.  This approach minimizes lock/unlock traffic.  Note that we
00474  * keep the pin on the index page until the caller is done with all the items
00475  * (this is needed for VACUUM synchronization, see nbtree/README).  When we
00476  * are ready to step to the next page, if the caller has told us any of the
00477  * items were killed, we re-lock the page to mark them killed, then unlock.
00478  * Finally we drop the pin and step to the next page in the appropriate
00479  * direction.
00480  *
00481  * If we are doing an index-only scan, we save the entire IndexTuple for each
00482  * matched item, otherwise only its heap TID and offset.  The IndexTuples go
00483  * into a separate workspace array; each BTScanPosItem stores its tuple's
00484  * offset within that array.
00485  */
00486 
00487 typedef struct BTScanPosItem    /* what we remember about each match */
00488 {
00489     ItemPointerData heapTid;    /* TID of referenced heap item */
00490     OffsetNumber indexOffset;   /* index item's location within page */
00491     LocationIndex tupleOffset;  /* IndexTuple's offset in workspace, if any */
00492 } BTScanPosItem;
00493 
00494 typedef struct BTScanPosData
00495 {
00496     Buffer      buf;            /* if valid, the buffer is pinned */
00497 
00498     BlockNumber nextPage;       /* page's right link when we scanned it */
00499 
00500     /*
00501      * moreLeft and moreRight track whether we think there may be matching
00502      * index entries to the left and right of the current page, respectively.
00503      * We can clear the appropriate one of these flags when _bt_checkkeys()
00504      * returns continuescan = false.
00505      */
00506     bool        moreLeft;
00507     bool        moreRight;
00508 
00509     /*
00510      * If we are doing an index-only scan, nextTupleOffset is the first free
00511      * location in the associated tuple storage workspace.
00512      */
00513     int         nextTupleOffset;
00514 
00515     /*
00516      * The items array is always ordered in index order (ie, increasing
00517      * indexoffset).  When scanning backwards it is convenient to fill the
00518      * array back-to-front, so we start at the last slot and fill downwards.
00519      * Hence we need both a first-valid-entry and a last-valid-entry counter.
00520      * itemIndex is a cursor showing which entry was last returned to caller.
00521      */
00522     int         firstItem;      /* first valid index in items[] */
00523     int         lastItem;       /* last valid index in items[] */
00524     int         itemIndex;      /* current index in items[] */
00525 
00526     BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
00527 } BTScanPosData;
00528 
00529 typedef BTScanPosData *BTScanPos;
00530 
00531 #define BTScanPosIsValid(scanpos) BufferIsValid((scanpos).buf)
00532 
00533 /* We need one of these for each equality-type SK_SEARCHARRAY scan key */
00534 typedef struct BTArrayKeyInfo
00535 {
00536     int         scan_key;       /* index of associated key in arrayKeyData */
00537     int         cur_elem;       /* index of current element in elem_values */
00538     int         mark_elem;      /* index of marked element in elem_values */
00539     int         num_elems;      /* number of elems in current array value */
00540     Datum      *elem_values;    /* array of num_elems Datums */
00541 } BTArrayKeyInfo;
00542 
00543 typedef struct BTScanOpaqueData
00544 {
00545     /* these fields are set by _bt_preprocess_keys(): */
00546     bool        qual_ok;        /* false if qual can never be satisfied */
00547     int         numberOfKeys;   /* number of preprocessed scan keys */
00548     ScanKey     keyData;        /* array of preprocessed scan keys */
00549 
00550     /* workspace for SK_SEARCHARRAY support */
00551     ScanKey     arrayKeyData;   /* modified copy of scan->keyData */
00552     int         numArrayKeys;   /* number of equality-type array keys (-1 if
00553                                  * there are any unsatisfiable array keys) */
00554     BTArrayKeyInfo *arrayKeys;  /* info about each equality-type array key */
00555     MemoryContext arrayContext; /* scan-lifespan context for array data */
00556 
00557     /* info about killed items if any (killedItems is NULL if never used) */
00558     int        *killedItems;    /* currPos.items indexes of killed items */
00559     int         numKilled;      /* number of currently stored items */
00560 
00561     /*
00562      * If we are doing an index-only scan, these are the tuple storage
00563      * workspaces for the currPos and markPos respectively.  Each is of size
00564      * BLCKSZ, so it can hold as much as a full page's worth of tuples.
00565      */
00566     char       *currTuples;     /* tuple storage for currPos */
00567     char       *markTuples;     /* tuple storage for markPos */
00568 
00569     /*
00570      * If the marked position is on the same page as current position, we
00571      * don't use markPos, but just keep the marked itemIndex in markItemIndex
00572      * (all the rest of currPos is valid for the mark position). Hence, to
00573      * determine if there is a mark, first look at markItemIndex, then at
00574      * markPos.
00575      */
00576     int         markItemIndex;  /* itemIndex, or -1 if not valid */
00577 
00578     /* keep these last in struct for efficiency */
00579     BTScanPosData currPos;      /* current position data */
00580     BTScanPosData markPos;      /* marked position, if any */
00581 } BTScanOpaqueData;
00582 
00583 typedef BTScanOpaqueData *BTScanOpaque;
00584 
00585 /*
00586  * We use some private sk_flags bits in preprocessed scan keys.  We're allowed
00587  * to use bits 16-31 (see skey.h).  The uppermost bits are copied from the
00588  * index's indoption[] array entry for the index attribute.
00589  */
00590 #define SK_BT_REQFWD    0x00010000      /* required to continue forward scan */
00591 #define SK_BT_REQBKWD   0x00020000      /* required to continue backward scan */
00592 #define SK_BT_INDOPTION_SHIFT  24       /* must clear the above bits */
00593 #define SK_BT_DESC          (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
00594 #define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
00595 
00596 /*
00597  * prototypes for functions in nbtree.c (external entry points for btree)
00598  */
00599 extern Datum btbuild(PG_FUNCTION_ARGS);
00600 extern Datum btbuildempty(PG_FUNCTION_ARGS);
00601 extern Datum btinsert(PG_FUNCTION_ARGS);
00602 extern Datum btbeginscan(PG_FUNCTION_ARGS);
00603 extern Datum btgettuple(PG_FUNCTION_ARGS);
00604 extern Datum btgetbitmap(PG_FUNCTION_ARGS);
00605 extern Datum btrescan(PG_FUNCTION_ARGS);
00606 extern Datum btendscan(PG_FUNCTION_ARGS);
00607 extern Datum btmarkpos(PG_FUNCTION_ARGS);
00608 extern Datum btrestrpos(PG_FUNCTION_ARGS);
00609 extern Datum btbulkdelete(PG_FUNCTION_ARGS);
00610 extern Datum btvacuumcleanup(PG_FUNCTION_ARGS);
00611 extern Datum btcanreturn(PG_FUNCTION_ARGS);
00612 extern Datum btoptions(PG_FUNCTION_ARGS);
00613 
00614 /*
00615  * prototypes for functions in nbtinsert.c
00616  */
00617 extern bool _bt_doinsert(Relation rel, IndexTuple itup,
00618              IndexUniqueCheck checkUnique, Relation heapRel);
00619 extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, int access);
00620 extern void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
00621                   BTStack stack, bool is_root, bool is_only);
00622 
00623 /*
00624  * prototypes for functions in nbtpage.c
00625  */
00626 extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level);
00627 extern Buffer _bt_getroot(Relation rel, int access);
00628 extern Buffer _bt_gettrueroot(Relation rel);
00629 extern int  _bt_getrootheight(Relation rel);
00630 extern void _bt_checkpage(Relation rel, Buffer buf);
00631 extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
00632 extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
00633                  BlockNumber blkno, int access);
00634 extern void _bt_relbuf(Relation rel, Buffer buf);
00635 extern void _bt_pageinit(Page page, Size size);
00636 extern bool _bt_page_recyclable(Page page);
00637 extern void _bt_delitems_delete(Relation rel, Buffer buf,
00638                     OffsetNumber *itemnos, int nitems, Relation heapRel);
00639 extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
00640                     OffsetNumber *itemnos, int nitems,
00641                     BlockNumber lastBlockVacuumed);
00642 extern int  _bt_pagedel(Relation rel, Buffer buf, BTStack stack);
00643 
00644 /*
00645  * prototypes for functions in nbtsearch.c
00646  */
00647 extern BTStack _bt_search(Relation rel,
00648            int keysz, ScanKey scankey, bool nextkey,
00649            Buffer *bufP, int access);
00650 extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz,
00651               ScanKey scankey, bool nextkey, int access);
00652 extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz,
00653             ScanKey scankey, bool nextkey);
00654 extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
00655             Page page, OffsetNumber offnum);
00656 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
00657 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
00658 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
00659 
00660 /*
00661  * prototypes for functions in nbtutils.c
00662  */
00663 extern ScanKey _bt_mkscankey(Relation rel, IndexTuple itup);
00664 extern ScanKey _bt_mkscankey_nodata(Relation rel);
00665 extern void _bt_freeskey(ScanKey skey);
00666 extern void _bt_freestack(BTStack stack);
00667 extern void _bt_preprocess_array_keys(IndexScanDesc scan);
00668 extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir);
00669 extern bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir);
00670 extern void _bt_mark_array_keys(IndexScanDesc scan);
00671 extern void _bt_restore_array_keys(IndexScanDesc scan);
00672 extern void _bt_preprocess_keys(IndexScanDesc scan);
00673 extern IndexTuple _bt_checkkeys(IndexScanDesc scan,
00674               Page page, OffsetNumber offnum,
00675               ScanDirection dir, bool *continuescan);
00676 extern void _bt_killitems(IndexScanDesc scan, bool haveLock);
00677 extern BTCycleId _bt_vacuum_cycleid(Relation rel);
00678 extern BTCycleId _bt_start_vacuum(Relation rel);
00679 extern void _bt_end_vacuum(Relation rel);
00680 extern void _bt_end_vacuum_callback(int code, Datum arg);
00681 extern Size BTreeShmemSize(void);
00682 extern void BTreeShmemInit(void);
00683 
00684 /*
00685  * prototypes for functions in nbtsort.c
00686  */
00687 typedef struct BTSpool BTSpool; /* opaque type known only within nbtsort.c */
00688 
00689 extern BTSpool *_bt_spoolinit(Relation heap, Relation index,
00690               bool isunique, bool isdead);
00691 extern void _bt_spooldestroy(BTSpool *btspool);
00692 extern void _bt_spool(IndexTuple itup, BTSpool *btspool);
00693 extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2);
00694 
00695 /*
00696  * prototypes for functions in nbtxlog.c
00697  */
00698 extern void btree_redo(XLogRecPtr lsn, XLogRecord *record);
00699 extern void btree_desc(StringInfo buf, uint8 xl_info, char *rec);
00700 extern void btree_xlog_startup(void);
00701 extern void btree_xlog_cleanup(void);
00702 extern bool btree_safe_restartpoint(void);
00703 
00704 #endif   /* NBTREE_H */
Header And Logo

nbtree.h