Header And Logo

PostgreSQL
| The world's most advanced open source database.

nbtsearch.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * nbtsearch.c
00004  *    Search code for postgres btrees.
00005  *
00006  *
00007  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00008  * Portions Copyright (c) 1994, Regents of the University of California
00009  *
00010  * IDENTIFICATION
00011  *    src/backend/access/nbtree/nbtsearch.c
00012  *
00013  *-------------------------------------------------------------------------
00014  */
00015 
00016 #include "postgres.h"
00017 
00018 #include "access/nbtree.h"
00019 #include "access/relscan.h"
00020 #include "miscadmin.h"
00021 #include "pgstat.h"
00022 #include "storage/predicate.h"
00023 #include "utils/lsyscache.h"
00024 #include "utils/rel.h"
00025 
00026 
00027 static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
00028              OffsetNumber offnum);
00029 static void _bt_saveitem(BTScanOpaque so, int itemIndex,
00030              OffsetNumber offnum, IndexTuple itup);
00031 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
00032 static Buffer _bt_walk_left(Relation rel, Buffer buf);
00033 static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
00034 
00035 
00036 /*
00037  *  _bt_search() -- Search the tree for a particular scankey,
00038  *      or more precisely for the first leaf page it could be on.
00039  *
00040  * The passed scankey must be an insertion-type scankey (see nbtree/README),
00041  * but it can omit the rightmost column(s) of the index.
00042  *
00043  * When nextkey is false (the usual case), we are looking for the first
00044  * item >= scankey.  When nextkey is true, we are looking for the first
00045  * item strictly greater than scankey.
00046  *
00047  * Return value is a stack of parent-page pointers.  *bufP is set to the
00048  * address of the leaf-page buffer, which is read-locked and pinned.
00049  * No locks are held on the parent pages, however!
00050  *
00051  * NOTE that the returned buffer is read-locked regardless of the access
00052  * parameter.  However, access = BT_WRITE will allow an empty root page
00053  * to be created and returned.  When access = BT_READ, an empty index
00054  * will result in *bufP being set to InvalidBuffer.
00055  */
00056 BTStack
00057 _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
00058            Buffer *bufP, int access)
00059 {
00060     BTStack     stack_in = NULL;
00061 
00062     /* Get the root page to start with */
00063     *bufP = _bt_getroot(rel, access);
00064 
00065     /* If index is empty and access = BT_READ, no root page is created. */
00066     if (!BufferIsValid(*bufP))
00067         return (BTStack) NULL;
00068 
00069     /* Loop iterates once per level descended in the tree */
00070     for (;;)
00071     {
00072         Page        page;
00073         BTPageOpaque opaque;
00074         OffsetNumber offnum;
00075         ItemId      itemid;
00076         IndexTuple  itup;
00077         BlockNumber blkno;
00078         BlockNumber par_blkno;
00079         BTStack     new_stack;
00080 
00081         /*
00082          * Race -- the page we just grabbed may have split since we read its
00083          * pointer in the parent (or metapage).  If it has, we may need to
00084          * move right to its new sibling.  Do that.
00085          */
00086         *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ);
00087 
00088         /* if this is a leaf page, we're done */
00089         page = BufferGetPage(*bufP);
00090         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
00091         if (P_ISLEAF(opaque))
00092             break;
00093 
00094         /*
00095          * Find the appropriate item on the internal page, and get the child
00096          * page that it points to.
00097          */
00098         offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
00099         itemid = PageGetItemId(page, offnum);
00100         itup = (IndexTuple) PageGetItem(page, itemid);
00101         blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
00102         par_blkno = BufferGetBlockNumber(*bufP);
00103 
00104         /*
00105          * We need to save the location of the index entry we chose in the
00106          * parent page on a stack. In case we split the tree, we'll use the
00107          * stack to work back up to the parent page.  We also save the actual
00108          * downlink (TID) to uniquely identify the index entry, in case it
00109          * moves right while we're working lower in the tree.  See the paper
00110          * by Lehman and Yao for how this is detected and handled. (We use the
00111          * child link to disambiguate duplicate keys in the index -- Lehman
00112          * and Yao disallow duplicate keys.)
00113          */
00114         new_stack = (BTStack) palloc(sizeof(BTStackData));
00115         new_stack->bts_blkno = par_blkno;
00116         new_stack->bts_offset = offnum;
00117         memcpy(&new_stack->bts_btentry, itup, sizeof(IndexTupleData));
00118         new_stack->bts_parent = stack_in;
00119 
00120         /* drop the read lock on the parent page, acquire one on the child */
00121         *bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);
00122 
00123         /* okay, all set to move down a level */
00124         stack_in = new_stack;
00125     }
00126 
00127     return stack_in;
00128 }
00129 
00130 /*
00131  *  _bt_moveright() -- move right in the btree if necessary.
00132  *
00133  * When we follow a pointer to reach a page, it is possible that
00134  * the page has changed in the meanwhile.  If this happens, we're
00135  * guaranteed that the page has "split right" -- that is, that any
00136  * data that appeared on the page originally is either on the page
00137  * or strictly to the right of it.
00138  *
00139  * This routine decides whether or not we need to move right in the
00140  * tree by examining the high key entry on the page.  If that entry
00141  * is strictly less than the scankey, or <= the scankey in the nextkey=true
00142  * case, then we followed the wrong link and we need to move right.
00143  *
00144  * The passed scankey must be an insertion-type scankey (see nbtree/README),
00145  * but it can omit the rightmost column(s) of the index.
00146  *
00147  * When nextkey is false (the usual case), we are looking for the first
00148  * item >= scankey.  When nextkey is true, we are looking for the first
00149  * item strictly greater than scankey.
00150  *
00151  * On entry, we have the buffer pinned and a lock of the type specified by
00152  * 'access'.  If we move right, we release the buffer and lock and acquire
00153  * the same on the right sibling.  Return value is the buffer we stop at.
00154  */
00155 Buffer
00156 _bt_moveright(Relation rel,
00157               Buffer buf,
00158               int keysz,
00159               ScanKey scankey,
00160               bool nextkey,
00161               int access)
00162 {
00163     Page        page;
00164     BTPageOpaque opaque;
00165     int32       cmpval;
00166 
00167     page = BufferGetPage(buf);
00168     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
00169 
00170     /*
00171      * When nextkey = false (normal case): if the scan key that brought us to
00172      * this page is > the high key stored on the page, then the page has split
00173      * and we need to move right.  (If the scan key is equal to the high key,
00174      * we might or might not need to move right; have to scan the page first
00175      * anyway.)
00176      *
00177      * When nextkey = true: move right if the scan key is >= page's high key.
00178      *
00179      * The page could even have split more than once, so scan as far as
00180      * needed.
00181      *
00182      * We also have to move right if we followed a link that brought us to a
00183      * dead page.
00184      */
00185     cmpval = nextkey ? 0 : 1;
00186 
00187     while (!P_RIGHTMOST(opaque) &&
00188            (P_IGNORE(opaque) ||
00189             _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval))
00190     {
00191         /* step right one page */
00192         BlockNumber rblkno = opaque->btpo_next;
00193 
00194         buf = _bt_relandgetbuf(rel, buf, rblkno, access);
00195         page = BufferGetPage(buf);
00196         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
00197     }
00198 
00199     if (P_IGNORE(opaque))
00200         elog(ERROR, "fell off the end of index \"%s\"",
00201              RelationGetRelationName(rel));
00202 
00203     return buf;
00204 }
00205 
00206 /*
00207  *  _bt_binsrch() -- Do a binary search for a key on a particular page.
00208  *
00209  * The passed scankey must be an insertion-type scankey (see nbtree/README),
00210  * but it can omit the rightmost column(s) of the index.
00211  *
00212  * When nextkey is false (the usual case), we are looking for the first
00213  * item >= scankey.  When nextkey is true, we are looking for the first
00214  * item strictly greater than scankey.
00215  *
00216  * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first
00217  * key >= given scankey, or > scankey if nextkey is true.  (NOTE: in
00218  * particular, this means it is possible to return a value 1 greater than the
00219  * number of keys on the page, if the scankey is > all keys on the page.)
00220  *
00221  * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber
00222  * of the last key < given scankey, or last key <= given scankey if nextkey
00223  * is true.  (Since _bt_compare treats the first data key of such a page as
00224  * minus infinity, there will be at least one key < scankey, so the result
00225  * always points at one of the keys on the page.)  This key indicates the
00226  * right place to descend to be sure we find all leaf keys >= given scankey
00227  * (or leaf keys > given scankey when nextkey is true).
00228  *
00229  * This procedure is not responsible for walking right, it just examines
00230  * the given page.  _bt_binsrch() has no lock or refcount side effects
00231  * on the buffer.
00232  */
00233 OffsetNumber
00234 _bt_binsrch(Relation rel,
00235             Buffer buf,
00236             int keysz,
00237             ScanKey scankey,
00238             bool nextkey)
00239 {
00240     Page        page;
00241     BTPageOpaque opaque;
00242     OffsetNumber low,
00243                 high;
00244     int32       result,
00245                 cmpval;
00246 
00247     page = BufferGetPage(buf);
00248     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
00249 
00250     low = P_FIRSTDATAKEY(opaque);
00251     high = PageGetMaxOffsetNumber(page);
00252 
00253     /*
00254      * If there are no keys on the page, return the first available slot. Note
00255      * this covers two cases: the page is really empty (no keys), or it
00256      * contains only a high key.  The latter case is possible after vacuuming.
00257      * This can never happen on an internal page, however, since they are
00258      * never empty (an internal page must have children).
00259      */
00260     if (high < low)
00261         return low;
00262 
00263     /*
00264      * Binary search to find the first key on the page >= scan key, or first
00265      * key > scankey when nextkey is true.
00266      *
00267      * For nextkey=false (cmpval=1), the loop invariant is: all slots before
00268      * 'low' are < scan key, all slots at or after 'high' are >= scan key.
00269      *
00270      * For nextkey=true (cmpval=0), the loop invariant is: all slots before
00271      * 'low' are <= scan key, all slots at or after 'high' are > scan key.
00272      *
00273      * We can fall out when high == low.
00274      */
00275     high++;                     /* establish the loop invariant for high */
00276 
00277     cmpval = nextkey ? 0 : 1;   /* select comparison value */
00278 
00279     while (high > low)
00280     {
00281         OffsetNumber mid = low + ((high - low) / 2);
00282 
00283         /* We have low <= mid < high, so mid points at a real slot */
00284 
00285         result = _bt_compare(rel, keysz, scankey, page, mid);
00286 
00287         if (result >= cmpval)
00288             low = mid + 1;
00289         else
00290             high = mid;
00291     }
00292 
00293     /*
00294      * At this point we have high == low, but be careful: they could point
00295      * past the last slot on the page.
00296      *
00297      * On a leaf page, we always return the first key >= scan key (resp. >
00298      * scan key), which could be the last slot + 1.
00299      */
00300     if (P_ISLEAF(opaque))
00301         return low;
00302 
00303     /*
00304      * On a non-leaf page, return the last key < scan key (resp. <= scan key).
00305      * There must be one if _bt_compare() is playing by the rules.
00306      */
00307     Assert(low > P_FIRSTDATAKEY(opaque));
00308 
00309     return OffsetNumberPrev(low);
00310 }
00311 
00312 /*----------
00313  *  _bt_compare() -- Compare scankey to a particular tuple on the page.
00314  *
00315  * The passed scankey must be an insertion-type scankey (see nbtree/README),
00316  * but it can omit the rightmost column(s) of the index.
00317  *
00318  *  keysz: number of key conditions to be checked (might be less than the
00319  *      number of index columns!)
00320  *  page/offnum: location of btree item to be compared to.
00321  *
00322  *      This routine returns:
00323  *          <0 if scankey < tuple at offnum;
00324  *           0 if scankey == tuple at offnum;
00325  *          >0 if scankey > tuple at offnum.
00326  *      NULLs in the keys are treated as sortable values.  Therefore
00327  *      "equality" does not necessarily mean that the item should be
00328  *      returned to the caller as a matching key!
00329  *
00330  * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be
00331  * "minus infinity": this routine will always claim it is less than the
00332  * scankey.  The actual key value stored (if any, which there probably isn't)
00333  * does not matter.  This convention allows us to implement the Lehman and
00334  * Yao convention that the first down-link pointer is before the first key.
00335  * See backend/access/nbtree/README for details.
00336  *----------
00337  */
00338 int32
00339 _bt_compare(Relation rel,
00340             int keysz,
00341             ScanKey scankey,
00342             Page page,
00343             OffsetNumber offnum)
00344 {
00345     TupleDesc   itupdesc = RelationGetDescr(rel);
00346     BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
00347     IndexTuple  itup;
00348     int         i;
00349 
00350     /*
00351      * Force result ">" if target item is first data item on an internal page
00352      * --- see NOTE above.
00353      */
00354     if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
00355         return 1;
00356 
00357     itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
00358 
00359     /*
00360      * The scan key is set up with the attribute number associated with each
00361      * term in the key.  It is important that, if the index is multi-key, the
00362      * scan contain the first k key attributes, and that they be in order.  If
00363      * you think about how multi-key ordering works, you'll understand why
00364      * this is.
00365      *
00366      * We don't test for violation of this condition here, however.  The
00367      * initial setup for the index scan had better have gotten it right (see
00368      * _bt_first).
00369      */
00370 
00371     for (i = 1; i <= keysz; i++)
00372     {
00373         Datum       datum;
00374         bool        isNull;
00375         int32       result;
00376 
00377         datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);
00378 
00379         /* see comments about NULLs handling in btbuild */
00380         if (scankey->sk_flags & SK_ISNULL)      /* key is NULL */
00381         {
00382             if (isNull)
00383                 result = 0;     /* NULL "=" NULL */
00384             else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
00385                 result = -1;    /* NULL "<" NOT_NULL */
00386             else
00387                 result = 1;     /* NULL ">" NOT_NULL */
00388         }
00389         else if (isNull)        /* key is NOT_NULL and item is NULL */
00390         {
00391             if (scankey->sk_flags & SK_BT_NULLS_FIRST)
00392                 result = 1;     /* NOT_NULL ">" NULL */
00393             else
00394                 result = -1;    /* NOT_NULL "<" NULL */
00395         }
00396         else
00397         {
00398             /*
00399              * The sk_func needs to be passed the index value as left arg and
00400              * the sk_argument as right arg (they might be of different
00401              * types).  Since it is convenient for callers to think of
00402              * _bt_compare as comparing the scankey to the index item, we have
00403              * to flip the sign of the comparison result.  (Unless it's a DESC
00404              * column, in which case we *don't* flip the sign.)
00405              */
00406             result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
00407                                                      scankey->sk_collation,
00408                                                      datum,
00409                                                      scankey->sk_argument));
00410 
00411             if (!(scankey->sk_flags & SK_BT_DESC))
00412                 result = -result;
00413         }
00414 
00415         /* if the keys are unequal, return the difference */
00416         if (result != 0)
00417             return result;
00418 
00419         scankey++;
00420     }
00421 
00422     /* if we get here, the keys are equal */
00423     return 0;
00424 }
00425 
00426 /*
00427  *  _bt_first() -- Find the first item in a scan.
00428  *
00429  *      We need to be clever about the direction of scan, the search
00430  *      conditions, and the tree ordering.  We find the first item (or,
00431  *      if backwards scan, the last item) in the tree that satisfies the
00432  *      qualifications in the scan key.  On success exit, the page containing
00433  *      the current index tuple is pinned but not locked, and data about
00434  *      the matching tuple(s) on the page has been loaded into so->currPos.
00435  *      scan->xs_ctup.t_self is set to the heap TID of the current tuple,
00436  *      and if requested, scan->xs_itup points to a copy of the index tuple.
00437  *
00438  * If there are no matching items in the index, we return FALSE, with no
00439  * pins or locks held.
00440  *
00441  * Note that scan->keyData[], and the so->keyData[] scankey built from it,
00442  * are both search-type scankeys (see nbtree/README for more about this).
00443  * Within this routine, we build a temporary insertion-type scankey to use
00444  * in locating the scan start position.
00445  */
00446 bool
00447 _bt_first(IndexScanDesc scan, ScanDirection dir)
00448 {
00449     Relation    rel = scan->indexRelation;
00450     BTScanOpaque so = (BTScanOpaque) scan->opaque;
00451     Buffer      buf;
00452     BTStack     stack;
00453     OffsetNumber offnum;
00454     StrategyNumber strat;
00455     bool        nextkey;
00456     bool        goback;
00457     ScanKey     startKeys[INDEX_MAX_KEYS];
00458     ScanKeyData scankeys[INDEX_MAX_KEYS];
00459     ScanKeyData notnullkeys[INDEX_MAX_KEYS];
00460     int         keysCount = 0;
00461     int         i;
00462     StrategyNumber strat_total;
00463     BTScanPosItem *currItem;
00464 
00465     pgstat_count_index_scan(rel);
00466 
00467     /*
00468      * Examine the scan keys and eliminate any redundant keys; also mark the
00469      * keys that must be matched to continue the scan.
00470      */
00471     _bt_preprocess_keys(scan);
00472 
00473     /*
00474      * Quit now if _bt_preprocess_keys() discovered that the scan keys can
00475      * never be satisfied (eg, x == 1 AND x > 2).
00476      */
00477     if (!so->qual_ok)
00478         return false;
00479 
00480     /*----------
00481      * Examine the scan keys to discover where we need to start the scan.
00482      *
00483      * We want to identify the keys that can be used as starting boundaries;
00484      * these are =, >, or >= keys for a forward scan or =, <, <= keys for
00485      * a backwards scan.  We can use keys for multiple attributes so long as
00486      * the prior attributes had only =, >= (resp. =, <=) keys.  Once we accept
00487      * a > or < boundary or find an attribute with no boundary (which can be
00488      * thought of as the same as "> -infinity"), we can't use keys for any
00489      * attributes to its right, because it would break our simplistic notion
00490      * of what initial positioning strategy to use.
00491      *
00492      * When the scan keys include cross-type operators, _bt_preprocess_keys
00493      * may not be able to eliminate redundant keys; in such cases we will
00494      * arbitrarily pick a usable one for each attribute.  This is correct
00495      * but possibly not optimal behavior.  (For example, with keys like
00496      * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
00497      * x=5 would be more efficient.)  Since the situation only arises given
00498      * a poorly-worded query plus an incomplete opfamily, live with it.
00499      *
00500      * When both equality and inequality keys appear for a single attribute
00501      * (again, only possible when cross-type operators appear), we *must*
00502      * select one of the equality keys for the starting point, because
00503      * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
00504      * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
00505      * start at x=4, we will fail and stop before reaching x=10.  If multiple
00506      * equality quals survive preprocessing, however, it doesn't matter which
00507      * one we use --- by definition, they are either redundant or
00508      * contradictory.
00509      *
00510      * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
00511      * If the index stores nulls at the end of the index we'll be starting
00512      * from, and we have no boundary key for the column (which means the key
00513      * we deduced NOT NULL from is an inequality key that constrains the other
00514      * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
00515      * use as a boundary key.  If we didn't do this, we might find ourselves
00516      * traversing a lot of null entries at the start of the scan.
00517      *
00518      * In this loop, row-comparison keys are treated the same as keys on their
00519      * first (leftmost) columns.  We'll add on lower-order columns of the row
00520      * comparison below, if possible.
00521      *
00522      * The selected scan keys (at most one per index column) are remembered by
00523      * storing their addresses into the local startKeys[] array.
00524      *----------
00525      */
00526     strat_total = BTEqualStrategyNumber;
00527     if (so->numberOfKeys > 0)
00528     {
00529         AttrNumber  curattr;
00530         ScanKey     chosen;
00531         ScanKey     impliesNN;
00532         ScanKey     cur;
00533 
00534         /*
00535          * chosen is the so-far-chosen key for the current attribute, if any.
00536          * We don't cast the decision in stone until we reach keys for the
00537          * next attribute.
00538          */
00539         curattr = 1;
00540         chosen = NULL;
00541         /* Also remember any scankey that implies a NOT NULL constraint */
00542         impliesNN = NULL;
00543 
00544         /*
00545          * Loop iterates from 0 to numberOfKeys inclusive; we use the last
00546          * pass to handle after-last-key processing.  Actual exit from the
00547          * loop is at one of the "break" statements below.
00548          */
00549         for (cur = so->keyData, i = 0;; cur++, i++)
00550         {
00551             if (i >= so->numberOfKeys || cur->sk_attno != curattr)
00552             {
00553                 /*
00554                  * Done looking at keys for curattr.  If we didn't find a
00555                  * usable boundary key, see if we can deduce a NOT NULL key.
00556                  */
00557                 if (chosen == NULL && impliesNN != NULL &&
00558                     ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
00559                      ScanDirectionIsForward(dir) :
00560                      ScanDirectionIsBackward(dir)))
00561                 {
00562                     /* Yes, so build the key in notnullkeys[keysCount] */
00563                     chosen = &notnullkeys[keysCount];
00564                     ScanKeyEntryInitialize(chosen,
00565                                            (SK_SEARCHNOTNULL | SK_ISNULL |
00566                                             (impliesNN->sk_flags &
00567                                           (SK_BT_DESC | SK_BT_NULLS_FIRST))),
00568                                            curattr,
00569                                  ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
00570                                   BTGreaterStrategyNumber :
00571                                   BTLessStrategyNumber),
00572                                            InvalidOid,
00573                                            InvalidOid,
00574                                            InvalidOid,
00575                                            (Datum) 0);
00576                 }
00577 
00578                 /*
00579                  * If we still didn't find a usable boundary key, quit; else
00580                  * save the boundary key pointer in startKeys.
00581                  */
00582                 if (chosen == NULL)
00583                     break;
00584                 startKeys[keysCount++] = chosen;
00585 
00586                 /*
00587                  * Adjust strat_total, and quit if we have stored a > or <
00588                  * key.
00589                  */
00590                 strat = chosen->sk_strategy;
00591                 if (strat != BTEqualStrategyNumber)
00592                 {
00593                     strat_total = strat;
00594                     if (strat == BTGreaterStrategyNumber ||
00595                         strat == BTLessStrategyNumber)
00596                         break;
00597                 }
00598 
00599                 /*
00600                  * Done if that was the last attribute, or if next key is not
00601                  * in sequence (implying no boundary key is available for the
00602                  * next attribute).
00603                  */
00604                 if (i >= so->numberOfKeys ||
00605                     cur->sk_attno != curattr + 1)
00606                     break;
00607 
00608                 /*
00609                  * Reset for next attr.
00610                  */
00611                 curattr = cur->sk_attno;
00612                 chosen = NULL;
00613                 impliesNN = NULL;
00614             }
00615 
00616             /*
00617              * Can we use this key as a starting boundary for this attr?
00618              *
00619              * If not, does it imply a NOT NULL constraint?  (Because
00620              * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
00621              * *any* inequality key works for that; we need not test.)
00622              */
00623             switch (cur->sk_strategy)
00624             {
00625                 case BTLessStrategyNumber:
00626                 case BTLessEqualStrategyNumber:
00627                     if (chosen == NULL)
00628                     {
00629                         if (ScanDirectionIsBackward(dir))
00630                             chosen = cur;
00631                         else
00632                             impliesNN = cur;
00633                     }
00634                     break;
00635                 case BTEqualStrategyNumber:
00636                     /* override any non-equality choice */
00637                     chosen = cur;
00638                     break;
00639                 case BTGreaterEqualStrategyNumber:
00640                 case BTGreaterStrategyNumber:
00641                     if (chosen == NULL)
00642                     {
00643                         if (ScanDirectionIsForward(dir))
00644                             chosen = cur;
00645                         else
00646                             impliesNN = cur;
00647                     }
00648                     break;
00649             }
00650         }
00651     }
00652 
00653     /*
00654      * If we found no usable boundary keys, we have to start from one end of
00655      * the tree.  Walk down that edge to the first or last key, and scan from
00656      * there.
00657      */
00658     if (keysCount == 0)
00659         return _bt_endpoint(scan, dir);
00660 
00661     /*
00662      * We want to start the scan somewhere within the index.  Set up an
00663      * insertion scankey we can use to search for the boundary point we
00664      * identified above.  The insertion scankey is built in the local
00665      * scankeys[] array, using the keys identified by startKeys[].
00666      */
00667     Assert(keysCount <= INDEX_MAX_KEYS);
00668     for (i = 0; i < keysCount; i++)
00669     {
00670         ScanKey     cur = startKeys[i];
00671 
00672         Assert(cur->sk_attno == i + 1);
00673 
00674         if (cur->sk_flags & SK_ROW_HEADER)
00675         {
00676             /*
00677              * Row comparison header: look to the first row member instead.
00678              *
00679              * The member scankeys are already in insertion format (ie, they
00680              * have sk_func = 3-way-comparison function), but we have to watch
00681              * out for nulls, which _bt_preprocess_keys didn't check. A null
00682              * in the first row member makes the condition unmatchable, just
00683              * like qual_ok = false.
00684              */
00685             ScanKey     subkey = (ScanKey) DatumGetPointer(cur->sk_argument);
00686 
00687             Assert(subkey->sk_flags & SK_ROW_MEMBER);
00688             if (subkey->sk_flags & SK_ISNULL)
00689                 return false;
00690             memcpy(scankeys + i, subkey, sizeof(ScanKeyData));
00691 
00692             /*
00693              * If the row comparison is the last positioning key we accepted,
00694              * try to add additional keys from the lower-order row members.
00695              * (If we accepted independent conditions on additional index
00696              * columns, we use those instead --- doesn't seem worth trying to
00697              * determine which is more restrictive.)  Note that this is OK
00698              * even if the row comparison is of ">" or "<" type, because the
00699              * condition applied to all but the last row member is effectively
00700              * ">=" or "<=", and so the extra keys don't break the positioning
00701              * scheme.  But, by the same token, if we aren't able to use all
00702              * the row members, then the part of the row comparison that we
00703              * did use has to be treated as just a ">=" or "<=" condition, and
00704              * so we'd better adjust strat_total accordingly.
00705              */
00706             if (i == keysCount - 1)
00707             {
00708                 bool        used_all_subkeys = false;
00709 
00710                 Assert(!(subkey->sk_flags & SK_ROW_END));
00711                 for (;;)
00712                 {
00713                     subkey++;
00714                     Assert(subkey->sk_flags & SK_ROW_MEMBER);
00715                     if (subkey->sk_attno != keysCount + 1)
00716                         break;  /* out-of-sequence, can't use it */
00717                     if (subkey->sk_strategy != cur->sk_strategy)
00718                         break;  /* wrong direction, can't use it */
00719                     if (subkey->sk_flags & SK_ISNULL)
00720                         break;  /* can't use null keys */
00721                     Assert(keysCount < INDEX_MAX_KEYS);
00722                     memcpy(scankeys + keysCount, subkey, sizeof(ScanKeyData));
00723                     keysCount++;
00724                     if (subkey->sk_flags & SK_ROW_END)
00725                     {
00726                         used_all_subkeys = true;
00727                         break;
00728                     }
00729                 }
00730                 if (!used_all_subkeys)
00731                 {
00732                     switch (strat_total)
00733                     {
00734                         case BTLessStrategyNumber:
00735                             strat_total = BTLessEqualStrategyNumber;
00736                             break;
00737                         case BTGreaterStrategyNumber:
00738                             strat_total = BTGreaterEqualStrategyNumber;
00739                             break;
00740                     }
00741                 }
00742                 break;          /* done with outer loop */
00743             }
00744         }
00745         else
00746         {
00747             /*
00748              * Ordinary comparison key.  Transform the search-style scan key
00749              * to an insertion scan key by replacing the sk_func with the
00750              * appropriate btree comparison function.
00751              *
00752              * If scankey operator is not a cross-type comparison, we can use
00753              * the cached comparison function; otherwise gotta look it up in
00754              * the catalogs.  (That can't lead to infinite recursion, since no
00755              * indexscan initiated by syscache lookup will use cross-data-type
00756              * operators.)
00757              *
00758              * We support the convention that sk_subtype == InvalidOid means
00759              * the opclass input type; this is a hack to simplify life for
00760              * ScanKeyInit().
00761              */
00762             if (cur->sk_subtype == rel->rd_opcintype[i] ||
00763                 cur->sk_subtype == InvalidOid)
00764             {
00765                 FmgrInfo   *procinfo;
00766 
00767                 procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
00768                 ScanKeyEntryInitializeWithInfo(scankeys + i,
00769                                                cur->sk_flags,
00770                                                cur->sk_attno,
00771                                                InvalidStrategy,
00772                                                cur->sk_subtype,
00773                                                cur->sk_collation,
00774                                                procinfo,
00775                                                cur->sk_argument);
00776             }
00777             else
00778             {
00779                 RegProcedure cmp_proc;
00780 
00781                 cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
00782                                              rel->rd_opcintype[i],
00783                                              cur->sk_subtype,
00784                                              BTORDER_PROC);
00785                 if (!RegProcedureIsValid(cmp_proc))
00786                     elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
00787                          BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
00788                          cur->sk_attno, RelationGetRelationName(rel));
00789                 ScanKeyEntryInitialize(scankeys + i,
00790                                        cur->sk_flags,
00791                                        cur->sk_attno,
00792                                        InvalidStrategy,
00793                                        cur->sk_subtype,
00794                                        cur->sk_collation,
00795                                        cmp_proc,
00796                                        cur->sk_argument);
00797             }
00798         }
00799     }
00800 
00801     /*----------
00802      * Examine the selected initial-positioning strategy to determine exactly
00803      * where we need to start the scan, and set flag variables to control the
00804      * code below.
00805      *
00806      * If nextkey = false, _bt_search and _bt_binsrch will locate the first
00807      * item >= scan key.  If nextkey = true, they will locate the first
00808      * item > scan key.
00809      *
00810      * If goback = true, we will then step back one item, while if
00811      * goback = false, we will start the scan on the located item.
00812      *----------
00813      */
00814     switch (strat_total)
00815     {
00816         case BTLessStrategyNumber:
00817 
00818             /*
00819              * Find first item >= scankey, then back up one to arrive at last
00820              * item < scankey.  (Note: this positioning strategy is only used
00821              * for a backward scan, so that is always the correct starting
00822              * position.)
00823              */
00824             nextkey = false;
00825             goback = true;
00826             break;
00827 
00828         case BTLessEqualStrategyNumber:
00829 
00830             /*
00831              * Find first item > scankey, then back up one to arrive at last
00832              * item <= scankey.  (Note: this positioning strategy is only used
00833              * for a backward scan, so that is always the correct starting
00834              * position.)
00835              */
00836             nextkey = true;
00837             goback = true;
00838             break;
00839 
00840         case BTEqualStrategyNumber:
00841 
00842             /*
00843              * If a backward scan was specified, need to start with last equal
00844              * item not first one.
00845              */
00846             if (ScanDirectionIsBackward(dir))
00847             {
00848                 /*
00849                  * This is the same as the <= strategy.  We will check at the
00850                  * end whether the found item is actually =.
00851                  */
00852                 nextkey = true;
00853                 goback = true;
00854             }
00855             else
00856             {
00857                 /*
00858                  * This is the same as the >= strategy.  We will check at the
00859                  * end whether the found item is actually =.
00860                  */
00861                 nextkey = false;
00862                 goback = false;
00863             }
00864             break;
00865 
00866         case BTGreaterEqualStrategyNumber:
00867 
00868             /*
00869              * Find first item >= scankey.  (This is only used for forward
00870              * scans.)
00871              */
00872             nextkey = false;
00873             goback = false;
00874             break;
00875 
00876         case BTGreaterStrategyNumber:
00877 
00878             /*
00879              * Find first item > scankey.  (This is only used for forward
00880              * scans.)
00881              */
00882             nextkey = true;
00883             goback = false;
00884             break;
00885 
00886         default:
00887             /* can't get here, but keep compiler quiet */
00888             elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
00889             return false;
00890     }
00891 
00892     /*
00893      * Use the manufactured insertion scan key to descend the tree and
00894      * position ourselves on the target leaf page.
00895      */
00896     stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ);
00897 
00898     /* don't need to keep the stack around... */
00899     _bt_freestack(stack);
00900 
00901     /* remember which buffer we have pinned, if any */
00902     so->currPos.buf = buf;
00903 
00904     if (!BufferIsValid(buf))
00905     {
00906         /*
00907          * We only get here if the index is completely empty. Lock relation
00908          * because nothing finer to lock exists.
00909          */
00910         PredicateLockRelation(rel, scan->xs_snapshot);
00911         return false;
00912     }
00913     else
00914         PredicateLockPage(rel, BufferGetBlockNumber(buf),
00915                           scan->xs_snapshot);
00916 
00917     /* initialize moreLeft/moreRight appropriately for scan direction */
00918     if (ScanDirectionIsForward(dir))
00919     {
00920         so->currPos.moreLeft = false;
00921         so->currPos.moreRight = true;
00922     }
00923     else
00924     {
00925         so->currPos.moreLeft = true;
00926         so->currPos.moreRight = false;
00927     }
00928     so->numKilled = 0;          /* just paranoia */
00929     so->markItemIndex = -1;     /* ditto */
00930 
00931     /* position to the precise item on the page */
00932     offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);
00933 
00934     /*
00935      * If nextkey = false, we are positioned at the first item >= scan key, or
00936      * possibly at the end of a page on which all the existing items are less
00937      * than the scan key and we know that everything on later pages is greater
00938      * than or equal to scan key.
00939      *
00940      * If nextkey = true, we are positioned at the first item > scan key, or
00941      * possibly at the end of a page on which all the existing items are less
00942      * than or equal to the scan key and we know that everything on later
00943      * pages is greater than scan key.
00944      *
00945      * The actually desired starting point is either this item or the prior
00946      * one, or in the end-of-page case it's the first item on the next page or
00947      * the last item on this page.  Adjust the starting offset if needed. (If
00948      * this results in an offset before the first item or after the last one,
00949      * _bt_readpage will report no items found, and then we'll step to the
00950      * next page as needed.)
00951      */
00952     if (goback)
00953         offnum = OffsetNumberPrev(offnum);
00954 
00955     /*
00956      * Now load data from the first page of the scan.
00957      */
00958     if (!_bt_readpage(scan, dir, offnum))
00959     {
00960         /*
00961          * There's no actually-matching data on this page.  Try to advance to
00962          * the next page.  Return false if there's no matching data at all.
00963          */
00964         if (!_bt_steppage(scan, dir))
00965             return false;
00966     }
00967 
00968     /* Drop the lock, but not pin, on the current page */
00969     LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
00970 
00971     /* OK, itemIndex says what to return */
00972     currItem = &so->currPos.items[so->currPos.itemIndex];
00973     scan->xs_ctup.t_self = currItem->heapTid;
00974     if (scan->xs_want_itup)
00975         scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
00976 
00977     return true;
00978 }
00979 
00980 /*
00981  *  _bt_next() -- Get the next item in a scan.
00982  *
00983  *      On entry, so->currPos describes the current page, which is pinned
00984  *      but not locked, and so->currPos.itemIndex identifies which item was
00985  *      previously returned.
00986  *
00987  *      On successful exit, scan->xs_ctup.t_self is set to the TID of the
00988  *      next heap tuple, and if requested, scan->xs_itup points to a copy of
00989  *      the index tuple.  so->currPos is updated as needed.
00990  *
00991  *      On failure exit (no more tuples), we release pin and set
00992  *      so->currPos.buf to InvalidBuffer.
00993  */
00994 bool
00995 _bt_next(IndexScanDesc scan, ScanDirection dir)
00996 {
00997     BTScanOpaque so = (BTScanOpaque) scan->opaque;
00998     BTScanPosItem *currItem;
00999 
01000     /*
01001      * Advance to next tuple on current page; or if there's no more, try to
01002      * step to the next page with data.
01003      */
01004     if (ScanDirectionIsForward(dir))
01005     {
01006         if (++so->currPos.itemIndex > so->currPos.lastItem)
01007         {
01008             /* We must acquire lock before applying _bt_steppage */
01009             Assert(BufferIsValid(so->currPos.buf));
01010             LockBuffer(so->currPos.buf, BT_READ);
01011             if (!_bt_steppage(scan, dir))
01012                 return false;
01013             /* Drop the lock, but not pin, on the new page */
01014             LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
01015         }
01016     }
01017     else
01018     {
01019         if (--so->currPos.itemIndex < so->currPos.firstItem)
01020         {
01021             /* We must acquire lock before applying _bt_steppage */
01022             Assert(BufferIsValid(so->currPos.buf));
01023             LockBuffer(so->currPos.buf, BT_READ);
01024             if (!_bt_steppage(scan, dir))
01025                 return false;
01026             /* Drop the lock, but not pin, on the new page */
01027             LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
01028         }
01029     }
01030 
01031     /* OK, itemIndex says what to return */
01032     currItem = &so->currPos.items[so->currPos.itemIndex];
01033     scan->xs_ctup.t_self = currItem->heapTid;
01034     if (scan->xs_want_itup)
01035         scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
01036 
01037     return true;
01038 }
01039 
01040 /*
01041  *  _bt_readpage() -- Load data from current index page into so->currPos
01042  *
01043  * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
01044  * is not changed here.  Also, currPos.moreLeft and moreRight must be valid;
01045  * they are updated as appropriate.  All other fields of so->currPos are
01046  * initialized from scratch here.
01047  *
01048  * We scan the current page starting at offnum and moving in the indicated
01049  * direction.  All items matching the scan keys are loaded into currPos.items.
01050  * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
01051  * that there can be no more matching tuples in the current scan direction.
01052  *
01053  * Returns true if any matching items found on the page, false if none.
01054  */
01055 static bool
01056 _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
01057 {
01058     BTScanOpaque so = (BTScanOpaque) scan->opaque;
01059     Page        page;
01060     BTPageOpaque opaque;
01061     OffsetNumber minoff;
01062     OffsetNumber maxoff;
01063     int         itemIndex;
01064     IndexTuple  itup;
01065     bool        continuescan;
01066 
01067     /* we must have the buffer pinned and locked */
01068     Assert(BufferIsValid(so->currPos.buf));
01069 
01070     page = BufferGetPage(so->currPos.buf);
01071     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01072     minoff = P_FIRSTDATAKEY(opaque);
01073     maxoff = PageGetMaxOffsetNumber(page);
01074 
01075     /*
01076      * we must save the page's right-link while scanning it; this tells us
01077      * where to step right to after we're done with these items.  There is no
01078      * corresponding need for the left-link, since splits always go right.
01079      */
01080     so->currPos.nextPage = opaque->btpo_next;
01081 
01082     /* initialize tuple workspace to empty */
01083     so->currPos.nextTupleOffset = 0;
01084 
01085     if (ScanDirectionIsForward(dir))
01086     {
01087         /* load items[] in ascending order */
01088         itemIndex = 0;
01089 
01090         offnum = Max(offnum, minoff);
01091 
01092         while (offnum <= maxoff)
01093         {
01094             itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
01095             if (itup != NULL)
01096             {
01097                 /* tuple passes all scan key conditions, so remember it */
01098                 _bt_saveitem(so, itemIndex, offnum, itup);
01099                 itemIndex++;
01100             }
01101             if (!continuescan)
01102             {
01103                 /* there can't be any more matches, so stop */
01104                 so->currPos.moreRight = false;
01105                 break;
01106             }
01107 
01108             offnum = OffsetNumberNext(offnum);
01109         }
01110 
01111         Assert(itemIndex <= MaxIndexTuplesPerPage);
01112         so->currPos.firstItem = 0;
01113         so->currPos.lastItem = itemIndex - 1;
01114         so->currPos.itemIndex = 0;
01115     }
01116     else
01117     {
01118         /* load items[] in descending order */
01119         itemIndex = MaxIndexTuplesPerPage;
01120 
01121         offnum = Min(offnum, maxoff);
01122 
01123         while (offnum >= minoff)
01124         {
01125             itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
01126             if (itup != NULL)
01127             {
01128                 /* tuple passes all scan key conditions, so remember it */
01129                 itemIndex--;
01130                 _bt_saveitem(so, itemIndex, offnum, itup);
01131             }
01132             if (!continuescan)
01133             {
01134                 /* there can't be any more matches, so stop */
01135                 so->currPos.moreLeft = false;
01136                 break;
01137             }
01138 
01139             offnum = OffsetNumberPrev(offnum);
01140         }
01141 
01142         Assert(itemIndex >= 0);
01143         so->currPos.firstItem = itemIndex;
01144         so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
01145         so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
01146     }
01147 
01148     return (so->currPos.firstItem <= so->currPos.lastItem);
01149 }
01150 
01151 /* Save an index item into so->currPos.items[itemIndex] */
01152 static void
01153 _bt_saveitem(BTScanOpaque so, int itemIndex,
01154              OffsetNumber offnum, IndexTuple itup)
01155 {
01156     BTScanPosItem *currItem = &so->currPos.items[itemIndex];
01157 
01158     currItem->heapTid = itup->t_tid;
01159     currItem->indexOffset = offnum;
01160     if (so->currTuples)
01161     {
01162         Size        itupsz = IndexTupleSize(itup);
01163 
01164         currItem->tupleOffset = so->currPos.nextTupleOffset;
01165         memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
01166         so->currPos.nextTupleOffset += MAXALIGN(itupsz);
01167     }
01168 }
01169 
01170 /*
01171  *  _bt_steppage() -- Step to next page containing valid data for scan
01172  *
01173  * On entry, so->currPos.buf must be pinned and read-locked.  We'll drop
01174  * the lock and pin before moving to next page.
01175  *
01176  * On success exit, we hold pin and read-lock on the next interesting page,
01177  * and so->currPos is updated to contain data from that page.
01178  *
01179  * If there are no more matching records in the given direction, we drop all
01180  * locks and pins, set so->currPos.buf to InvalidBuffer, and return FALSE.
01181  */
01182 static bool
01183 _bt_steppage(IndexScanDesc scan, ScanDirection dir)
01184 {
01185     BTScanOpaque so = (BTScanOpaque) scan->opaque;
01186     Relation    rel;
01187     Page        page;
01188     BTPageOpaque opaque;
01189 
01190     /* we must have the buffer pinned and locked */
01191     Assert(BufferIsValid(so->currPos.buf));
01192 
01193     /* Before leaving current page, deal with any killed items */
01194     if (so->numKilled > 0)
01195         _bt_killitems(scan, true);
01196 
01197     /*
01198      * Before we modify currPos, make a copy of the page data if there was a
01199      * mark position that needs it.
01200      */
01201     if (so->markItemIndex >= 0)
01202     {
01203         /* bump pin on current buffer for assignment to mark buffer */
01204         IncrBufferRefCount(so->currPos.buf);
01205         memcpy(&so->markPos, &so->currPos,
01206                offsetof(BTScanPosData, items[1]) +
01207                so->currPos.lastItem * sizeof(BTScanPosItem));
01208         if (so->markTuples)
01209             memcpy(so->markTuples, so->currTuples,
01210                    so->currPos.nextTupleOffset);
01211         so->markPos.itemIndex = so->markItemIndex;
01212         so->markItemIndex = -1;
01213     }
01214 
01215     rel = scan->indexRelation;
01216 
01217     if (ScanDirectionIsForward(dir))
01218     {
01219         /* Walk right to the next page with data */
01220         /* We must rely on the previously saved nextPage link! */
01221         BlockNumber blkno = so->currPos.nextPage;
01222 
01223         /* Remember we left a page with data */
01224         so->currPos.moreLeft = true;
01225 
01226         for (;;)
01227         {
01228             /* release the previous buffer */
01229             _bt_relbuf(rel, so->currPos.buf);
01230             so->currPos.buf = InvalidBuffer;
01231             /* if we're at end of scan, give up */
01232             if (blkno == P_NONE || !so->currPos.moreRight)
01233                 return false;
01234             /* check for interrupts while we're not holding any buffer lock */
01235             CHECK_FOR_INTERRUPTS();
01236             /* step right one page */
01237             so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
01238             /* check for deleted page */
01239             page = BufferGetPage(so->currPos.buf);
01240             opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01241             if (!P_IGNORE(opaque))
01242             {
01243                 PredicateLockPage(rel, blkno, scan->xs_snapshot);
01244                 /* see if there are any matches on this page */
01245                 /* note that this will clear moreRight if we can stop */
01246                 if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
01247                     break;
01248             }
01249             /* nope, keep going */
01250             blkno = opaque->btpo_next;
01251         }
01252     }
01253     else
01254     {
01255         /* Remember we left a page with data */
01256         so->currPos.moreRight = true;
01257 
01258         /*
01259          * Walk left to the next page with data.  This is much more complex
01260          * than the walk-right case because of the possibility that the page
01261          * to our left splits while we are in flight to it, plus the
01262          * possibility that the page we were on gets deleted after we leave
01263          * it.  See nbtree/README for details.
01264          */
01265         for (;;)
01266         {
01267             /* Done if we know there are no matching keys to the left */
01268             if (!so->currPos.moreLeft)
01269             {
01270                 _bt_relbuf(rel, so->currPos.buf);
01271                 so->currPos.buf = InvalidBuffer;
01272                 return false;
01273             }
01274 
01275             /* Step to next physical page */
01276             so->currPos.buf = _bt_walk_left(rel, so->currPos.buf);
01277 
01278             /* if we're physically at end of index, return failure */
01279             if (so->currPos.buf == InvalidBuffer)
01280                 return false;
01281 
01282             /*
01283              * Okay, we managed to move left to a non-deleted page. Done if
01284              * it's not half-dead and contains matching tuples. Else loop back
01285              * and do it all again.
01286              */
01287             page = BufferGetPage(so->currPos.buf);
01288             opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01289             if (!P_IGNORE(opaque))
01290             {
01291                 PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
01292                 /* see if there are any matches on this page */
01293                 /* note that this will clear moreLeft if we can stop */
01294                 if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
01295                     break;
01296             }
01297         }
01298     }
01299 
01300     return true;
01301 }
01302 
01303 /*
01304  * _bt_walk_left() -- step left one page, if possible
01305  *
01306  * The given buffer must be pinned and read-locked.  This will be dropped
01307  * before stepping left.  On return, we have pin and read lock on the
01308  * returned page, instead.
01309  *
01310  * Returns InvalidBuffer if there is no page to the left (no lock is held
01311  * in that case).
01312  *
01313  * When working on a non-leaf level, it is possible for the returned page
01314  * to be half-dead; the caller should check that condition and step left
01315  * again if it's important.
01316  */
01317 static Buffer
01318 _bt_walk_left(Relation rel, Buffer buf)
01319 {
01320     Page        page;
01321     BTPageOpaque opaque;
01322 
01323     page = BufferGetPage(buf);
01324     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01325 
01326     for (;;)
01327     {
01328         BlockNumber obknum;
01329         BlockNumber lblkno;
01330         BlockNumber blkno;
01331         int         tries;
01332 
01333         /* if we're at end of tree, release buf and return failure */
01334         if (P_LEFTMOST(opaque))
01335         {
01336             _bt_relbuf(rel, buf);
01337             break;
01338         }
01339         /* remember original page we are stepping left from */
01340         obknum = BufferGetBlockNumber(buf);
01341         /* step left */
01342         blkno = lblkno = opaque->btpo_prev;
01343         _bt_relbuf(rel, buf);
01344         /* check for interrupts while we're not holding any buffer lock */
01345         CHECK_FOR_INTERRUPTS();
01346         buf = _bt_getbuf(rel, blkno, BT_READ);
01347         page = BufferGetPage(buf);
01348         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01349 
01350         /*
01351          * If this isn't the page we want, walk right till we find what we
01352          * want --- but go no more than four hops (an arbitrary limit). If we
01353          * don't find the correct page by then, the most likely bet is that
01354          * the original page got deleted and isn't in the sibling chain at all
01355          * anymore, not that its left sibling got split more than four times.
01356          *
01357          * Note that it is correct to test P_ISDELETED not P_IGNORE here,
01358          * because half-dead pages are still in the sibling chain.  Caller
01359          * must reject half-dead pages if wanted.
01360          */
01361         tries = 0;
01362         for (;;)
01363         {
01364             if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
01365             {
01366                 /* Found desired page, return it */
01367                 return buf;
01368             }
01369             if (P_RIGHTMOST(opaque) || ++tries > 4)
01370                 break;
01371             blkno = opaque->btpo_next;
01372             buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
01373             page = BufferGetPage(buf);
01374             opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01375         }
01376 
01377         /* Return to the original page to see what's up */
01378         buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
01379         page = BufferGetPage(buf);
01380         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01381         if (P_ISDELETED(opaque))
01382         {
01383             /*
01384              * It was deleted.  Move right to first nondeleted page (there
01385              * must be one); that is the page that has acquired the deleted
01386              * one's keyspace, so stepping left from it will take us where we
01387              * want to be.
01388              */
01389             for (;;)
01390             {
01391                 if (P_RIGHTMOST(opaque))
01392                     elog(ERROR, "fell off the end of index \"%s\"",
01393                          RelationGetRelationName(rel));
01394                 blkno = opaque->btpo_next;
01395                 buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
01396                 page = BufferGetPage(buf);
01397                 opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01398                 if (!P_ISDELETED(opaque))
01399                     break;
01400             }
01401 
01402             /*
01403              * Now return to top of loop, resetting obknum to point to this
01404              * nondeleted page, and try again.
01405              */
01406         }
01407         else
01408         {
01409             /*
01410              * It wasn't deleted; the explanation had better be that the page
01411              * to the left got split or deleted. Without this check, we'd go
01412              * into an infinite loop if there's anything wrong.
01413              */
01414             if (opaque->btpo_prev == lblkno)
01415                 elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
01416                      obknum, RelationGetRelationName(rel));
01417             /* Okay to try again with new lblkno value */
01418         }
01419     }
01420 
01421     return InvalidBuffer;
01422 }
01423 
01424 /*
01425  * _bt_get_endpoint() -- Find the first or last page on a given tree level
01426  *
01427  * If the index is empty, we will return InvalidBuffer; any other failure
01428  * condition causes ereport().  We will not return a dead page.
01429  *
01430  * The returned buffer is pinned and read-locked.
01431  */
01432 Buffer
01433 _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
01434 {
01435     Buffer      buf;
01436     Page        page;
01437     BTPageOpaque opaque;
01438     OffsetNumber offnum;
01439     BlockNumber blkno;
01440     IndexTuple  itup;
01441 
01442     /*
01443      * If we are looking for a leaf page, okay to descend from fast root;
01444      * otherwise better descend from true root.  (There is no point in being
01445      * smarter about intermediate levels.)
01446      */
01447     if (level == 0)
01448         buf = _bt_getroot(rel, BT_READ);
01449     else
01450         buf = _bt_gettrueroot(rel);
01451 
01452     if (!BufferIsValid(buf))
01453         return InvalidBuffer;
01454 
01455     page = BufferGetPage(buf);
01456     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01457 
01458     for (;;)
01459     {
01460         /*
01461          * If we landed on a deleted page, step right to find a live page
01462          * (there must be one).  Also, if we want the rightmost page, step
01463          * right if needed to get to it (this could happen if the page split
01464          * since we obtained a pointer to it).
01465          */
01466         while (P_IGNORE(opaque) ||
01467                (rightmost && !P_RIGHTMOST(opaque)))
01468         {
01469             blkno = opaque->btpo_next;
01470             if (blkno == P_NONE)
01471                 elog(ERROR, "fell off the end of index \"%s\"",
01472                      RelationGetRelationName(rel));
01473             buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
01474             page = BufferGetPage(buf);
01475             opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01476         }
01477 
01478         /* Done? */
01479         if (opaque->btpo.level == level)
01480             break;
01481         if (opaque->btpo.level < level)
01482             elog(ERROR, "btree level %u not found in index \"%s\"",
01483                  level, RelationGetRelationName(rel));
01484 
01485         /* Descend to leftmost or rightmost child page */
01486         if (rightmost)
01487             offnum = PageGetMaxOffsetNumber(page);
01488         else
01489             offnum = P_FIRSTDATAKEY(opaque);
01490 
01491         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
01492         blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
01493 
01494         buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
01495         page = BufferGetPage(buf);
01496         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01497     }
01498 
01499     return buf;
01500 }
01501 
01502 /*
01503  *  _bt_endpoint() -- Find the first or last page in the index, and scan
01504  * from there to the first key satisfying all the quals.
01505  *
01506  * This is used by _bt_first() to set up a scan when we've determined
01507  * that the scan must start at the beginning or end of the index (for
01508  * a forward or backward scan respectively).  Exit conditions are the
01509  * same as for _bt_first().
01510  */
01511 static bool
01512 _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
01513 {
01514     Relation    rel = scan->indexRelation;
01515     BTScanOpaque so = (BTScanOpaque) scan->opaque;
01516     Buffer      buf;
01517     Page        page;
01518     BTPageOpaque opaque;
01519     OffsetNumber start;
01520     BTScanPosItem *currItem;
01521 
01522     /*
01523      * Scan down to the leftmost or rightmost leaf page.  This is a simplified
01524      * version of _bt_search().  We don't maintain a stack since we know we
01525      * won't need it.
01526      */
01527     buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
01528 
01529     if (!BufferIsValid(buf))
01530     {
01531         /*
01532          * Empty index. Lock the whole relation, as nothing finer to lock
01533          * exists.
01534          */
01535         PredicateLockRelation(rel, scan->xs_snapshot);
01536         so->currPos.buf = InvalidBuffer;
01537         return false;
01538     }
01539 
01540     PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
01541     page = BufferGetPage(buf);
01542     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01543     Assert(P_ISLEAF(opaque));
01544 
01545     if (ScanDirectionIsForward(dir))
01546     {
01547         /* There could be dead pages to the left, so not this: */
01548         /* Assert(P_LEFTMOST(opaque)); */
01549 
01550         start = P_FIRSTDATAKEY(opaque);
01551     }
01552     else if (ScanDirectionIsBackward(dir))
01553     {
01554         Assert(P_RIGHTMOST(opaque));
01555 
01556         start = PageGetMaxOffsetNumber(page);
01557     }
01558     else
01559     {
01560         elog(ERROR, "invalid scan direction: %d", (int) dir);
01561         start = 0;              /* keep compiler quiet */
01562     }
01563 
01564     /* remember which buffer we have pinned */
01565     so->currPos.buf = buf;
01566 
01567     /* initialize moreLeft/moreRight appropriately for scan direction */
01568     if (ScanDirectionIsForward(dir))
01569     {
01570         so->currPos.moreLeft = false;
01571         so->currPos.moreRight = true;
01572     }
01573     else
01574     {
01575         so->currPos.moreLeft = true;
01576         so->currPos.moreRight = false;
01577     }
01578     so->numKilled = 0;          /* just paranoia */
01579     so->markItemIndex = -1;     /* ditto */
01580 
01581     /*
01582      * Now load data from the first page of the scan.
01583      */
01584     if (!_bt_readpage(scan, dir, start))
01585     {
01586         /*
01587          * There's no actually-matching data on this page.  Try to advance to
01588          * the next page.  Return false if there's no matching data at all.
01589          */
01590         if (!_bt_steppage(scan, dir))
01591             return false;
01592     }
01593 
01594     /* Drop the lock, but not pin, on the current page */
01595     LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
01596 
01597     /* OK, itemIndex says what to return */
01598     currItem = &so->currPos.items[so->currPos.itemIndex];
01599     scan->xs_ctup.t_self = currItem->heapTid;
01600     if (scan->xs_want_itup)
01601         scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
01602 
01603     return true;
01604 }