#include "postgres.h"
#include "access/nbtree.h"
#include "access/relscan.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/predicate.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"

Include dependency graph for nbtsearch.c:

Functions
static bool	_bt_readpage (IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
static void	_bt_saveitem (BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
static bool	_bt_steppage (IndexScanDesc scan, ScanDirection dir)
static Buffer	_bt_walk_left (Relation rel, Buffer buf)
static bool	_bt_endpoint (IndexScanDesc scan, ScanDirection dir)
BTStack	_bt_search (Relation rel, int keysz, ScanKey scankey, bool nextkey, Buffer *bufP, int access)
Buffer	_bt_moveright (Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey, int access)
OffsetNumber	_bt_binsrch (Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey)
int32	_bt_compare (Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum)
bool	_bt_first (IndexScanDesc scan, ScanDirection dir)
bool	_bt_next (IndexScanDesc scan, ScanDirection dir)
Buffer	_bt_get_endpoint (Relation rel, uint32 level, bool rightmost)

Function Documentation

OffsetNumber _bt_binsrch	(	Relation	rel,
		Buffer	buf,
		int	keysz,
		ScanKey	scankey,
		bool	nextkey
	)

Definition at line 234 of file nbtsearch.c.

References _bt_compare(), Assert, BufferGetPage, OffsetNumberPrev, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber, and PageGetSpecialPointer.

Referenced by _bt_doinsert(), _bt_findinsertloc(), _bt_first(), and _bt_search().

{
    Page        page;
    BTPageOpaque opaque;
    OffsetNumber low,
                high;
    int32       result,
                cmpval;

    page = BufferGetPage(buf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);

    low = P_FIRSTDATAKEY(opaque);
    high = PageGetMaxOffsetNumber(page);

    /*
     * If there are no keys on the page, return the first available slot. Note
     * this covers two cases: the page is really empty (no keys), or it
     * contains only a high key.  The latter case is possible after vacuuming.
     * This can never happen on an internal page, however, since they are
     * never empty (an internal page must have children).
     */
    if (high < low)
        return low;

    /*
     * Binary search to find the first key on the page >= scan key, or first
     * key > scankey when nextkey is true.
     *
     * For nextkey=false (cmpval=1), the loop invariant is: all slots before
     * 'low' are < scan key, all slots at or after 'high' are >= scan key.
     *
     * For nextkey=true (cmpval=0), the loop invariant is: all slots before
     * 'low' are <= scan key, all slots at or after 'high' are > scan key.
     *
     * We can fall out when high == low.
     */
    high++;                     /* establish the loop invariant for high */

    cmpval = nextkey ? 0 : 1;   /* select comparison value */

    while (high > low)
    {
        OffsetNumber mid = low + ((high - low) / 2);

        /* We have low <= mid < high, so mid points at a real slot */

        result = _bt_compare(rel, keysz, scankey, page, mid);

        if (result >= cmpval)
            low = mid + 1;
        else
            high = mid;
    }

    /*
     * At this point we have high == low, but be careful: they could point
     * past the last slot on the page.
     *
     * On a leaf page, we always return the first key >= scan key (resp. >
     * scan key), which could be the last slot + 1.
     */
    if (P_ISLEAF(opaque))
        return low;

    /*
     * On a non-leaf page, return the last key < scan key (resp. <= scan key).
     * There must be one if _bt_compare() is playing by the rules.
     */
    Assert(low > P_FIRSTDATAKEY(opaque));

    return OffsetNumberPrev(low);
}

int32 _bt_compare	(	Relation	rel,
		int	keysz,
		ScanKey	scankey,
		Page	page,
		OffsetNumber	offnum
	)

Definition at line 339 of file nbtsearch.c.

References DatumGetInt32, FunctionCall2Coll(), i, index_getattr, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem, PageGetItemId, PageGetSpecialPointer, RelationGetDescr, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_findinsertloc(), and _bt_moveright().

{
    TupleDesc   itupdesc = RelationGetDescr(rel);
    BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    IndexTuple  itup;
    int         i;

    /*
     * Force result ">" if target item is first data item on an internal page
     * --- see NOTE above.
     */
    if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
        return 1;

    itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));

    /*
     * The scan key is set up with the attribute number associated with each
     * term in the key.  It is important that, if the index is multi-key, the
     * scan contain the first k key attributes, and that they be in order.  If
     * you think about how multi-key ordering works, you'll understand why
     * this is.
     *
     * We don't test for violation of this condition here, however.  The
     * initial setup for the index scan had better have gotten it right (see
     * _bt_first).
     */

    for (i = 1; i <= keysz; i++)
    {
        Datum       datum;
        bool        isNull;
        int32       result;

        datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);

        /* see comments about NULLs handling in btbuild */
        if (scankey->sk_flags & SK_ISNULL)      /* key is NULL */
        {
            if (isNull)
                result = 0;     /* NULL "=" NULL */
            else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
                result = -1;    /* NULL "<" NOT_NULL */
            else
                result = 1;     /* NULL ">" NOT_NULL */
        }
        else if (isNull)        /* key is NOT_NULL and item is NULL */
        {
            if (scankey->sk_flags & SK_BT_NULLS_FIRST)
                result = 1;     /* NOT_NULL ">" NULL */
            else
                result = -1;    /* NOT_NULL "<" NULL */
        }
        else
        {
            /*
             * The sk_func needs to be passed the index value as left arg and
             * the sk_argument as right arg (they might be of different
             * types).  Since it is convenient for callers to think of
             * _bt_compare as comparing the scankey to the index item, we have
             * to flip the sign of the comparison result.  (Unless it's a DESC
             * column, in which case we *don't* flip the sign.)
             */
            result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
                                                     scankey->sk_collation,
                                                     datum,
                                                     scankey->sk_argument));

            if (!(scankey->sk_flags & SK_BT_DESC))
                result = -result;
        }

        /* if the keys are unequal, return the difference */
        if (result != 0)
            return result;

        scankey++;
    }

    /* if we get here, the keys are equal */
    return 0;
}

static bool _bt_endpoint	(	IndexScanDesc	scan,
		ScanDirection	dir
	)			`[static]`

Definition at line 1512 of file nbtsearch.c.

References _bt_get_endpoint(), _bt_readpage(), _bt_steppage(), Assert, BTScanPosData::buf, buf, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage, BufferIsValid, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, elog, ERROR, IndexScanDescData::indexRelation, BTScanPosData::itemIndex, BTScanPosData::items, LockBuffer(), BTScanOpaqueData::markItemIndex, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_ISLEAF, P_RIGHTMOST, PageGetMaxOffsetNumber, PageGetSpecialPointer, PredicateLockPage(), PredicateLockRelation(), ScanDirectionIsBackward, ScanDirectionIsForward, HeapTupleData::t_self, IndexScanDescData::xs_ctup, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by _bt_first().

{
    Relation    rel = scan->indexRelation;
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    Buffer      buf;
    Page        page;
    BTPageOpaque opaque;
    OffsetNumber start;
    BTScanPosItem *currItem;

    /*
     * Scan down to the leftmost or rightmost leaf page.  This is a simplified
     * version of _bt_search().  We don't maintain a stack since we know we
     * won't need it.
     */
    buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));

    if (!BufferIsValid(buf))
    {
        /*
         * Empty index. Lock the whole relation, as nothing finer to lock
         * exists.
         */
        PredicateLockRelation(rel, scan->xs_snapshot);
        so->currPos.buf = InvalidBuffer;
        return false;
    }

    PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot);
    page = BufferGetPage(buf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    Assert(P_ISLEAF(opaque));

    if (ScanDirectionIsForward(dir))
    {
        /* There could be dead pages to the left, so not this: */
        /* Assert(P_LEFTMOST(opaque)); */

        start = P_FIRSTDATAKEY(opaque);
    }
    else if (ScanDirectionIsBackward(dir))
    {
        Assert(P_RIGHTMOST(opaque));

        start = PageGetMaxOffsetNumber(page);
    }
    else
    {
        elog(ERROR, "invalid scan direction: %d", (int) dir);
        start = 0;              /* keep compiler quiet */
    }

    /* remember which buffer we have pinned */
    so->currPos.buf = buf;

    /* initialize moreLeft/moreRight appropriately for scan direction */
    if (ScanDirectionIsForward(dir))
    {
        so->currPos.moreLeft = false;
        so->currPos.moreRight = true;
    }
    else
    {
        so->currPos.moreLeft = true;
        so->currPos.moreRight = false;
    }
    so->numKilled = 0;          /* just paranoia */
    so->markItemIndex = -1;     /* ditto */

    /*
     * Now load data from the first page of the scan.
     */
    if (!_bt_readpage(scan, dir, start))
    {
        /*
         * There's no actually-matching data on this page.  Try to advance to
         * the next page.  Return false if there's no matching data at all.
         */
        if (!_bt_steppage(scan, dir))
            return false;
    }

    /* Drop the lock, but not pin, on the current page */
    LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);

    /* OK, itemIndex says what to return */
    currItem = &so->currPos.items[so->currPos.itemIndex];
    scan->xs_ctup.t_self = currItem->heapTid;
    if (scan->xs_want_itup)
        scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);

    return true;
}

bool _bt_first	(	IndexScanDesc	scan,
		ScanDirection	dir
	)

Definition at line 447 of file nbtsearch.c.

References _bt_binsrch(), _bt_endpoint(), _bt_freestack(), _bt_preprocess_keys(), _bt_readpage(), _bt_search(), _bt_steppage(), Assert, BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosData::buf, buf, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferIsValid, cur, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, DatumGetPointer, elog, ERROR, get_opfamily_proc(), i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, InvalidOid, InvalidStrategy, BTScanPosData::itemIndex, BTScanPosData::items, BTScanOpaqueData::keyData, LockBuffer(), BTScanOpaqueData::markItemIndex, BTScanPosData::moreLeft, BTScanPosData::moreRight, NULL, BTScanOpaqueData::numberOfKeys, BTScanOpaqueData::numKilled, OffsetNumberPrev, IndexScanDescData::opaque, pgstat_count_index_scan, PredicateLockPage(), PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, ScanKeyData::sk_subtype, HeapTupleData::t_self, IndexScanDescData::xs_ctup, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

{
    Relation    rel = scan->indexRelation;
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    Buffer      buf;
    BTStack     stack;
    OffsetNumber offnum;
    StrategyNumber strat;
    bool        nextkey;
    bool        goback;
    ScanKey     startKeys[INDEX_MAX_KEYS];
    ScanKeyData scankeys[INDEX_MAX_KEYS];
    ScanKeyData notnullkeys[INDEX_MAX_KEYS];
    int         keysCount = 0;
    int         i;
    StrategyNumber strat_total;
    BTScanPosItem *currItem;

    pgstat_count_index_scan(rel);

    /*
     * Examine the scan keys and eliminate any redundant keys; also mark the
     * keys that must be matched to continue the scan.
     */
    _bt_preprocess_keys(scan);

    /*
     * Quit now if _bt_preprocess_keys() discovered that the scan keys can
     * never be satisfied (eg, x == 1 AND x > 2).
     */
    if (!so->qual_ok)
        return false;

    /*----------
     * Examine the scan keys to discover where we need to start the scan.
     *
     * We want to identify the keys that can be used as starting boundaries;
     * these are =, >, or >= keys for a forward scan or =, <, <= keys for
     * a backwards scan.  We can use keys for multiple attributes so long as
     * the prior attributes had only =, >= (resp. =, <=) keys.  Once we accept
     * a > or < boundary or find an attribute with no boundary (which can be
     * thought of as the same as "> -infinity"), we can't use keys for any
     * attributes to its right, because it would break our simplistic notion
     * of what initial positioning strategy to use.
     *
     * When the scan keys include cross-type operators, _bt_preprocess_keys
     * may not be able to eliminate redundant keys; in such cases we will
     * arbitrarily pick a usable one for each attribute.  This is correct
     * but possibly not optimal behavior.  (For example, with keys like
     * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
     * x=5 would be more efficient.)  Since the situation only arises given
     * a poorly-worded query plus an incomplete opfamily, live with it.
     *
     * When both equality and inequality keys appear for a single attribute
     * (again, only possible when cross-type operators appear), we *must*
     * select one of the equality keys for the starting point, because
     * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
     * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
     * start at x=4, we will fail and stop before reaching x=10.  If multiple
     * equality quals survive preprocessing, however, it doesn't matter which
     * one we use --- by definition, they are either redundant or
     * contradictory.
     *
     * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
     * If the index stores nulls at the end of the index we'll be starting
     * from, and we have no boundary key for the column (which means the key
     * we deduced NOT NULL from is an inequality key that constrains the other
     * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
     * use as a boundary key.  If we didn't do this, we might find ourselves
     * traversing a lot of null entries at the start of the scan.
     *
     * In this loop, row-comparison keys are treated the same as keys on their
     * first (leftmost) columns.  We'll add on lower-order columns of the row
     * comparison below, if possible.
     *
     * The selected scan keys (at most one per index column) are remembered by
     * storing their addresses into the local startKeys[] array.
     *----------
     */
    strat_total = BTEqualStrategyNumber;
    if (so->numberOfKeys > 0)
    {
        AttrNumber  curattr;
        ScanKey     chosen;
        ScanKey     impliesNN;
        ScanKey     cur;

        /*
         * chosen is the so-far-chosen key for the current attribute, if any.
         * We don't cast the decision in stone until we reach keys for the
         * next attribute.
         */
        curattr = 1;
        chosen = NULL;
        /* Also remember any scankey that implies a NOT NULL constraint */
        impliesNN = NULL;

        /*
         * Loop iterates from 0 to numberOfKeys inclusive; we use the last
         * pass to handle after-last-key processing.  Actual exit from the
         * loop is at one of the "break" statements below.
         */
        for (cur = so->keyData, i = 0;; cur++, i++)
        {
            if (i >= so->numberOfKeys || cur->sk_attno != curattr)
            {
                /*
                 * Done looking at keys for curattr.  If we didn't find a
                 * usable boundary key, see if we can deduce a NOT NULL key.
                 */
                if (chosen == NULL && impliesNN != NULL &&
                    ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
                     ScanDirectionIsForward(dir) :
                     ScanDirectionIsBackward(dir)))
                {
                    /* Yes, so build the key in notnullkeys[keysCount] */
                    chosen = &notnullkeys[keysCount];
                    ScanKeyEntryInitialize(chosen,
                                           (SK_SEARCHNOTNULL | SK_ISNULL |
                                            (impliesNN->sk_flags &
                                          (SK_BT_DESC | SK_BT_NULLS_FIRST))),
                                           curattr,
                                 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
                                  BTGreaterStrategyNumber :
                                  BTLessStrategyNumber),
                                           InvalidOid,
                                           InvalidOid,
                                           InvalidOid,
                                           (Datum) 0);
                }

                /*
                 * If we still didn't find a usable boundary key, quit; else
                 * save the boundary key pointer in startKeys.
                 */
                if (chosen == NULL)
                    break;
                startKeys[keysCount++] = chosen;

                /*
                 * Adjust strat_total, and quit if we have stored a > or <
                 * key.
                 */
                strat = chosen->sk_strategy;
                if (strat != BTEqualStrategyNumber)
                {
                    strat_total = strat;
                    if (strat == BTGreaterStrategyNumber ||
                        strat == BTLessStrategyNumber)
                        break;
                }

                /*
                 * Done if that was the last attribute, or if next key is not
                 * in sequence (implying no boundary key is available for the
                 * next attribute).
                 */
                if (i >= so->numberOfKeys ||
                    cur->sk_attno != curattr + 1)
                    break;

                /*
                 * Reset for next attr.
                 */
                curattr = cur->sk_attno;
                chosen = NULL;
                impliesNN = NULL;
            }

            /*
             * Can we use this key as a starting boundary for this attr?
             *
             * If not, does it imply a NOT NULL constraint?  (Because
             * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
             * *any* inequality key works for that; we need not test.)
             */
            switch (cur->sk_strategy)
            {
                case BTLessStrategyNumber:
                case BTLessEqualStrategyNumber:
                    if (chosen == NULL)
                    {
                        if (ScanDirectionIsBackward(dir))
                            chosen = cur;
                        else
                            impliesNN = cur;
                    }
                    break;
                case BTEqualStrategyNumber:
                    /* override any non-equality choice */
                    chosen = cur;
                    break;
                case BTGreaterEqualStrategyNumber:
                case BTGreaterStrategyNumber:
                    if (chosen == NULL)
                    {
                        if (ScanDirectionIsForward(dir))
                            chosen = cur;
                        else
                            impliesNN = cur;
                    }
                    break;
            }
        }
    }

    /*
     * If we found no usable boundary keys, we have to start from one end of
     * the tree.  Walk down that edge to the first or last key, and scan from
     * there.
     */
    if (keysCount == 0)
        return _bt_endpoint(scan, dir);

    /*
     * We want to start the scan somewhere within the index.  Set up an
     * insertion scankey we can use to search for the boundary point we
     * identified above.  The insertion scankey is built in the local
     * scankeys[] array, using the keys identified by startKeys[].
     */
    Assert(keysCount <= INDEX_MAX_KEYS);
    for (i = 0; i < keysCount; i++)
    {
        ScanKey     cur = startKeys[i];

        Assert(cur->sk_attno == i + 1);

        if (cur->sk_flags & SK_ROW_HEADER)
        {
            /*
             * Row comparison header: look to the first row member instead.
             *
             * The member scankeys are already in insertion format (ie, they
             * have sk_func = 3-way-comparison function), but we have to watch
             * out for nulls, which _bt_preprocess_keys didn't check. A null
             * in the first row member makes the condition unmatchable, just
             * like qual_ok = false.
             */
            ScanKey     subkey = (ScanKey) DatumGetPointer(cur->sk_argument);

            Assert(subkey->sk_flags & SK_ROW_MEMBER);
            if (subkey->sk_flags & SK_ISNULL)
                return false;
            memcpy(scankeys + i, subkey, sizeof(ScanKeyData));

            /*
             * If the row comparison is the last positioning key we accepted,
             * try to add additional keys from the lower-order row members.
             * (If we accepted independent conditions on additional index
             * columns, we use those instead --- doesn't seem worth trying to
             * determine which is more restrictive.)  Note that this is OK
             * even if the row comparison is of ">" or "<" type, because the
             * condition applied to all but the last row member is effectively
             * ">=" or "<=", and so the extra keys don't break the positioning
             * scheme.  But, by the same token, if we aren't able to use all
             * the row members, then the part of the row comparison that we
             * did use has to be treated as just a ">=" or "<=" condition, and
             * so we'd better adjust strat_total accordingly.
             */
            if (i == keysCount - 1)
            {
                bool        used_all_subkeys = false;

                Assert(!(subkey->sk_flags & SK_ROW_END));
                for (;;)
                {
                    subkey++;
                    Assert(subkey->sk_flags & SK_ROW_MEMBER);
                    if (subkey->sk_attno != keysCount + 1)
                        break;  /* out-of-sequence, can't use it */
                    if (subkey->sk_strategy != cur->sk_strategy)
                        break;  /* wrong direction, can't use it */
                    if (subkey->sk_flags & SK_ISNULL)
                        break;  /* can't use null keys */
                    Assert(keysCount < INDEX_MAX_KEYS);
                    memcpy(scankeys + keysCount, subkey, sizeof(ScanKeyData));
                    keysCount++;
                    if (subkey->sk_flags & SK_ROW_END)
                    {
                        used_all_subkeys = true;
                        break;
                    }
                }
                if (!used_all_subkeys)
                {
                    switch (strat_total)
                    {
                        case BTLessStrategyNumber:
                            strat_total = BTLessEqualStrategyNumber;
                            break;
                        case BTGreaterStrategyNumber:
                            strat_total = BTGreaterEqualStrategyNumber;
                            break;
                    }
                }
                break;          /* done with outer loop */
            }
        }
        else
        {
            /*
             * Ordinary comparison key.  Transform the search-style scan key
             * to an insertion scan key by replacing the sk_func with the
             * appropriate btree comparison function.
             *
             * If scankey operator is not a cross-type comparison, we can use
             * the cached comparison function; otherwise gotta look it up in
             * the catalogs.  (That can't lead to infinite recursion, since no
             * indexscan initiated by syscache lookup will use cross-data-type
             * operators.)
             *
             * We support the convention that sk_subtype == InvalidOid means
             * the opclass input type; this is a hack to simplify life for
             * ScanKeyInit().
             */
            if (cur->sk_subtype == rel->rd_opcintype[i] ||
                cur->sk_subtype == InvalidOid)
            {
                FmgrInfo   *procinfo;

                procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
                ScanKeyEntryInitializeWithInfo(scankeys + i,
                                               cur->sk_flags,
                                               cur->sk_attno,
                                               InvalidStrategy,
                                               cur->sk_subtype,
                                               cur->sk_collation,
                                               procinfo,
                                               cur->sk_argument);
            }
            else
            {
                RegProcedure cmp_proc;

                cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
                                             rel->rd_opcintype[i],
                                             cur->sk_subtype,
                                             BTORDER_PROC);
                if (!RegProcedureIsValid(cmp_proc))
                    elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
                         BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
                         cur->sk_attno, RelationGetRelationName(rel));
                ScanKeyEntryInitialize(scankeys + i,
                                       cur->sk_flags,
                                       cur->sk_attno,
                                       InvalidStrategy,
                                       cur->sk_subtype,
                                       cur->sk_collation,
                                       cmp_proc,
                                       cur->sk_argument);
            }
        }
    }

    /*----------
     * Examine the selected initial-positioning strategy to determine exactly
     * where we need to start the scan, and set flag variables to control the
     * code below.
     *
     * If nextkey = false, _bt_search and _bt_binsrch will locate the first
     * item >= scan key.  If nextkey = true, they will locate the first
     * item > scan key.
     *
     * If goback = true, we will then step back one item, while if
     * goback = false, we will start the scan on the located item.
     *----------
     */
    switch (strat_total)
    {
        case BTLessStrategyNumber:

            /*
             * Find first item >= scankey, then back up one to arrive at last
             * item < scankey.  (Note: this positioning strategy is only used
             * for a backward scan, so that is always the correct starting
             * position.)
             */
            nextkey = false;
            goback = true;
            break;

        case BTLessEqualStrategyNumber:

            /*
             * Find first item > scankey, then back up one to arrive at last
             * item <= scankey.  (Note: this positioning strategy is only used
             * for a backward scan, so that is always the correct starting
             * position.)
             */
            nextkey = true;
            goback = true;
            break;

        case BTEqualStrategyNumber:

            /*
             * If a backward scan was specified, need to start with last equal
             * item not first one.
             */
            if (ScanDirectionIsBackward(dir))
            {
                /*
                 * This is the same as the <= strategy.  We will check at the
                 * end whether the found item is actually =.
                 */
                nextkey = true;
                goback = true;
            }
            else
            {
                /*
                 * This is the same as the >= strategy.  We will check at the
                 * end whether the found item is actually =.
                 */
                nextkey = false;
                goback = false;
            }
            break;

        case BTGreaterEqualStrategyNumber:

            /*
             * Find first item >= scankey.  (This is only used for forward
             * scans.)
             */
            nextkey = false;
            goback = false;
            break;

        case BTGreaterStrategyNumber:

            /*
             * Find first item > scankey.  (This is only used for forward
             * scans.)
             */
            nextkey = true;
            goback = false;
            break;

        default:
            /* can't get here, but keep compiler quiet */
            elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
            return false;
    }

    /*
     * Use the manufactured insertion scan key to descend the tree and
     * position ourselves on the target leaf page.
     */
    stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ);

    /* don't need to keep the stack around... */
    _bt_freestack(stack);

    /* remember which buffer we have pinned, if any */
    so->currPos.buf = buf;

    if (!BufferIsValid(buf))
    {
        /*
         * We only get here if the index is completely empty. Lock relation
         * because nothing finer to lock exists.
         */
        PredicateLockRelation(rel, scan->xs_snapshot);
        return false;
    }
    else
        PredicateLockPage(rel, BufferGetBlockNumber(buf),
                          scan->xs_snapshot);

    /* initialize moreLeft/moreRight appropriately for scan direction */
    if (ScanDirectionIsForward(dir))
    {
        so->currPos.moreLeft = false;
        so->currPos.moreRight = true;
    }
    else
    {
        so->currPos.moreLeft = true;
        so->currPos.moreRight = false;
    }
    so->numKilled = 0;          /* just paranoia */
    so->markItemIndex = -1;     /* ditto */

    /* position to the precise item on the page */
    offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);

    /*
     * If nextkey = false, we are positioned at the first item >= scan key, or
     * possibly at the end of a page on which all the existing items are less
     * than the scan key and we know that everything on later pages is greater
     * than or equal to scan key.
     *
     * If nextkey = true, we are positioned at the first item > scan key, or
     * possibly at the end of a page on which all the existing items are less
     * than or equal to the scan key and we know that everything on later
     * pages is greater than scan key.
     *
     * The actually desired starting point is either this item or the prior
     * one, or in the end-of-page case it's the first item on the next page or
     * the last item on this page.  Adjust the starting offset if needed. (If
     * this results in an offset before the first item or after the last one,
     * _bt_readpage will report no items found, and then we'll step to the
     * next page as needed.)
     */
    if (goback)
        offnum = OffsetNumberPrev(offnum);

    /*
     * Now load data from the first page of the scan.
     */
    if (!_bt_readpage(scan, dir, offnum))
    {
        /*
         * There's no actually-matching data on this page.  Try to advance to
         * the next page.  Return false if there's no matching data at all.
         */
        if (!_bt_steppage(scan, dir))
            return false;
    }

    /* Drop the lock, but not pin, on the current page */
    LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);

    /* OK, itemIndex says what to return */
    currItem = &so->currPos.items[so->currPos.itemIndex];
    scan->xs_ctup.t_self = currItem->heapTid;
    if (scan->xs_want_itup)
        scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);

    return true;
}

Buffer _bt_get_endpoint	(	Relation	rel,
		uint32	level,
		bool	rightmost
	)

Definition at line 1433 of file nbtsearch.c.

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_next, buf, BufferGetPage, BufferIsValid, elog, ERROR, ItemPointerGetBlockNumber, BTPageOpaqueData::level, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, RelationGetRelationName, and IndexTupleData::t_tid.

Referenced by _bt_endpoint(), _bt_insert_parent(), and _bt_pagedel().

{
    Buffer      buf;
    Page        page;
    BTPageOpaque opaque;
    OffsetNumber offnum;
    BlockNumber blkno;
    IndexTuple  itup;

    /*
     * If we are looking for a leaf page, okay to descend from fast root;
     * otherwise better descend from true root.  (There is no point in being
     * smarter about intermediate levels.)
     */
    if (level == 0)
        buf = _bt_getroot(rel, BT_READ);
    else
        buf = _bt_gettrueroot(rel);

    if (!BufferIsValid(buf))
        return InvalidBuffer;

    page = BufferGetPage(buf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);

    for (;;)
    {
        /*
         * If we landed on a deleted page, step right to find a live page
         * (there must be one).  Also, if we want the rightmost page, step
         * right if needed to get to it (this could happen if the page split
         * since we obtained a pointer to it).
         */
        while (P_IGNORE(opaque) ||
               (rightmost && !P_RIGHTMOST(opaque)))
        {
            blkno = opaque->btpo_next;
            if (blkno == P_NONE)
                elog(ERROR, "fell off the end of index \"%s\"",
                     RelationGetRelationName(rel));
            buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
            page = BufferGetPage(buf);
            opaque = (BTPageOpaque) PageGetSpecialPointer(page);
        }

        /* Done? */
        if (opaque->btpo.level == level)
            break;
        if (opaque->btpo.level < level)
            elog(ERROR, "btree level %u not found in index \"%s\"",
                 level, RelationGetRelationName(rel));

        /* Descend to leftmost or rightmost child page */
        if (rightmost)
            offnum = PageGetMaxOffsetNumber(page);
        else
            offnum = P_FIRSTDATAKEY(opaque);

        itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
        blkno = ItemPointerGetBlockNumber(&(itup->t_tid));

        buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
        page = BufferGetPage(buf);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    }

    return buf;
}

Buffer _bt_moveright	(	Relation	rel,
		Buffer	buf,
		int	keysz,
		ScanKey	scankey,
		bool	nextkey,
		int	access
	)

Definition at line 156 of file nbtsearch.c.

References _bt_compare(), _bt_relandgetbuf(), BTPageOpaqueData::btpo_next, BufferGetPage, elog, ERROR, P_HIKEY, P_IGNORE, P_RIGHTMOST, PageGetSpecialPointer, and RelationGetRelationName.

Referenced by _bt_doinsert(), and _bt_search().

{
    Page        page;
    BTPageOpaque opaque;
    int32       cmpval;

    page = BufferGetPage(buf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);

    /*
     * When nextkey = false (normal case): if the scan key that brought us to
     * this page is > the high key stored on the page, then the page has split
     * and we need to move right.  (If the scan key is equal to the high key,
     * we might or might not need to move right; have to scan the page first
     * anyway.)
     *
     * When nextkey = true: move right if the scan key is >= page's high key.
     *
     * The page could even have split more than once, so scan as far as
     * needed.
     *
     * We also have to move right if we followed a link that brought us to a
     * dead page.
     */
    cmpval = nextkey ? 0 : 1;

    while (!P_RIGHTMOST(opaque) &&
           (P_IGNORE(opaque) ||
            _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval))
    {
        /* step right one page */
        BlockNumber rblkno = opaque->btpo_next;

        buf = _bt_relandgetbuf(rel, buf, rblkno, access);
        page = BufferGetPage(buf);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    }

    if (P_IGNORE(opaque))
        elog(ERROR, "fell off the end of index \"%s\"",
             RelationGetRelationName(rel));

    return buf;
}

bool _bt_next	(	IndexScanDesc	scan,
		ScanDirection	dir
	)

Definition at line 995 of file nbtsearch.c.

References _bt_steppage(), Assert, BT_READ, BTScanPosData::buf, BUFFER_LOCK_UNLOCK, BufferIsValid, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosData::firstItem, BTScanPosData::itemIndex, BTScanPosData::items, BTScanPosData::lastItem, LockBuffer(), IndexScanDescData::opaque, ScanDirectionIsForward, HeapTupleData::t_self, IndexScanDescData::xs_ctup, IndexScanDescData::xs_itup, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

{
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    BTScanPosItem *currItem;

    /*
     * Advance to next tuple on current page; or if there's no more, try to
     * step to the next page with data.
     */
    if (ScanDirectionIsForward(dir))
    {
        if (++so->currPos.itemIndex > so->currPos.lastItem)
        {
            /* We must acquire lock before applying _bt_steppage */
            Assert(BufferIsValid(so->currPos.buf));
            LockBuffer(so->currPos.buf, BT_READ);
            if (!_bt_steppage(scan, dir))
                return false;
            /* Drop the lock, but not pin, on the new page */
            LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
        }
    }
    else
    {
        if (--so->currPos.itemIndex < so->currPos.firstItem)
        {
            /* We must acquire lock before applying _bt_steppage */
            Assert(BufferIsValid(so->currPos.buf));
            LockBuffer(so->currPos.buf, BT_READ);
            if (!_bt_steppage(scan, dir))
                return false;
            /* Drop the lock, but not pin, on the new page */
            LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
        }
    }

    /* OK, itemIndex says what to return */
    currItem = &so->currPos.items[so->currPos.itemIndex];
    scan->xs_ctup.t_self = currItem->heapTid;
    if (scan->xs_want_itup)
        scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);

    return true;
}

static bool _bt_readpage	(	IndexScanDesc	scan,
		ScanDirection	dir,
		OffsetNumber	offnum
	)			`[static]`

Definition at line 1056 of file nbtsearch.c.

References _bt_checkkeys(), _bt_saveitem(), Assert, BTScanPosData::buf, BufferGetPage, BufferIsValid, BTScanOpaqueData::currPos, BTScanPosData::firstItem, BTScanPosData::itemIndex, BTScanPosData::lastItem, Max, MaxIndexTuplesPerPage, Min, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanPosData::nextPage, BTScanPosData::nextTupleOffset, NULL, OffsetNumberNext, OffsetNumberPrev, IndexScanDescData::opaque, P_FIRSTDATAKEY, PageGetMaxOffsetNumber, PageGetSpecialPointer, and ScanDirectionIsForward.

Referenced by _bt_endpoint(), _bt_first(), and _bt_steppage().

{
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    Page        page;
    BTPageOpaque opaque;
    OffsetNumber minoff;
    OffsetNumber maxoff;
    int         itemIndex;
    IndexTuple  itup;
    bool        continuescan;

    /* we must have the buffer pinned and locked */
    Assert(BufferIsValid(so->currPos.buf));

    page = BufferGetPage(so->currPos.buf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    minoff = P_FIRSTDATAKEY(opaque);
    maxoff = PageGetMaxOffsetNumber(page);

    /*
     * we must save the page's right-link while scanning it; this tells us
     * where to step right to after we're done with these items.  There is no
     * corresponding need for the left-link, since splits always go right.
     */
    so->currPos.nextPage = opaque->btpo_next;

    /* initialize tuple workspace to empty */
    so->currPos.nextTupleOffset = 0;

    if (ScanDirectionIsForward(dir))
    {
        /* load items[] in ascending order */
        itemIndex = 0;

        offnum = Max(offnum, minoff);

        while (offnum <= maxoff)
        {
            itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
            if (itup != NULL)
            {
                /* tuple passes all scan key conditions, so remember it */
                _bt_saveitem(so, itemIndex, offnum, itup);
                itemIndex++;
            }
            if (!continuescan)
            {
                /* there can't be any more matches, so stop */
                so->currPos.moreRight = false;
                break;
            }

            offnum = OffsetNumberNext(offnum);
        }

        Assert(itemIndex <= MaxIndexTuplesPerPage);
        so->currPos.firstItem = 0;
        so->currPos.lastItem = itemIndex - 1;
        so->currPos.itemIndex = 0;
    }
    else
    {
        /* load items[] in descending order */
        itemIndex = MaxIndexTuplesPerPage;

        offnum = Min(offnum, maxoff);

        while (offnum >= minoff)
        {
            itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
            if (itup != NULL)
            {
                /* tuple passes all scan key conditions, so remember it */
                itemIndex--;
                _bt_saveitem(so, itemIndex, offnum, itup);
            }
            if (!continuescan)
            {
                /* there can't be any more matches, so stop */
                so->currPos.moreLeft = false;
                break;
            }

            offnum = OffsetNumberPrev(offnum);
        }

        Assert(itemIndex >= 0);
        so->currPos.firstItem = itemIndex;
        so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
        so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
    }

    return (so->currPos.firstItem <= so->currPos.lastItem);
}

static void _bt_saveitem	(	BTScanOpaque	so,
		int	itemIndex,
		OffsetNumber	offnum,
		IndexTuple	itup
	)			`[static]`

Definition at line 1153 of file nbtsearch.c.

References BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosItem::heapTid, BTScanPosItem::indexOffset, IndexTupleSize, BTScanPosData::items, MAXALIGN, BTScanPosData::nextTupleOffset, IndexTupleData::t_tid, and BTScanPosItem::tupleOffset.

Referenced by _bt_readpage().

{
    BTScanPosItem *currItem = &so->currPos.items[itemIndex];

    currItem->heapTid = itup->t_tid;
    currItem->indexOffset = offnum;
    if (so->currTuples)
    {
        Size        itupsz = IndexTupleSize(itup);

        currItem->tupleOffset = so->currPos.nextTupleOffset;
        memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
        so->currPos.nextTupleOffset += MAXALIGN(itupsz);
    }
}

BTStack _bt_search	(	Relation	rel,
		int	keysz,
		ScanKey	scankey,
		bool	nextkey,
		Buffer *	bufP,
		int	access
	)

Definition at line 57 of file nbtsearch.c.

References _bt_binsrch(), _bt_getroot(), _bt_moveright(), _bt_relandgetbuf(), BT_READ, BTStackData::bts_blkno, BTStackData::bts_btentry, BTStackData::bts_offset, BTStackData::bts_parent, BufferGetBlockNumber(), BufferGetPage, BufferIsValid, ItemPointerGetBlockNumber, P_ISLEAF, PageGetItem, PageGetItemId, PageGetSpecialPointer, palloc(), and IndexTupleData::t_tid.

Referenced by _bt_doinsert(), _bt_first(), and _bt_pagedel().

{
    BTStack     stack_in = NULL;

    /* Get the root page to start with */
    *bufP = _bt_getroot(rel, access);

    /* If index is empty and access = BT_READ, no root page is created. */
    if (!BufferIsValid(*bufP))
        return (BTStack) NULL;

    /* Loop iterates once per level descended in the tree */
    for (;;)
    {
        Page        page;
        BTPageOpaque opaque;
        OffsetNumber offnum;
        ItemId      itemid;
        IndexTuple  itup;
        BlockNumber blkno;
        BlockNumber par_blkno;
        BTStack     new_stack;

        /*
         * Race -- the page we just grabbed may have split since we read its
         * pointer in the parent (or metapage).  If it has, we may need to
         * move right to its new sibling.  Do that.
         */
        *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ);

        /* if this is a leaf page, we're done */
        page = BufferGetPage(*bufP);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);
        if (P_ISLEAF(opaque))
            break;

        /*
         * Find the appropriate item on the internal page, and get the child
         * page that it points to.
         */
        offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
        itemid = PageGetItemId(page, offnum);
        itup = (IndexTuple) PageGetItem(page, itemid);
        blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
        par_blkno = BufferGetBlockNumber(*bufP);

        /*
         * We need to save the location of the index entry we chose in the
         * parent page on a stack. In case we split the tree, we'll use the
         * stack to work back up to the parent page.  We also save the actual
         * downlink (TID) to uniquely identify the index entry, in case it
         * moves right while we're working lower in the tree.  See the paper
         * by Lehman and Yao for how this is detected and handled. (We use the
         * child link to disambiguate duplicate keys in the index -- Lehman
         * and Yao disallow duplicate keys.)
         */
        new_stack = (BTStack) palloc(sizeof(BTStackData));
        new_stack->bts_blkno = par_blkno;
        new_stack->bts_offset = offnum;
        memcpy(&new_stack->bts_btentry, itup, sizeof(IndexTupleData));
        new_stack->bts_parent = stack_in;

        /* drop the read lock on the parent page, acquire one on the child */
        *bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);

        /* okay, all set to move down a level */
        stack_in = new_stack;
    }

    return stack_in;
}

static bool _bt_steppage	(	IndexScanDesc	scan,
		ScanDirection	dir
	)			`[static]`

Definition at line 1183 of file nbtsearch.c.

References _bt_getbuf(), _bt_killitems(), _bt_readpage(), _bt_relbuf(), _bt_walk_left(), Assert, BT_READ, BTScanPosData::buf, BufferGetBlockNumber(), BufferGetPage, BufferIsValid, CHECK_FOR_INTERRUPTS, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, IncrBufferRefCount(), IndexScanDescData::indexRelation, InvalidBuffer, BTScanPosData::itemIndex, BTScanPosData::lastItem, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanPosData::moreLeft, BTScanPosData::moreRight, BTScanPosData::nextPage, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numKilled, offsetof, IndexScanDescData::opaque, P_FIRSTDATAKEY, P_IGNORE, P_NONE, PageGetMaxOffsetNumber, PageGetSpecialPointer, PredicateLockPage(), ScanDirectionIsForward, and IndexScanDescData::xs_snapshot.

Referenced by _bt_endpoint(), _bt_first(), and _bt_next().

{
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    Relation    rel;
    Page        page;
    BTPageOpaque opaque;

    /* we must have the buffer pinned and locked */
    Assert(BufferIsValid(so->currPos.buf));

    /* Before leaving current page, deal with any killed items */
    if (so->numKilled > 0)
        _bt_killitems(scan, true);

    /*
     * Before we modify currPos, make a copy of the page data if there was a
     * mark position that needs it.
     */
    if (so->markItemIndex >= 0)
    {
        /* bump pin on current buffer for assignment to mark buffer */
        IncrBufferRefCount(so->currPos.buf);
        memcpy(&so->markPos, &so->currPos,
               offsetof(BTScanPosData, items[1]) +
               so->currPos.lastItem * sizeof(BTScanPosItem));
        if (so->markTuples)
            memcpy(so->markTuples, so->currTuples,
                   so->currPos.nextTupleOffset);
        so->markPos.itemIndex = so->markItemIndex;
        so->markItemIndex = -1;
    }

    rel = scan->indexRelation;

    if (ScanDirectionIsForward(dir))
    {
        /* Walk right to the next page with data */
        /* We must rely on the previously saved nextPage link! */
        BlockNumber blkno = so->currPos.nextPage;

        /* Remember we left a page with data */
        so->currPos.moreLeft = true;

        for (;;)
        {
            /* release the previous buffer */
            _bt_relbuf(rel, so->currPos.buf);
            so->currPos.buf = InvalidBuffer;
            /* if we're at end of scan, give up */
            if (blkno == P_NONE || !so->currPos.moreRight)
                return false;
            /* check for interrupts while we're not holding any buffer lock */
            CHECK_FOR_INTERRUPTS();
            /* step right one page */
            so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ);
            /* check for deleted page */
            page = BufferGetPage(so->currPos.buf);
            opaque = (BTPageOpaque) PageGetSpecialPointer(page);
            if (!P_IGNORE(opaque))
            {
                PredicateLockPage(rel, blkno, scan->xs_snapshot);
                /* see if there are any matches on this page */
                /* note that this will clear moreRight if we can stop */
                if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque)))
                    break;
            }
            /* nope, keep going */
            blkno = opaque->btpo_next;
        }
    }
    else
    {
        /* Remember we left a page with data */
        so->currPos.moreRight = true;

        /*
         * Walk left to the next page with data.  This is much more complex
         * than the walk-right case because of the possibility that the page
         * to our left splits while we are in flight to it, plus the
         * possibility that the page we were on gets deleted after we leave
         * it.  See nbtree/README for details.
         */
        for (;;)
        {
            /* Done if we know there are no matching keys to the left */
            if (!so->currPos.moreLeft)
            {
                _bt_relbuf(rel, so->currPos.buf);
                so->currPos.buf = InvalidBuffer;
                return false;
            }

            /* Step to next physical page */
            so->currPos.buf = _bt_walk_left(rel, so->currPos.buf);

            /* if we're physically at end of index, return failure */
            if (so->currPos.buf == InvalidBuffer)
                return false;

            /*
             * Okay, we managed to move left to a non-deleted page. Done if
             * it's not half-dead and contains matching tuples. Else loop back
             * and do it all again.
             */
            page = BufferGetPage(so->currPos.buf);
            opaque = (BTPageOpaque) PageGetSpecialPointer(page);
            if (!P_IGNORE(opaque))
            {
                PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot);
                /* see if there are any matches on this page */
                /* note that this will clear moreLeft if we can stop */
                if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page)))
                    break;
            }
        }
    }

    return true;
}

static Buffer _bt_walk_left	(	Relation	rel,
		Buffer	buf
	)			`[static]`

Definition at line 1318 of file nbtsearch.c.

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BufferGetBlockNumber(), BufferGetPage, CHECK_FOR_INTERRUPTS, elog, ERROR, P_ISDELETED, P_LEFTMOST, P_RIGHTMOST, PageGetSpecialPointer, and RelationGetRelationName.

Referenced by _bt_steppage().

{
    Page        page;
    BTPageOpaque opaque;

    page = BufferGetPage(buf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);

    for (;;)
    {
        BlockNumber obknum;
        BlockNumber lblkno;
        BlockNumber blkno;
        int         tries;

        /* if we're at end of tree, release buf and return failure */
        if (P_LEFTMOST(opaque))
        {
            _bt_relbuf(rel, buf);
            break;
        }
        /* remember original page we are stepping left from */
        obknum = BufferGetBlockNumber(buf);
        /* step left */
        blkno = lblkno = opaque->btpo_prev;
        _bt_relbuf(rel, buf);
        /* check for interrupts while we're not holding any buffer lock */
        CHECK_FOR_INTERRUPTS();
        buf = _bt_getbuf(rel, blkno, BT_READ);
        page = BufferGetPage(buf);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);

        /*
         * If this isn't the page we want, walk right till we find what we
         * want --- but go no more than four hops (an arbitrary limit). If we
         * don't find the correct page by then, the most likely bet is that
         * the original page got deleted and isn't in the sibling chain at all
         * anymore, not that its left sibling got split more than four times.
         *
         * Note that it is correct to test P_ISDELETED not P_IGNORE here,
         * because half-dead pages are still in the sibling chain.  Caller
         * must reject half-dead pages if wanted.
         */
        tries = 0;
        for (;;)
        {
            if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum)
            {
                /* Found desired page, return it */
                return buf;
            }
            if (P_RIGHTMOST(opaque) || ++tries > 4)
                break;
            blkno = opaque->btpo_next;
            buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
            page = BufferGetPage(buf);
            opaque = (BTPageOpaque) PageGetSpecialPointer(page);
        }

        /* Return to the original page to see what's up */
        buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ);
        page = BufferGetPage(buf);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);
        if (P_ISDELETED(opaque))
        {
            /*
             * It was deleted.  Move right to first nondeleted page (there
             * must be one); that is the page that has acquired the deleted
             * one's keyspace, so stepping left from it will take us where we
             * want to be.
             */
            for (;;)
            {
                if (P_RIGHTMOST(opaque))
                    elog(ERROR, "fell off the end of index \"%s\"",
                         RelationGetRelationName(rel));
                blkno = opaque->btpo_next;
                buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
                page = BufferGetPage(buf);
                opaque = (BTPageOpaque) PageGetSpecialPointer(page);
                if (!P_ISDELETED(opaque))
                    break;
            }

            /*
             * Now return to top of loop, resetting obknum to point to this
             * nondeleted page, and try again.
             */
        }
        else
        {
            /*
             * It wasn't deleted; the explanation had better be that the page
             * to the left got split or deleted. Without this check, we'd go
             * into an infinite loop if there's anything wrong.
             */
            if (opaque->btpo_prev == lblkno)
                elog(ERROR, "could not find left sibling of block %u in index \"%s\"",
                     obknum, RelationGetRelationName(rel));
            /* Okay to try again with new lblkno value */
        }
    }

    return InvalidBuffer;
}

Header And Logo

nbtsearch.c File Reference

Functions

Function Documentation