Header And Logo

PostgreSQL
| The world's most advanced open source database.

Data Structures | Defines | Typedefs | Functions

nbtree.h File Reference

#include "access/genam.h"
#include "access/itup.h"
#include "access/sdir.h"
#include "access/xlog.h"
#include "access/xlogutils.h"
#include "catalog/pg_index.h"
Include dependency graph for nbtree.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  BTPageOpaqueData
struct  BTMetaPageData
struct  xl_btreetid
struct  xl_btree_metadata
struct  xl_btree_insert
struct  xl_btree_split
struct  xl_btree_delete
struct  xl_btree_reuse_page
struct  xl_btree_vacuum
struct  xl_btree_delete_page
struct  xl_btree_newroot
struct  BTStackData
struct  BTScanPosItem
struct  BTScanPosData
struct  BTArrayKeyInfo
struct  BTScanOpaqueData

Defines

#define BTP_LEAF   (1 << 0)
#define BTP_ROOT   (1 << 1)
#define BTP_DELETED   (1 << 2)
#define BTP_META   (1 << 3)
#define BTP_HALF_DEAD   (1 << 4)
#define BTP_SPLIT_END   (1 << 5)
#define BTP_HAS_GARBAGE   (1 << 6)
#define MAX_BT_CYCLE_ID   0xFF7F
#define BTPageGetMeta(p)   ((BTMetaPageData *) PageGetContents(p))
#define BTREE_METAPAGE   0
#define BTREE_MAGIC   0x053162
#define BTREE_VERSION   2
#define BTMaxItemSize(page)
#define BTREE_MIN_FILLFACTOR   10
#define BTREE_DEFAULT_FILLFACTOR   90
#define BTREE_NONLEAF_FILLFACTOR   70
#define BTTidSame(i1, i2)
#define BTEntrySame(i1, i2)   BTTidSame((i1)->t_tid, (i2)->t_tid)
#define P_NONE   0
#define P_LEFTMOST(opaque)   ((opaque)->btpo_prev == P_NONE)
#define P_RIGHTMOST(opaque)   ((opaque)->btpo_next == P_NONE)
#define P_ISLEAF(opaque)   ((opaque)->btpo_flags & BTP_LEAF)
#define P_ISROOT(opaque)   ((opaque)->btpo_flags & BTP_ROOT)
#define P_ISDELETED(opaque)   ((opaque)->btpo_flags & BTP_DELETED)
#define P_ISHALFDEAD(opaque)   ((opaque)->btpo_flags & BTP_HALF_DEAD)
#define P_IGNORE(opaque)   ((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD))
#define P_HAS_GARBAGE(opaque)   ((opaque)->btpo_flags & BTP_HAS_GARBAGE)
#define P_HIKEY   ((OffsetNumber) 1)
#define P_FIRSTKEY   ((OffsetNumber) 2)
#define P_FIRSTDATAKEY(opaque)   (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
#define XLOG_BTREE_INSERT_LEAF   0x00
#define XLOG_BTREE_INSERT_UPPER   0x10
#define XLOG_BTREE_INSERT_META   0x20
#define XLOG_BTREE_SPLIT_L   0x30
#define XLOG_BTREE_SPLIT_R   0x40
#define XLOG_BTREE_SPLIT_L_ROOT   0x50
#define XLOG_BTREE_SPLIT_R_ROOT   0x60
#define XLOG_BTREE_DELETE   0x70
#define XLOG_BTREE_DELETE_PAGE   0x80
#define XLOG_BTREE_DELETE_PAGE_META   0x90
#define XLOG_BTREE_NEWROOT   0xA0
#define XLOG_BTREE_DELETE_PAGE_HALF   0xB0
#define XLOG_BTREE_VACUUM   0xC0
#define XLOG_BTREE_REUSE_PAGE   0xD0
#define SizeOfBtreeInsert   (offsetof(xl_btreetid, tid) + SizeOfIptrData)
#define SizeOfBtreeSplit   (offsetof(xl_btree_split, firstright) + sizeof(OffsetNumber))
#define SizeOfBtreeDelete   (offsetof(xl_btree_delete, nitems) + sizeof(int))
#define SizeOfBtreeReusePage   (sizeof(xl_btree_reuse_page))
#define SizeOfBtreeVacuum   (offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber))
#define SizeOfBtreeDeletePage   (offsetof(xl_btree_delete_page, btpo_xact) + sizeof(TransactionId))
#define SizeOfBtreeNewroot   (offsetof(xl_btree_newroot, level) + sizeof(uint32))
#define BTCommuteStrategyNumber(strat)   (BTMaxStrategyNumber + 1 - (strat))
#define BTORDER_PROC   1
#define BTSORTSUPPORT_PROC   2
#define BT_READ   BUFFER_LOCK_SHARE
#define BT_WRITE   BUFFER_LOCK_EXCLUSIVE
#define BTScanPosIsValid(scanpos)   BufferIsValid((scanpos).buf)
#define SK_BT_REQFWD   0x00010000
#define SK_BT_REQBKWD   0x00020000
#define SK_BT_INDOPTION_SHIFT   24
#define SK_BT_DESC   (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
#define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)

Typedefs

typedef uint16 BTCycleId
typedef struct BTPageOpaqueData BTPageOpaqueData
typedef BTPageOpaqueDataBTPageOpaque
typedef struct BTMetaPageData BTMetaPageData
typedef struct xl_btreetid xl_btreetid
typedef struct xl_btree_metadata xl_btree_metadata
typedef struct xl_btree_insert xl_btree_insert
typedef struct xl_btree_split xl_btree_split
typedef struct xl_btree_delete xl_btree_delete
typedef struct xl_btree_reuse_page xl_btree_reuse_page
typedef struct xl_btree_vacuum xl_btree_vacuum
typedef struct xl_btree_delete_page xl_btree_delete_page
typedef struct xl_btree_newroot xl_btree_newroot
typedef struct BTStackData BTStackData
typedef BTStackDataBTStack
typedef struct BTScanPosItem BTScanPosItem
typedef struct BTScanPosData BTScanPosData
typedef BTScanPosDataBTScanPos
typedef struct BTArrayKeyInfo BTArrayKeyInfo
typedef struct BTScanOpaqueData BTScanOpaqueData
typedef BTScanOpaqueDataBTScanOpaque
typedef struct BTSpool BTSpool

Functions

Datum btbuild (PG_FUNCTION_ARGS)
Datum btbuildempty (PG_FUNCTION_ARGS)
Datum btinsert (PG_FUNCTION_ARGS)
Datum btbeginscan (PG_FUNCTION_ARGS)
Datum btgettuple (PG_FUNCTION_ARGS)
Datum btgetbitmap (PG_FUNCTION_ARGS)
Datum btrescan (PG_FUNCTION_ARGS)
Datum btendscan (PG_FUNCTION_ARGS)
Datum btmarkpos (PG_FUNCTION_ARGS)
Datum btrestrpos (PG_FUNCTION_ARGS)
Datum btbulkdelete (PG_FUNCTION_ARGS)
Datum btvacuumcleanup (PG_FUNCTION_ARGS)
Datum btcanreturn (PG_FUNCTION_ARGS)
Datum btoptions (PG_FUNCTION_ARGS)
bool _bt_doinsert (Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel)
Buffer _bt_getstackbuf (Relation rel, BTStack stack, int access)
void _bt_insert_parent (Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
void _bt_initmetapage (Page page, BlockNumber rootbknum, uint32 level)
Buffer _bt_getroot (Relation rel, int access)
Buffer _bt_gettrueroot (Relation rel)
int _bt_getrootheight (Relation rel)
void _bt_checkpage (Relation rel, Buffer buf)
Buffer _bt_getbuf (Relation rel, BlockNumber blkno, int access)
Buffer _bt_relandgetbuf (Relation rel, Buffer obuf, BlockNumber blkno, int access)
void _bt_relbuf (Relation rel, Buffer buf)
void _bt_pageinit (Page page, Size size)
bool _bt_page_recyclable (Page page)
void _bt_delitems_delete (Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, Relation heapRel)
void _bt_delitems_vacuum (Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, BlockNumber lastBlockVacuumed)
int _bt_pagedel (Relation rel, Buffer buf, BTStack stack)
BTStack _bt_search (Relation rel, int keysz, ScanKey scankey, bool nextkey, Buffer *bufP, int access)
Buffer _bt_moveright (Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey, int access)
OffsetNumber _bt_binsrch (Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey)
int32 _bt_compare (Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum)
bool _bt_first (IndexScanDesc scan, ScanDirection dir)
bool _bt_next (IndexScanDesc scan, ScanDirection dir)
Buffer _bt_get_endpoint (Relation rel, uint32 level, bool rightmost)
ScanKey _bt_mkscankey (Relation rel, IndexTuple itup)
ScanKey _bt_mkscankey_nodata (Relation rel)
void _bt_freeskey (ScanKey skey)
void _bt_freestack (BTStack stack)
void _bt_preprocess_array_keys (IndexScanDesc scan)
void _bt_start_array_keys (IndexScanDesc scan, ScanDirection dir)
bool _bt_advance_array_keys (IndexScanDesc scan, ScanDirection dir)
void _bt_mark_array_keys (IndexScanDesc scan)
void _bt_restore_array_keys (IndexScanDesc scan)
void _bt_preprocess_keys (IndexScanDesc scan)
IndexTuple _bt_checkkeys (IndexScanDesc scan, Page page, OffsetNumber offnum, ScanDirection dir, bool *continuescan)
void _bt_killitems (IndexScanDesc scan, bool haveLock)
BTCycleId _bt_vacuum_cycleid (Relation rel)
BTCycleId _bt_start_vacuum (Relation rel)
void _bt_end_vacuum (Relation rel)
void _bt_end_vacuum_callback (int code, Datum arg)
Size BTreeShmemSize (void)
void BTreeShmemInit (void)
BTSpool_bt_spoolinit (Relation heap, Relation index, bool isunique, bool isdead)
void _bt_spooldestroy (BTSpool *btspool)
void _bt_spool (IndexTuple itup, BTSpool *btspool)
void _bt_leafbuild (BTSpool *btspool, BTSpool *spool2)
void btree_redo (XLogRecPtr lsn, XLogRecord *record)
void btree_desc (StringInfo buf, uint8 xl_info, char *rec)
void btree_xlog_startup (void)
void btree_xlog_cleanup (void)
bool btree_safe_restartpoint (void)

Define Documentation

#define BT_READ   BUFFER_LOCK_SHARE
#define BT_WRITE   BUFFER_LOCK_EXCLUSIVE
#define BTCommuteStrategyNumber (   strat  )     (BTMaxStrategyNumber + 1 - (strat))

Definition at line 416 of file nbtree.h.

Referenced by _bt_compare_scankey_args(), and _bt_fix_scankey_strategy().

#define BTEntrySame (   i1,
  i2 
)    BTTidSame((i1)->t_tid, (i2)->t_tid)

Definition at line 155 of file nbtree.h.

Referenced by _bt_getstackbuf().

#define BTMaxItemSize (   page  ) 
Value:

Definition at line 117 of file nbtree.h.

Referenced by _bt_buildadd(), and _bt_findinsertloc().

#define BTORDER_PROC   1
#define BTP_DELETED   (1 << 2)

Definition at line 71 of file nbtree.h.

Referenced by pgstat_btree_page().

#define BTP_HALF_DEAD   (1 << 4)

Definition at line 73 of file nbtree.h.

Referenced by pgstat_btree_page().

#define BTP_HAS_GARBAGE   (1 << 6)

Definition at line 75 of file nbtree.h.

#define BTP_LEAF   (1 << 0)

Definition at line 69 of file nbtree.h.

Referenced by _bt_blnewpage(), _bt_getroot(), and btree_xlog_split().

#define BTP_META   (1 << 3)

Definition at line 72 of file nbtree.h.

Referenced by _bt_getroot(), _bt_getrootheight(), and _bt_gettrueroot().

#define BTP_ROOT   (1 << 1)

Definition at line 70 of file nbtree.h.

Referenced by _bt_split().

#define BTP_SPLIT_END   (1 << 5)

Definition at line 74 of file nbtree.h.

Referenced by _bt_split(), and btvacuumpage().

#define BTPageGetMeta (   p  )     ((BTMetaPageData *) PageGetContents(p))
#define BTREE_DEFAULT_FILLFACTOR   90

Definition at line 130 of file nbtree.h.

Referenced by _bt_findsplitloc(), and _bt_pagestate().

#define BTREE_MAGIC   0x053162

Definition at line 108 of file nbtree.h.

Referenced by _bt_getroot(), _bt_getrootheight(), and _bt_gettrueroot().

#define BTREE_METAPAGE   0
#define BTREE_MIN_FILLFACTOR   10

Definition at line 129 of file nbtree.h.

#define BTREE_NONLEAF_FILLFACTOR   70

Definition at line 131 of file nbtree.h.

#define BTREE_VERSION   2

Definition at line 109 of file nbtree.h.

Referenced by _bt_getroot(), _bt_getrootheight(), and _bt_gettrueroot().

#define BTScanPosIsValid (   scanpos  )     BufferIsValid((scanpos).buf)

Definition at line 531 of file nbtree.h.

Referenced by btendscan(), btgettuple(), btmarkpos(), btrescan(), and btrestrpos().

#define BTSORTSUPPORT_PROC   2

Definition at line 431 of file nbtree.h.

Referenced by assignProcTypes(), get_sort_function_for_ordering_op(), and MJExamineQuals().

#define BTTidSame (   i1,
  i2 
)
Value:
( (i1).ip_blkid.bi_hi == (i2).ip_blkid.bi_hi && \
      (i1).ip_blkid.bi_lo == (i2).ip_blkid.bi_lo && \
      (i1).ip_posid == (i2).ip_posid )

Definition at line 151 of file nbtree.h.

#define MAX_BT_CYCLE_ID   0xFF7F

Definition at line 84 of file nbtree.h.

Referenced by _bt_start_vacuum().

#define P_FIRSTDATAKEY (   opaque  )     (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
#define P_FIRSTKEY   ((OffsetNumber) 2)

Definition at line 201 of file nbtree.h.

Referenced by _bt_buildadd(), _bt_newroot(), _bt_slideleft(), and btree_xlog_newroot().

#define P_HAS_GARBAGE (   opaque  )     ((opaque)->btpo_flags & BTP_HAS_GARBAGE)

Definition at line 180 of file nbtree.h.

Referenced by _bt_findinsertloc().

#define P_HIKEY   ((OffsetNumber) 1)
#define P_IGNORE (   opaque  )     ((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD))
#define P_ISDELETED (   opaque  )     ((opaque)->btpo_flags & BTP_DELETED)
#define P_ISHALFDEAD (   opaque  )     ((opaque)->btpo_flags & BTP_HALF_DEAD)

Definition at line 178 of file nbtree.h.

Referenced by _bt_pagedel(), and btvacuumpage().

#define P_ISLEAF (   opaque  )     ((opaque)->btpo_flags & BTP_LEAF)
#define P_ISROOT (   opaque  )     ((opaque)->btpo_flags & BTP_ROOT)
#define P_LEFTMOST (   opaque  )     ((opaque)->btpo_prev == P_NONE)

Definition at line 173 of file nbtree.h.

Referenced by _bt_getroot(), _bt_insertonpg(), _bt_walk_left(), and btree_xlog_cleanup().

#define P_NONE   0
#define P_RIGHTMOST (   opaque  )     ((opaque)->btpo_next == P_NONE)
#define SizeOfBtreeDelete   (offsetof(xl_btree_delete, nitems) + sizeof(int))

Definition at line 326 of file nbtree.h.

Referenced by btree_xlog_delete().

#define SizeOfBtreeDeletePage   (offsetof(xl_btree_delete_page, btpo_xact) + sizeof(TransactionId))

Definition at line 389 of file nbtree.h.

Referenced by btree_xlog_delete_page().

#define SizeOfBtreeInsert   (offsetof(xl_btreetid, tid) + SizeOfIptrData)

Definition at line 262 of file nbtree.h.

Referenced by btree_xlog_insert().

#define SizeOfBtreeNewroot   (offsetof(xl_btree_newroot, level) + sizeof(uint32))

Definition at line 406 of file nbtree.h.

Referenced by btree_xlog_newroot().

#define SizeOfBtreeReusePage   (sizeof(xl_btree_reuse_page))

Definition at line 338 of file nbtree.h.

#define SizeOfBtreeSplit   (offsetof(xl_btree_split, firstright) + sizeof(OffsetNumber))

Definition at line 308 of file nbtree.h.

Referenced by btree_xlog_split().

#define SizeOfBtreeVacuum   (offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber))

Definition at line 370 of file nbtree.h.

Referenced by btree_xlog_vacuum().

#define SK_BT_DESC   (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
#define SK_BT_INDOPTION_SHIFT   24

Definition at line 592 of file nbtree.h.

Referenced by _bt_mkscankey().

#define SK_BT_NULLS_FIRST   (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
#define SK_BT_REQBKWD   0x00020000

Definition at line 591 of file nbtree.h.

Referenced by _bt_check_rowcompare(), and _bt_checkkeys().

#define SK_BT_REQFWD   0x00010000

Definition at line 590 of file nbtree.h.

Referenced by _bt_check_rowcompare(), _bt_checkkeys(), and _bt_mark_scankey_required().

#define XLOG_BTREE_DELETE   0x70

Definition at line 217 of file nbtree.h.

Referenced by _bt_delitems_delete(), btree_desc(), and btree_redo().

#define XLOG_BTREE_DELETE_PAGE   0x80

Definition at line 218 of file nbtree.h.

Referenced by btree_desc(), and btree_redo().

#define XLOG_BTREE_DELETE_PAGE_HALF   0xB0

Definition at line 221 of file nbtree.h.

Referenced by btree_desc(), btree_redo(), and btree_xlog_delete_page().

#define XLOG_BTREE_DELETE_PAGE_META   0x90

Definition at line 219 of file nbtree.h.

Referenced by btree_desc(), btree_redo(), and btree_xlog_delete_page().

#define XLOG_BTREE_INSERT_LEAF   0x00

Definition at line 210 of file nbtree.h.

Referenced by _bt_insertonpg(), btree_desc(), and btree_redo().

#define XLOG_BTREE_INSERT_META   0x20

Definition at line 212 of file nbtree.h.

Referenced by btree_desc(), and btree_redo().

#define XLOG_BTREE_INSERT_UPPER   0x10

Definition at line 211 of file nbtree.h.

Referenced by btree_desc(), and btree_redo().

#define XLOG_BTREE_NEWROOT   0xA0

Definition at line 220 of file nbtree.h.

Referenced by _bt_getroot(), _bt_newroot(), btree_desc(), and btree_redo().

#define XLOG_BTREE_REUSE_PAGE   0xD0

Definition at line 225 of file nbtree.h.

Referenced by _bt_log_reuse_page(), btree_desc(), and btree_redo().

#define XLOG_BTREE_SPLIT_L   0x30

Definition at line 213 of file nbtree.h.

Referenced by _bt_split(), btree_desc(), and btree_redo().

#define XLOG_BTREE_SPLIT_L_ROOT   0x50

Definition at line 215 of file nbtree.h.

Referenced by _bt_split(), btree_desc(), and btree_redo().

#define XLOG_BTREE_SPLIT_R   0x40

Definition at line 214 of file nbtree.h.

Referenced by btree_desc(), and btree_redo().

#define XLOG_BTREE_SPLIT_R_ROOT   0x60

Definition at line 216 of file nbtree.h.

Referenced by btree_desc(), and btree_redo().

#define XLOG_BTREE_VACUUM   0xC0

Definition at line 223 of file nbtree.h.

Referenced by _bt_delitems_vacuum(), btree_desc(), and btree_redo().


Typedef Documentation

typedef uint16 BTCycleId

Definition at line 25 of file nbtree.h.

Definition at line 66 of file nbtree.h.

Definition at line 583 of file nbtree.h.

Definition at line 529 of file nbtree.h.

typedef struct BTScanPosData BTScanPosData
typedef struct BTScanPosItem BTScanPosItem
typedef struct BTSpool BTSpool

Definition at line 687 of file nbtree.h.

typedef BTStackData* BTStack

Definition at line 459 of file nbtree.h.

typedef struct BTStackData BTStackData
typedef struct xl_btreetid xl_btreetid

Function Documentation

bool _bt_advance_array_keys ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 549 of file nbtutils.c.

References BTScanOpaqueData::arrayKeyData, BTScanOpaqueData::arrayKeys, BTArrayKeyInfo::cur_elem, BTArrayKeyInfo::elem_values, i, BTArrayKeyInfo::num_elems, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, BTArrayKeyInfo::scan_key, ScanDirectionIsBackward, and ScanKeyData::sk_argument.

Referenced by btgetbitmap(), and btgettuple().

{
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    bool        found = false;
    int         i;

    /*
     * We must advance the last array key most quickly, since it will
     * correspond to the lowest-order index column among the available
     * qualifications. This is necessary to ensure correct ordering of output
     * when there are multiple array keys.
     */
    for (i = so->numArrayKeys - 1; i >= 0; i--)
    {
        BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
        ScanKey     skey = &so->arrayKeyData[curArrayKey->scan_key];
        int         cur_elem = curArrayKey->cur_elem;
        int         num_elems = curArrayKey->num_elems;

        if (ScanDirectionIsBackward(dir))
        {
            if (--cur_elem < 0)
            {
                cur_elem = num_elems - 1;
                found = false;  /* need to advance next array key */
            }
            else
                found = true;
        }
        else
        {
            if (++cur_elem >= num_elems)
            {
                cur_elem = 0;
                found = false;  /* need to advance next array key */
            }
            else
                found = true;
        }

        curArrayKey->cur_elem = cur_elem;
        skey->sk_argument = curArrayKey->elem_values[cur_elem];
        if (found)
            break;
    }

    return found;
}

OffsetNumber _bt_binsrch ( Relation  rel,
Buffer  buf,
int  keysz,
ScanKey  scankey,
bool  nextkey 
)

Definition at line 234 of file nbtsearch.c.

References _bt_compare(), Assert, BufferGetPage, OffsetNumberPrev, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber, and PageGetSpecialPointer.

Referenced by _bt_doinsert(), _bt_findinsertloc(), _bt_first(), and _bt_search().

{
    Page        page;
    BTPageOpaque opaque;
    OffsetNumber low,
                high;
    int32       result,
                cmpval;

    page = BufferGetPage(buf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);

    low = P_FIRSTDATAKEY(opaque);
    high = PageGetMaxOffsetNumber(page);

    /*
     * If there are no keys on the page, return the first available slot. Note
     * this covers two cases: the page is really empty (no keys), or it
     * contains only a high key.  The latter case is possible after vacuuming.
     * This can never happen on an internal page, however, since they are
     * never empty (an internal page must have children).
     */
    if (high < low)
        return low;

    /*
     * Binary search to find the first key on the page >= scan key, or first
     * key > scankey when nextkey is true.
     *
     * For nextkey=false (cmpval=1), the loop invariant is: all slots before
     * 'low' are < scan key, all slots at or after 'high' are >= scan key.
     *
     * For nextkey=true (cmpval=0), the loop invariant is: all slots before
     * 'low' are <= scan key, all slots at or after 'high' are > scan key.
     *
     * We can fall out when high == low.
     */
    high++;                     /* establish the loop invariant for high */

    cmpval = nextkey ? 0 : 1;   /* select comparison value */

    while (high > low)
    {
        OffsetNumber mid = low + ((high - low) / 2);

        /* We have low <= mid < high, so mid points at a real slot */

        result = _bt_compare(rel, keysz, scankey, page, mid);

        if (result >= cmpval)
            low = mid + 1;
        else
            high = mid;
    }

    /*
     * At this point we have high == low, but be careful: they could point
     * past the last slot on the page.
     *
     * On a leaf page, we always return the first key >= scan key (resp. >
     * scan key), which could be the last slot + 1.
     */
    if (P_ISLEAF(opaque))
        return low;

    /*
     * On a non-leaf page, return the last key < scan key (resp. <= scan key).
     * There must be one if _bt_compare() is playing by the rules.
     */
    Assert(low > P_FIRSTDATAKEY(opaque));

    return OffsetNumberPrev(low);
}

IndexTuple _bt_checkkeys ( IndexScanDesc  scan,
Page  page,
OffsetNumber  offnum,
ScanDirection  dir,
bool continuescan 
)

Definition at line 1372 of file nbtutils.c.

References _bt_check_rowcompare(), Assert, DatumGetBool, FunctionCall2Coll(), IndexScanDescData::ignore_killed_tuples, index_getattr, IndexScanDescData::indexRelation, ItemIdIsDead, BTScanOpaqueData::keyData, NULL, BTScanOpaqueData::numberOfKeys, IndexScanDescData::opaque, P_FIRSTDATAKEY, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, RelationGetDescr, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_NULLS_FIRST, SK_BT_REQBKWD, SK_BT_REQFWD, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, SK_ISNULL, SK_ROW_HEADER, SK_SEARCHNOTNULL, and SK_SEARCHNULL.

Referenced by _bt_readpage().

{
    ItemId      iid = PageGetItemId(page, offnum);
    bool        tuple_alive;
    IndexTuple  tuple;
    TupleDesc   tupdesc;
    BTScanOpaque so;
    int         keysz;
    int         ikey;
    ScanKey     key;

    *continuescan = true;       /* default assumption */

    /*
     * If the scan specifies not to return killed tuples, then we treat a
     * killed tuple as not passing the qual.  Most of the time, it's a win to
     * not bother examining the tuple's index keys, but just return
     * immediately with continuescan = true to proceed to the next tuple.
     * However, if this is the last tuple on the page, we should check the
     * index keys to prevent uselessly advancing to the next page.
     */
    if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
    {
        /* return immediately if there are more tuples on the page */
        if (ScanDirectionIsForward(dir))
        {
            if (offnum < PageGetMaxOffsetNumber(page))
                return NULL;
        }
        else
        {
            BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);

            if (offnum > P_FIRSTDATAKEY(opaque))
                return NULL;
        }

        /*
         * OK, we want to check the keys so we can set continuescan correctly,
         * but we'll return NULL even if the tuple passes the key tests.
         */
        tuple_alive = false;
    }
    else
        tuple_alive = true;

    tuple = (IndexTuple) PageGetItem(page, iid);

    tupdesc = RelationGetDescr(scan->indexRelation);
    so = (BTScanOpaque) scan->opaque;
    keysz = so->numberOfKeys;

    for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++)
    {
        Datum       datum;
        bool        isNull;
        Datum       test;

        /* row-comparison keys need special processing */
        if (key->sk_flags & SK_ROW_HEADER)
        {
            if (_bt_check_rowcompare(key, tuple, tupdesc, dir, continuescan))
                continue;
            return NULL;
        }

        datum = index_getattr(tuple,
                              key->sk_attno,
                              tupdesc,
                              &isNull);

        if (key->sk_flags & SK_ISNULL)
        {
            /* Handle IS NULL/NOT NULL tests */
            if (key->sk_flags & SK_SEARCHNULL)
            {
                if (isNull)
                    continue;   /* tuple satisfies this qual */
            }
            else
            {
                Assert(key->sk_flags & SK_SEARCHNOTNULL);
                if (!isNull)
                    continue;   /* tuple satisfies this qual */
            }

            /*
             * Tuple fails this qual.  If it's a required qual for the current
             * scan direction, then we can conclude no further tuples will
             * pass, either.
             */
            if ((key->sk_flags & SK_BT_REQFWD) &&
                ScanDirectionIsForward(dir))
                *continuescan = false;
            else if ((key->sk_flags & SK_BT_REQBKWD) &&
                     ScanDirectionIsBackward(dir))
                *continuescan = false;

            /*
             * In any case, this indextuple doesn't match the qual.
             */
            return NULL;
        }

        if (isNull)
        {
            if (key->sk_flags & SK_BT_NULLS_FIRST)
            {
                /*
                 * Since NULLs are sorted before non-NULLs, we know we have
                 * reached the lower limit of the range of values for this
                 * index attr.  On a backward scan, we can stop if this qual
                 * is one of the "must match" subset.  We can stop regardless
                 * of whether the qual is > or <, so long as it's required,
                 * because it's not possible for any future tuples to pass. On
                 * a forward scan, however, we must keep going, because we may
                 * have initially positioned to the start of the index.
                 */
                if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
                    ScanDirectionIsBackward(dir))
                    *continuescan = false;
            }
            else
            {
                /*
                 * Since NULLs are sorted after non-NULLs, we know we have
                 * reached the upper limit of the range of values for this
                 * index attr.  On a forward scan, we can stop if this qual is
                 * one of the "must match" subset.  We can stop regardless of
                 * whether the qual is > or <, so long as it's required,
                 * because it's not possible for any future tuples to pass. On
                 * a backward scan, however, we must keep going, because we
                 * may have initially positioned to the end of the index.
                 */
                if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
                    ScanDirectionIsForward(dir))
                    *continuescan = false;
            }

            /*
             * In any case, this indextuple doesn't match the qual.
             */
            return NULL;
        }

        test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
                                 datum, key->sk_argument);

        if (!DatumGetBool(test))
        {
            /*
             * Tuple fails this qual.  If it's a required qual for the current
             * scan direction, then we can conclude no further tuples will
             * pass, either.
             *
             * Note: because we stop the scan as soon as any required equality
             * qual fails, it is critical that equality quals be used for the
             * initial positioning in _bt_first() when they are available. See
             * comments in _bt_first().
             */
            if ((key->sk_flags & SK_BT_REQFWD) &&
                ScanDirectionIsForward(dir))
                *continuescan = false;
            else if ((key->sk_flags & SK_BT_REQBKWD) &&
                     ScanDirectionIsBackward(dir))
                *continuescan = false;

            /*
             * In any case, this indextuple doesn't match the qual.
             */
            return NULL;
        }
    }

    /* Check for failure due to it being a killed tuple. */
    if (!tuple_alive)
        return NULL;

    /* If we get here, the tuple passes all index quals. */
    return tuple;
}

void _bt_checkpage ( Relation  rel,
Buffer  buf 
)

Definition at line 492 of file nbtpage.c.

References BufferGetBlockNumber(), BufferGetPage, ereport, errcode(), errhint(), errmsg(), ERROR, MAXALIGN, PageGetSpecialSize, PageIsNew, and RelationGetRelationName.

Referenced by _bt_getbuf(), _bt_relandgetbuf(), and btvacuumpage().

{
    Page        page = BufferGetPage(buf);

    /*
     * ReadBuffer verifies that every newly-read page passes
     * PageHeaderIsValid, which means it either contains a reasonably sane
     * page header or is all-zero.  We have to defend against the all-zero
     * case, however.
     */
    if (PageIsNew(page))
        ereport(ERROR,
                (errcode(ERRCODE_INDEX_CORRUPTED),
             errmsg("index \"%s\" contains unexpected zero page at block %u",
                    RelationGetRelationName(rel),
                    BufferGetBlockNumber(buf)),
                 errhint("Please REINDEX it.")));

    /*
     * Additionally check that the special area looks sane.
     */
    if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
        ereport(ERROR,
                (errcode(ERRCODE_INDEX_CORRUPTED),
                 errmsg("index \"%s\" contains corrupted page at block %u",
                        RelationGetRelationName(rel),
                        BufferGetBlockNumber(buf)),
                 errhint("Please REINDEX it.")));
}

int32 _bt_compare ( Relation  rel,
int  keysz,
ScanKey  scankey,
Page  page,
OffsetNumber  offnum 
)

Definition at line 339 of file nbtsearch.c.

References DatumGetInt32, FunctionCall2Coll(), i, index_getattr, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem, PageGetItemId, PageGetSpecialPointer, RelationGetDescr, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_findinsertloc(), and _bt_moveright().

{
    TupleDesc   itupdesc = RelationGetDescr(rel);
    BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    IndexTuple  itup;
    int         i;

    /*
     * Force result ">" if target item is first data item on an internal page
     * --- see NOTE above.
     */
    if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
        return 1;

    itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));

    /*
     * The scan key is set up with the attribute number associated with each
     * term in the key.  It is important that, if the index is multi-key, the
     * scan contain the first k key attributes, and that they be in order.  If
     * you think about how multi-key ordering works, you'll understand why
     * this is.
     *
     * We don't test for violation of this condition here, however.  The
     * initial setup for the index scan had better have gotten it right (see
     * _bt_first).
     */

    for (i = 1; i <= keysz; i++)
    {
        Datum       datum;
        bool        isNull;
        int32       result;

        datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);

        /* see comments about NULLs handling in btbuild */
        if (scankey->sk_flags & SK_ISNULL)      /* key is NULL */
        {
            if (isNull)
                result = 0;     /* NULL "=" NULL */
            else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
                result = -1;    /* NULL "<" NOT_NULL */
            else
                result = 1;     /* NULL ">" NOT_NULL */
        }
        else if (isNull)        /* key is NOT_NULL and item is NULL */
        {
            if (scankey->sk_flags & SK_BT_NULLS_FIRST)
                result = 1;     /* NOT_NULL ">" NULL */
            else
                result = -1;    /* NOT_NULL "<" NULL */
        }
        else
        {
            /*
             * The sk_func needs to be passed the index value as left arg and
             * the sk_argument as right arg (they might be of different
             * types).  Since it is convenient for callers to think of
             * _bt_compare as comparing the scankey to the index item, we have
             * to flip the sign of the comparison result.  (Unless it's a DESC
             * column, in which case we *don't* flip the sign.)
             */
            result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
                                                     scankey->sk_collation,
                                                     datum,
                                                     scankey->sk_argument));

            if (!(scankey->sk_flags & SK_BT_DESC))
                result = -result;
        }

        /* if the keys are unequal, return the difference */
        if (result != 0)
            return result;

        scankey++;
    }

    /* if we get here, the keys are equal */
    return 0;
}

void _bt_delitems_delete ( Relation  rel,
Buffer  buf,
OffsetNumber itemnos,
int  nitems,
Relation  heapRel 
)

Definition at line 882 of file nbtpage.c.

References Assert, xl_btree_delete::block, BTPageOpaqueData::btpo_flags, XLogRecData::buffer, XLogRecData::buffer_std, BufferGetBlockNumber(), BufferGetPage, XLogRecData::data, END_CRIT_SECTION, xl_btree_delete::hnode, XLogRecData::len, MarkBufferDirty(), XLogRecData::next, xl_btree_delete::nitems, xl_btree_delete::node, PageGetSpecialPointer, PageIndexMultiDelete(), PageSetLSN, RelationData::rd_node, RelationNeedsWAL, START_CRIT_SECTION, XLOG_BTREE_DELETE, and XLogInsert().

Referenced by _bt_vacuum_one_page().

{
    Page        page = BufferGetPage(buf);
    BTPageOpaque opaque;

    /* Shouldn't be called unless there's something to do */
    Assert(nitems > 0);

    /* No ereport(ERROR) until changes are logged */
    START_CRIT_SECTION();

    /* Fix the page */
    PageIndexMultiDelete(page, itemnos, nitems);

    /*
     * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID,
     * because this is not called by VACUUM.
     */

    /*
     * Mark the page as not containing any LP_DEAD items.  This is not
     * certainly true (there might be some that have recently been marked, but
     * weren't included in our target-item list), but it will almost always be
     * true and it doesn't seem worth an additional page scan to check it.
     * Remember that BTP_HAS_GARBAGE is only a hint anyway.
     */
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    opaque->btpo_flags &= ~BTP_HAS_GARBAGE;

    MarkBufferDirty(buf);

    /* XLOG stuff */
    if (RelationNeedsWAL(rel))
    {
        XLogRecPtr  recptr;
        XLogRecData rdata[3];
        xl_btree_delete xlrec_delete;

        xlrec_delete.node = rel->rd_node;
        xlrec_delete.hnode = heapRel->rd_node;
        xlrec_delete.block = BufferGetBlockNumber(buf);
        xlrec_delete.nitems = nitems;

        rdata[0].data = (char *) &xlrec_delete;
        rdata[0].len = SizeOfBtreeDelete;
        rdata[0].buffer = InvalidBuffer;
        rdata[0].next = &(rdata[1]);

        /*
         * We need the target-offsets array whether or not we store the whole
         * buffer, to allow us to find the latestRemovedXid on a standby
         * server.
         */
        rdata[1].data = (char *) itemnos;
        rdata[1].len = nitems * sizeof(OffsetNumber);
        rdata[1].buffer = InvalidBuffer;
        rdata[1].next = &(rdata[2]);

        rdata[2].data = NULL;
        rdata[2].len = 0;
        rdata[2].buffer = buf;
        rdata[2].buffer_std = true;
        rdata[2].next = NULL;

        recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);

        PageSetLSN(page, recptr);
    }

    END_CRIT_SECTION();
}

void _bt_delitems_vacuum ( Relation  rel,
Buffer  buf,
OffsetNumber itemnos,
int  nitems,
BlockNumber  lastBlockVacuumed 
)

Definition at line 794 of file nbtpage.c.

References xl_btree_vacuum::block, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, XLogRecData::buffer, XLogRecData::buffer_std, BufferGetBlockNumber(), BufferGetPage, XLogRecData::data, END_CRIT_SECTION, xl_btree_vacuum::lastBlockVacuumed, XLogRecData::len, MarkBufferDirty(), XLogRecData::next, xl_btree_vacuum::node, PageGetSpecialPointer, PageIndexMultiDelete(), PageSetLSN, RelationData::rd_node, RelationNeedsWAL, START_CRIT_SECTION, XLOG_BTREE_VACUUM, and XLogInsert().

Referenced by btvacuumpage(), and btvacuumscan().

{
    Page        page = BufferGetPage(buf);
    BTPageOpaque opaque;

    /* No ereport(ERROR) until changes are logged */
    START_CRIT_SECTION();

    /* Fix the page */
    if (nitems > 0)
        PageIndexMultiDelete(page, itemnos, nitems);

    /*
     * We can clear the vacuum cycle ID since this page has certainly been
     * processed by the current vacuum scan.
     */
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    opaque->btpo_cycleid = 0;

    /*
     * Mark the page as not containing any LP_DEAD items.  This is not
     * certainly true (there might be some that have recently been marked, but
     * weren't included in our target-item list), but it will almost always be
     * true and it doesn't seem worth an additional page scan to check it.
     * Remember that BTP_HAS_GARBAGE is only a hint anyway.
     */
    opaque->btpo_flags &= ~BTP_HAS_GARBAGE;

    MarkBufferDirty(buf);

    /* XLOG stuff */
    if (RelationNeedsWAL(rel))
    {
        XLogRecPtr  recptr;
        XLogRecData rdata[2];
        xl_btree_vacuum xlrec_vacuum;

        xlrec_vacuum.node = rel->rd_node;
        xlrec_vacuum.block = BufferGetBlockNumber(buf);

        xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed;
        rdata[0].data = (char *) &xlrec_vacuum;
        rdata[0].len = SizeOfBtreeVacuum;
        rdata[0].buffer = InvalidBuffer;
        rdata[0].next = &(rdata[1]);

        /*
         * The target-offsets array is not in the buffer, but pretend that it
         * is.  When XLogInsert stores the whole buffer, the offsets array
         * need not be stored too.
         */
        if (nitems > 0)
        {
            rdata[1].data = (char *) itemnos;
            rdata[1].len = nitems * sizeof(OffsetNumber);
        }
        else
        {
            rdata[1].data = NULL;
            rdata[1].len = 0;
        }
        rdata[1].buffer = buf;
        rdata[1].buffer_std = true;
        rdata[1].next = NULL;

        recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM, rdata);

        PageSetLSN(page, recptr);
    }

    END_CRIT_SECTION();
}

bool _bt_doinsert ( Relation  rel,
IndexTuple  itup,
IndexUniqueCheck  checkUnique,
Relation  heapRel 
)

Definition at line 103 of file nbtinsert.c.

References _bt_binsrch(), _bt_check_unique(), _bt_findinsertloc(), _bt_freeskey(), _bt_freestack(), _bt_insertonpg(), _bt_mkscankey(), _bt_moveright(), _bt_relbuf(), _bt_search(), BT_WRITE, buf, BUFFER_LOCK_UNLOCK, CheckForSerializableConflictIn(), LockBuffer(), NULL, RelationData::rd_rel, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_NO, and XactLockTableWait().

Referenced by btinsert().

{
    bool        is_unique = false;
    int         natts = rel->rd_rel->relnatts;
    ScanKey     itup_scankey;
    BTStack     stack;
    Buffer      buf;
    OffsetNumber offset;

    /* we need an insertion scan key to do our search, so build one */
    itup_scankey = _bt_mkscankey(rel, itup);

top:
    /* find the first page containing this key */
    stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE);

    offset = InvalidOffsetNumber;

    /* trade in our read lock for a write lock */
    LockBuffer(buf, BUFFER_LOCK_UNLOCK);
    LockBuffer(buf, BT_WRITE);

    /*
     * If the page was split between the time that we surrendered our read
     * lock and acquired our write lock, then this page may no longer be the
     * right place for the key we want to insert.  In this case, we need to
     * move right in the tree.  See Lehman and Yao for an excruciatingly
     * precise description.
     */
    buf = _bt_moveright(rel, buf, natts, itup_scankey, false, BT_WRITE);

    /*
     * If we're not allowing duplicates, make sure the key isn't already in
     * the index.
     *
     * NOTE: obviously, _bt_check_unique can only detect keys that are already
     * in the index; so it cannot defend against concurrent insertions of the
     * same key.  We protect against that by means of holding a write lock on
     * the target page.  Any other would-be inserter of the same key must
     * acquire a write lock on the same target page, so only one would-be
     * inserter can be making the check at one time.  Furthermore, once we are
     * past the check we hold write locks continuously until we have performed
     * our insertion, so no later inserter can fail to see our insertion.
     * (This requires some care in _bt_insertonpg.)
     *
     * If we must wait for another xact, we release the lock while waiting,
     * and then must start over completely.
     *
     * For a partial uniqueness check, we don't wait for the other xact. Just
     * let the tuple in and return false for possibly non-unique, or true for
     * definitely unique.
     */
    if (checkUnique != UNIQUE_CHECK_NO)
    {
        TransactionId xwait;

        offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
        xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey,
                                 checkUnique, &is_unique);

        if (TransactionIdIsValid(xwait))
        {
            /* Have to wait for the other guy ... */
            _bt_relbuf(rel, buf);
            XactLockTableWait(xwait);
            /* start over... */
            _bt_freestack(stack);
            goto top;
        }
    }

    if (checkUnique != UNIQUE_CHECK_EXISTING)
    {
        /*
         * The only conflict predicate locking cares about for indexes is when
         * an index tuple insert conflicts with an existing lock.  Since the
         * actual location of the insert is hard to predict because of the
         * random search used to prevent O(N^2) performance when there are
         * many duplicate entries, we can just use the "first valid" page.
         */
        CheckForSerializableConflictIn(rel, NULL, buf);
        /* do the insertion */
        _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup, heapRel);
        _bt_insertonpg(rel, buf, stack, itup, offset, false);
    }
    else
    {
        /* just release the buffer */
        _bt_relbuf(rel, buf);
    }

    /* be tidy */
    _bt_freestack(stack);
    _bt_freeskey(itup_scankey);

    return is_unique;
}

void _bt_end_vacuum ( Relation  rel  ) 
void _bt_end_vacuum_callback ( int  code,
Datum  arg 
)

Definition at line 1967 of file nbtutils.c.

References _bt_end_vacuum(), and DatumGetPointer.

Referenced by btbulkdelete().

bool _bt_first ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 447 of file nbtsearch.c.

References _bt_binsrch(), _bt_endpoint(), _bt_freestack(), _bt_preprocess_keys(), _bt_readpage(), _bt_search(), _bt_steppage(), Assert, BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosData::buf, buf, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferIsValid, cur, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, DatumGetPointer, elog, ERROR, get_opfamily_proc(), i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, InvalidOid, InvalidStrategy, BTScanPosData::itemIndex, BTScanPosData::items, BTScanOpaqueData::keyData, LockBuffer(), BTScanOpaqueData::markItemIndex, BTScanPosData::moreLeft, BTScanPosData::moreRight, NULL, BTScanOpaqueData::numberOfKeys, BTScanOpaqueData::numKilled, OffsetNumberPrev, IndexScanDescData::opaque, pgstat_count_index_scan, PredicateLockPage(), PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, ScanKeyData::sk_subtype, HeapTupleData::t_self, IndexScanDescData::xs_ctup, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

{
    Relation    rel = scan->indexRelation;
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    Buffer      buf;
    BTStack     stack;
    OffsetNumber offnum;
    StrategyNumber strat;
    bool        nextkey;
    bool        goback;
    ScanKey     startKeys[INDEX_MAX_KEYS];
    ScanKeyData scankeys[INDEX_MAX_KEYS];
    ScanKeyData notnullkeys[INDEX_MAX_KEYS];
    int         keysCount = 0;
    int         i;
    StrategyNumber strat_total;
    BTScanPosItem *currItem;

    pgstat_count_index_scan(rel);

    /*
     * Examine the scan keys and eliminate any redundant keys; also mark the
     * keys that must be matched to continue the scan.
     */
    _bt_preprocess_keys(scan);

    /*
     * Quit now if _bt_preprocess_keys() discovered that the scan keys can
     * never be satisfied (eg, x == 1 AND x > 2).
     */
    if (!so->qual_ok)
        return false;

    /*----------
     * Examine the scan keys to discover where we need to start the scan.
     *
     * We want to identify the keys that can be used as starting boundaries;
     * these are =, >, or >= keys for a forward scan or =, <, <= keys for
     * a backwards scan.  We can use keys for multiple attributes so long as
     * the prior attributes had only =, >= (resp. =, <=) keys.  Once we accept
     * a > or < boundary or find an attribute with no boundary (which can be
     * thought of as the same as "> -infinity"), we can't use keys for any
     * attributes to its right, because it would break our simplistic notion
     * of what initial positioning strategy to use.
     *
     * When the scan keys include cross-type operators, _bt_preprocess_keys
     * may not be able to eliminate redundant keys; in such cases we will
     * arbitrarily pick a usable one for each attribute.  This is correct
     * but possibly not optimal behavior.  (For example, with keys like
     * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
     * x=5 would be more efficient.)  Since the situation only arises given
     * a poorly-worded query plus an incomplete opfamily, live with it.
     *
     * When both equality and inequality keys appear for a single attribute
     * (again, only possible when cross-type operators appear), we *must*
     * select one of the equality keys for the starting point, because
     * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
     * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
     * start at x=4, we will fail and stop before reaching x=10.  If multiple
     * equality quals survive preprocessing, however, it doesn't matter which
     * one we use --- by definition, they are either redundant or
     * contradictory.
     *
     * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
     * If the index stores nulls at the end of the index we'll be starting
     * from, and we have no boundary key for the column (which means the key
     * we deduced NOT NULL from is an inequality key that constrains the other
     * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
     * use as a boundary key.  If we didn't do this, we might find ourselves
     * traversing a lot of null entries at the start of the scan.
     *
     * In this loop, row-comparison keys are treated the same as keys on their
     * first (leftmost) columns.  We'll add on lower-order columns of the row
     * comparison below, if possible.
     *
     * The selected scan keys (at most one per index column) are remembered by
     * storing their addresses into the local startKeys[] array.
     *----------
     */
    strat_total = BTEqualStrategyNumber;
    if (so->numberOfKeys > 0)
    {
        AttrNumber  curattr;
        ScanKey     chosen;
        ScanKey     impliesNN;
        ScanKey     cur;

        /*
         * chosen is the so-far-chosen key for the current attribute, if any.
         * We don't cast the decision in stone until we reach keys for the
         * next attribute.
         */
        curattr = 1;
        chosen = NULL;
        /* Also remember any scankey that implies a NOT NULL constraint */
        impliesNN = NULL;

        /*
         * Loop iterates from 0 to numberOfKeys inclusive; we use the last
         * pass to handle after-last-key processing.  Actual exit from the
         * loop is at one of the "break" statements below.
         */
        for (cur = so->keyData, i = 0;; cur++, i++)
        {
            if (i >= so->numberOfKeys || cur->sk_attno != curattr)
            {
                /*
                 * Done looking at keys for curattr.  If we didn't find a
                 * usable boundary key, see if we can deduce a NOT NULL key.
                 */
                if (chosen == NULL && impliesNN != NULL &&
                    ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
                     ScanDirectionIsForward(dir) :
                     ScanDirectionIsBackward(dir)))
                {
                    /* Yes, so build the key in notnullkeys[keysCount] */
                    chosen = &notnullkeys[keysCount];
                    ScanKeyEntryInitialize(chosen,
                                           (SK_SEARCHNOTNULL | SK_ISNULL |
                                            (impliesNN->sk_flags &
                                          (SK_BT_DESC | SK_BT_NULLS_FIRST))),
                                           curattr,
                                 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
                                  BTGreaterStrategyNumber :
                                  BTLessStrategyNumber),
                                           InvalidOid,
                                           InvalidOid,
                                           InvalidOid,
                                           (Datum) 0);
                }

                /*
                 * If we still didn't find a usable boundary key, quit; else
                 * save the boundary key pointer in startKeys.
                 */
                if (chosen == NULL)
                    break;
                startKeys[keysCount++] = chosen;

                /*
                 * Adjust strat_total, and quit if we have stored a > or <
                 * key.
                 */
                strat = chosen->sk_strategy;
                if (strat != BTEqualStrategyNumber)
                {
                    strat_total = strat;
                    if (strat == BTGreaterStrategyNumber ||
                        strat == BTLessStrategyNumber)
                        break;
                }

                /*
                 * Done if that was the last attribute, or if next key is not
                 * in sequence (implying no boundary key is available for the
                 * next attribute).
                 */
                if (i >= so->numberOfKeys ||
                    cur->sk_attno != curattr + 1)
                    break;

                /*
                 * Reset for next attr.
                 */
                curattr = cur->sk_attno;
                chosen = NULL;
                impliesNN = NULL;
            }

            /*
             * Can we use this key as a starting boundary for this attr?
             *
             * If not, does it imply a NOT NULL constraint?  (Because
             * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
             * *any* inequality key works for that; we need not test.)
             */
            switch (cur->sk_strategy)
            {
                case BTLessStrategyNumber:
                case BTLessEqualStrategyNumber:
                    if (chosen == NULL)
                    {
                        if (ScanDirectionIsBackward(dir))
                            chosen = cur;
                        else
                            impliesNN = cur;
                    }
                    break;
                case BTEqualStrategyNumber:
                    /* override any non-equality choice */
                    chosen = cur;
                    break;
                case BTGreaterEqualStrategyNumber:
                case BTGreaterStrategyNumber:
                    if (chosen == NULL)
                    {
                        if (ScanDirectionIsForward(dir))
                            chosen = cur;
                        else
                            impliesNN = cur;
                    }
                    break;
            }
        }
    }

    /*
     * If we found no usable boundary keys, we have to start from one end of
     * the tree.  Walk down that edge to the first or last key, and scan from
     * there.
     */
    if (keysCount == 0)
        return _bt_endpoint(scan, dir);

    /*
     * We want to start the scan somewhere within the index.  Set up an
     * insertion scankey we can use to search for the boundary point we
     * identified above.  The insertion scankey is built in the local
     * scankeys[] array, using the keys identified by startKeys[].
     */
    Assert(keysCount <= INDEX_MAX_KEYS);
    for (i = 0; i < keysCount; i++)
    {
        ScanKey     cur = startKeys[i];

        Assert(cur->sk_attno == i + 1);

        if (cur->sk_flags & SK_ROW_HEADER)
        {
            /*
             * Row comparison header: look to the first row member instead.
             *
             * The member scankeys are already in insertion format (ie, they
             * have sk_func = 3-way-comparison function), but we have to watch
             * out for nulls, which _bt_preprocess_keys didn't check. A null
             * in the first row member makes the condition unmatchable, just
             * like qual_ok = false.
             */
            ScanKey     subkey = (ScanKey) DatumGetPointer(cur->sk_argument);

            Assert(subkey->sk_flags & SK_ROW_MEMBER);
            if (subkey->sk_flags & SK_ISNULL)
                return false;
            memcpy(scankeys + i, subkey, sizeof(ScanKeyData));

            /*
             * If the row comparison is the last positioning key we accepted,
             * try to add additional keys from the lower-order row members.
             * (If we accepted independent conditions on additional index
             * columns, we use those instead --- doesn't seem worth trying to
             * determine which is more restrictive.)  Note that this is OK
             * even if the row comparison is of ">" or "<" type, because the
             * condition applied to all but the last row member is effectively
             * ">=" or "<=", and so the extra keys don't break the positioning
             * scheme.  But, by the same token, if we aren't able to use all
             * the row members, then the part of the row comparison that we
             * did use has to be treated as just a ">=" or "<=" condition, and
             * so we'd better adjust strat_total accordingly.
             */
            if (i == keysCount - 1)
            {
                bool        used_all_subkeys = false;

                Assert(!(subkey->sk_flags & SK_ROW_END));
                for (;;)
                {
                    subkey++;
                    Assert(subkey->sk_flags & SK_ROW_MEMBER);
                    if (subkey->sk_attno != keysCount + 1)
                        break;  /* out-of-sequence, can't use it */
                    if (subkey->sk_strategy != cur->sk_strategy)
                        break;  /* wrong direction, can't use it */
                    if (subkey->sk_flags & SK_ISNULL)
                        break;  /* can't use null keys */
                    Assert(keysCount < INDEX_MAX_KEYS);
                    memcpy(scankeys + keysCount, subkey, sizeof(ScanKeyData));
                    keysCount++;
                    if (subkey->sk_flags & SK_ROW_END)
                    {
                        used_all_subkeys = true;
                        break;
                    }
                }
                if (!used_all_subkeys)
                {
                    switch (strat_total)
                    {
                        case BTLessStrategyNumber:
                            strat_total = BTLessEqualStrategyNumber;
                            break;
                        case BTGreaterStrategyNumber:
                            strat_total = BTGreaterEqualStrategyNumber;
                            break;
                    }
                }
                break;          /* done with outer loop */
            }
        }
        else
        {
            /*
             * Ordinary comparison key.  Transform the search-style scan key
             * to an insertion scan key by replacing the sk_func with the
             * appropriate btree comparison function.
             *
             * If scankey operator is not a cross-type comparison, we can use
             * the cached comparison function; otherwise gotta look it up in
             * the catalogs.  (That can't lead to infinite recursion, since no
             * indexscan initiated by syscache lookup will use cross-data-type
             * operators.)
             *
             * We support the convention that sk_subtype == InvalidOid means
             * the opclass input type; this is a hack to simplify life for
             * ScanKeyInit().
             */
            if (cur->sk_subtype == rel->rd_opcintype[i] ||
                cur->sk_subtype == InvalidOid)
            {
                FmgrInfo   *procinfo;

                procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
                ScanKeyEntryInitializeWithInfo(scankeys + i,
                                               cur->sk_flags,
                                               cur->sk_attno,
                                               InvalidStrategy,
                                               cur->sk_subtype,
                                               cur->sk_collation,
                                               procinfo,
                                               cur->sk_argument);
            }
            else
            {
                RegProcedure cmp_proc;

                cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
                                             rel->rd_opcintype[i],
                                             cur->sk_subtype,
                                             BTORDER_PROC);
                if (!RegProcedureIsValid(cmp_proc))
                    elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
                         BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
                         cur->sk_attno, RelationGetRelationName(rel));
                ScanKeyEntryInitialize(scankeys + i,
                                       cur->sk_flags,
                                       cur->sk_attno,
                                       InvalidStrategy,
                                       cur->sk_subtype,
                                       cur->sk_collation,
                                       cmp_proc,
                                       cur->sk_argument);
            }
        }
    }

    /*----------
     * Examine the selected initial-positioning strategy to determine exactly
     * where we need to start the scan, and set flag variables to control the
     * code below.
     *
     * If nextkey = false, _bt_search and _bt_binsrch will locate the first
     * item >= scan key.  If nextkey = true, they will locate the first
     * item > scan key.
     *
     * If goback = true, we will then step back one item, while if
     * goback = false, we will start the scan on the located item.
     *----------
     */
    switch (strat_total)
    {
        case BTLessStrategyNumber:

            /*
             * Find first item >= scankey, then back up one to arrive at last
             * item < scankey.  (Note: this positioning strategy is only used
             * for a backward scan, so that is always the correct starting
             * position.)
             */
            nextkey = false;
            goback = true;
            break;

        case BTLessEqualStrategyNumber:

            /*
             * Find first item > scankey, then back up one to arrive at last
             * item <= scankey.  (Note: this positioning strategy is only used
             * for a backward scan, so that is always the correct starting
             * position.)
             */
            nextkey = true;
            goback = true;
            break;

        case BTEqualStrategyNumber:

            /*
             * If a backward scan was specified, need to start with last equal
             * item not first one.
             */
            if (ScanDirectionIsBackward(dir))
            {
                /*
                 * This is the same as the <= strategy.  We will check at the
                 * end whether the found item is actually =.
                 */
                nextkey = true;
                goback = true;
            }
            else
            {
                /*
                 * This is the same as the >= strategy.  We will check at the
                 * end whether the found item is actually =.
                 */
                nextkey = false;
                goback = false;
            }
            break;

        case BTGreaterEqualStrategyNumber:

            /*
             * Find first item >= scankey.  (This is only used for forward
             * scans.)
             */
            nextkey = false;
            goback = false;
            break;

        case BTGreaterStrategyNumber:

            /*
             * Find first item > scankey.  (This is only used for forward
             * scans.)
             */
            nextkey = true;
            goback = false;
            break;

        default:
            /* can't get here, but keep compiler quiet */
            elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
            return false;
    }

    /*
     * Use the manufactured insertion scan key to descend the tree and
     * position ourselves on the target leaf page.
     */
    stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ);

    /* don't need to keep the stack around... */
    _bt_freestack(stack);

    /* remember which buffer we have pinned, if any */
    so->currPos.buf = buf;

    if (!BufferIsValid(buf))
    {
        /*
         * We only get here if the index is completely empty. Lock relation
         * because nothing finer to lock exists.
         */
        PredicateLockRelation(rel, scan->xs_snapshot);
        return false;
    }
    else
        PredicateLockPage(rel, BufferGetBlockNumber(buf),
                          scan->xs_snapshot);

    /* initialize moreLeft/moreRight appropriately for scan direction */
    if (ScanDirectionIsForward(dir))
    {
        so->currPos.moreLeft = false;
        so->currPos.moreRight = true;
    }
    else
    {
        so->currPos.moreLeft = true;
        so->currPos.moreRight = false;
    }
    so->numKilled = 0;          /* just paranoia */
    so->markItemIndex = -1;     /* ditto */

    /* position to the precise item on the page */
    offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);

    /*
     * If nextkey = false, we are positioned at the first item >= scan key, or
     * possibly at the end of a page on which all the existing items are less
     * than the scan key and we know that everything on later pages is greater
     * than or equal to scan key.
     *
     * If nextkey = true, we are positioned at the first item > scan key, or
     * possibly at the end of a page on which all the existing items are less
     * than or equal to the scan key and we know that everything on later
     * pages is greater than scan key.
     *
     * The actually desired starting point is either this item or the prior
     * one, or in the end-of-page case it's the first item on the next page or
     * the last item on this page.  Adjust the starting offset if needed. (If
     * this results in an offset before the first item or after the last one,
     * _bt_readpage will report no items found, and then we'll step to the
     * next page as needed.)
     */
    if (goback)
        offnum = OffsetNumberPrev(offnum);

    /*
     * Now load data from the first page of the scan.
     */
    if (!_bt_readpage(scan, dir, offnum))
    {
        /*
         * There's no actually-matching data on this page.  Try to advance to
         * the next page.  Return false if there's no matching data at all.
         */
        if (!_bt_steppage(scan, dir))
            return false;
    }

    /* Drop the lock, but not pin, on the current page */
    LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);

    /* OK, itemIndex says what to return */
    currItem = &so->currPos.items[so->currPos.itemIndex];
    scan->xs_ctup.t_self = currItem->heapTid;
    if (scan->xs_want_itup)
        scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);

    return true;
}

void _bt_freeskey ( ScanKey  skey  ) 

Definition at line 155 of file nbtutils.c.

References pfree().

Referenced by _bt_doinsert(), and _bt_load().

{
    pfree(skey);
}

void _bt_freestack ( BTStack  stack  ) 

Definition at line 164 of file nbtutils.c.

References BTStackData::bts_parent, NULL, and pfree().

Referenced by _bt_doinsert(), and _bt_first().

{
    BTStack     ostack;

    while (stack != NULL)
    {
        ostack = stack;
        stack = stack->bts_parent;
        pfree(ostack);
    }
}

Buffer _bt_get_endpoint ( Relation  rel,
uint32  level,
bool  rightmost 
)

Definition at line 1433 of file nbtsearch.c.

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_next, buf, BufferGetPage, BufferIsValid, elog, ERROR, ItemPointerGetBlockNumber, BTPageOpaqueData::level, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, RelationGetRelationName, and IndexTupleData::t_tid.

Referenced by _bt_endpoint(), _bt_insert_parent(), and _bt_pagedel().

{
    Buffer      buf;
    Page        page;
    BTPageOpaque opaque;
    OffsetNumber offnum;
    BlockNumber blkno;
    IndexTuple  itup;

    /*
     * If we are looking for a leaf page, okay to descend from fast root;
     * otherwise better descend from true root.  (There is no point in being
     * smarter about intermediate levels.)
     */
    if (level == 0)
        buf = _bt_getroot(rel, BT_READ);
    else
        buf = _bt_gettrueroot(rel);

    if (!BufferIsValid(buf))
        return InvalidBuffer;

    page = BufferGetPage(buf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);

    for (;;)
    {
        /*
         * If we landed on a deleted page, step right to find a live page
         * (there must be one).  Also, if we want the rightmost page, step
         * right if needed to get to it (this could happen if the page split
         * since we obtained a pointer to it).
         */
        while (P_IGNORE(opaque) ||
               (rightmost && !P_RIGHTMOST(opaque)))
        {
            blkno = opaque->btpo_next;
            if (blkno == P_NONE)
                elog(ERROR, "fell off the end of index \"%s\"",
                     RelationGetRelationName(rel));
            buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
            page = BufferGetPage(buf);
            opaque = (BTPageOpaque) PageGetSpecialPointer(page);
        }

        /* Done? */
        if (opaque->btpo.level == level)
            break;
        if (opaque->btpo.level < level)
            elog(ERROR, "btree level %u not found in index \"%s\"",
                 level, RelationGetRelationName(rel));

        /* Descend to leftmost or rightmost child page */
        if (rightmost)
            offnum = PageGetMaxOffsetNumber(page);
        else
            offnum = P_FIRSTDATAKEY(opaque);

        itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
        blkno = ItemPointerGetBlockNumber(&(itup->t_tid));

        buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
        page = BufferGetPage(buf);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    }

    return buf;
}

Buffer _bt_getbuf ( Relation  rel,
BlockNumber  blkno,
int  access 
)

Definition at line 575 of file nbtpage.c.

References _bt_checkpage(), _bt_log_reuse_page(), _bt_page_recyclable(), _bt_pageinit(), _bt_relbuf(), Assert, BT_WRITE, BTPageOpaqueData::btpo, buf, BufferGetPage, BufferGetPageSize, ConditionalLockBuffer(), DEBUG2, elog, ExclusiveLock, GetFreeIndexPage(), InvalidBlockNumber, LockBuffer(), LockRelationForExtension(), P_NEW, PageGetSpecialPointer, PageIsNew, ReadBuffer(), RELATION_IS_LOCAL, ReleaseBuffer(), UnlockRelationForExtension(), BTPageOpaqueData::xact, and XLogStandbyInfoActive.

Referenced by _bt_getroot(), _bt_getrootheight(), _bt_getstackbuf(), _bt_gettrueroot(), _bt_insertonpg(), _bt_newroot(), _bt_pagedel(), _bt_split(), _bt_steppage(), and _bt_walk_left().

{
    Buffer      buf;

    if (blkno != P_NEW)
    {
        /* Read an existing block of the relation */
        buf = ReadBuffer(rel, blkno);
        LockBuffer(buf, access);
        _bt_checkpage(rel, buf);
    }
    else
    {
        bool        needLock;
        Page        page;

        Assert(access == BT_WRITE);

        /*
         * First see if the FSM knows of any free pages.
         *
         * We can't trust the FSM's report unreservedly; we have to check that
         * the page is still free.  (For example, an already-free page could
         * have been re-used between the time the last VACUUM scanned it and
         * the time the VACUUM made its FSM updates.)
         *
         * In fact, it's worse than that: we can't even assume that it's safe
         * to take a lock on the reported page.  If somebody else has a lock
         * on it, or even worse our own caller does, we could deadlock.  (The
         * own-caller scenario is actually not improbable. Consider an index
         * on a serial or timestamp column.  Nearly all splits will be at the
         * rightmost page, so it's entirely likely that _bt_split will call us
         * while holding a lock on the page most recently acquired from FSM. A
         * VACUUM running concurrently with the previous split could well have
         * placed that page back in FSM.)
         *
         * To get around that, we ask for only a conditional lock on the
         * reported page.  If we fail, then someone else is using the page,
         * and we may reasonably assume it's not free.  (If we happen to be
         * wrong, the worst consequence is the page will be lost to use till
         * the next VACUUM, which is no big problem.)
         */
        for (;;)
        {
            blkno = GetFreeIndexPage(rel);
            if (blkno == InvalidBlockNumber)
                break;
            buf = ReadBuffer(rel, blkno);
            if (ConditionalLockBuffer(buf))
            {
                page = BufferGetPage(buf);
                if (_bt_page_recyclable(page))
                {
                    /*
                     * If we are generating WAL for Hot Standby then create a
                     * WAL record that will allow us to conflict with queries
                     * running on standby.
                     */
                    if (XLogStandbyInfoActive())
                    {
                        BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);

                        _bt_log_reuse_page(rel, blkno, opaque->btpo.xact);
                    }

                    /* Okay to use page.  Re-initialize and return it */
                    _bt_pageinit(page, BufferGetPageSize(buf));
                    return buf;
                }
                elog(DEBUG2, "FSM returned nonrecyclable page");
                _bt_relbuf(rel, buf);
            }
            else
            {
                elog(DEBUG2, "FSM returned nonlockable page");
                /* couldn't get lock, so just drop pin */
                ReleaseBuffer(buf);
            }
        }

        /*
         * Extend the relation by one page.
         *
         * We have to use a lock to ensure no one else is extending the rel at
         * the same time, else we will both try to initialize the same new
         * page.  We can skip locking for new or temp relations, however,
         * since no one else could be accessing them.
         */
        needLock = !RELATION_IS_LOCAL(rel);

        if (needLock)
            LockRelationForExtension(rel, ExclusiveLock);

        buf = ReadBuffer(rel, P_NEW);

        /* Acquire buffer lock on new page */
        LockBuffer(buf, BT_WRITE);

        /*
         * Release the file-extension lock; it's now OK for someone else to
         * extend the relation some more.  Note that we cannot release this
         * lock before we have buffer lock on the new page, or we risk a race
         * condition against btvacuumscan --- see comments therein.
         */
        if (needLock)
            UnlockRelationForExtension(rel, ExclusiveLock);

        /* Initialize the new page before returning it */
        page = BufferGetPage(buf);
        Assert(PageIsNew(page));
        _bt_pageinit(page, BufferGetPageSize(buf));
    }

    /* ref count and lock type are correct */
    return buf;
}

Buffer _bt_getroot ( Relation  rel,
int  access 
)

Definition at line 94 of file nbtpage.c.

References _bt_getbuf(), _bt_getroot(), _bt_relandgetbuf(), _bt_relbuf(), Assert, BT_READ, BT_WRITE, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_LEAF, BTP_META, BTPageGetMeta, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTREE_MAGIC, BTREE_METAPAGE, BTREE_VERSION, XLogRecData::buffer, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage, CacheInvalidateRelcache(), XLogRecData::data, elog, END_CRIT_SECTION, ereport, errcode(), errmsg(), ERROR, XLogRecData::len, xl_btree_newroot::level, BTPageOpaqueData::level, LockBuffer(), MarkBufferDirty(), MemoryContextAlloc(), XLogRecData::next, xl_btree_newroot::node, NULL, P_IGNORE, P_LEFTMOST, P_NEW, P_NONE, P_RIGHTMOST, PageGetSpecialPointer, PageSetLSN, pfree(), RelationData::rd_amcache, RelationData::rd_indexcxt, RelationData::rd_node, RelationGetRelationName, RelationNeedsWAL, xl_btree_newroot::rootblk, START_CRIT_SECTION, XLOG_BTREE_NEWROOT, and XLogInsert().

Referenced by _bt_get_endpoint(), _bt_getroot(), and _bt_search().

{
    Buffer      metabuf;
    Page        metapg;
    BTPageOpaque metaopaque;
    Buffer      rootbuf;
    Page        rootpage;
    BTPageOpaque rootopaque;
    BlockNumber rootblkno;
    uint32      rootlevel;
    BTMetaPageData *metad;

    /*
     * Try to use previously-cached metapage data to find the root.  This
     * normally saves one buffer access per index search, which is a very
     * helpful savings in bufmgr traffic and hence contention.
     */
    if (rel->rd_amcache != NULL)
    {
        metad = (BTMetaPageData *) rel->rd_amcache;
        /* We shouldn't have cached it if any of these fail */
        Assert(metad->btm_magic == BTREE_MAGIC);
        Assert(metad->btm_version == BTREE_VERSION);
        Assert(metad->btm_root != P_NONE);

        rootblkno = metad->btm_fastroot;
        Assert(rootblkno != P_NONE);
        rootlevel = metad->btm_fastlevel;

        rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
        rootpage = BufferGetPage(rootbuf);
        rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);

        /*
         * Since the cache might be stale, we check the page more carefully
         * here than normal.  We *must* check that it's not deleted. If it's
         * not alone on its level, then we reject too --- this may be overly
         * paranoid but better safe than sorry.  Note we don't check P_ISROOT,
         * because that's not set in a "fast root".
         */
        if (!P_IGNORE(rootopaque) &&
            rootopaque->btpo.level == rootlevel &&
            P_LEFTMOST(rootopaque) &&
            P_RIGHTMOST(rootopaque))
        {
            /* OK, accept cached page as the root */
            return rootbuf;
        }
        _bt_relbuf(rel, rootbuf);
        /* Cache is stale, throw it away */
        if (rel->rd_amcache)
            pfree(rel->rd_amcache);
        rel->rd_amcache = NULL;
    }

    metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
    metapg = BufferGetPage(metabuf);
    metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
    metad = BTPageGetMeta(metapg);

    /* sanity-check the metapage */
    if (!(metaopaque->btpo_flags & BTP_META) ||
        metad->btm_magic != BTREE_MAGIC)
        ereport(ERROR,
                (errcode(ERRCODE_INDEX_CORRUPTED),
                 errmsg("index \"%s\" is not a btree",
                        RelationGetRelationName(rel))));

    if (metad->btm_version != BTREE_VERSION)
        ereport(ERROR,
                (errcode(ERRCODE_INDEX_CORRUPTED),
                 errmsg("version mismatch in index \"%s\": file version %d, code version %d",
                        RelationGetRelationName(rel),
                        metad->btm_version, BTREE_VERSION)));

    /* if no root page initialized yet, do it */
    if (metad->btm_root == P_NONE)
    {
        /* If access = BT_READ, caller doesn't want us to create root yet */
        if (access == BT_READ)
        {
            _bt_relbuf(rel, metabuf);
            return InvalidBuffer;
        }

        /* trade in our read lock for a write lock */
        LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
        LockBuffer(metabuf, BT_WRITE);

        /*
         * Race condition:  if someone else initialized the metadata between
         * the time we released the read lock and acquired the write lock, we
         * must avoid doing it again.
         */
        if (metad->btm_root != P_NONE)
        {
            /*
             * Metadata initialized by someone else.  In order to guarantee no
             * deadlocks, we have to release the metadata page and start all
             * over again.  (Is that really true? But it's hardly worth trying
             * to optimize this case.)
             */
            _bt_relbuf(rel, metabuf);
            return _bt_getroot(rel, access);
        }

        /*
         * Get, initialize, write, and leave a lock of the appropriate type on
         * the new root page.  Since this is the first page in the tree, it's
         * a leaf as well as the root.
         */
        rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
        rootblkno = BufferGetBlockNumber(rootbuf);
        rootpage = BufferGetPage(rootbuf);
        rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
        rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
        rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
        rootopaque->btpo.level = 0;
        rootopaque->btpo_cycleid = 0;

        /* NO ELOG(ERROR) till meta is updated */
        START_CRIT_SECTION();

        metad->btm_root = rootblkno;
        metad->btm_level = 0;
        metad->btm_fastroot = rootblkno;
        metad->btm_fastlevel = 0;

        MarkBufferDirty(rootbuf);
        MarkBufferDirty(metabuf);

        /* XLOG stuff */
        if (RelationNeedsWAL(rel))
        {
            xl_btree_newroot xlrec;
            XLogRecPtr  recptr;
            XLogRecData rdata;

            xlrec.node = rel->rd_node;
            xlrec.rootblk = rootblkno;
            xlrec.level = 0;

            rdata.data = (char *) &xlrec;
            rdata.len = SizeOfBtreeNewroot;
            rdata.buffer = InvalidBuffer;
            rdata.next = NULL;

            recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, &rdata);

            PageSetLSN(rootpage, recptr);
            PageSetLSN(metapg, recptr);
        }

        END_CRIT_SECTION();

        /*
         * Send out relcache inval for metapage change (probably unnecessary
         * here, but let's be safe).
         */
        CacheInvalidateRelcache(rel);

        /*
         * swap root write lock for read lock.  There is no danger of anyone
         * else accessing the new root page while it's unlocked, since no one
         * else knows where it is yet.
         */
        LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
        LockBuffer(rootbuf, BT_READ);

        /* okay, metadata is correct, release lock on it */
        _bt_relbuf(rel, metabuf);
    }
    else
    {
        rootblkno = metad->btm_fastroot;
        Assert(rootblkno != P_NONE);
        rootlevel = metad->btm_fastlevel;

        /*
         * Cache the metapage data for next time
         */
        rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
                                             sizeof(BTMetaPageData));
        memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));

        /*
         * We are done with the metapage; arrange to release it via first
         * _bt_relandgetbuf call
         */
        rootbuf = metabuf;

        for (;;)
        {
            rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
            rootpage = BufferGetPage(rootbuf);
            rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);

            if (!P_IGNORE(rootopaque))
                break;

            /* it's dead, Jim.  step right one page */
            if (P_RIGHTMOST(rootopaque))
                elog(ERROR, "no live root page found in index \"%s\"",
                     RelationGetRelationName(rel));
            rootblkno = rootopaque->btpo_next;
        }

        /* Note: can't check btpo.level on deleted pages */
        if (rootopaque->btpo.level != rootlevel)
            elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
                 rootblkno, RelationGetRelationName(rel),
                 rootopaque->btpo.level, rootlevel);
    }

    /*
     * By here, we have a pin and read lock on the root page, and no lock set
     * on the metadata page.  Return the root page's buffer.
     */
    return rootbuf;
}

int _bt_getrootheight ( Relation  rel  ) 

Definition at line 424 of file nbtpage.c.

References _bt_getbuf(), _bt_relbuf(), Assert, BT_READ, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_META, BTPageGetMeta, BTPageOpaqueData::btpo_flags, BTREE_MAGIC, BTREE_METAPAGE, BTREE_VERSION, BufferGetPage, ereport, errcode(), errmsg(), ERROR, MemoryContextAlloc(), NULL, P_NONE, PageGetSpecialPointer, RelationData::rd_amcache, RelationData::rd_indexcxt, and RelationGetRelationName.

Referenced by get_relation_info().

{
    BTMetaPageData *metad;

    /*
     * We can get what we need from the cached metapage data.  If it's not
     * cached yet, load it.  Sanity checks here must match _bt_getroot().
     */
    if (rel->rd_amcache == NULL)
    {
        Buffer      metabuf;
        Page        metapg;
        BTPageOpaque metaopaque;

        metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
        metapg = BufferGetPage(metabuf);
        metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
        metad = BTPageGetMeta(metapg);

        /* sanity-check the metapage */
        if (!(metaopaque->btpo_flags & BTP_META) ||
            metad->btm_magic != BTREE_MAGIC)
            ereport(ERROR,
                    (errcode(ERRCODE_INDEX_CORRUPTED),
                     errmsg("index \"%s\" is not a btree",
                            RelationGetRelationName(rel))));

        if (metad->btm_version != BTREE_VERSION)
            ereport(ERROR,
                    (errcode(ERRCODE_INDEX_CORRUPTED),
                     errmsg("version mismatch in index \"%s\": file version %d, code version %d",
                            RelationGetRelationName(rel),
                            metad->btm_version, BTREE_VERSION)));

        /*
         * If there's no root page yet, _bt_getroot() doesn't expect a cache
         * to be made, so just stop here and report the index height is zero.
         * (XXX perhaps _bt_getroot() should be changed to allow this case.)
         */
        if (metad->btm_root == P_NONE)
        {
            _bt_relbuf(rel, metabuf);
            return 0;
        }

        /*
         * Cache the metapage data for next time
         */
        rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
                                             sizeof(BTMetaPageData));
        memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));

        _bt_relbuf(rel, metabuf);
    }

    metad = (BTMetaPageData *) rel->rd_amcache;
    /* We shouldn't have cached it if any of these fail */
    Assert(metad->btm_magic == BTREE_MAGIC);
    Assert(metad->btm_version == BTREE_VERSION);
    Assert(metad->btm_fastroot != P_NONE);

    return metad->btm_fastlevel;
}

Buffer _bt_getstackbuf ( Relation  rel,
BTStack  stack,
int  access 
)

Definition at line 1724 of file nbtinsert.c.

References _bt_getbuf(), _bt_relbuf(), BTEntrySame, BTPageOpaqueData::btpo_next, BTStackData::bts_blkno, BTStackData::bts_btentry, BTStackData::bts_offset, buf, BufferGetPage, OffsetNumberNext, OffsetNumberPrev, P_FIRSTDATAKEY, P_IGNORE, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, and PageGetSpecialPointer.

Referenced by _bt_insert_parent(), _bt_pagedel(), and _bt_parent_deletion_safe().

{
    BlockNumber blkno;
    OffsetNumber start;

    blkno = stack->bts_blkno;
    start = stack->bts_offset;

    for (;;)
    {
        Buffer      buf;
        Page        page;
        BTPageOpaque opaque;

        buf = _bt_getbuf(rel, blkno, access);
        page = BufferGetPage(buf);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);

        if (!P_IGNORE(opaque))
        {
            OffsetNumber offnum,
                        minoff,
                        maxoff;
            ItemId      itemid;
            IndexTuple  item;

            minoff = P_FIRSTDATAKEY(opaque);
            maxoff = PageGetMaxOffsetNumber(page);

            /*
             * start = InvalidOffsetNumber means "search the whole page". We
             * need this test anyway due to possibility that page has a high
             * key now when it didn't before.
             */
            if (start < minoff)
                start = minoff;

            /*
             * Need this check too, to guard against possibility that page
             * split since we visited it originally.
             */
            if (start > maxoff)
                start = OffsetNumberNext(maxoff);

            /*
             * These loops will check every item on the page --- but in an
             * order that's attuned to the probability of where it actually
             * is.  Scan to the right first, then to the left.
             */
            for (offnum = start;
                 offnum <= maxoff;
                 offnum = OffsetNumberNext(offnum))
            {
                itemid = PageGetItemId(page, offnum);
                item = (IndexTuple) PageGetItem(page, itemid);
                if (BTEntrySame(item, &stack->bts_btentry))
                {
                    /* Return accurate pointer to where link is now */
                    stack->bts_blkno = blkno;
                    stack->bts_offset = offnum;
                    return buf;
                }
            }

            for (offnum = OffsetNumberPrev(start);
                 offnum >= minoff;
                 offnum = OffsetNumberPrev(offnum))
            {
                itemid = PageGetItemId(page, offnum);
                item = (IndexTuple) PageGetItem(page, itemid);
                if (BTEntrySame(item, &stack->bts_btentry))
                {
                    /* Return accurate pointer to where link is now */
                    stack->bts_blkno = blkno;
                    stack->bts_offset = offnum;
                    return buf;
                }
            }
        }

        /*
         * The item we're looking for moved right at least one page.
         */
        if (P_RIGHTMOST(opaque))
        {
            _bt_relbuf(rel, buf);
            return InvalidBuffer;
        }
        blkno = opaque->btpo_next;
        start = InvalidOffsetNumber;
        _bt_relbuf(rel, buf);
    }
}

Buffer _bt_gettrueroot ( Relation  rel  ) 

Definition at line 330 of file nbtpage.c.

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_META, BTPageGetMeta, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTREE_MAGIC, BTREE_METAPAGE, BTREE_VERSION, BufferGetPage, elog, ereport, errcode(), errmsg(), ERROR, BTPageOpaqueData::level, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetSpecialPointer, pfree(), RelationData::rd_amcache, and RelationGetRelationName.

Referenced by _bt_get_endpoint().

{
    Buffer      metabuf;
    Page        metapg;
    BTPageOpaque metaopaque;
    Buffer      rootbuf;
    Page        rootpage;
    BTPageOpaque rootopaque;
    BlockNumber rootblkno;
    uint32      rootlevel;
    BTMetaPageData *metad;

    /*
     * We don't try to use cached metapage data here, since (a) this path is
     * not performance-critical, and (b) if we are here it suggests our cache
     * is out-of-date anyway.  In light of point (b), it's probably safest to
     * actively flush any cached metapage info.
     */
    if (rel->rd_amcache)
        pfree(rel->rd_amcache);
    rel->rd_amcache = NULL;

    metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
    metapg = BufferGetPage(metabuf);
    metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
    metad = BTPageGetMeta(metapg);

    if (!(metaopaque->btpo_flags & BTP_META) ||
        metad->btm_magic != BTREE_MAGIC)
        ereport(ERROR,
                (errcode(ERRCODE_INDEX_CORRUPTED),
                 errmsg("index \"%s\" is not a btree",
                        RelationGetRelationName(rel))));

    if (metad->btm_version != BTREE_VERSION)
        ereport(ERROR,
                (errcode(ERRCODE_INDEX_CORRUPTED),
                 errmsg("version mismatch in index \"%s\": file version %d, code version %d",
                        RelationGetRelationName(rel),
                        metad->btm_version, BTREE_VERSION)));

    /* if no root page initialized yet, fail */
    if (metad->btm_root == P_NONE)
    {
        _bt_relbuf(rel, metabuf);
        return InvalidBuffer;
    }

    rootblkno = metad->btm_root;
    rootlevel = metad->btm_level;

    /*
     * We are done with the metapage; arrange to release it via first
     * _bt_relandgetbuf call
     */
    rootbuf = metabuf;

    for (;;)
    {
        rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
        rootpage = BufferGetPage(rootbuf);
        rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);

        if (!P_IGNORE(rootopaque))
            break;

        /* it's dead, Jim.  step right one page */
        if (P_RIGHTMOST(rootopaque))
            elog(ERROR, "no live root page found in index \"%s\"",
                 RelationGetRelationName(rel));
        rootblkno = rootopaque->btpo_next;
    }

    /* Note: can't check btpo.level on deleted pages */
    if (rootopaque->btpo.level != rootlevel)
        elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
             rootblkno, RelationGetRelationName(rel),
             rootopaque->btpo.level, rootlevel);

    return rootbuf;
}

void _bt_initmetapage ( Page  page,
BlockNumber  rootbknum,
uint32  level 
)

Definition at line 39 of file nbtpage.c.

References _bt_pageinit(), BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTPageGetMeta, BTPageOpaqueData::btpo_flags, and PageGetSpecialPointer.

Referenced by _bt_uppershutdown(), and btbuildempty().

{
    BTMetaPageData *metad;
    BTPageOpaque metaopaque;

    _bt_pageinit(page, BLCKSZ);

    metad = BTPageGetMeta(page);
    metad->btm_magic = BTREE_MAGIC;
    metad->btm_version = BTREE_VERSION;
    metad->btm_root = rootbknum;
    metad->btm_level = level;
    metad->btm_fastroot = rootbknum;
    metad->btm_fastlevel = level;

    metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
    metaopaque->btpo_flags = BTP_META;

    /*
     * Set pd_lower just past the end of the metadata.  This is not essential
     * but it makes the page look compressible to xlog.c.
     */
    ((PageHeader) page)->pd_lower =
        ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
}

void _bt_insert_parent ( Relation  rel,
Buffer  buf,
Buffer  rbuf,
BTStack  stack,
bool  is_root,
bool  is_only 
)

Definition at line 1610 of file nbtinsert.c.

References _bt_get_endpoint(), _bt_getstackbuf(), _bt_insertonpg(), _bt_newroot(), _bt_relbuf(), Assert, BT_WRITE, BTPageOpaqueData::btpo, BTStackData::bts_blkno, BTStackData::bts_btentry, BTStackData::bts_offset, BTStackData::bts_parent, BufferGetBlockNumber(), BufferGetPage, CopyIndexTuple(), DEBUG2, elog, ERROR, InRecovery, InvalidBuffer, ItemPointerSet, BTPageOpaqueData::level, NULL, P_HIKEY, PageGetItem, PageGetItemId, PageGetSpecialPointer, pfree(), RelationGetRelationName, and IndexTupleData::t_tid.

Referenced by _bt_insertonpg(), and btree_xlog_cleanup().

{
    /*
     * Here we have to do something Lehman and Yao don't talk about: deal with
     * a root split and construction of a new root.  If our stack is empty
     * then we have just split a node on what had been the root level when we
     * descended the tree.  If it was still the root then we perform a
     * new-root construction.  If it *wasn't* the root anymore, search to find
     * the next higher level that someone constructed meanwhile, and find the
     * right place to insert as for the normal case.
     *
     * If we have to search for the parent level, we do so by re-descending
     * from the root.  This is not super-efficient, but it's rare enough not
     * to matter.  (This path is also taken when called from WAL recovery ---
     * we have no stack in that case.)
     */
    if (is_root)
    {
        Buffer      rootbuf;

        Assert(stack == NULL);
        Assert(is_only);
        /* create a new root node and update the metapage */
        rootbuf = _bt_newroot(rel, buf, rbuf);
        /* release the split buffers */
        _bt_relbuf(rel, rootbuf);
        _bt_relbuf(rel, rbuf);
        _bt_relbuf(rel, buf);
    }
    else
    {
        BlockNumber bknum = BufferGetBlockNumber(buf);
        BlockNumber rbknum = BufferGetBlockNumber(rbuf);
        Page        page = BufferGetPage(buf);
        IndexTuple  new_item;
        BTStackData fakestack;
        IndexTuple  ritem;
        Buffer      pbuf;

        if (stack == NULL)
        {
            BTPageOpaque lpageop;

            if (!InRecovery)
                elog(DEBUG2, "concurrent ROOT page split");
            lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
            /* Find the leftmost page at the next level up */
            pbuf = _bt_get_endpoint(rel, lpageop->btpo.level + 1, false);
            /* Set up a phony stack entry pointing there */
            stack = &fakestack;
            stack->bts_blkno = BufferGetBlockNumber(pbuf);
            stack->bts_offset = InvalidOffsetNumber;
            /* bts_btentry will be initialized below */
            stack->bts_parent = NULL;
            _bt_relbuf(rel, pbuf);
        }

        /* get high key from left page == lowest key on new right page */
        ritem = (IndexTuple) PageGetItem(page,
                                         PageGetItemId(page, P_HIKEY));

        /* form an index tuple that points at the new right page */
        new_item = CopyIndexTuple(ritem);
        ItemPointerSet(&(new_item->t_tid), rbknum, P_HIKEY);

        /*
         * Find the parent buffer and get the parent page.
         *
         * Oops - if we were moved right then we need to change stack item! We
         * want to find parent pointing to where we are, right ?    - vadim
         * 05/27/97
         */
        ItemPointerSet(&(stack->bts_btentry.t_tid), bknum, P_HIKEY);

        pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);

        /* Now we can unlock the children */
        _bt_relbuf(rel, rbuf);
        _bt_relbuf(rel, buf);

        /* Check for error only after writing children */
        if (pbuf == InvalidBuffer)
            elog(ERROR, "failed to re-find parent key in index \"%s\" for split pages %u/%u",
                 RelationGetRelationName(rel), bknum, rbknum);

        /* Recursively update the parent */
        _bt_insertonpg(rel, pbuf, stack->bts_parent,
                       new_item, stack->bts_offset + 1,
                       is_only);

        /* be tidy */
        pfree(new_item);
    }
}

void _bt_killitems ( IndexScanDesc  scan,
bool  haveLock 
)

Definition at line 1737 of file nbtutils.c.

References Assert, BT_READ, BTScanPosData::buf, BUFFER_LOCK_UNLOCK, BufferGetPage, BufferIsValid, BTScanOpaqueData::currPos, BTScanPosData::firstItem, BTScanPosItem::heapTid, i, BTScanPosItem::indexOffset, ItemIdMarkDead, ItemPointerEquals(), BTScanPosData::items, BTScanOpaqueData::killedItems, LockBuffer(), MarkBufferDirtyHint(), BTScanOpaqueData::numKilled, OffsetNumberNext, IndexScanDescData::opaque, P_FIRSTDATAKEY, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, and IndexTupleData::t_tid.

Referenced by _bt_steppage(), btendscan(), btrescan(), and btrestrpos().

{
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    Page        page;
    BTPageOpaque opaque;
    OffsetNumber minoff;
    OffsetNumber maxoff;
    int         i;
    bool        killedsomething = false;

    Assert(BufferIsValid(so->currPos.buf));

    if (!haveLock)
        LockBuffer(so->currPos.buf, BT_READ);

    page = BufferGetPage(so->currPos.buf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    minoff = P_FIRSTDATAKEY(opaque);
    maxoff = PageGetMaxOffsetNumber(page);

    for (i = 0; i < so->numKilled; i++)
    {
        int         itemIndex = so->killedItems[i];
        BTScanPosItem *kitem = &so->currPos.items[itemIndex];
        OffsetNumber offnum = kitem->indexOffset;

        Assert(itemIndex >= so->currPos.firstItem &&
               itemIndex <= so->currPos.lastItem);
        if (offnum < minoff)
            continue;           /* pure paranoia */
        while (offnum <= maxoff)
        {
            ItemId      iid = PageGetItemId(page, offnum);
            IndexTuple  ituple = (IndexTuple) PageGetItem(page, iid);

            if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
            {
                /* found the item */
                ItemIdMarkDead(iid);
                killedsomething = true;
                break;          /* out of inner search loop */
            }
            offnum = OffsetNumberNext(offnum);
        }
    }

    /*
     * Since this can be redone later if needed, mark as dirty hint.
     *
     * Whenever we mark anything LP_DEAD, we also set the page's
     * BTP_HAS_GARBAGE flag, which is likewise just a hint.
     */
    if (killedsomething)
    {
        opaque->btpo_flags |= BTP_HAS_GARBAGE;
        MarkBufferDirtyHint(so->currPos.buf);
    }

    if (!haveLock)
        LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);

    /*
     * Always reset the scan state, so we don't look for same items on other
     * pages.
     */
    so->numKilled = 0;
}

void _bt_leafbuild ( BTSpool btspool,
BTSpool spool2 
)

Definition at line 198 of file nbtsort.c.

References _bt_load(), BTREE_METAPAGE, BTWriteState::btws_pages_alloced, BTWriteState::btws_pages_written, BTWriteState::btws_use_wal, BTWriteState::btws_zeropage, BTSpool::heap, BTWriteState::heap, BTSpool::index, BTWriteState::index, log_btree_build_stats, RelationNeedsWAL, ResetUsage(), ShowUsage(), BTSpool::sortstate, tuplesort_performsort(), and XLogIsNeeded.

Referenced by btbuild().

{
    BTWriteState wstate;

#ifdef BTREE_BUILD_STATS
    if (log_btree_build_stats)
    {
        ShowUsage("BTREE BUILD (Spool) STATISTICS");
        ResetUsage();
    }
#endif   /* BTREE_BUILD_STATS */

    tuplesort_performsort(btspool->sortstate);
    if (btspool2)
        tuplesort_performsort(btspool2->sortstate);

    wstate.heap = btspool->heap;
    wstate.index = btspool->index;

    /*
     * We need to log index creation in WAL iff WAL archiving/streaming is
     * enabled UNLESS the index isn't WAL-logged anyway.
     */
    wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index);

    /* reserve the metapage */
    wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
    wstate.btws_pages_written = 0;
    wstate.btws_zeropage = NULL;    /* until needed */

    _bt_load(&wstate, btspool, btspool2);
}

void _bt_mark_array_keys ( IndexScanDesc  scan  ) 

Definition at line 604 of file nbtutils.c.

References BTScanOpaqueData::arrayKeys, BTArrayKeyInfo::cur_elem, i, BTArrayKeyInfo::mark_elem, BTScanOpaqueData::numArrayKeys, and IndexScanDescData::opaque.

Referenced by btmarkpos().

{
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    int         i;

    for (i = 0; i < so->numArrayKeys; i++)
    {
        BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];

        curArrayKey->mark_elem = curArrayKey->cur_elem;
    }
}

ScanKey _bt_mkscankey ( Relation  rel,
IndexTuple  itup 
)

Definition at line 62 of file nbtutils.c.

References arg, BTORDER_PROC, i, index_getattr, index_getprocinfo(), InvalidOid, InvalidStrategy, palloc(), RelationData::rd_indcollation, RelationData::rd_indoption, RelationGetDescr, RelationGetNumberOfAttributes, ScanKeyEntryInitializeWithInfo(), SK_BT_INDOPTION_SHIFT, and SK_ISNULL.

Referenced by _bt_doinsert(), and _bt_pagedel().

{
    ScanKey     skey;
    TupleDesc   itupdesc;
    int         natts;
    int16      *indoption;
    int         i;

    itupdesc = RelationGetDescr(rel);
    natts = RelationGetNumberOfAttributes(rel);
    indoption = rel->rd_indoption;

    skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));

    for (i = 0; i < natts; i++)
    {
        FmgrInfo   *procinfo;
        Datum       arg;
        bool        null;
        int         flags;

        /*
         * We can use the cached (default) support procs since no cross-type
         * comparison can be needed.
         */
        procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
        arg = index_getattr(itup, i + 1, itupdesc, &null);
        flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT);
        ScanKeyEntryInitializeWithInfo(&skey[i],
                                       flags,
                                       (AttrNumber) (i + 1),
                                       InvalidStrategy,
                                       InvalidOid,
                                       rel->rd_indcollation[i],
                                       procinfo,
                                       arg);
    }

    return skey;
}

ScanKey _bt_mkscankey_nodata ( Relation  rel  ) 

Definition at line 115 of file nbtutils.c.

References BTORDER_PROC, i, index_getprocinfo(), InvalidOid, InvalidStrategy, palloc(), RelationData::rd_indcollation, RelationData::rd_indoption, RelationGetNumberOfAttributes, ScanKeyEntryInitializeWithInfo(), and SK_ISNULL.

Referenced by _bt_load(), tuplesort_begin_cluster(), and tuplesort_begin_index_btree().

{
    ScanKey     skey;
    int         natts;
    int16      *indoption;
    int         i;

    natts = RelationGetNumberOfAttributes(rel);
    indoption = rel->rd_indoption;

    skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));

    for (i = 0; i < natts; i++)
    {
        FmgrInfo   *procinfo;
        int         flags;

        /*
         * We can use the cached (default) support procs since no cross-type
         * comparison can be needed.
         */
        procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
        flags = SK_ISNULL | (indoption[i] << SK_BT_INDOPTION_SHIFT);
        ScanKeyEntryInitializeWithInfo(&skey[i],
                                       flags,
                                       (AttrNumber) (i + 1),
                                       InvalidStrategy,
                                       InvalidOid,
                                       rel->rd_indcollation[i],
                                       procinfo,
                                       (Datum) 0);
    }

    return skey;
}

Buffer _bt_moveright ( Relation  rel,
Buffer  buf,
int  keysz,
ScanKey  scankey,
bool  nextkey,
int  access 
)

Definition at line 156 of file nbtsearch.c.

References _bt_compare(), _bt_relandgetbuf(), BTPageOpaqueData::btpo_next, BufferGetPage, elog, ERROR, P_HIKEY, P_IGNORE, P_RIGHTMOST, PageGetSpecialPointer, and RelationGetRelationName.

Referenced by _bt_doinsert(), and _bt_search().

{
    Page        page;
    BTPageOpaque opaque;
    int32       cmpval;

    page = BufferGetPage(buf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);

    /*
     * When nextkey = false (normal case): if the scan key that brought us to
     * this page is > the high key stored on the page, then the page has split
     * and we need to move right.  (If the scan key is equal to the high key,
     * we might or might not need to move right; have to scan the page first
     * anyway.)
     *
     * When nextkey = true: move right if the scan key is >= page's high key.
     *
     * The page could even have split more than once, so scan as far as
     * needed.
     *
     * We also have to move right if we followed a link that brought us to a
     * dead page.
     */
    cmpval = nextkey ? 0 : 1;

    while (!P_RIGHTMOST(opaque) &&
           (P_IGNORE(opaque) ||
            _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval))
    {
        /* step right one page */
        BlockNumber rblkno = opaque->btpo_next;

        buf = _bt_relandgetbuf(rel, buf, rblkno, access);
        page = BufferGetPage(buf);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    }

    if (P_IGNORE(opaque))
        elog(ERROR, "fell off the end of index \"%s\"",
             RelationGetRelationName(rel));

    return buf;
}

bool _bt_next ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 995 of file nbtsearch.c.

References _bt_steppage(), Assert, BT_READ, BTScanPosData::buf, BUFFER_LOCK_UNLOCK, BufferIsValid, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosData::firstItem, BTScanPosData::itemIndex, BTScanPosData::items, BTScanPosData::lastItem, LockBuffer(), IndexScanDescData::opaque, ScanDirectionIsForward, HeapTupleData::t_self, IndexScanDescData::xs_ctup, IndexScanDescData::xs_itup, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

{
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    BTScanPosItem *currItem;

    /*
     * Advance to next tuple on current page; or if there's no more, try to
     * step to the next page with data.
     */
    if (ScanDirectionIsForward(dir))
    {
        if (++so->currPos.itemIndex > so->currPos.lastItem)
        {
            /* We must acquire lock before applying _bt_steppage */
            Assert(BufferIsValid(so->currPos.buf));
            LockBuffer(so->currPos.buf, BT_READ);
            if (!_bt_steppage(scan, dir))
                return false;
            /* Drop the lock, but not pin, on the new page */
            LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
        }
    }
    else
    {
        if (--so->currPos.itemIndex < so->currPos.firstItem)
        {
            /* We must acquire lock before applying _bt_steppage */
            Assert(BufferIsValid(so->currPos.buf));
            LockBuffer(so->currPos.buf, BT_READ);
            if (!_bt_steppage(scan, dir))
                return false;
            /* Drop the lock, but not pin, on the new page */
            LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
        }
    }

    /* OK, itemIndex says what to return */
    currItem = &so->currPos.items[so->currPos.itemIndex];
    scan->xs_ctup.t_self = currItem->heapTid;
    if (scan->xs_want_itup)
        scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);

    return true;
}

bool _bt_page_recyclable ( Page  page  ) 

Definition at line 749 of file nbtpage.c.

References BTPageOpaqueData::btpo, P_ISDELETED, PageGetSpecialPointer, PageIsNew, RecentGlobalXmin, TransactionIdPrecedes(), and BTPageOpaqueData::xact.

Referenced by _bt_getbuf(), and btvacuumpage().

{
    BTPageOpaque opaque;

    /*
     * It's possible to find an all-zeroes page in an index --- for example, a
     * backend might successfully extend the relation one page and then crash
     * before it is able to make a WAL entry for adding the page. If we find a
     * zeroed page then reclaim it.
     */
    if (PageIsNew(page))
        return true;

    /*
     * Otherwise, recycle if deleted and too old to have any processes
     * interested in it.
     */
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    if (P_ISDELETED(opaque) &&
        TransactionIdPrecedes(opaque->btpo.xact, RecentGlobalXmin))
        return true;
    return false;
}

int _bt_pagedel ( Relation  rel,
Buffer  buf,
BTStack  stack 
)

Definition at line 1067 of file nbtpage.c.

References _bt_get_endpoint(), _bt_getbuf(), _bt_getstackbuf(), _bt_mkscankey(), _bt_pagedel(), _bt_parent_deletion_safe(), _bt_relbuf(), _bt_search(), Assert, BT_READ, BT_WRITE, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_level, BTMetaPageData::btm_root, BTPageGetMeta, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, xl_btree_delete_page::btpo_xact, BTREE_METAPAGE, BTStackData::bts_blkno, BTStackData::bts_btentry, BTStackData::bts_offset, BTStackData::bts_parent, XLogRecData::buffer, XLogRecData::buffer_std, BufferGetBlockNumber(), BufferGetPage, BufferIsValid, CacheInvalidateRelcache(), CopyIndexTuple(), XLogRecData::data, xl_btree_delete_page::deadblk, elog, END_CRIT_SECTION, ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, InRecovery, InvalidBuffer, ItemPointerGetBlockNumber, ItemPointerSet, xl_btree_delete_page::leftblk, XLogRecData::len, xl_btree_metadata::level, BTPageOpaqueData::level, LOG, MarkBufferDirty(), XLogRecData::next, xl_btreetid::node, NULL, OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_ISDELETED, P_ISHALFDEAD, P_ISROOT, P_NONE, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, PageIndexTupleDelete(), PageSetLSN, palloc(), PredicateLockPageCombine(), RelationData::rd_node, RelationData::rd_rel, ReadNewTransactionId(), RelationGetRelationName, RelationNeedsWAL, xl_btree_delete_page::rightblk, xl_btree_metadata::root, START_CRIT_SECTION, IndexTupleData::t_tid, xl_btree_delete_page::target, xl_btreetid::tid, BTPageOpaqueData::xact, and XLogInsert().

Referenced by _bt_pagedel(), btree_xlog_cleanup(), and btvacuumpage().

{
    int         result;
    BlockNumber target,
                leftsib,
                rightsib,
                parent;
    OffsetNumber poffset,
                maxoff;
    uint32      targetlevel,
                ilevel;
    ItemId      itemid;
    IndexTuple  targetkey,
                itup;
    ScanKey     itup_scankey;
    Buffer      lbuf,
                rbuf,
                pbuf;
    bool        parent_half_dead;
    bool        parent_one_child;
    bool        rightsib_empty;
    Buffer      metabuf = InvalidBuffer;
    Page        metapg = NULL;
    BTMetaPageData *metad = NULL;
    Page        page;
    BTPageOpaque opaque;

    /*
     * We can never delete rightmost pages nor root pages.  While at it, check
     * that page is not already deleted and is empty.
     */
    page = BufferGetPage(buf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
        P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
    {
        /* Should never fail to delete a half-dead page */
        Assert(!P_ISHALFDEAD(opaque));

        _bt_relbuf(rel, buf);
        return 0;
    }

    /*
     * Save info about page, including a copy of its high key (it must have
     * one, being non-rightmost).
     */
    target = BufferGetBlockNumber(buf);
    targetlevel = opaque->btpo.level;
    leftsib = opaque->btpo_prev;
    itemid = PageGetItemId(page, P_HIKEY);
    targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));

    /*
     * To avoid deadlocks, we'd better drop the target page lock before going
     * further.
     */
    _bt_relbuf(rel, buf);

    /*
     * We need an approximate pointer to the page's parent page.  We use the
     * standard search mechanism to search for the page's high key; this will
     * give us a link to either the current parent or someplace to its left
     * (if there are multiple equal high keys).  In recursion cases, the
     * caller already generated a search stack and we can just re-use that
     * work.
     */
    if (stack == NULL)
    {
        if (!InRecovery)
        {
            /* we need an insertion scan key to do our search, so build one */
            itup_scankey = _bt_mkscankey(rel, targetkey);
            /* find the leftmost leaf page containing this key */
            stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false,
                               &lbuf, BT_READ);
            /* don't need a pin on that either */
            _bt_relbuf(rel, lbuf);

            /*
             * If we are trying to delete an interior page, _bt_search did
             * more than we needed.  Locate the stack item pointing to our
             * parent level.
             */
            ilevel = 0;
            for (;;)
            {
                if (stack == NULL)
                    elog(ERROR, "not enough stack items");
                if (ilevel == targetlevel)
                    break;
                stack = stack->bts_parent;
                ilevel++;
            }
        }
        else
        {
            /*
             * During WAL recovery, we can't use _bt_search (for one reason,
             * it might invoke user-defined comparison functions that expect
             * facilities not available in recovery mode).  Instead, just set
             * up a dummy stack pointing to the left end of the parent tree
             * level, from which _bt_getstackbuf will walk right to the parent
             * page.  Painful, but we don't care too much about performance in
             * this scenario.
             */
            pbuf = _bt_get_endpoint(rel, targetlevel + 1, false);
            stack = (BTStack) palloc(sizeof(BTStackData));
            stack->bts_blkno = BufferGetBlockNumber(pbuf);
            stack->bts_offset = InvalidOffsetNumber;
            /* bts_btentry will be initialized below */
            stack->bts_parent = NULL;
            _bt_relbuf(rel, pbuf);
        }
    }

    /*
     * We cannot delete a page that is the rightmost child of its immediate
     * parent, unless it is the only child --- in which case the parent has to
     * be deleted too, and the same condition applies recursively to it. We
     * have to check this condition all the way up before trying to delete. We
     * don't need to re-test when deleting a non-leaf page, though.
     */
    if (targetlevel == 0 &&
        !_bt_parent_deletion_safe(rel, target, stack))
        return 0;

    /*
     * We have to lock the pages we need to modify in the standard order:
     * moving right, then up.  Else we will deadlock against other writers.
     *
     * So, we need to find and write-lock the current left sibling of the
     * target page.  The sibling that was current a moment ago could have
     * split, so we may have to move right.  This search could fail if either
     * the sibling or the target page was deleted by someone else meanwhile;
     * if so, give up.  (Right now, that should never happen, since page
     * deletion is only done in VACUUM and there shouldn't be multiple VACUUMs
     * concurrently on the same table.)
     */
    if (leftsib != P_NONE)
    {
        lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
        page = BufferGetPage(lbuf);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);
        while (P_ISDELETED(opaque) || opaque->btpo_next != target)
        {
            /* step right one page */
            leftsib = opaque->btpo_next;
            _bt_relbuf(rel, lbuf);
            if (leftsib == P_NONE)
            {
                elog(LOG, "no left sibling (concurrent deletion?) in \"%s\"",
                     RelationGetRelationName(rel));
                return 0;
            }
            lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
            page = BufferGetPage(lbuf);
            opaque = (BTPageOpaque) PageGetSpecialPointer(page);
        }
    }
    else
        lbuf = InvalidBuffer;

    /*
     * Next write-lock the target page itself.  It should be okay to take just
     * a write lock not a superexclusive lock, since no scans would stop on an
     * empty page.
     */
    buf = _bt_getbuf(rel, target, BT_WRITE);
    page = BufferGetPage(buf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);

    /*
     * Check page is still empty etc, else abandon deletion.  The empty check
     * is necessary since someone else might have inserted into it while we
     * didn't have it locked; the others are just for paranoia's sake.
     */
    if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
        P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
    {
        _bt_relbuf(rel, buf);
        if (BufferIsValid(lbuf))
            _bt_relbuf(rel, lbuf);
        return 0;
    }
    if (opaque->btpo_prev != leftsib)
        elog(ERROR, "left link changed unexpectedly in block %u of index \"%s\"",
             target, RelationGetRelationName(rel));

    /*
     * And next write-lock the (current) right sibling.
     */
    rightsib = opaque->btpo_next;
    rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
    page = BufferGetPage(rbuf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    if (opaque->btpo_prev != target)
        elog(ERROR, "right sibling's left-link doesn't match: "
             "block %u links to %u instead of expected %u in index \"%s\"",
             rightsib, opaque->btpo_prev, target,
             RelationGetRelationName(rel));

    /*
     * Any insert which would have gone on the target block will now go to the
     * right sibling block.
     */
    PredicateLockPageCombine(rel, target, rightsib);

    /*
     * Next find and write-lock the current parent of the target page. This is
     * essentially the same as the corresponding step of splitting.
     */
    ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY);
    pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
    if (pbuf == InvalidBuffer)
        elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u",
             RelationGetRelationName(rel), target);
    parent = stack->bts_blkno;
    poffset = stack->bts_offset;

    /*
     * If the target is the rightmost child of its parent, then we can't
     * delete, unless it's also the only child --- in which case the parent
     * changes to half-dead status.  The "can't delete" case should have been
     * detected by _bt_parent_deletion_safe, so complain if we see it now.
     */
    page = BufferGetPage(pbuf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    maxoff = PageGetMaxOffsetNumber(page);
    parent_half_dead = false;
    parent_one_child = false;
    if (poffset >= maxoff)
    {
        if (poffset == P_FIRSTDATAKEY(opaque))
            parent_half_dead = true;
        else
            elog(ERROR, "failed to delete rightmost child %u of block %u in index \"%s\"",
                 target, parent, RelationGetRelationName(rel));
    }
    else
    {
        /* Will there be exactly one child left in this parent? */
        if (OffsetNumberNext(P_FIRSTDATAKEY(opaque)) == maxoff)
            parent_one_child = true;
    }

    /*
     * If we are deleting the next-to-last page on the target's level, then
     * the rightsib is a candidate to become the new fast root. (In theory, it
     * might be possible to push the fast root even further down, but the odds
     * of doing so are slim, and the locking considerations daunting.)
     *
     * We don't support handling this in the case where the parent is becoming
     * half-dead, even though it theoretically could occur.
     *
     * We can safely acquire a lock on the metapage here --- see comments for
     * _bt_newroot().
     */
    if (leftsib == P_NONE && !parent_half_dead)
    {
        page = BufferGetPage(rbuf);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);
        Assert(opaque->btpo.level == targetlevel);
        if (P_RIGHTMOST(opaque))
        {
            /* rightsib will be the only one left on the level */
            metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
            metapg = BufferGetPage(metabuf);
            metad = BTPageGetMeta(metapg);

            /*
             * The expected case here is btm_fastlevel == targetlevel+1; if
             * the fastlevel is <= targetlevel, something is wrong, and we
             * choose to overwrite it to fix it.
             */
            if (metad->btm_fastlevel > targetlevel + 1)
            {
                /* no update wanted */
                _bt_relbuf(rel, metabuf);
                metabuf = InvalidBuffer;
            }
        }
    }

    /*
     * Check that the parent-page index items we're about to delete/overwrite
     * contain what we expect.  This can fail if the index has become corrupt
     * for some reason.  We want to throw any error before entering the
     * critical section --- otherwise it'd be a PANIC.
     *
     * The test on the target item is just an Assert because _bt_getstackbuf
     * should have guaranteed it has the expected contents.  The test on the
     * next-child downlink is known to sometimes fail in the field, though.
     */
    page = BufferGetPage(pbuf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);

#ifdef USE_ASSERT_CHECKING
    itemid = PageGetItemId(page, poffset);
    itup = (IndexTuple) PageGetItem(page, itemid);
    Assert(ItemPointerGetBlockNumber(&(itup->t_tid)) == target);
#endif

    if (!parent_half_dead)
    {
        OffsetNumber nextoffset;

        nextoffset = OffsetNumberNext(poffset);
        itemid = PageGetItemId(page, nextoffset);
        itup = (IndexTuple) PageGetItem(page, itemid);
        if (ItemPointerGetBlockNumber(&(itup->t_tid)) != rightsib)
            elog(ERROR, "right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
                 rightsib, target, ItemPointerGetBlockNumber(&(itup->t_tid)),
                 parent, RelationGetRelationName(rel));
    }

    /*
     * Here we begin doing the deletion.
     */

    /* No ereport(ERROR) until changes are logged */
    START_CRIT_SECTION();

    /*
     * Update parent.  The normal case is a tad tricky because we want to
     * delete the target's downlink and the *following* key.  Easiest way is
     * to copy the right sibling's downlink over the target downlink, and then
     * delete the following item.
     */
    if (parent_half_dead)
    {
        PageIndexTupleDelete(page, poffset);
        opaque->btpo_flags |= BTP_HALF_DEAD;
    }
    else
    {
        OffsetNumber nextoffset;

        itemid = PageGetItemId(page, poffset);
        itup = (IndexTuple) PageGetItem(page, itemid);
        ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY);

        nextoffset = OffsetNumberNext(poffset);
        PageIndexTupleDelete(page, nextoffset);
    }

    /*
     * Update siblings' side-links.  Note the target page's side-links will
     * continue to point to the siblings.  Asserts here are just rechecking
     * things we already verified above.
     */
    if (BufferIsValid(lbuf))
    {
        page = BufferGetPage(lbuf);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);
        Assert(opaque->btpo_next == target);
        opaque->btpo_next = rightsib;
    }
    page = BufferGetPage(rbuf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    Assert(opaque->btpo_prev == target);
    opaque->btpo_prev = leftsib;
    rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));

    /*
     * Mark the page itself deleted.  It can be recycled when all current
     * transactions are gone.  Storing GetTopTransactionId() would work, but
     * we're in VACUUM and would not otherwise have an XID.  Having already
     * updated links to the target, ReadNewTransactionId() suffices as an
     * upper bound.  Any scan having retained a now-stale link is advertising
     * in its PGXACT an xmin less than or equal to the value we read here.  It
     * will continue to do so, holding back RecentGlobalXmin, for the duration
     * of that scan.
     */
    page = BufferGetPage(buf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    opaque->btpo_flags &= ~BTP_HALF_DEAD;
    opaque->btpo_flags |= BTP_DELETED;
    opaque->btpo.xact = ReadNewTransactionId();

    /* And update the metapage, if needed */
    if (BufferIsValid(metabuf))
    {
        metad->btm_fastroot = rightsib;
        metad->btm_fastlevel = targetlevel;
        MarkBufferDirty(metabuf);
    }

    /* Must mark buffers dirty before XLogInsert */
    MarkBufferDirty(pbuf);
    MarkBufferDirty(rbuf);
    MarkBufferDirty(buf);
    if (BufferIsValid(lbuf))
        MarkBufferDirty(lbuf);

    /* XLOG stuff */
    if (RelationNeedsWAL(rel))
    {
        xl_btree_delete_page xlrec;
        xl_btree_metadata xlmeta;
        uint8       xlinfo;
        XLogRecPtr  recptr;
        XLogRecData rdata[5];
        XLogRecData *nextrdata;

        xlrec.target.node = rel->rd_node;
        ItemPointerSet(&(xlrec.target.tid), parent, poffset);
        xlrec.deadblk = target;
        xlrec.leftblk = leftsib;
        xlrec.rightblk = rightsib;
        xlrec.btpo_xact = opaque->btpo.xact;

        rdata[0].data = (char *) &xlrec;
        rdata[0].len = SizeOfBtreeDeletePage;
        rdata[0].buffer = InvalidBuffer;
        rdata[0].next = nextrdata = &(rdata[1]);

        if (BufferIsValid(metabuf))
        {
            xlmeta.root = metad->btm_root;
            xlmeta.level = metad->btm_level;
            xlmeta.fastroot = metad->btm_fastroot;
            xlmeta.fastlevel = metad->btm_fastlevel;

            nextrdata->data = (char *) &xlmeta;
            nextrdata->len = sizeof(xl_btree_metadata);
            nextrdata->buffer = InvalidBuffer;
            nextrdata->next = nextrdata + 1;
            nextrdata++;
            xlinfo = XLOG_BTREE_DELETE_PAGE_META;
        }
        else if (parent_half_dead)
            xlinfo = XLOG_BTREE_DELETE_PAGE_HALF;
        else
            xlinfo = XLOG_BTREE_DELETE_PAGE;

        nextrdata->data = NULL;
        nextrdata->len = 0;
        nextrdata->next = nextrdata + 1;
        nextrdata->buffer = pbuf;
        nextrdata->buffer_std = true;
        nextrdata++;

        nextrdata->data = NULL;
        nextrdata->len = 0;
        nextrdata->buffer = rbuf;
        nextrdata->buffer_std = true;
        nextrdata->next = NULL;

        if (BufferIsValid(lbuf))
        {
            nextrdata->next = nextrdata + 1;
            nextrdata++;
            nextrdata->data = NULL;
            nextrdata->len = 0;
            nextrdata->buffer = lbuf;
            nextrdata->buffer_std = true;
            nextrdata->next = NULL;
        }

        recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);

        if (BufferIsValid(metabuf))
        {
            PageSetLSN(metapg, recptr);
        }
        page = BufferGetPage(pbuf);
        PageSetLSN(page, recptr);
        page = BufferGetPage(rbuf);
        PageSetLSN(page, recptr);
        page = BufferGetPage(buf);
        PageSetLSN(page, recptr);
        if (BufferIsValid(lbuf))
        {
            page = BufferGetPage(lbuf);
            PageSetLSN(page, recptr);
        }
    }

    END_CRIT_SECTION();

    /* release metapage; send out relcache inval if metapage changed */
    if (BufferIsValid(metabuf))
    {
        CacheInvalidateRelcache(rel);
        _bt_relbuf(rel, metabuf);
    }
    /* can always release leftsib immediately */
    if (BufferIsValid(lbuf))
        _bt_relbuf(rel, lbuf);

    /*
     * If parent became half dead, recurse to delete it. Otherwise, if right
     * sibling is empty and is now the last child of the parent, recurse to
     * try to delete it.  (These cases cannot apply at the same time, though
     * the second case might itself recurse to the first.)
     *
     * When recursing to parent, we hold the lock on the target page until
     * done.  This delays any insertions into the keyspace that was just
     * effectively reassigned to the parent's right sibling.  If we allowed
     * that, and there were enough such insertions before we finish deleting
     * the parent, page splits within that keyspace could lead to inserting
     * out-of-order keys into the grandparent level.  It is thought that that
     * wouldn't have any serious consequences, but it still seems like a
     * pretty bad idea.
     */
    if (parent_half_dead)
    {
        /* recursive call will release pbuf */
        _bt_relbuf(rel, rbuf);
        result = _bt_pagedel(rel, pbuf, stack->bts_parent) + 1;
        _bt_relbuf(rel, buf);
    }
    else if (parent_one_child && rightsib_empty)
    {
        _bt_relbuf(rel, pbuf);
        _bt_relbuf(rel, buf);
        /* recursive call will release rbuf */
        result = _bt_pagedel(rel, rbuf, stack) + 1;
    }
    else
    {
        _bt_relbuf(rel, pbuf);
        _bt_relbuf(rel, buf);
        _bt_relbuf(rel, rbuf);
        result = 1;
    }

    return result;
}

void _bt_pageinit ( Page  page,
Size  size 
)
void _bt_preprocess_array_keys ( IndexScanDesc  scan  ) 

Definition at line 192 of file nbtutils.c.

References _bt_find_extreme_element(), _bt_sort_array_elements(), ALLOCSET_SMALL_INITSIZE, ALLOCSET_SMALL_MAXSIZE, ALLOCSET_SMALL_MINSIZE, AllocSetContextCreate(), ARR_ELEMTYPE, BTScanOpaqueData::arrayContext, BTScanOpaqueData::arrayKeyData, BTScanOpaqueData::arrayKeys, Assert, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, cur, CurrentMemoryContext, DatumGetArrayTypeP, deconstruct_array(), BTArrayKeyInfo::elem_values, elog, ERROR, get_typlenbyvalalign(), i, IndexScanDescData::indexRelation, INDOPTION_DESC, IndexScanDescData::keyData, MemoryContextReset(), MemoryContextSwitchTo(), NULL, BTArrayKeyInfo::num_elems, BTScanOpaqueData::numArrayKeys, IndexScanDescData::numberOfKeys, IndexScanDescData::opaque, palloc(), palloc0(), RelationData::rd_indoption, BTArrayKeyInfo::scan_key, SK_ISNULL, SK_ROW_HEADER, SK_SEARCHARRAY, SK_SEARCHNOTNULL, and SK_SEARCHNULL.

Referenced by btrescan().

{
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    int         numberOfKeys = scan->numberOfKeys;
    int16      *indoption = scan->indexRelation->rd_indoption;
    int         numArrayKeys;
    ScanKey     cur;
    int         i;
    MemoryContext oldContext;

    /* Quick check to see if there are any array keys */
    numArrayKeys = 0;
    for (i = 0; i < numberOfKeys; i++)
    {
        cur = &scan->keyData[i];
        if (cur->sk_flags & SK_SEARCHARRAY)
        {
            numArrayKeys++;
            Assert(!(cur->sk_flags & (SK_ROW_HEADER | SK_SEARCHNULL | SK_SEARCHNOTNULL)));
            /* If any arrays are null as a whole, we can quit right now. */
            if (cur->sk_flags & SK_ISNULL)
            {
                so->numArrayKeys = -1;
                so->arrayKeyData = NULL;
                return;
            }
        }
    }

    /* Quit if nothing to do. */
    if (numArrayKeys == 0)
    {
        so->numArrayKeys = 0;
        so->arrayKeyData = NULL;
        return;
    }

    /*
     * Make a scan-lifespan context to hold array-associated data, or reset it
     * if we already have one from a previous rescan cycle.
     */
    if (so->arrayContext == NULL)
        so->arrayContext = AllocSetContextCreate(CurrentMemoryContext,
                                                 "BTree Array Context",
                                                 ALLOCSET_SMALL_MINSIZE,
                                                 ALLOCSET_SMALL_INITSIZE,
                                                 ALLOCSET_SMALL_MAXSIZE);
    else
        MemoryContextReset(so->arrayContext);

    oldContext = MemoryContextSwitchTo(so->arrayContext);

    /* Create modifiable copy of scan->keyData in the workspace context */
    so->arrayKeyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
    memcpy(so->arrayKeyData,
           scan->keyData,
           scan->numberOfKeys * sizeof(ScanKeyData));

    /* Allocate space for per-array data in the workspace context */
    so->arrayKeys = (BTArrayKeyInfo *) palloc0(numArrayKeys * sizeof(BTArrayKeyInfo));

    /* Now process each array key */
    numArrayKeys = 0;
    for (i = 0; i < numberOfKeys; i++)
    {
        ArrayType  *arrayval;
        int16       elmlen;
        bool        elmbyval;
        char        elmalign;
        int         num_elems;
        Datum      *elem_values;
        bool       *elem_nulls;
        int         num_nonnulls;
        int         j;

        cur = &so->arrayKeyData[i];
        if (!(cur->sk_flags & SK_SEARCHARRAY))
            continue;

        /*
         * First, deconstruct the array into elements.  Anything allocated
         * here (including a possibly detoasted array value) is in the
         * workspace context.
         */
        arrayval = DatumGetArrayTypeP(cur->sk_argument);
        /* We could cache this data, but not clear it's worth it */
        get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
                             &elmlen, &elmbyval, &elmalign);
        deconstruct_array(arrayval,
                          ARR_ELEMTYPE(arrayval),
                          elmlen, elmbyval, elmalign,
                          &elem_values, &elem_nulls, &num_elems);

        /*
         * Compress out any null elements.  We can ignore them since we assume
         * all btree operators are strict.
         */
        num_nonnulls = 0;
        for (j = 0; j < num_elems; j++)
        {
            if (!elem_nulls[j])
                elem_values[num_nonnulls++] = elem_values[j];
        }

        /* We could pfree(elem_nulls) now, but not worth the cycles */

        /* If there's no non-nulls, the scan qual is unsatisfiable */
        if (num_nonnulls == 0)
        {
            numArrayKeys = -1;
            break;
        }

        /*
         * If the comparison operator is not equality, then the array qual
         * degenerates to a simple comparison against the smallest or largest
         * non-null array element, as appropriate.
         */
        switch (cur->sk_strategy)
        {
            case BTLessStrategyNumber:
            case BTLessEqualStrategyNumber:
                cur->sk_argument =
                    _bt_find_extreme_element(scan, cur,
                                             BTGreaterStrategyNumber,
                                             elem_values, num_nonnulls);
                continue;
            case BTEqualStrategyNumber:
                /* proceed with rest of loop */
                break;
            case BTGreaterEqualStrategyNumber:
            case BTGreaterStrategyNumber:
                cur->sk_argument =
                    _bt_find_extreme_element(scan, cur,
                                             BTLessStrategyNumber,
                                             elem_values, num_nonnulls);
                continue;
            default:
                elog(ERROR, "unrecognized StrategyNumber: %d",
                     (int) cur->sk_strategy);
                break;
        }

        /*
         * Sort the non-null elements and eliminate any duplicates.  We must
         * sort in the same ordering used by the index column, so that the
         * successive primitive indexscans produce data in index order.
         */
        num_elems = _bt_sort_array_elements(scan, cur,
                        (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0,
                                            elem_values, num_nonnulls);

        /*
         * And set up the BTArrayKeyInfo data.
         */
        so->arrayKeys[numArrayKeys].scan_key = i;
        so->arrayKeys[numArrayKeys].num_elems = num_elems;
        so->arrayKeys[numArrayKeys].elem_values = elem_values;
        numArrayKeys++;
    }

    so->numArrayKeys = numArrayKeys;

    MemoryContextSwitchTo(oldContext);
}

void _bt_preprocess_keys ( IndexScanDesc  scan  ) 

Definition at line 743 of file nbtutils.c.

References _bt_compare_scankey_args(), _bt_fix_scankey_strategy(), _bt_mark_scankey_required(), BTScanOpaqueData::arrayKeyData, Assert, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTMaxStrategyNumber, cur, elog, ERROR, i, IndexScanDescData::indexRelation, BTScanOpaqueData::keyData, IndexScanDescData::keyData, NULL, BTScanOpaqueData::numberOfKeys, IndexScanDescData::numberOfKeys, IndexScanDescData::opaque, BTScanOpaqueData::qual_ok, RelationData::rd_indoption, ScanKeyData::sk_flags, SK_ROW_HEADER, and SK_SEARCHNULL.

Referenced by _bt_first(), and _bt_restore_array_keys().

{
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    int         numberOfKeys = scan->numberOfKeys;
    int16      *indoption = scan->indexRelation->rd_indoption;
    int         new_numberOfKeys;
    int         numberOfEqualCols;
    ScanKey     inkeys;
    ScanKey     outkeys;
    ScanKey     cur;
    ScanKey     xform[BTMaxStrategyNumber];
    bool        test_result;
    int         i,
                j;
    AttrNumber  attno;

    /* initialize result variables */
    so->qual_ok = true;
    so->numberOfKeys = 0;

    if (numberOfKeys < 1)
        return;                 /* done if qual-less scan */

    /*
     * Read so->arrayKeyData if array keys are present, else scan->keyData
     */
    if (so->arrayKeyData != NULL)
        inkeys = so->arrayKeyData;
    else
        inkeys = scan->keyData;

    outkeys = so->keyData;
    cur = &inkeys[0];
    /* we check that input keys are correctly ordered */
    if (cur->sk_attno < 1)
        elog(ERROR, "btree index keys must be ordered by attribute");

    /* We can short-circuit most of the work if there's just one key */
    if (numberOfKeys == 1)
    {
        /* Apply indoption to scankey (might change sk_strategy!) */
        if (!_bt_fix_scankey_strategy(cur, indoption))
            so->qual_ok = false;
        memcpy(outkeys, cur, sizeof(ScanKeyData));
        so->numberOfKeys = 1;
        /* We can mark the qual as required if it's for first index col */
        if (cur->sk_attno == 1)
            _bt_mark_scankey_required(outkeys);
        return;
    }

    /*
     * Otherwise, do the full set of pushups.
     */
    new_numberOfKeys = 0;
    numberOfEqualCols = 0;

    /*
     * Initialize for processing of keys for attr 1.
     *
     * xform[i] points to the currently best scan key of strategy type i+1; it
     * is NULL if we haven't yet found such a key for this attr.
     */
    attno = 1;
    memset(xform, 0, sizeof(xform));

    /*
     * Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to
     * handle after-last-key processing.  Actual exit from the loop is at the
     * "break" statement below.
     */
    for (i = 0;; cur++, i++)
    {
        if (i < numberOfKeys)
        {
            /* Apply indoption to scankey (might change sk_strategy!) */
            if (!_bt_fix_scankey_strategy(cur, indoption))
            {
                /* NULL can't be matched, so give up */
                so->qual_ok = false;
                return;
            }
        }

        /*
         * If we are at the end of the keys for a particular attr, finish up
         * processing and emit the cleaned-up keys.
         */
        if (i == numberOfKeys || cur->sk_attno != attno)
        {
            int         priorNumberOfEqualCols = numberOfEqualCols;

            /* check input keys are correctly ordered */
            if (i < numberOfKeys && cur->sk_attno < attno)
                elog(ERROR, "btree index keys must be ordered by attribute");

            /*
             * If = has been specified, all other keys can be eliminated as
             * redundant.  If we have a case like key = 1 AND key > 2, we can
             * set qual_ok to false and abandon further processing.
             *
             * We also have to deal with the case of "key IS NULL", which is
             * unsatisfiable in combination with any other index condition. By
             * the time we get here, that's been classified as an equality
             * check, and we've rejected any combination of it with a regular
             * equality condition; but not with other types of conditions.
             */
            if (xform[BTEqualStrategyNumber - 1])
            {
                ScanKey     eq = xform[BTEqualStrategyNumber - 1];

                for (j = BTMaxStrategyNumber; --j >= 0;)
                {
                    ScanKey     chk = xform[j];

                    if (!chk || j == (BTEqualStrategyNumber - 1))
                        continue;

                    if (eq->sk_flags & SK_SEARCHNULL)
                    {
                        /* IS NULL is contradictory to anything else */
                        so->qual_ok = false;
                        return;
                    }

                    if (_bt_compare_scankey_args(scan, chk, eq, chk,
                                                 &test_result))
                    {
                        if (!test_result)
                        {
                            /* keys proven mutually contradictory */
                            so->qual_ok = false;
                            return;
                        }
                        /* else discard the redundant non-equality key */
                        xform[j] = NULL;
                    }
                    /* else, cannot determine redundancy, keep both keys */
                }
                /* track number of attrs for which we have "=" keys */
                numberOfEqualCols++;
            }

            /* try to keep only one of <, <= */
            if (xform[BTLessStrategyNumber - 1]
                && xform[BTLessEqualStrategyNumber - 1])
            {
                ScanKey     lt = xform[BTLessStrategyNumber - 1];
                ScanKey     le = xform[BTLessEqualStrategyNumber - 1];

                if (_bt_compare_scankey_args(scan, le, lt, le,
                                             &test_result))
                {
                    if (test_result)
                        xform[BTLessEqualStrategyNumber - 1] = NULL;
                    else
                        xform[BTLessStrategyNumber - 1] = NULL;
                }
            }

            /* try to keep only one of >, >= */
            if (xform[BTGreaterStrategyNumber - 1]
                && xform[BTGreaterEqualStrategyNumber - 1])
            {
                ScanKey     gt = xform[BTGreaterStrategyNumber - 1];
                ScanKey     ge = xform[BTGreaterEqualStrategyNumber - 1];

                if (_bt_compare_scankey_args(scan, ge, gt, ge,
                                             &test_result))
                {
                    if (test_result)
                        xform[BTGreaterEqualStrategyNumber - 1] = NULL;
                    else
                        xform[BTGreaterStrategyNumber - 1] = NULL;
                }
            }

            /*
             * Emit the cleaned-up keys into the outkeys[] array, and then
             * mark them if they are required.  They are required (possibly
             * only in one direction) if all attrs before this one had "=".
             */
            for (j = BTMaxStrategyNumber; --j >= 0;)
            {
                if (xform[j])
                {
                    ScanKey     outkey = &outkeys[new_numberOfKeys++];

                    memcpy(outkey, xform[j], sizeof(ScanKeyData));
                    if (priorNumberOfEqualCols == attno - 1)
                        _bt_mark_scankey_required(outkey);
                }
            }

            /*
             * Exit loop here if done.
             */
            if (i == numberOfKeys)
                break;

            /* Re-initialize for new attno */
            attno = cur->sk_attno;
            memset(xform, 0, sizeof(xform));
        }

        /* check strategy this key's operator corresponds to */
        j = cur->sk_strategy - 1;

        /* if row comparison, push it directly to the output array */
        if (cur->sk_flags & SK_ROW_HEADER)
        {
            ScanKey     outkey = &outkeys[new_numberOfKeys++];

            memcpy(outkey, cur, sizeof(ScanKeyData));
            if (numberOfEqualCols == attno - 1)
                _bt_mark_scankey_required(outkey);

            /*
             * We don't support RowCompare using equality; such a qual would
             * mess up the numberOfEqualCols tracking.
             */
            Assert(j != (BTEqualStrategyNumber - 1));
            continue;
        }

        /* have we seen one of these before? */
        if (xform[j] == NULL)
        {
            /* nope, so remember this scankey */
            xform[j] = cur;
        }
        else
        {
            /* yup, keep only the more restrictive key */
            if (_bt_compare_scankey_args(scan, cur, cur, xform[j],
                                         &test_result))
            {
                if (test_result)
                    xform[j] = cur;
                else if (j == (BTEqualStrategyNumber - 1))
                {
                    /* key == a && key == b, but a != b */
                    so->qual_ok = false;
                    return;
                }
                /* else old key is more restrictive, keep it */
            }
            else
            {
                /*
                 * We can't determine which key is more restrictive.  Keep the
                 * previous one in xform[j] and push this one directly to the
                 * output array.
                 */
                ScanKey     outkey = &outkeys[new_numberOfKeys++];

                memcpy(outkey, cur, sizeof(ScanKeyData));
                if (numberOfEqualCols == attno - 1)
                    _bt_mark_scankey_required(outkey);
            }
        }
    }

    so->numberOfKeys = new_numberOfKeys;
}

Buffer _bt_relandgetbuf ( Relation  rel,
Buffer  obuf,
BlockNumber  blkno,
int  access 
)
void _bt_relbuf ( Relation  rel,
Buffer  buf 
)
void _bt_restore_array_keys ( IndexScanDesc  scan  ) 

Definition at line 623 of file nbtutils.c.

References _bt_preprocess_keys(), BTScanOpaqueData::arrayKeyData, BTScanOpaqueData::arrayKeys, Assert, BTArrayKeyInfo::cur_elem, BTArrayKeyInfo::elem_values, i, BTArrayKeyInfo::mark_elem, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, BTScanOpaqueData::qual_ok, BTArrayKeyInfo::scan_key, and ScanKeyData::sk_argument.

Referenced by btrestrpos().

{
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    bool        changed = false;
    int         i;

    /* Restore each array key to its position when the mark was set */
    for (i = 0; i < so->numArrayKeys; i++)
    {
        BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
        ScanKey     skey = &so->arrayKeyData[curArrayKey->scan_key];
        int         mark_elem = curArrayKey->mark_elem;

        if (curArrayKey->cur_elem != mark_elem)
        {
            curArrayKey->cur_elem = mark_elem;
            skey->sk_argument = curArrayKey->elem_values[mark_elem];
            changed = true;
        }
    }

    /*
     * If we changed any keys, we must redo _bt_preprocess_keys.  That might
     * sound like overkill, but in cases with multiple keys per index column
     * it seems necessary to do the full set of pushups.
     */
    if (changed)
    {
        _bt_preprocess_keys(scan);
        /* The mark should have been set on a consistent set of keys... */
        Assert(so->qual_ok);
    }
}

BTStack _bt_search ( Relation  rel,
int  keysz,
ScanKey  scankey,
bool  nextkey,
Buffer bufP,
int  access 
)

Definition at line 57 of file nbtsearch.c.

References _bt_binsrch(), _bt_getroot(), _bt_moveright(), _bt_relandgetbuf(), BT_READ, BTStackData::bts_blkno, BTStackData::bts_btentry, BTStackData::bts_offset, BTStackData::bts_parent, BufferGetBlockNumber(), BufferGetPage, BufferIsValid, ItemPointerGetBlockNumber, P_ISLEAF, PageGetItem, PageGetItemId, PageGetSpecialPointer, palloc(), and IndexTupleData::t_tid.

Referenced by _bt_doinsert(), _bt_first(), and _bt_pagedel().

{
    BTStack     stack_in = NULL;

    /* Get the root page to start with */
    *bufP = _bt_getroot(rel, access);

    /* If index is empty and access = BT_READ, no root page is created. */
    if (!BufferIsValid(*bufP))
        return (BTStack) NULL;

    /* Loop iterates once per level descended in the tree */
    for (;;)
    {
        Page        page;
        BTPageOpaque opaque;
        OffsetNumber offnum;
        ItemId      itemid;
        IndexTuple  itup;
        BlockNumber blkno;
        BlockNumber par_blkno;
        BTStack     new_stack;

        /*
         * Race -- the page we just grabbed may have split since we read its
         * pointer in the parent (or metapage).  If it has, we may need to
         * move right to its new sibling.  Do that.
         */
        *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ);

        /* if this is a leaf page, we're done */
        page = BufferGetPage(*bufP);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);
        if (P_ISLEAF(opaque))
            break;

        /*
         * Find the appropriate item on the internal page, and get the child
         * page that it points to.
         */
        offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
        itemid = PageGetItemId(page, offnum);
        itup = (IndexTuple) PageGetItem(page, itemid);
        blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
        par_blkno = BufferGetBlockNumber(*bufP);

        /*
         * We need to save the location of the index entry we chose in the
         * parent page on a stack. In case we split the tree, we'll use the
         * stack to work back up to the parent page.  We also save the actual
         * downlink (TID) to uniquely identify the index entry, in case it
         * moves right while we're working lower in the tree.  See the paper
         * by Lehman and Yao for how this is detected and handled. (We use the
         * child link to disambiguate duplicate keys in the index -- Lehman
         * and Yao disallow duplicate keys.)
         */
        new_stack = (BTStack) palloc(sizeof(BTStackData));
        new_stack->bts_blkno = par_blkno;
        new_stack->bts_offset = offnum;
        memcpy(&new_stack->bts_btentry, itup, sizeof(IndexTupleData));
        new_stack->bts_parent = stack_in;

        /* drop the read lock on the parent page, acquire one on the child */
        *bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);

        /* okay, all set to move down a level */
        stack_in = new_stack;
    }

    return stack_in;
}

void _bt_spool ( IndexTuple  itup,
BTSpool btspool 
)

Definition at line 188 of file nbtsort.c.

References BTSpool::sortstate, and tuplesort_putindextuple().

Referenced by btbuildCallback().

{
    tuplesort_putindextuple(btspool->sortstate, itup);
}

void _bt_spooldestroy ( BTSpool btspool  ) 

Definition at line 178 of file nbtsort.c.

References pfree(), BTSpool::sortstate, and tuplesort_end().

Referenced by btbuild().

{
    tuplesort_end(btspool->sortstate);
    pfree(btspool);
}

BTSpool* _bt_spoolinit ( Relation  heap,
Relation  index,
bool  isunique,
bool  isdead 
)

Definition at line 150 of file nbtsort.c.

References BTSpool::heap, BTSpool::index, BTSpool::isunique, maintenance_work_mem, palloc0(), BTSpool::sortstate, tuplesort_begin_index_btree(), and work_mem.

Referenced by btbuild().

{
    BTSpool    *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
    int         btKbytes;

    btspool->heap = heap;
    btspool->index = index;
    btspool->isunique = isunique;

    /*
     * We size the sort area as maintenance_work_mem rather than work_mem to
     * speed index creation.  This should be OK since a single backend can't
     * run multiple index creations in parallel.  Note that creation of a
     * unique index actually requires two BTSpool objects.  We expect that the
     * second one (for dead tuples) won't get very full, so we give it only
     * work_mem.
     */
    btKbytes = isdead ? work_mem : maintenance_work_mem;
    btspool->sortstate = tuplesort_begin_index_btree(heap, index, isunique,
                                                     btKbytes, false);

    return btspool;
}

void _bt_start_array_keys ( IndexScanDesc  scan,
ScanDirection  dir 
)

Definition at line 523 of file nbtutils.c.

References BTScanOpaqueData::arrayKeyData, BTScanOpaqueData::arrayKeys, Assert, BTArrayKeyInfo::cur_elem, BTArrayKeyInfo::elem_values, i, BTArrayKeyInfo::num_elems, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, BTArrayKeyInfo::scan_key, ScanDirectionIsBackward, and ScanKeyData::sk_argument.

Referenced by btgetbitmap(), and btgettuple().

{
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    int         i;

    for (i = 0; i < so->numArrayKeys; i++)
    {
        BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
        ScanKey     skey = &so->arrayKeyData[curArrayKey->scan_key];

        Assert(curArrayKey->num_elems > 0);
        if (ScanDirectionIsBackward(dir))
            curArrayKey->cur_elem = curArrayKey->num_elems - 1;
        else
            curArrayKey->cur_elem = 0;
        skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem];
    }
}

BTCycleId _bt_start_vacuum ( Relation  rel  ) 

Definition at line 1882 of file nbtutils.c.

References BtreeVacuumLock, BTVacInfo::cycle_ctr, BTOneVacInfo::cycleid, LockRelId::dbId, elog, ERROR, i, LockInfoData::lockRelId, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), MAX_BT_CYCLE_ID, BTVacInfo::max_vacuums, BTVacInfo::num_vacuums, RelationData::rd_lockInfo, RelationGetRelationName, LockRelId::relId, BTOneVacInfo::relid, and BTVacInfo::vacuums.

Referenced by btbulkdelete().

{
    BTCycleId   result;
    int         i;
    BTOneVacInfo *vac;

    LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);

    /*
     * Assign the next cycle ID, being careful to avoid zero as well as the
     * reserved high values.
     */
    result = ++(btvacinfo->cycle_ctr);
    if (result == 0 || result > MAX_BT_CYCLE_ID)
        result = btvacinfo->cycle_ctr = 1;

    /* Let's just make sure there's no entry already for this index */
    for (i = 0; i < btvacinfo->num_vacuums; i++)
    {
        vac = &btvacinfo->vacuums[i];
        if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
            vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
        {
            /*
             * Unlike most places in the backend, we have to explicitly
             * release our LWLock before throwing an error.  This is because
             * we expect _bt_end_vacuum() to be called before transaction
             * abort cleanup can run to release LWLocks.
             */
            LWLockRelease(BtreeVacuumLock);
            elog(ERROR, "multiple active vacuums for index \"%s\"",
                 RelationGetRelationName(rel));
        }
    }

    /* OK, add an entry */
    if (btvacinfo->num_vacuums >= btvacinfo->max_vacuums)
    {
        LWLockRelease(BtreeVacuumLock);
        elog(ERROR, "out of btvacinfo slots");
    }
    vac = &btvacinfo->vacuums[btvacinfo->num_vacuums];
    vac->relid = rel->rd_lockInfo.lockRelId;
    vac->cycleid = result;
    btvacinfo->num_vacuums++;

    LWLockRelease(BtreeVacuumLock);
    return result;
}

BTCycleId _bt_vacuum_cycleid ( Relation  rel  ) 

Definition at line 1848 of file nbtutils.c.

References BtreeVacuumLock, BTOneVacInfo::cycleid, LockRelId::dbId, i, LockInfoData::lockRelId, LW_SHARED, LWLockAcquire(), LWLockRelease(), BTVacInfo::num_vacuums, RelationData::rd_lockInfo, LockRelId::relId, BTOneVacInfo::relid, and BTVacInfo::vacuums.

Referenced by _bt_split().

{
    BTCycleId   result = 0;
    int         i;

    /* Share lock is enough since this is a read-only operation */
    LWLockAcquire(BtreeVacuumLock, LW_SHARED);

    for (i = 0; i < btvacinfo->num_vacuums; i++)
    {
        BTOneVacInfo *vac = &btvacinfo->vacuums[i];

        if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
            vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
        {
            result = vac->cycleid;
            break;
        }
    }

    LWLockRelease(BtreeVacuumLock);
    return result;
}

Datum btbeginscan ( PG_FUNCTION_ARGS   ) 

Definition at line 406 of file nbtree.c.

References BTScanOpaqueData::arrayContext, BTScanOpaqueData::arrayKeyData, BTScanOpaqueData::arrayKeys, Assert, BTScanPosData::buf, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanOpaqueData::keyData, BTScanOpaqueData::killedItems, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numArrayKeys, IndexScanDescData::numberOfKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, palloc(), PG_GETARG_INT32, PG_GETARG_POINTER, PG_RETURN_POINTER, RelationGetDescr, RelationGetIndexScan(), and IndexScanDescData::xs_itupdesc.

{
    Relation    rel = (Relation) PG_GETARG_POINTER(0);
    int         nkeys = PG_GETARG_INT32(1);
    int         norderbys = PG_GETARG_INT32(2);
    IndexScanDesc scan;
    BTScanOpaque so;

    /* no order by operators allowed */
    Assert(norderbys == 0);

    /* get the scan */
    scan = RelationGetIndexScan(rel, nkeys, norderbys);

    /* allocate private workspace */
    so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
    so->currPos.buf = so->markPos.buf = InvalidBuffer;
    if (scan->numberOfKeys > 0)
        so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
    else
        so->keyData = NULL;

    so->arrayKeyData = NULL;    /* assume no array keys for now */
    so->numArrayKeys = 0;
    so->arrayKeys = NULL;
    so->arrayContext = NULL;

    so->killedItems = NULL;     /* until needed */
    so->numKilled = 0;

    /*
     * We don't know yet whether the scan will be index-only, so we do not
     * allocate the tuple workspace arrays until btrescan.  However, we set up
     * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
     */
    so->currTuples = so->markTuples = NULL;
    so->currPos.nextTupleOffset = 0;
    so->markPos.nextTupleOffset = 0;

    scan->xs_itupdesc = RelationGetDescr(rel);

    scan->opaque = so;

    PG_RETURN_POINTER(scan);
}

Datum btbuild ( PG_FUNCTION_ARGS   ) 

Definition at line 83 of file nbtree.c.

References _bt_leafbuild(), _bt_spooldestroy(), _bt_spoolinit(), btbuildCallback(), elog, ERROR, BTBuildState::haveDead, IndexBuildResult::heap_tuples, BTBuildState::heapRel, IndexInfo::ii_Unique, IndexBuildResult::index_tuples, IndexBuildHeapScan(), BTBuildState::indtuples, BTBuildState::isUnique, log_btree_build_stats, palloc(), PG_GETARG_POINTER, PG_RETURN_POINTER, RelationGetNumberOfBlocks, RelationGetRelationName, ResetUsage(), ShowUsage(), BTBuildState::spool, and BTBuildState::spool2.

{
    Relation    heap = (Relation) PG_GETARG_POINTER(0);
    Relation    index = (Relation) PG_GETARG_POINTER(1);
    IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
    IndexBuildResult *result;
    double      reltuples;
    BTBuildState buildstate;

    buildstate.isUnique = indexInfo->ii_Unique;
    buildstate.haveDead = false;
    buildstate.heapRel = heap;
    buildstate.spool = NULL;
    buildstate.spool2 = NULL;
    buildstate.indtuples = 0;

#ifdef BTREE_BUILD_STATS
    if (log_btree_build_stats)
        ResetUsage();
#endif   /* BTREE_BUILD_STATS */

    /*
     * We expect to be called exactly once for any index relation. If that's
     * not the case, big trouble's what we have.
     */
    if (RelationGetNumberOfBlocks(index) != 0)
        elog(ERROR, "index \"%s\" already contains data",
             RelationGetRelationName(index));

    buildstate.spool = _bt_spoolinit(heap, index, indexInfo->ii_Unique, false);

    /*
     * If building a unique index, put dead tuples in a second spool to keep
     * them out of the uniqueness check.
     */
    if (indexInfo->ii_Unique)
        buildstate.spool2 = _bt_spoolinit(heap, index, false, true);

    /* do the heap scan */
    reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
                                   btbuildCallback, (void *) &buildstate);

    /* okay, all heap tuples are indexed */
    if (buildstate.spool2 && !buildstate.haveDead)
    {
        /* spool2 turns out to be unnecessary */
        _bt_spooldestroy(buildstate.spool2);
        buildstate.spool2 = NULL;
    }

    /*
     * Finish the build by (1) completing the sort of the spool file, (2)
     * inserting the sorted tuples into btree pages and (3) building the upper
     * levels.
     */
    _bt_leafbuild(buildstate.spool, buildstate.spool2);
    _bt_spooldestroy(buildstate.spool);
    if (buildstate.spool2)
        _bt_spooldestroy(buildstate.spool2);

#ifdef BTREE_BUILD_STATS
    if (log_btree_build_stats)
    {
        ShowUsage("BTREE BUILD STATS");
        ResetUsage();
    }
#endif   /* BTREE_BUILD_STATS */

    /*
     * If we are reindexing a pre-existing index, it is critical to send out a
     * relcache invalidation SI message to ensure all backends re-read the
     * index metapage.  We expect that the caller will ensure that happens
     * (typically as a side effect of updating index stats, but it must happen
     * even if the stats don't change!)
     */

    /*
     * Return statistics
     */
    result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));

    result->heap_tuples = reltuples;
    result->index_tuples = buildstate.indtuples;

    PG_RETURN_POINTER(result);
}

Datum btbuildempty ( PG_FUNCTION_ARGS   ) 

Definition at line 210 of file nbtree.c.

References _bt_initmetapage(), BTREE_METAPAGE, INIT_FORKNUM, log_newpage(), RelFileNodeBackend::node, P_NONE, PageSetChecksumInplace(), palloc(), PG_GETARG_POINTER, PG_RETURN_VOID, RelationData::rd_smgr, SMgrRelationData::smgr_rnode, smgrimmedsync(), smgrwrite(), and XLogIsNeeded.

{
    Relation    index = (Relation) PG_GETARG_POINTER(0);
    Page        metapage;

    /* Construct metapage. */
    metapage = (Page) palloc(BLCKSZ);
    _bt_initmetapage(metapage, P_NONE, 0);

    /* Write the page.  If archiving/streaming, XLOG it. */
    PageSetChecksumInplace(metapage, BTREE_METAPAGE);
    smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
              (char *) metapage, true);
    if (XLogIsNeeded())
        log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
                    BTREE_METAPAGE, metapage);

    /*
     * An immediate sync is require even if we xlog'd the page, because the
     * write did not go through shared_buffers and therefore a concurrent
     * checkpoint may have move the redo pointer past our xlog record.
     */
    smgrimmedsync(index->rd_smgr, INIT_FORKNUM);

    PG_RETURN_VOID();
}

Datum btbulkdelete ( PG_FUNCTION_ARGS   ) 

Definition at line 653 of file nbtree.c.

References _bt_end_vacuum(), _bt_end_vacuum_callback(), _bt_start_vacuum(), btvacuumscan(), callback(), IndexVacuumInfo::index, NULL, palloc0(), PG_END_ENSURE_ERROR_CLEANUP, PG_ENSURE_ERROR_CLEANUP, PG_GETARG_POINTER, PG_RETURN_POINTER, and PointerGetDatum.

{
    IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
    IndexBulkDeleteResult *volatile stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
    IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2);
    void       *callback_state = (void *) PG_GETARG_POINTER(3);
    Relation    rel = info->index;
    BTCycleId   cycleid;

    /* allocate stats if first time through, else re-use existing struct */
    if (stats == NULL)
        stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));

    /* Establish the vacuum cycle ID to use for this scan */
    /* The ENSURE stuff ensures we clean up shared memory on failure */
    PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
    {
        cycleid = _bt_start_vacuum(rel);

        btvacuumscan(info, stats, callback, callback_state, cycleid);
    }
    PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
    _bt_end_vacuum(rel);

    PG_RETURN_POINTER(stats);
}

Datum btcanreturn ( PG_FUNCTION_ARGS   ) 

Definition at line 1112 of file nbtree.c.

References PG_RETURN_BOOL.

{
    PG_RETURN_BOOL(true);
}

Datum btendscan ( PG_FUNCTION_ARGS   ) 

Definition at line 523 of file nbtree.c.

References _bt_killitems(), BTScanOpaqueData::arrayContext, BTScanPosIsValid, BTScanPosData::buf, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanOpaqueData::keyData, BTScanOpaqueData::killedItems, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, MemoryContextDelete(), NULL, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, pfree(), PG_GETARG_POINTER, PG_RETURN_VOID, and ReleaseBuffer().

{
    IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
    BTScanOpaque so = (BTScanOpaque) scan->opaque;

    /* we aren't holding any read locks, but gotta drop the pins */
    if (BTScanPosIsValid(so->currPos))
    {
        /* Before leaving current page, deal with any killed items */
        if (so->numKilled > 0)
            _bt_killitems(scan, false);
        ReleaseBuffer(so->currPos.buf);
        so->currPos.buf = InvalidBuffer;
    }

    if (BTScanPosIsValid(so->markPos))
    {
        ReleaseBuffer(so->markPos.buf);
        so->markPos.buf = InvalidBuffer;
    }
    so->markItemIndex = -1;

    /* Release storage */
    if (so->keyData != NULL)
        pfree(so->keyData);
    /* so->arrayKeyData and so->arrayKeys are in arrayContext */
    if (so->arrayContext != NULL)
        MemoryContextDelete(so->arrayContext);
    if (so->killedItems != NULL)
        pfree(so->killedItems);
    if (so->currTuples != NULL)
        pfree(so->currTuples);
    /* so->markTuples should not be pfree'd, see btrescan */
    pfree(so);

    PG_RETURN_VOID();
}

Datum btgetbitmap ( PG_FUNCTION_ARGS   ) 

Definition at line 346 of file nbtree.c.

References _bt_advance_array_keys(), _bt_first(), _bt_next(), _bt_start_array_keys(), BTScanOpaqueData::currPos, ForwardScanDirection, BTScanPosItem::heapTid, BTScanPosData::itemIndex, BTScanPosData::items, BTScanPosData::lastItem, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, PG_GETARG_POINTER, PG_RETURN_INT64, HeapTupleData::t_self, tbm_add_tuples(), and IndexScanDescData::xs_ctup.

{
    IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
    TIDBitmap  *tbm = (TIDBitmap *) PG_GETARG_POINTER(1);
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    int64       ntids = 0;
    ItemPointer heapTid;

    /*
     * If we have any array keys, initialize them.
     */
    if (so->numArrayKeys)
    {
        /* punt if we have any unsatisfiable array keys */
        if (so->numArrayKeys < 0)
            PG_RETURN_INT64(ntids);

        _bt_start_array_keys(scan, ForwardScanDirection);
    }

    /* This loop handles advancing to the next array elements, if any */
    do
    {
        /* Fetch the first page & tuple */
        if (_bt_first(scan, ForwardScanDirection))
        {
            /* Save tuple ID, and continue scanning */
            heapTid = &scan->xs_ctup.t_self;
            tbm_add_tuples(tbm, heapTid, 1, false);
            ntids++;

            for (;;)
            {
                /*
                 * Advance to next tuple within page.  This is the same as the
                 * easy case in _bt_next().
                 */
                if (++so->currPos.itemIndex > so->currPos.lastItem)
                {
                    /* let _bt_next do the heavy lifting */
                    if (!_bt_next(scan, ForwardScanDirection))
                        break;
                }

                /* Save tuple ID, and continue scanning */
                heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
                tbm_add_tuples(tbm, heapTid, 1, false);
                ntids++;
            }
        }
        /* Now see if we have more array keys to deal with */
    } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));

    PG_RETURN_INT64(ntids);
}

Datum btgettuple ( PG_FUNCTION_ARGS   ) 

Definition at line 270 of file nbtree.c.

References _bt_advance_array_keys(), _bt_first(), _bt_next(), _bt_start_array_keys(), BTScanPosIsValid, BTScanOpaqueData::currPos, BTScanPosData::itemIndex, IndexScanDescData::kill_prior_tuple, BTScanOpaqueData::killedItems, MaxIndexTuplesPerPage, NULL, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, palloc(), PG_GETARG_INT32, PG_GETARG_POINTER, PG_RETURN_BOOL, and IndexScanDescData::xs_recheck.

{
    IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
    ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    bool        res;

    /* btree indexes are never lossy */
    scan->xs_recheck = false;

    /*
     * If we have any array keys, initialize them during first call for a
     * scan.  We can't do this in btrescan because we don't know the scan
     * direction at that time.
     */
    if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
    {
        /* punt if we have any unsatisfiable array keys */
        if (so->numArrayKeys < 0)
            PG_RETURN_BOOL(false);

        _bt_start_array_keys(scan, dir);
    }

    /* This loop handles advancing to the next array elements, if any */
    do
    {
        /*
         * If we've already initialized this scan, we can just advance it in
         * the appropriate direction.  If we haven't done so yet, we call
         * _bt_first() to get the first item in the scan.
         */
        if (!BTScanPosIsValid(so->currPos))
            res = _bt_first(scan, dir);
        else
        {
            /*
             * Check to see if we should kill the previously-fetched tuple.
             */
            if (scan->kill_prior_tuple)
            {
                /*
                 * Yes, remember it for later. (We'll deal with all such
                 * tuples at once right before leaving the index page.)  The
                 * test for numKilled overrun is not just paranoia: if the
                 * caller reverses direction in the indexscan then the same
                 * item might get entered multiple times. It's not worth
                 * trying to optimize that, so we don't detect it, but instead
                 * just forget any excess entries.
                 */
                if (so->killedItems == NULL)
                    so->killedItems = (int *)
                        palloc(MaxIndexTuplesPerPage * sizeof(int));
                if (so->numKilled < MaxIndexTuplesPerPage)
                    so->killedItems[so->numKilled++] = so->currPos.itemIndex;
            }

            /*
             * Now continue the scan.
             */
            res = _bt_next(scan, dir);
        }

        /* If we have a tuple, return it ... */
        if (res)
            break;
        /* ... otherwise see if we have more array keys to deal with */
    } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));

    PG_RETURN_BOOL(res);
}

Datum btinsert ( PG_FUNCTION_ARGS   ) 

Definition at line 244 of file nbtree.c.

References _bt_doinsert(), index_form_tuple(), pfree(), PG_GETARG_INT32, PG_GETARG_POINTER, PG_RETURN_BOOL, RelationGetDescr, IndexTupleData::t_tid, and values.

{
    Relation    rel = (Relation) PG_GETARG_POINTER(0);
    Datum      *values = (Datum *) PG_GETARG_POINTER(1);
    bool       *isnull = (bool *) PG_GETARG_POINTER(2);
    ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3);
    Relation    heapRel = (Relation) PG_GETARG_POINTER(4);
    IndexUniqueCheck checkUnique = (IndexUniqueCheck) PG_GETARG_INT32(5);
    bool        result;
    IndexTuple  itup;

    /* generate an index tuple */
    itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
    itup->t_tid = *ht_ctid;

    result = _bt_doinsert(rel, itup, checkUnique, heapRel);

    pfree(itup);

    PG_RETURN_BOOL(result);
}

Datum btmarkpos ( PG_FUNCTION_ARGS   ) 

Definition at line 565 of file nbtree.c.

References _bt_mark_array_keys(), BTScanPosIsValid, BTScanPosData::buf, BTScanOpaqueData::currPos, BTScanPosData::itemIndex, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, PG_GETARG_POINTER, PG_RETURN_VOID, and ReleaseBuffer().

{
    IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
    BTScanOpaque so = (BTScanOpaque) scan->opaque;

    /* we aren't holding any read locks, but gotta drop the pin */
    if (BTScanPosIsValid(so->markPos))
    {
        ReleaseBuffer(so->markPos.buf);
        so->markPos.buf = InvalidBuffer;
    }

    /*
     * Just record the current itemIndex.  If we later step to next page
     * before releasing the marked position, _bt_steppage makes a full copy of
     * the currPos struct in markPos.  If (as often happens) the mark is moved
     * before we leave the page, we don't have to do that work.
     */
    if (BTScanPosIsValid(so->currPos))
        so->markItemIndex = so->currPos.itemIndex;
    else
        so->markItemIndex = -1;

    /* Also record the current positions of any array keys */
    if (so->numArrayKeys)
        _bt_mark_array_keys(scan);

    PG_RETURN_VOID();
}

Datum btoptions ( PG_FUNCTION_ARGS   ) 

Definition at line 2017 of file nbtutils.c.

References default_reloptions(), PG_GETARG_BOOL, PG_GETARG_DATUM, PG_RETURN_BYTEA_P, PG_RETURN_NULL, and RELOPT_KIND_BTREE.

{
    Datum       reloptions = PG_GETARG_DATUM(0);
    bool        validate = PG_GETARG_BOOL(1);
    bytea      *result;

    result = default_reloptions(reloptions, validate, RELOPT_KIND_BTREE);
    if (result)
        PG_RETURN_BYTEA_P(result);
    PG_RETURN_NULL();
}

void btree_desc ( StringInfo  buf,
uint8  xl_info,
char *  rec 
)

Definition at line 29 of file nbtdesc.c.

References appendStringInfo(), xl_btree_delete::block, xl_btree_vacuum::block, RelFileNode::dbNode, xl_btree_delete_page::deadblk, xl_btree_split::firstright, xl_btree_delete::hnode, xl_btree_vacuum::lastBlockVacuumed, xl_btree_reuse_page::latestRemovedXid, xl_btree_delete_page::leftblk, xl_btree_split::leftsib, xl_btree_newroot::level, xl_btree_split::level, xl_btree_reuse_page::node, xl_btree_newroot::node, xl_btree_delete::node, xl_btree_vacuum::node, xl_btree_split::node, out_target(), RelFileNode::relNode, xl_btree_delete_page::rightblk, xl_btree_split::rightsib, xl_btree_split::rnext, xl_btree_newroot::rootblk, RelFileNode::spcNode, xl_btree_delete_page::target, xl_btree_insert::target, XLOG_BTREE_DELETE, XLOG_BTREE_DELETE_PAGE, XLOG_BTREE_DELETE_PAGE_HALF, XLOG_BTREE_DELETE_PAGE_META, XLOG_BTREE_INSERT_LEAF, XLOG_BTREE_INSERT_META, XLOG_BTREE_INSERT_UPPER, XLOG_BTREE_NEWROOT, XLOG_BTREE_REUSE_PAGE, XLOG_BTREE_SPLIT_L, XLOG_BTREE_SPLIT_L_ROOT, XLOG_BTREE_SPLIT_R, XLOG_BTREE_SPLIT_R_ROOT, and XLOG_BTREE_VACUUM.

{
    uint8       info = xl_info & ~XLR_INFO_MASK;

    switch (info)
    {
        case XLOG_BTREE_INSERT_LEAF:
            {
                xl_btree_insert *xlrec = (xl_btree_insert *) rec;

                appendStringInfo(buf, "insert: ");
                out_target(buf, &(xlrec->target));
                break;
            }
        case XLOG_BTREE_INSERT_UPPER:
            {
                xl_btree_insert *xlrec = (xl_btree_insert *) rec;

                appendStringInfo(buf, "insert_upper: ");
                out_target(buf, &(xlrec->target));
                break;
            }
        case XLOG_BTREE_INSERT_META:
            {
                xl_btree_insert *xlrec = (xl_btree_insert *) rec;

                appendStringInfo(buf, "insert_meta: ");
                out_target(buf, &(xlrec->target));
                break;
            }
        case XLOG_BTREE_SPLIT_L:
            {
                xl_btree_split *xlrec = (xl_btree_split *) rec;

                appendStringInfo(buf, "split_l: rel %u/%u/%u ",
                                 xlrec->node.spcNode, xlrec->node.dbNode,
                                 xlrec->node.relNode);
                appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d",
                               xlrec->leftsib, xlrec->rightsib, xlrec->rnext,
                                 xlrec->level, xlrec->firstright);
                break;
            }
        case XLOG_BTREE_SPLIT_R:
            {
                xl_btree_split *xlrec = (xl_btree_split *) rec;

                appendStringInfo(buf, "split_r: rel %u/%u/%u ",
                                 xlrec->node.spcNode, xlrec->node.dbNode,
                                 xlrec->node.relNode);
                appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d",
                               xlrec->leftsib, xlrec->rightsib, xlrec->rnext,
                                 xlrec->level, xlrec->firstright);
                break;
            }
        case XLOG_BTREE_SPLIT_L_ROOT:
            {
                xl_btree_split *xlrec = (xl_btree_split *) rec;

                appendStringInfo(buf, "split_l_root: rel %u/%u/%u ",
                                 xlrec->node.spcNode, xlrec->node.dbNode,
                                 xlrec->node.relNode);
                appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d",
                               xlrec->leftsib, xlrec->rightsib, xlrec->rnext,
                                 xlrec->level, xlrec->firstright);
                break;
            }
        case XLOG_BTREE_SPLIT_R_ROOT:
            {
                xl_btree_split *xlrec = (xl_btree_split *) rec;

                appendStringInfo(buf, "split_r_root: rel %u/%u/%u ",
                                 xlrec->node.spcNode, xlrec->node.dbNode,
                                 xlrec->node.relNode);
                appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d",
                               xlrec->leftsib, xlrec->rightsib, xlrec->rnext,
                                 xlrec->level, xlrec->firstright);
                break;
            }
        case XLOG_BTREE_VACUUM:
            {
                xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec;

                appendStringInfo(buf, "vacuum: rel %u/%u/%u; blk %u, lastBlockVacuumed %u",
                                 xlrec->node.spcNode, xlrec->node.dbNode,
                                 xlrec->node.relNode, xlrec->block,
                                 xlrec->lastBlockVacuumed);
                break;
            }
        case XLOG_BTREE_DELETE:
            {
                xl_btree_delete *xlrec = (xl_btree_delete *) rec;

                appendStringInfo(buf, "delete: index %u/%u/%u; iblk %u, heap %u/%u/%u;",
                xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode,
                                 xlrec->block,
                                 xlrec->hnode.spcNode, xlrec->hnode.dbNode, xlrec->hnode.relNode);
                break;
            }
        case XLOG_BTREE_DELETE_PAGE:
        case XLOG_BTREE_DELETE_PAGE_META:
        case XLOG_BTREE_DELETE_PAGE_HALF:
            {
                xl_btree_delete_page *xlrec = (xl_btree_delete_page *) rec;

                appendStringInfo(buf, "delete_page: ");
                out_target(buf, &(xlrec->target));
                appendStringInfo(buf, "; dead %u; left %u; right %u",
                            xlrec->deadblk, xlrec->leftblk, xlrec->rightblk);
                break;
            }
        case XLOG_BTREE_NEWROOT:
            {
                xl_btree_newroot *xlrec = (xl_btree_newroot *) rec;

                appendStringInfo(buf, "newroot: rel %u/%u/%u; root %u lev %u",
                                 xlrec->node.spcNode, xlrec->node.dbNode,
                                 xlrec->node.relNode,
                                 xlrec->rootblk, xlrec->level);
                break;
            }
        case XLOG_BTREE_REUSE_PAGE:
            {
                xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) rec;

                appendStringInfo(buf, "reuse_page: rel %u/%u/%u; latestRemovedXid %u",
                                 xlrec->node.spcNode, xlrec->node.dbNode,
                               xlrec->node.relNode, xlrec->latestRemovedXid);
                break;
            }
        default:
            appendStringInfo(buf, "UNKNOWN");
            break;
    }
}

void btree_redo ( XLogRecPtr  lsn,
XLogRecord record 
)

Definition at line 1023 of file nbtxlog.c.

References btree_xlog_delete(), btree_xlog_delete_page(), btree_xlog_insert(), btree_xlog_newroot(), btree_xlog_reuse_page(), btree_xlog_split(), btree_xlog_vacuum(), elog, PANIC, XLogRecord::xl_info, XLOG_BTREE_DELETE, XLOG_BTREE_DELETE_PAGE, XLOG_BTREE_DELETE_PAGE_HALF, XLOG_BTREE_DELETE_PAGE_META, XLOG_BTREE_INSERT_LEAF, XLOG_BTREE_INSERT_META, XLOG_BTREE_INSERT_UPPER, XLOG_BTREE_NEWROOT, XLOG_BTREE_REUSE_PAGE, XLOG_BTREE_SPLIT_L, XLOG_BTREE_SPLIT_L_ROOT, XLOG_BTREE_SPLIT_R, XLOG_BTREE_SPLIT_R_ROOT, and XLOG_BTREE_VACUUM.

{
    uint8       info = record->xl_info & ~XLR_INFO_MASK;

    switch (info)
    {
        case XLOG_BTREE_INSERT_LEAF:
            btree_xlog_insert(true, false, lsn, record);
            break;
        case XLOG_BTREE_INSERT_UPPER:
            btree_xlog_insert(false, false, lsn, record);
            break;
        case XLOG_BTREE_INSERT_META:
            btree_xlog_insert(false, true, lsn, record);
            break;
        case XLOG_BTREE_SPLIT_L:
            btree_xlog_split(true, false, lsn, record);
            break;
        case XLOG_BTREE_SPLIT_R:
            btree_xlog_split(false, false, lsn, record);
            break;
        case XLOG_BTREE_SPLIT_L_ROOT:
            btree_xlog_split(true, true, lsn, record);
            break;
        case XLOG_BTREE_SPLIT_R_ROOT:
            btree_xlog_split(false, true, lsn, record);
            break;
        case XLOG_BTREE_VACUUM:
            btree_xlog_vacuum(lsn, record);
            break;
        case XLOG_BTREE_DELETE:
            btree_xlog_delete(lsn, record);
            break;
        case XLOG_BTREE_DELETE_PAGE:
        case XLOG_BTREE_DELETE_PAGE_META:
        case XLOG_BTREE_DELETE_PAGE_HALF:
            btree_xlog_delete_page(info, lsn, record);
            break;
        case XLOG_BTREE_NEWROOT:
            btree_xlog_newroot(lsn, record);
            break;
        case XLOG_BTREE_REUSE_PAGE:
            btree_xlog_reuse_page(lsn, record);
            break;
        default:
            elog(PANIC, "btree_redo: unknown op code %u", info);
    }
}

bool btree_safe_restartpoint ( void   ) 

Definition at line 1141 of file nbtxlog.c.

{
    if (incomplete_actions)
        return false;
    return true;
}

void btree_xlog_cleanup ( void   ) 

Definition at line 1079 of file nbtxlog.c.

References _bt_insert_parent(), _bt_pagedel(), buf, BufferGetPage, BufferIsValid, CreateFakeRelcacheEntry(), bt_incomplete_action::delblk, elog, FreeFakeRelcacheEntry(), bt_incomplete_action::is_root, bt_incomplete_action::is_split, bt_incomplete_action::leftblk, lfirst, bt_incomplete_action::node, NULL, P_LEFTMOST, P_RIGHTMOST, PageGetSpecialPointer, PANIC, bt_incomplete_action::rightblk, and XLogReadBuffer().

{
    ListCell   *l;

    foreach(l, incomplete_actions)
    {
        bt_incomplete_action *action = (bt_incomplete_action *) lfirst(l);

        if (action->is_split)
        {
            /* finish an incomplete split */
            Buffer      lbuf,
                        rbuf;
            Page        lpage,
                        rpage;
            BTPageOpaque lpageop,
                        rpageop;
            bool        is_only;
            Relation    reln;

            lbuf = XLogReadBuffer(action->node, action->leftblk, false);
            /* failure is impossible because we wrote this page earlier */
            if (!BufferIsValid(lbuf))
                elog(PANIC, "btree_xlog_cleanup: left block unfound");
            lpage = (Page) BufferGetPage(lbuf);
            lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
            rbuf = XLogReadBuffer(action->node, action->rightblk, false);
            /* failure is impossible because we wrote this page earlier */
            if (!BufferIsValid(rbuf))
                elog(PANIC, "btree_xlog_cleanup: right block unfound");
            rpage = (Page) BufferGetPage(rbuf);
            rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);

            /* if the pages are all of their level, it's a only-page split */
            is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop);

            reln = CreateFakeRelcacheEntry(action->node);
            _bt_insert_parent(reln, lbuf, rbuf, NULL,
                              action->is_root, is_only);
            FreeFakeRelcacheEntry(reln);
        }
        else
        {
            /* finish an incomplete deletion (of a half-dead page) */
            Buffer      buf;

            buf = XLogReadBuffer(action->node, action->delblk, false);
            if (BufferIsValid(buf))
            {
                Relation    reln;

                reln = CreateFakeRelcacheEntry(action->node);
                if (_bt_pagedel(reln, buf, NULL) == 0)
                    elog(PANIC, "btree_xlog_cleanup: _bt_pagedel failed");
                FreeFakeRelcacheEntry(reln);
            }
        }
    }
    incomplete_actions = NIL;
}

void btree_xlog_startup ( void   ) 

Definition at line 1073 of file nbtxlog.c.

{
    incomplete_actions = NIL;
}

void BTreeShmemInit ( void   ) 

Definition at line 1989 of file nbtutils.c.

References Assert, BTreeShmemSize(), BTVacInfo::cycle_ctr, IsUnderPostmaster, BTVacInfo::max_vacuums, MaxBackends, NULL, BTVacInfo::num_vacuums, and ShmemInitStruct().

Referenced by CreateSharedMemoryAndSemaphores().

{
    bool        found;

    btvacinfo = (BTVacInfo *) ShmemInitStruct("BTree Vacuum State",
                                              BTreeShmemSize(),
                                              &found);

    if (!IsUnderPostmaster)
    {
        /* Initialize shared memory area */
        Assert(!found);

        /*
         * It doesn't really matter what the cycle counter starts at, but
         * having it always start the same doesn't seem good.  Seed with
         * low-order bits of time() instead.
         */
        btvacinfo->cycle_ctr = (BTCycleId) time(NULL);

        btvacinfo->num_vacuums = 0;
        btvacinfo->max_vacuums = MaxBackends;
    }
    else
        Assert(found);
}

Size BTreeShmemSize ( void   ) 

Definition at line 1976 of file nbtutils.c.

References add_size(), MaxBackends, mul_size(), and offsetof.

Referenced by BTreeShmemInit(), and CreateSharedMemoryAndSemaphores().

{
    Size        size;

    size = offsetof(BTVacInfo, vacuums[0]);
    size = add_size(size, mul_size(MaxBackends, sizeof(BTOneVacInfo)));
    return size;
}

Datum btrescan ( PG_FUNCTION_ARGS   ) 

Definition at line 456 of file nbtree.c.

References _bt_killitems(), _bt_preprocess_array_keys(), BTScanPosIsValid, BTScanPosData::buf, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, IndexScanDescData::keyData, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, memmove, NULL, BTScanOpaqueData::numberOfKeys, IndexScanDescData::numberOfKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, palloc(), PG_GETARG_POINTER, PG_RETURN_VOID, ReleaseBuffer(), and IndexScanDescData::xs_want_itup.

{
    IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
    ScanKey     scankey = (ScanKey) PG_GETARG_POINTER(1);

    /* remaining arguments are ignored */
    BTScanOpaque so = (BTScanOpaque) scan->opaque;

    /* we aren't holding any read locks, but gotta drop the pins */
    if (BTScanPosIsValid(so->currPos))
    {
        /* Before leaving current page, deal with any killed items */
        if (so->numKilled > 0)
            _bt_killitems(scan, false);
        ReleaseBuffer(so->currPos.buf);
        so->currPos.buf = InvalidBuffer;
    }

    if (BTScanPosIsValid(so->markPos))
    {
        ReleaseBuffer(so->markPos.buf);
        so->markPos.buf = InvalidBuffer;
    }
    so->markItemIndex = -1;

    /*
     * Allocate tuple workspace arrays, if needed for an index-only scan and
     * not already done in a previous rescan call.  To save on palloc
     * overhead, both workspaces are allocated as one palloc block; only this
     * function and btendscan know that.
     *
     * NOTE: this data structure also makes it safe to return data from a
     * "name" column, even though btree name_ops uses an underlying storage
     * datatype of cstring.  The risk there is that "name" is supposed to be
     * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
     * However, since we only return data out of tuples sitting in the
     * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
     * data out of the markTuples array --- running off the end of memory for
     * a SIGSEGV is not possible.  Yeah, this is ugly as sin, but it beats
     * adding special-case treatment for name_ops elsewhere.
     */
    if (scan->xs_want_itup && so->currTuples == NULL)
    {
        so->currTuples = (char *) palloc(BLCKSZ * 2);
        so->markTuples = so->currTuples + BLCKSZ;
    }

    /*
     * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
     * - vadim 05/05/97
     */
    if (scankey && scan->numberOfKeys > 0)
        memmove(scan->keyData,
                scankey,
                scan->numberOfKeys * sizeof(ScanKeyData));
    so->numberOfKeys = 0;       /* until _bt_preprocess_keys sets it */

    /* If any keys are SK_SEARCHARRAY type, set up array-key info */
    _bt_preprocess_array_keys(scan);

    PG_RETURN_VOID();
}

Datum btrestrpos ( PG_FUNCTION_ARGS   ) 

Definition at line 599 of file nbtree.c.

References _bt_killitems(), _bt_restore_array_keys(), BTScanPosIsValid, BTScanPosData::buf, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, IncrBufferRefCount(), BTScanPosData::itemIndex, BTScanPosData::lastItem, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numKilled, offsetof, IndexScanDescData::opaque, PG_GETARG_POINTER, PG_RETURN_VOID, and ReleaseBuffer().

{
    IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
    BTScanOpaque so = (BTScanOpaque) scan->opaque;

    /* Restore the marked positions of any array keys */
    if (so->numArrayKeys)
        _bt_restore_array_keys(scan);

    if (so->markItemIndex >= 0)
    {
        /*
         * The mark position is on the same page we are currently on. Just
         * restore the itemIndex.
         */
        so->currPos.itemIndex = so->markItemIndex;
    }
    else
    {
        /* we aren't holding any read locks, but gotta drop the pin */
        if (BTScanPosIsValid(so->currPos))
        {
            /* Before leaving current page, deal with any killed items */
            if (so->numKilled > 0 &&
                so->currPos.buf != so->markPos.buf)
                _bt_killitems(scan, false);
            ReleaseBuffer(so->currPos.buf);
            so->currPos.buf = InvalidBuffer;
        }

        if (BTScanPosIsValid(so->markPos))
        {
            /* bump pin on mark buffer for assignment to current buffer */
            IncrBufferRefCount(so->markPos.buf);
            memcpy(&so->currPos, &so->markPos,
                   offsetof(BTScanPosData, items[1]) +
                   so->markPos.lastItem * sizeof(BTScanPosItem));
            if (so->currTuples)
                memcpy(so->currTuples, so->markTuples,
                       so->markPos.nextTupleOffset);
        }
    }

    PG_RETURN_VOID();
}

Datum btvacuumcleanup ( PG_FUNCTION_ARGS   ) 

Definition at line 686 of file nbtree.c.

References IndexVacuumInfo::analyze_only, btvacuumscan(), IndexVacuumInfo::estimated_count, IndexVacuumInfo::index, IndexFreeSpaceMapVacuum(), NULL, IndexVacuumInfo::num_heap_tuples, IndexBulkDeleteResult::num_index_tuples, palloc0(), PG_GETARG_POINTER, and PG_RETURN_POINTER.

{
    IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
    IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);

    /* No-op in ANALYZE ONLY mode */
    if (info->analyze_only)
        PG_RETURN_POINTER(stats);

    /*
     * If btbulkdelete was called, we need not do anything, just return the
     * stats from the latest btbulkdelete call.  If it wasn't called, we must
     * still do a pass over the index, to recycle any newly-recyclable pages
     * and to obtain index statistics.
     *
     * Since we aren't going to actually delete any leaf items, there's no
     * need to go through all the vacuum-cycle-ID pushups.
     */
    if (stats == NULL)
    {
        stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
        btvacuumscan(info, stats, NULL, NULL, 0);
    }

    /* Finally, vacuum the FSM */
    IndexFreeSpaceMapVacuum(info->index);

    /*
     * It's quite possible for us to be fooled by concurrent page splits into
     * double-counting some index tuples, so disbelieve any total that exceeds
     * the underlying heap's count ... if we know that accurately.  Otherwise
     * this might just make matters worse.
     */
    if (!info->estimated_count)
    {
        if (stats->num_index_tuples > info->num_heap_tuples)
            stats->num_index_tuples = info->num_heap_tuples;
    }

    PG_RETURN_POINTER(stats);
}