#include "access/genam.h"
#include "access/itup.h"
#include "access/sdir.h"
#include "access/xlog.h"
#include "access/xlogutils.h"
#include "catalog/pg_index.h"

Include dependency graph for nbtree.h:

This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures
struct	BTPageOpaqueData
struct	BTMetaPageData
struct	xl_btreetid
struct	xl_btree_metadata
struct	xl_btree_insert
struct	xl_btree_split
struct	xl_btree_delete
struct	xl_btree_reuse_page
struct	xl_btree_vacuum
struct	xl_btree_delete_page
struct	xl_btree_newroot
struct	BTStackData
struct	BTScanPosItem
struct	BTScanPosData
struct	BTArrayKeyInfo
struct	BTScanOpaqueData
Defines
#define	BTP_LEAF (1 << 0)
#define	BTP_ROOT (1 << 1)
#define	BTP_DELETED (1 << 2)
#define	BTP_META (1 << 3)
#define	BTP_HALF_DEAD (1 << 4)
#define	BTP_SPLIT_END (1 << 5)
#define	BTP_HAS_GARBAGE (1 << 6)
#define	MAX_BT_CYCLE_ID 0xFF7F
#define	BTPageGetMeta(p) ((BTMetaPageData *) PageGetContents(p))
#define	BTREE_METAPAGE 0
#define	BTREE_MAGIC 0x053162
#define	BTREE_VERSION 2
#define	BTMaxItemSize(page)
#define	BTREE_MIN_FILLFACTOR 10
#define	BTREE_DEFAULT_FILLFACTOR 90
#define	BTREE_NONLEAF_FILLFACTOR 70
#define	BTTidSame(i1, i2)
#define	BTEntrySame(i1, i2) BTTidSame((i1)->t_tid, (i2)->t_tid)
#define	P_NONE 0
#define	P_LEFTMOST(opaque) ((opaque)->btpo_prev == P_NONE)
#define	P_RIGHTMOST(opaque) ((opaque)->btpo_next == P_NONE)
#define	P_ISLEAF(opaque) ((opaque)->btpo_flags & BTP_LEAF)
#define	P_ISROOT(opaque) ((opaque)->btpo_flags & BTP_ROOT)
#define	P_ISDELETED(opaque) ((opaque)->btpo_flags & BTP_DELETED)
#define	P_ISHALFDEAD(opaque) ((opaque)->btpo_flags & BTP_HALF_DEAD)
#define	P_IGNORE(opaque) ((opaque)->btpo_flags & (BTP_DELETED\|BTP_HALF_DEAD))
#define	P_HAS_GARBAGE(opaque) ((opaque)->btpo_flags & BTP_HAS_GARBAGE)
#define	P_HIKEY ((OffsetNumber) 1)
#define	P_FIRSTKEY ((OffsetNumber) 2)
#define	P_FIRSTDATAKEY(opaque) (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)
#define	XLOG_BTREE_INSERT_LEAF 0x00
#define	XLOG_BTREE_INSERT_UPPER 0x10
#define	XLOG_BTREE_INSERT_META 0x20
#define	XLOG_BTREE_SPLIT_L 0x30
#define	XLOG_BTREE_SPLIT_R 0x40
#define	XLOG_BTREE_SPLIT_L_ROOT 0x50
#define	XLOG_BTREE_SPLIT_R_ROOT 0x60
#define	XLOG_BTREE_DELETE 0x70
#define	XLOG_BTREE_DELETE_PAGE 0x80
#define	XLOG_BTREE_DELETE_PAGE_META 0x90
#define	XLOG_BTREE_NEWROOT 0xA0
#define	XLOG_BTREE_DELETE_PAGE_HALF 0xB0
#define	XLOG_BTREE_VACUUM 0xC0
#define	XLOG_BTREE_REUSE_PAGE 0xD0
#define	SizeOfBtreeInsert (offsetof(xl_btreetid, tid) + SizeOfIptrData)
#define	SizeOfBtreeSplit (offsetof(xl_btree_split, firstright) + sizeof(OffsetNumber))
#define	SizeOfBtreeDelete (offsetof(xl_btree_delete, nitems) + sizeof(int))
#define	SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page))
#define	SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber))
#define	SizeOfBtreeDeletePage (offsetof(xl_btree_delete_page, btpo_xact) + sizeof(TransactionId))
#define	SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32))
#define	BTCommuteStrategyNumber(strat) (BTMaxStrategyNumber + 1 - (strat))
#define	BTORDER_PROC 1
#define	BTSORTSUPPORT_PROC 2
#define	BT_READ BUFFER_LOCK_SHARE
#define	BT_WRITE BUFFER_LOCK_EXCLUSIVE
#define	BTScanPosIsValid(scanpos) BufferIsValid((scanpos).buf)
#define	SK_BT_REQFWD 0x00010000
#define	SK_BT_REQBKWD 0x00020000
#define	SK_BT_INDOPTION_SHIFT 24
#define	SK_BT_DESC (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
#define	SK_BT_NULLS_FIRST (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
Typedefs
typedef uint16	BTCycleId
typedef struct BTPageOpaqueData	BTPageOpaqueData
typedef BTPageOpaqueData *	BTPageOpaque
typedef struct BTMetaPageData	BTMetaPageData
typedef struct xl_btreetid	xl_btreetid
typedef struct xl_btree_metadata	xl_btree_metadata
typedef struct xl_btree_insert	xl_btree_insert
typedef struct xl_btree_split	xl_btree_split
typedef struct xl_btree_delete	xl_btree_delete
typedef struct xl_btree_reuse_page	xl_btree_reuse_page
typedef struct xl_btree_vacuum	xl_btree_vacuum
typedef struct xl_btree_delete_page	xl_btree_delete_page
typedef struct xl_btree_newroot	xl_btree_newroot
typedef struct BTStackData	BTStackData
typedef BTStackData *	BTStack
typedef struct BTScanPosItem	BTScanPosItem
typedef struct BTScanPosData	BTScanPosData
typedef BTScanPosData *	BTScanPos
typedef struct BTArrayKeyInfo	BTArrayKeyInfo
typedef struct BTScanOpaqueData	BTScanOpaqueData
typedef BTScanOpaqueData *	BTScanOpaque
typedef struct BTSpool	BTSpool
Functions
Datum	btbuild (PG_FUNCTION_ARGS)
Datum	btbuildempty (PG_FUNCTION_ARGS)
Datum	btinsert (PG_FUNCTION_ARGS)
Datum	btbeginscan (PG_FUNCTION_ARGS)
Datum	btgettuple (PG_FUNCTION_ARGS)
Datum	btgetbitmap (PG_FUNCTION_ARGS)
Datum	btrescan (PG_FUNCTION_ARGS)
Datum	btendscan (PG_FUNCTION_ARGS)
Datum	btmarkpos (PG_FUNCTION_ARGS)
Datum	btrestrpos (PG_FUNCTION_ARGS)
Datum	btbulkdelete (PG_FUNCTION_ARGS)
Datum	btvacuumcleanup (PG_FUNCTION_ARGS)
Datum	btcanreturn (PG_FUNCTION_ARGS)
Datum	btoptions (PG_FUNCTION_ARGS)
bool	_bt_doinsert (Relation rel, IndexTuple itup, IndexUniqueCheck checkUnique, Relation heapRel)
Buffer	_bt_getstackbuf (Relation rel, BTStack stack, int access)
void	_bt_insert_parent (Relation rel, Buffer buf, Buffer rbuf, BTStack stack, bool is_root, bool is_only)
void	_bt_initmetapage (Page page, BlockNumber rootbknum, uint32 level)
Buffer	_bt_getroot (Relation rel, int access)
Buffer	_bt_gettrueroot (Relation rel)
int	_bt_getrootheight (Relation rel)
void	_bt_checkpage (Relation rel, Buffer buf)
Buffer	_bt_getbuf (Relation rel, BlockNumber blkno, int access)
Buffer	_bt_relandgetbuf (Relation rel, Buffer obuf, BlockNumber blkno, int access)
void	_bt_relbuf (Relation rel, Buffer buf)
void	_bt_pageinit (Page page, Size size)
bool	_bt_page_recyclable (Page page)
void	_bt_delitems_delete (Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, Relation heapRel)
void	_bt_delitems_vacuum (Relation rel, Buffer buf, OffsetNumber *itemnos, int nitems, BlockNumber lastBlockVacuumed)
int	_bt_pagedel (Relation rel, Buffer buf, BTStack stack)
BTStack	_bt_search (Relation rel, int keysz, ScanKey scankey, bool nextkey, Buffer *bufP, int access)
Buffer	_bt_moveright (Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey, int access)
OffsetNumber	_bt_binsrch (Relation rel, Buffer buf, int keysz, ScanKey scankey, bool nextkey)
int32	_bt_compare (Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum)
bool	_bt_first (IndexScanDesc scan, ScanDirection dir)
bool	_bt_next (IndexScanDesc scan, ScanDirection dir)
Buffer	_bt_get_endpoint (Relation rel, uint32 level, bool rightmost)
ScanKey	_bt_mkscankey (Relation rel, IndexTuple itup)
ScanKey	_bt_mkscankey_nodata (Relation rel)
void	_bt_freeskey (ScanKey skey)
void	_bt_freestack (BTStack stack)
void	_bt_preprocess_array_keys (IndexScanDesc scan)
void	_bt_start_array_keys (IndexScanDesc scan, ScanDirection dir)
bool	_bt_advance_array_keys (IndexScanDesc scan, ScanDirection dir)
void	_bt_mark_array_keys (IndexScanDesc scan)
void	_bt_restore_array_keys (IndexScanDesc scan)
void	_bt_preprocess_keys (IndexScanDesc scan)
IndexTuple	_bt_checkkeys (IndexScanDesc scan, Page page, OffsetNumber offnum, ScanDirection dir, bool *continuescan)
void	_bt_killitems (IndexScanDesc scan, bool haveLock)
BTCycleId	_bt_vacuum_cycleid (Relation rel)
BTCycleId	_bt_start_vacuum (Relation rel)
void	_bt_end_vacuum (Relation rel)
void	_bt_end_vacuum_callback (int code, Datum arg)
Size	BTreeShmemSize (void)
void	BTreeShmemInit (void)
BTSpool *	_bt_spoolinit (Relation heap, Relation index, bool isunique, bool isdead)
void	_bt_spooldestroy (BTSpool *btspool)
void	_bt_spool (IndexTuple itup, BTSpool *btspool)
void	_bt_leafbuild (BTSpool btspool, BTSpool spool2)
void	btree_redo (XLogRecPtr lsn, XLogRecord *record)
void	btree_desc (StringInfo buf, uint8 xl_info, char *rec)
void	btree_xlog_startup (void)
void	btree_xlog_cleanup (void)
bool	btree_safe_restartpoint (void)

Define Documentation

#define BT_READ BUFFER_LOCK_SHARE

Definition at line 438 of file nbtree.h.

Referenced by _bt_check_unique(), _bt_first(), _bt_get_endpoint(), _bt_getroot(), _bt_getrootheight(), _bt_gettrueroot(), _bt_killitems(), _bt_next(), _bt_pagedel(), _bt_parent_deletion_safe(), _bt_search(), _bt_steppage(), _bt_walk_left(), btvacuumpage(), and pgstat_btree_page().

#define BT_WRITE BUFFER_LOCK_EXCLUSIVE

Definition at line 439 of file nbtree.h.

Referenced by _bt_doinsert(), _bt_findinsertloc(), _bt_getbuf(), _bt_getroot(), _bt_insert_parent(), _bt_insertonpg(), _bt_newroot(), _bt_pagedel(), and _bt_split().

#define BTCommuteStrategyNumber ( strat ) (BTMaxStrategyNumber + 1 - (strat))

Definition at line 416 of file nbtree.h.

Referenced by _bt_compare_scankey_args(), and _bt_fix_scankey_strategy().

#define BTEntrySame	(	i1,
		i2
	)		BTTidSame((i1)->t_tid, (i2)->t_tid)

Definition at line 155 of file nbtree.h.

Referenced by _bt_getstackbuf().

#define BTMaxItemSize ( page )

Value:

MAXALIGN_DOWN((PageGetPageSize(page) - \
                   MAXALIGN(SizeOfPageHeaderData + 3*sizeof(ItemIdData)) - \
                   MAXALIGN(sizeof(BTPageOpaqueData))) / 3)

Definition at line 117 of file nbtree.h.

Referenced by _bt_buildadd(), and _bt_findinsertloc().

#define BTORDER_PROC 1

Definition at line 430 of file nbtree.h.

Referenced by _bt_first(), _bt_mkscankey(), _bt_mkscankey_nodata(), _bt_sort_array_elements(), assignProcTypes(), ExecIndexBuildScanKeys(), ExecInitExpr(), get_sort_function_for_ordering_op(), load_rangetype_info(), lookup_type_cache(), and MJExamineQuals().

#define BTP_DELETED (1 << 2)

Definition at line 71 of file nbtree.h.

Referenced by pgstat_btree_page().

#define BTP_HALF_DEAD (1 << 4)

Definition at line 73 of file nbtree.h.

Referenced by pgstat_btree_page().

#define BTP_HAS_GARBAGE (1 << 6)

Definition at line 75 of file nbtree.h.

#define BTP_LEAF (1 << 0)

Definition at line 69 of file nbtree.h.

Referenced by _bt_blnewpage(), _bt_getroot(), and btree_xlog_split().

#define BTP_META (1 << 3)

Definition at line 72 of file nbtree.h.

Referenced by _bt_getroot(), _bt_getrootheight(), and _bt_gettrueroot().

#define BTP_ROOT (1 << 1)

Definition at line 70 of file nbtree.h.

Referenced by _bt_split().

#define BTP_SPLIT_END (1 << 5)

Definition at line 74 of file nbtree.h.

Referenced by _bt_split(), and btvacuumpage().

#define BTPageGetMeta ( p ) ((BTMetaPageData *) PageGetContents(p))

Definition at line 104 of file nbtree.h.

Referenced by _bt_getroot(), _bt_getrootheight(), _bt_gettrueroot(), _bt_initmetapage(), _bt_insertonpg(), _bt_newroot(), _bt_pagedel(), _bt_restore_meta(), bt_metap(), and pgstatindex().

#define BTREE_DEFAULT_FILLFACTOR 90

Definition at line 130 of file nbtree.h.

Referenced by _bt_findsplitloc(), and _bt_pagestate().

#define BTREE_MAGIC 0x053162

Definition at line 108 of file nbtree.h.

Referenced by _bt_getroot(), _bt_getrootheight(), and _bt_gettrueroot().

#define BTREE_METAPAGE 0

Definition at line 107 of file nbtree.h.

Referenced by _bt_getroot(), _bt_getrootheight(), _bt_gettrueroot(), _bt_insertonpg(), _bt_leafbuild(), _bt_newroot(), _bt_pagedel(), _bt_restore_meta(), _bt_uppershutdown(), btbuildempty(), btvacuumscan(), and pgstat_relation().

#define BTREE_MIN_FILLFACTOR 10

Definition at line 129 of file nbtree.h.

#define BTREE_NONLEAF_FILLFACTOR 70

Definition at line 131 of file nbtree.h.

#define BTREE_VERSION 2

Definition at line 109 of file nbtree.h.

Referenced by _bt_getroot(), _bt_getrootheight(), and _bt_gettrueroot().

#define BTScanPosIsValid ( scanpos ) BufferIsValid((scanpos).buf)

Definition at line 531 of file nbtree.h.

Referenced by btendscan(), btgettuple(), btmarkpos(), btrescan(), and btrestrpos().

#define BTSORTSUPPORT_PROC 2

Definition at line 431 of file nbtree.h.

Referenced by assignProcTypes(), get_sort_function_for_ordering_op(), and MJExamineQuals().

#define BTTidSame	(	i1,
		i2
	)

Value:

( (i1).ip_blkid.bi_hi == (i2).ip_blkid.bi_hi && \
      (i1).ip_blkid.bi_lo == (i2).ip_blkid.bi_lo && \
      (i1).ip_posid == (i2).ip_posid )

Definition at line 151 of file nbtree.h.

#define MAX_BT_CYCLE_ID 0xFF7F

Definition at line 84 of file nbtree.h.

Referenced by _bt_start_vacuum().

#define P_FIRSTDATAKEY ( opaque ) (P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY)

#define P_FIRSTKEY ((OffsetNumber) 2)

Definition at line 201 of file nbtree.h.

Referenced by _bt_buildadd(), _bt_newroot(), _bt_slideleft(), and btree_xlog_newroot().

#define P_HAS_GARBAGE ( opaque ) ((opaque)->btpo_flags & BTP_HAS_GARBAGE)

Definition at line 180 of file nbtree.h.

Referenced by _bt_findinsertloc().

#define P_HIKEY ((OffsetNumber) 1)

Definition at line 200 of file nbtree.h.

Referenced by _bt_buildadd(), _bt_check_unique(), _bt_findinsertloc(), _bt_findsplitloc(), _bt_insert_parent(), _bt_insertonpg(), _bt_moveright(), _bt_newroot(), _bt_pagedel(), _bt_parent_deletion_safe(), _bt_slideleft(), _bt_split(), _bt_uppershutdown(), btree_xlog_delete_page(), btree_xlog_newroot(), and btree_xlog_split().

#define P_IGNORE ( opaque ) ((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD))

Definition at line 179 of file nbtree.h.

Referenced by _bt_check_unique(), _bt_findinsertloc(), _bt_get_endpoint(), _bt_getroot(), _bt_getstackbuf(), _bt_gettrueroot(), _bt_moveright(), _bt_steppage(), btvacuumpage(), GetBTPageStatistics(), and pgstatindex().

#define P_ISDELETED ( opaque ) ((opaque)->btpo_flags & BTP_DELETED)

Definition at line 177 of file nbtree.h.

Referenced by _bt_page_recyclable(), _bt_pagedel(), _bt_walk_left(), bt_page_items(), btvacuumpage(), GetBTPageStatistics(), and pgstatindex().

#define P_ISHALFDEAD ( opaque ) ((opaque)->btpo_flags & BTP_HALF_DEAD)

Definition at line 178 of file nbtree.h.

Referenced by _bt_pagedel(), and btvacuumpage().

#define P_ISLEAF ( opaque ) ((opaque)->btpo_flags & BTP_LEAF)

Definition at line 175 of file nbtree.h.

Referenced by _bt_binsrch(), _bt_compare(), _bt_endpoint(), _bt_findinsertloc(), _bt_findsplitloc(), _bt_insertonpg(), _bt_isequal(), _bt_pgaddtup(), _bt_search(), _bt_sortaddtup(), btvacuumpage(), GetBTPageStatistics(), pgstat_btree_page(), and pgstatindex().

#define P_ISROOT ( opaque ) ((opaque)->btpo_flags & BTP_ROOT)

Definition at line 176 of file nbtree.h.

Referenced by _bt_insertonpg(), _bt_pagedel(), _bt_parent_deletion_safe(), _bt_split(), GetBTPageStatistics(), and pgstatindex().

#define P_LEFTMOST ( opaque ) ((opaque)->btpo_prev == P_NONE)

Definition at line 173 of file nbtree.h.

Referenced by _bt_getroot(), _bt_insertonpg(), _bt_walk_left(), and btree_xlog_cleanup().

#define P_NONE 0

Definition at line 167 of file nbtree.h.

Referenced by _bt_get_endpoint(), _bt_getroot(), _bt_getrootheight(), _bt_gettrueroot(), _bt_pagedel(), _bt_steppage(), btbuildempty(), btree_xlog_delete_page(), btree_xlog_split(), btvacuumpage(), and pgstatindex().

#define P_RIGHTMOST ( opaque ) ((opaque)->btpo_next == P_NONE)

Definition at line 174 of file nbtree.h.

Referenced by _bt_check_unique(), _bt_endpoint(), _bt_findinsertloc(), _bt_findsplitloc(), _bt_get_endpoint(), _bt_getroot(), _bt_getstackbuf(), _bt_gettrueroot(), _bt_insertonpg(), _bt_moveright(), _bt_pagedel(), _bt_parent_deletion_safe(), _bt_split(), _bt_walk_left(), btree_xlog_cleanup(), btree_xlog_split(), and btvacuumpage().

#define SizeOfBtreeDelete (offsetof(xl_btree_delete, nitems) + sizeof(int))

Definition at line 326 of file nbtree.h.

Referenced by btree_xlog_delete().

#define SizeOfBtreeDeletePage (offsetof(xl_btree_delete_page, btpo_xact) + sizeof(TransactionId))

Definition at line 389 of file nbtree.h.

Referenced by btree_xlog_delete_page().

#define SizeOfBtreeInsert (offsetof(xl_btreetid, tid) + SizeOfIptrData)

Definition at line 262 of file nbtree.h.

Referenced by btree_xlog_insert().

#define SizeOfBtreeNewroot (offsetof(xl_btree_newroot, level) + sizeof(uint32))

Definition at line 406 of file nbtree.h.

Referenced by btree_xlog_newroot().

#define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page))

Definition at line 338 of file nbtree.h.

#define SizeOfBtreeSplit (offsetof(xl_btree_split, firstright) + sizeof(OffsetNumber))

Definition at line 308 of file nbtree.h.

Referenced by btree_xlog_split().

#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber))

Definition at line 370 of file nbtree.h.

Referenced by btree_xlog_vacuum().

#define SK_BT_DESC (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)

Definition at line 593 of file nbtree.h.

Referenced by _bt_check_rowcompare(), _bt_compare(), _bt_compare_scankey_args(), _bt_first(), _bt_fix_scankey_strategy(), _bt_load(), inlineApplySortFunction(), and reversedirection_index_btree().

#define SK_BT_INDOPTION_SHIFT 24

Definition at line 592 of file nbtree.h.

Referenced by _bt_mkscankey().

#define SK_BT_NULLS_FIRST (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)

Definition at line 594 of file nbtree.h.

Referenced by _bt_check_rowcompare(), _bt_checkkeys(), _bt_compare(), _bt_compare_scankey_args(), _bt_first(), _bt_fix_scankey_strategy(), _bt_load(), and inlineApplySortFunction().

#define SK_BT_REQBKWD 0x00020000

Definition at line 591 of file nbtree.h.

Referenced by _bt_check_rowcompare(), and _bt_checkkeys().

#define SK_BT_REQFWD 0x00010000

Definition at line 590 of file nbtree.h.

Referenced by _bt_check_rowcompare(), _bt_checkkeys(), and _bt_mark_scankey_required().

#define XLOG_BTREE_DELETE 0x70

Definition at line 217 of file nbtree.h.

Referenced by _bt_delitems_delete(), btree_desc(), and btree_redo().

#define XLOG_BTREE_DELETE_PAGE 0x80

Definition at line 218 of file nbtree.h.

Referenced by btree_desc(), and btree_redo().

#define XLOG_BTREE_DELETE_PAGE_HALF 0xB0

Definition at line 221 of file nbtree.h.

Referenced by btree_desc(), btree_redo(), and btree_xlog_delete_page().

#define XLOG_BTREE_DELETE_PAGE_META 0x90

Definition at line 219 of file nbtree.h.

Referenced by btree_desc(), btree_redo(), and btree_xlog_delete_page().

#define XLOG_BTREE_INSERT_LEAF 0x00

Definition at line 210 of file nbtree.h.

Referenced by _bt_insertonpg(), btree_desc(), and btree_redo().

#define XLOG_BTREE_INSERT_META 0x20

Definition at line 212 of file nbtree.h.

Referenced by btree_desc(), and btree_redo().

#define XLOG_BTREE_INSERT_UPPER 0x10

Definition at line 211 of file nbtree.h.

Referenced by btree_desc(), and btree_redo().

#define XLOG_BTREE_NEWROOT 0xA0

Definition at line 220 of file nbtree.h.

Referenced by _bt_getroot(), _bt_newroot(), btree_desc(), and btree_redo().

#define XLOG_BTREE_REUSE_PAGE 0xD0

Definition at line 225 of file nbtree.h.

Referenced by _bt_log_reuse_page(), btree_desc(), and btree_redo().

#define XLOG_BTREE_SPLIT_L 0x30

Definition at line 213 of file nbtree.h.

Referenced by _bt_split(), btree_desc(), and btree_redo().

#define XLOG_BTREE_SPLIT_L_ROOT 0x50

Definition at line 215 of file nbtree.h.

Referenced by _bt_split(), btree_desc(), and btree_redo().

#define XLOG_BTREE_SPLIT_R 0x40

Definition at line 214 of file nbtree.h.

Referenced by btree_desc(), and btree_redo().

#define XLOG_BTREE_SPLIT_R_ROOT 0x60

Definition at line 216 of file nbtree.h.

Referenced by btree_desc(), and btree_redo().

#define XLOG_BTREE_VACUUM 0xC0

Definition at line 223 of file nbtree.h.

Referenced by _bt_delitems_vacuum(), btree_desc(), and btree_redo().

Typedef Documentation

typedef struct BTArrayKeyInfo BTArrayKeyInfo

typedef uint16 BTCycleId

Definition at line 25 of file nbtree.h.

typedef struct BTMetaPageData BTMetaPageData

typedef BTPageOpaqueData* BTPageOpaque

Definition at line 66 of file nbtree.h.

typedef struct BTPageOpaqueData BTPageOpaqueData

typedef BTScanOpaqueData* BTScanOpaque

Definition at line 583 of file nbtree.h.

typedef struct BTScanOpaqueData BTScanOpaqueData

typedef BTScanPosData* BTScanPos

Definition at line 529 of file nbtree.h.

typedef struct BTScanPosData BTScanPosData

typedef struct BTScanPosItem BTScanPosItem

typedef struct BTSpool BTSpool

Definition at line 687 of file nbtree.h.

typedef BTStackData* BTStack

Definition at line 459 of file nbtree.h.

typedef struct BTStackData BTStackData

typedef struct xl_btree_delete xl_btree_delete

typedef struct xl_btree_delete_page xl_btree_delete_page

typedef struct xl_btree_insert xl_btree_insert

typedef struct xl_btree_metadata xl_btree_metadata

typedef struct xl_btree_newroot xl_btree_newroot

typedef struct xl_btree_reuse_page xl_btree_reuse_page

typedef struct xl_btree_split xl_btree_split

typedef struct xl_btree_vacuum xl_btree_vacuum

typedef struct xl_btreetid xl_btreetid

Function Documentation

bool _bt_advance_array_keys	(	IndexScanDesc	scan,
		ScanDirection	dir
	)

Definition at line 549 of file nbtutils.c.

References BTScanOpaqueData::arrayKeyData, BTScanOpaqueData::arrayKeys, BTArrayKeyInfo::cur_elem, BTArrayKeyInfo::elem_values, i, BTArrayKeyInfo::num_elems, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, BTArrayKeyInfo::scan_key, ScanDirectionIsBackward, and ScanKeyData::sk_argument.

Referenced by btgetbitmap(), and btgettuple().

{
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    bool        found = false;
    int         i;

    /*
     * We must advance the last array key most quickly, since it will
     * correspond to the lowest-order index column among the available
     * qualifications. This is necessary to ensure correct ordering of output
     * when there are multiple array keys.
     */
    for (i = so->numArrayKeys - 1; i >= 0; i--)
    {
        BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
        ScanKey     skey = &so->arrayKeyData[curArrayKey->scan_key];
        int         cur_elem = curArrayKey->cur_elem;
        int         num_elems = curArrayKey->num_elems;

        if (ScanDirectionIsBackward(dir))
        {
            if (--cur_elem < 0)
            {
                cur_elem = num_elems - 1;
                found = false;  /* need to advance next array key */
            }
            else
                found = true;
        }
        else
        {
            if (++cur_elem >= num_elems)
            {
                cur_elem = 0;
                found = false;  /* need to advance next array key */
            }
            else
                found = true;
        }

        curArrayKey->cur_elem = cur_elem;
        skey->sk_argument = curArrayKey->elem_values[cur_elem];
        if (found)
            break;
    }

    return found;
}

OffsetNumber _bt_binsrch	(	Relation	rel,
		Buffer	buf,
		int	keysz,
		ScanKey	scankey,
		bool	nextkey
	)

Definition at line 234 of file nbtsearch.c.

References _bt_compare(), Assert, BufferGetPage, OffsetNumberPrev, P_FIRSTDATAKEY, P_ISLEAF, PageGetMaxOffsetNumber, and PageGetSpecialPointer.

Referenced by _bt_doinsert(), _bt_findinsertloc(), _bt_first(), and _bt_search().

{
    Page        page;
    BTPageOpaque opaque;
    OffsetNumber low,
                high;
    int32       result,
                cmpval;

    page = BufferGetPage(buf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);

    low = P_FIRSTDATAKEY(opaque);
    high = PageGetMaxOffsetNumber(page);

    /*
     * If there are no keys on the page, return the first available slot. Note
     * this covers two cases: the page is really empty (no keys), or it
     * contains only a high key.  The latter case is possible after vacuuming.
     * This can never happen on an internal page, however, since they are
     * never empty (an internal page must have children).
     */
    if (high < low)
        return low;

    /*
     * Binary search to find the first key on the page >= scan key, or first
     * key > scankey when nextkey is true.
     *
     * For nextkey=false (cmpval=1), the loop invariant is: all slots before
     * 'low' are < scan key, all slots at or after 'high' are >= scan key.
     *
     * For nextkey=true (cmpval=0), the loop invariant is: all slots before
     * 'low' are <= scan key, all slots at or after 'high' are > scan key.
     *
     * We can fall out when high == low.
     */
    high++;                     /* establish the loop invariant for high */

    cmpval = nextkey ? 0 : 1;   /* select comparison value */

    while (high > low)
    {
        OffsetNumber mid = low + ((high - low) / 2);

        /* We have low <= mid < high, so mid points at a real slot */

        result = _bt_compare(rel, keysz, scankey, page, mid);

        if (result >= cmpval)
            low = mid + 1;
        else
            high = mid;
    }

    /*
     * At this point we have high == low, but be careful: they could point
     * past the last slot on the page.
     *
     * On a leaf page, we always return the first key >= scan key (resp. >
     * scan key), which could be the last slot + 1.
     */
    if (P_ISLEAF(opaque))
        return low;

    /*
     * On a non-leaf page, return the last key < scan key (resp. <= scan key).
     * There must be one if _bt_compare() is playing by the rules.
     */
    Assert(low > P_FIRSTDATAKEY(opaque));

    return OffsetNumberPrev(low);
}

IndexTuple _bt_checkkeys	(	IndexScanDesc	scan,
		Page	page,
		OffsetNumber	offnum,
		ScanDirection	dir,
		bool *	continuescan
	)

Definition at line 1372 of file nbtutils.c.

References _bt_check_rowcompare(), Assert, DatumGetBool, FunctionCall2Coll(), IndexScanDescData::ignore_killed_tuples, index_getattr, IndexScanDescData::indexRelation, ItemIdIsDead, BTScanOpaqueData::keyData, NULL, BTScanOpaqueData::numberOfKeys, IndexScanDescData::opaque, P_FIRSTDATAKEY, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, RelationGetDescr, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_NULLS_FIRST, SK_BT_REQBKWD, SK_BT_REQFWD, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, SK_ISNULL, SK_ROW_HEADER, SK_SEARCHNOTNULL, and SK_SEARCHNULL.

Referenced by _bt_readpage().

{
    ItemId      iid = PageGetItemId(page, offnum);
    bool        tuple_alive;
    IndexTuple  tuple;
    TupleDesc   tupdesc;
    BTScanOpaque so;
    int         keysz;
    int         ikey;
    ScanKey     key;

    *continuescan = true;       /* default assumption */

    /*
     * If the scan specifies not to return killed tuples, then we treat a
     * killed tuple as not passing the qual.  Most of the time, it's a win to
     * not bother examining the tuple's index keys, but just return
     * immediately with continuescan = true to proceed to the next tuple.
     * However, if this is the last tuple on the page, we should check the
     * index keys to prevent uselessly advancing to the next page.
     */
    if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
    {
        /* return immediately if there are more tuples on the page */
        if (ScanDirectionIsForward(dir))
        {
            if (offnum < PageGetMaxOffsetNumber(page))
                return NULL;
        }
        else
        {
            BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);

            if (offnum > P_FIRSTDATAKEY(opaque))
                return NULL;
        }

        /*
         * OK, we want to check the keys so we can set continuescan correctly,
         * but we'll return NULL even if the tuple passes the key tests.
         */
        tuple_alive = false;
    }
    else
        tuple_alive = true;

    tuple = (IndexTuple) PageGetItem(page, iid);

    tupdesc = RelationGetDescr(scan->indexRelation);
    so = (BTScanOpaque) scan->opaque;
    keysz = so->numberOfKeys;

    for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++)
    {
        Datum       datum;
        bool        isNull;
        Datum       test;

        /* row-comparison keys need special processing */
        if (key->sk_flags & SK_ROW_HEADER)
        {
            if (_bt_check_rowcompare(key, tuple, tupdesc, dir, continuescan))
                continue;
            return NULL;
        }

        datum = index_getattr(tuple,
                              key->sk_attno,
                              tupdesc,
                              &isNull);

        if (key->sk_flags & SK_ISNULL)
        {
            /* Handle IS NULL/NOT NULL tests */
            if (key->sk_flags & SK_SEARCHNULL)
            {
                if (isNull)
                    continue;   /* tuple satisfies this qual */
            }
            else
            {
                Assert(key->sk_flags & SK_SEARCHNOTNULL);
                if (!isNull)
                    continue;   /* tuple satisfies this qual */
            }

            /*
             * Tuple fails this qual.  If it's a required qual for the current
             * scan direction, then we can conclude no further tuples will
             * pass, either.
             */
            if ((key->sk_flags & SK_BT_REQFWD) &&
                ScanDirectionIsForward(dir))
                *continuescan = false;
            else if ((key->sk_flags & SK_BT_REQBKWD) &&
                     ScanDirectionIsBackward(dir))
                *continuescan = false;

            /*
             * In any case, this indextuple doesn't match the qual.
             */
            return NULL;
        }

        if (isNull)
        {
            if (key->sk_flags & SK_BT_NULLS_FIRST)
            {
                /*
                 * Since NULLs are sorted before non-NULLs, we know we have
                 * reached the lower limit of the range of values for this
                 * index attr.  On a backward scan, we can stop if this qual
                 * is one of the "must match" subset.  We can stop regardless
                 * of whether the qual is > or <, so long as it's required,
                 * because it's not possible for any future tuples to pass. On
                 * a forward scan, however, we must keep going, because we may
                 * have initially positioned to the start of the index.
                 */
                if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
                    ScanDirectionIsBackward(dir))
                    *continuescan = false;
            }
            else
            {
                /*
                 * Since NULLs are sorted after non-NULLs, we know we have
                 * reached the upper limit of the range of values for this
                 * index attr.  On a forward scan, we can stop if this qual is
                 * one of the "must match" subset.  We can stop regardless of
                 * whether the qual is > or <, so long as it's required,
                 * because it's not possible for any future tuples to pass. On
                 * a backward scan, however, we must keep going, because we
                 * may have initially positioned to the end of the index.
                 */
                if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) &&
                    ScanDirectionIsForward(dir))
                    *continuescan = false;
            }

            /*
             * In any case, this indextuple doesn't match the qual.
             */
            return NULL;
        }

        test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
                                 datum, key->sk_argument);

        if (!DatumGetBool(test))
        {
            /*
             * Tuple fails this qual.  If it's a required qual for the current
             * scan direction, then we can conclude no further tuples will
             * pass, either.
             *
             * Note: because we stop the scan as soon as any required equality
             * qual fails, it is critical that equality quals be used for the
             * initial positioning in _bt_first() when they are available. See
             * comments in _bt_first().
             */
            if ((key->sk_flags & SK_BT_REQFWD) &&
                ScanDirectionIsForward(dir))
                *continuescan = false;
            else if ((key->sk_flags & SK_BT_REQBKWD) &&
                     ScanDirectionIsBackward(dir))
                *continuescan = false;

            /*
             * In any case, this indextuple doesn't match the qual.
             */
            return NULL;
        }
    }

    /* Check for failure due to it being a killed tuple. */
    if (!tuple_alive)
        return NULL;

    /* If we get here, the tuple passes all index quals. */
    return tuple;
}

void _bt_checkpage	(	Relation	rel,
		Buffer	buf
	)

Definition at line 492 of file nbtpage.c.

References BufferGetBlockNumber(), BufferGetPage, ereport, errcode(), errhint(), errmsg(), ERROR, MAXALIGN, PageGetSpecialSize, PageIsNew, and RelationGetRelationName.

Referenced by _bt_getbuf(), _bt_relandgetbuf(), and btvacuumpage().

{
    Page        page = BufferGetPage(buf);

    /*
     * ReadBuffer verifies that every newly-read page passes
     * PageHeaderIsValid, which means it either contains a reasonably sane
     * page header or is all-zero.  We have to defend against the all-zero
     * case, however.
     */
    if (PageIsNew(page))
        ereport(ERROR,
                (errcode(ERRCODE_INDEX_CORRUPTED),
             errmsg("index \"%s\" contains unexpected zero page at block %u",
                    RelationGetRelationName(rel),
                    BufferGetBlockNumber(buf)),
                 errhint("Please REINDEX it.")));

    /*
     * Additionally check that the special area looks sane.
     */
    if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
        ereport(ERROR,
                (errcode(ERRCODE_INDEX_CORRUPTED),
                 errmsg("index \"%s\" contains corrupted page at block %u",
                        RelationGetRelationName(rel),
                        BufferGetBlockNumber(buf)),
                 errhint("Please REINDEX it.")));
}

int32 _bt_compare	(	Relation	rel,
		int	keysz,
		ScanKey	scankey,
		Page	page,
		OffsetNumber	offnum
	)

Definition at line 339 of file nbtsearch.c.

References DatumGetInt32, FunctionCall2Coll(), i, index_getattr, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem, PageGetItemId, PageGetSpecialPointer, RelationGetDescr, ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, ScanKeyData::sk_func, and SK_ISNULL.

Referenced by _bt_binsrch(), _bt_findinsertloc(), and _bt_moveright().

{
    TupleDesc   itupdesc = RelationGetDescr(rel);
    BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    IndexTuple  itup;
    int         i;

    /*
     * Force result ">" if target item is first data item on an internal page
     * --- see NOTE above.
     */
    if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
        return 1;

    itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));

    /*
     * The scan key is set up with the attribute number associated with each
     * term in the key.  It is important that, if the index is multi-key, the
     * scan contain the first k key attributes, and that they be in order.  If
     * you think about how multi-key ordering works, you'll understand why
     * this is.
     *
     * We don't test for violation of this condition here, however.  The
     * initial setup for the index scan had better have gotten it right (see
     * _bt_first).
     */

    for (i = 1; i <= keysz; i++)
    {
        Datum       datum;
        bool        isNull;
        int32       result;

        datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull);

        /* see comments about NULLs handling in btbuild */
        if (scankey->sk_flags & SK_ISNULL)      /* key is NULL */
        {
            if (isNull)
                result = 0;     /* NULL "=" NULL */
            else if (scankey->sk_flags & SK_BT_NULLS_FIRST)
                result = -1;    /* NULL "<" NOT_NULL */
            else
                result = 1;     /* NULL ">" NOT_NULL */
        }
        else if (isNull)        /* key is NOT_NULL and item is NULL */
        {
            if (scankey->sk_flags & SK_BT_NULLS_FIRST)
                result = 1;     /* NOT_NULL ">" NULL */
            else
                result = -1;    /* NOT_NULL "<" NULL */
        }
        else
        {
            /*
             * The sk_func needs to be passed the index value as left arg and
             * the sk_argument as right arg (they might be of different
             * types).  Since it is convenient for callers to think of
             * _bt_compare as comparing the scankey to the index item, we have
             * to flip the sign of the comparison result.  (Unless it's a DESC
             * column, in which case we *don't* flip the sign.)
             */
            result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func,
                                                     scankey->sk_collation,
                                                     datum,
                                                     scankey->sk_argument));

            if (!(scankey->sk_flags & SK_BT_DESC))
                result = -result;
        }

        /* if the keys are unequal, return the difference */
        if (result != 0)
            return result;

        scankey++;
    }

    /* if we get here, the keys are equal */
    return 0;
}

void _bt_delitems_delete	(	Relation	rel,
		Buffer	buf,
		OffsetNumber *	itemnos,
		int	nitems,
		Relation	heapRel
	)

Definition at line 882 of file nbtpage.c.

References Assert, xl_btree_delete::block, BTPageOpaqueData::btpo_flags, XLogRecData::buffer, XLogRecData::buffer_std, BufferGetBlockNumber(), BufferGetPage, XLogRecData::data, END_CRIT_SECTION, xl_btree_delete::hnode, XLogRecData::len, MarkBufferDirty(), XLogRecData::next, xl_btree_delete::nitems, xl_btree_delete::node, PageGetSpecialPointer, PageIndexMultiDelete(), PageSetLSN, RelationData::rd_node, RelationNeedsWAL, START_CRIT_SECTION, XLOG_BTREE_DELETE, and XLogInsert().

Referenced by _bt_vacuum_one_page().

{
    Page        page = BufferGetPage(buf);
    BTPageOpaque opaque;

    /* Shouldn't be called unless there's something to do */
    Assert(nitems > 0);

    /* No ereport(ERROR) until changes are logged */
    START_CRIT_SECTION();

    /* Fix the page */
    PageIndexMultiDelete(page, itemnos, nitems);

    /*
     * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID,
     * because this is not called by VACUUM.
     */

    /*
     * Mark the page as not containing any LP_DEAD items.  This is not
     * certainly true (there might be some that have recently been marked, but
     * weren't included in our target-item list), but it will almost always be
     * true and it doesn't seem worth an additional page scan to check it.
     * Remember that BTP_HAS_GARBAGE is only a hint anyway.
     */
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    opaque->btpo_flags &= ~BTP_HAS_GARBAGE;

    MarkBufferDirty(buf);

    /* XLOG stuff */
    if (RelationNeedsWAL(rel))
    {
        XLogRecPtr  recptr;
        XLogRecData rdata[3];
        xl_btree_delete xlrec_delete;

        xlrec_delete.node = rel->rd_node;
        xlrec_delete.hnode = heapRel->rd_node;
        xlrec_delete.block = BufferGetBlockNumber(buf);
        xlrec_delete.nitems = nitems;

        rdata[0].data = (char *) &xlrec_delete;
        rdata[0].len = SizeOfBtreeDelete;
        rdata[0].buffer = InvalidBuffer;
        rdata[0].next = &(rdata[1]);

        /*
         * We need the target-offsets array whether or not we store the whole
         * buffer, to allow us to find the latestRemovedXid on a standby
         * server.
         */
        rdata[1].data = (char *) itemnos;
        rdata[1].len = nitems * sizeof(OffsetNumber);
        rdata[1].buffer = InvalidBuffer;
        rdata[1].next = &(rdata[2]);

        rdata[2].data = NULL;
        rdata[2].len = 0;
        rdata[2].buffer = buf;
        rdata[2].buffer_std = true;
        rdata[2].next = NULL;

        recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);

        PageSetLSN(page, recptr);
    }

    END_CRIT_SECTION();
}

void _bt_delitems_vacuum	(	Relation	rel,
		Buffer	buf,
		OffsetNumber *	itemnos,
		int	nitems,
		BlockNumber	lastBlockVacuumed
	)

Definition at line 794 of file nbtpage.c.

References xl_btree_vacuum::block, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, XLogRecData::buffer, XLogRecData::buffer_std, BufferGetBlockNumber(), BufferGetPage, XLogRecData::data, END_CRIT_SECTION, xl_btree_vacuum::lastBlockVacuumed, XLogRecData::len, MarkBufferDirty(), XLogRecData::next, xl_btree_vacuum::node, PageGetSpecialPointer, PageIndexMultiDelete(), PageSetLSN, RelationData::rd_node, RelationNeedsWAL, START_CRIT_SECTION, XLOG_BTREE_VACUUM, and XLogInsert().

Referenced by btvacuumpage(), and btvacuumscan().

{
    Page        page = BufferGetPage(buf);
    BTPageOpaque opaque;

    /* No ereport(ERROR) until changes are logged */
    START_CRIT_SECTION();

    /* Fix the page */
    if (nitems > 0)
        PageIndexMultiDelete(page, itemnos, nitems);

    /*
     * We can clear the vacuum cycle ID since this page has certainly been
     * processed by the current vacuum scan.
     */
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    opaque->btpo_cycleid = 0;

    /*
     * Mark the page as not containing any LP_DEAD items.  This is not
     * certainly true (there might be some that have recently been marked, but
     * weren't included in our target-item list), but it will almost always be
     * true and it doesn't seem worth an additional page scan to check it.
     * Remember that BTP_HAS_GARBAGE is only a hint anyway.
     */
    opaque->btpo_flags &= ~BTP_HAS_GARBAGE;

    MarkBufferDirty(buf);

    /* XLOG stuff */
    if (RelationNeedsWAL(rel))
    {
        XLogRecPtr  recptr;
        XLogRecData rdata[2];
        xl_btree_vacuum xlrec_vacuum;

        xlrec_vacuum.node = rel->rd_node;
        xlrec_vacuum.block = BufferGetBlockNumber(buf);

        xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed;
        rdata[0].data = (char *) &xlrec_vacuum;
        rdata[0].len = SizeOfBtreeVacuum;
        rdata[0].buffer = InvalidBuffer;
        rdata[0].next = &(rdata[1]);

        /*
         * The target-offsets array is not in the buffer, but pretend that it
         * is.  When XLogInsert stores the whole buffer, the offsets array
         * need not be stored too.
         */
        if (nitems > 0)
        {
            rdata[1].data = (char *) itemnos;
            rdata[1].len = nitems * sizeof(OffsetNumber);
        }
        else
        {
            rdata[1].data = NULL;
            rdata[1].len = 0;
        }
        rdata[1].buffer = buf;
        rdata[1].buffer_std = true;
        rdata[1].next = NULL;

        recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM, rdata);

        PageSetLSN(page, recptr);
    }

    END_CRIT_SECTION();
}

bool _bt_doinsert	(	Relation	rel,
		IndexTuple	itup,
		IndexUniqueCheck	checkUnique,
		Relation	heapRel
	)

Definition at line 103 of file nbtinsert.c.

References _bt_binsrch(), _bt_check_unique(), _bt_findinsertloc(), _bt_freeskey(), _bt_freestack(), _bt_insertonpg(), _bt_mkscankey(), _bt_moveright(), _bt_relbuf(), _bt_search(), BT_WRITE, buf, BUFFER_LOCK_UNLOCK, CheckForSerializableConflictIn(), LockBuffer(), NULL, RelationData::rd_rel, TransactionIdIsValid, UNIQUE_CHECK_EXISTING, UNIQUE_CHECK_NO, and XactLockTableWait().

Referenced by btinsert().

{
    bool        is_unique = false;
    int         natts = rel->rd_rel->relnatts;
    ScanKey     itup_scankey;
    BTStack     stack;
    Buffer      buf;
    OffsetNumber offset;

    /* we need an insertion scan key to do our search, so build one */
    itup_scankey = _bt_mkscankey(rel, itup);

top:
    /* find the first page containing this key */
    stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE);

    offset = InvalidOffsetNumber;

    /* trade in our read lock for a write lock */
    LockBuffer(buf, BUFFER_LOCK_UNLOCK);
    LockBuffer(buf, BT_WRITE);

    /*
     * If the page was split between the time that we surrendered our read
     * lock and acquired our write lock, then this page may no longer be the
     * right place for the key we want to insert.  In this case, we need to
     * move right in the tree.  See Lehman and Yao for an excruciatingly
     * precise description.
     */
    buf = _bt_moveright(rel, buf, natts, itup_scankey, false, BT_WRITE);

    /*
     * If we're not allowing duplicates, make sure the key isn't already in
     * the index.
     *
     * NOTE: obviously, _bt_check_unique can only detect keys that are already
     * in the index; so it cannot defend against concurrent insertions of the
     * same key.  We protect against that by means of holding a write lock on
     * the target page.  Any other would-be inserter of the same key must
     * acquire a write lock on the same target page, so only one would-be
     * inserter can be making the check at one time.  Furthermore, once we are
     * past the check we hold write locks continuously until we have performed
     * our insertion, so no later inserter can fail to see our insertion.
     * (This requires some care in _bt_insertonpg.)
     *
     * If we must wait for another xact, we release the lock while waiting,
     * and then must start over completely.
     *
     * For a partial uniqueness check, we don't wait for the other xact. Just
     * let the tuple in and return false for possibly non-unique, or true for
     * definitely unique.
     */
    if (checkUnique != UNIQUE_CHECK_NO)
    {
        TransactionId xwait;

        offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
        xwait = _bt_check_unique(rel, itup, heapRel, buf, offset, itup_scankey,
                                 checkUnique, &is_unique);

        if (TransactionIdIsValid(xwait))
        {
            /* Have to wait for the other guy ... */
            _bt_relbuf(rel, buf);
            XactLockTableWait(xwait);
            /* start over... */
            _bt_freestack(stack);
            goto top;
        }
    }

    if (checkUnique != UNIQUE_CHECK_EXISTING)
    {
        /*
         * The only conflict predicate locking cares about for indexes is when
         * an index tuple insert conflicts with an existing lock.  Since the
         * actual location of the insert is hard to predict because of the
         * random search used to prevent O(N^2) performance when there are
         * many duplicate entries, we can just use the "first valid" page.
         */
        CheckForSerializableConflictIn(rel, NULL, buf);
        /* do the insertion */
        _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup, heapRel);
        _bt_insertonpg(rel, buf, stack, itup, offset, false);
    }
    else
    {
        /* just release the buffer */
        _bt_relbuf(rel, buf);
    }

    /* be tidy */
    _bt_freestack(stack);
    _bt_freeskey(itup_scankey);

    return is_unique;
}

void _bt_end_vacuum ( Relation rel )

Definition at line 1939 of file nbtutils.c.

References BtreeVacuumLock, LockRelId::dbId, i, LockInfoData::lockRelId, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), BTVacInfo::num_vacuums, RelationData::rd_lockInfo, LockRelId::relId, BTOneVacInfo::relid, and BTVacInfo::vacuums.

Referenced by _bt_end_vacuum_callback(), and btbulkdelete().

{
    int         i;

    LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);

    /* Find the array entry */
    for (i = 0; i < btvacinfo->num_vacuums; i++)
    {
        BTOneVacInfo *vac = &btvacinfo->vacuums[i];

        if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
            vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
        {
            /* Remove it by shifting down the last entry */
            *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1];
            btvacinfo->num_vacuums--;
            break;
        }
    }

    LWLockRelease(BtreeVacuumLock);
}

void _bt_end_vacuum_callback	(	int	code,
		Datum	arg
	)

Definition at line 1967 of file nbtutils.c.

References _bt_end_vacuum(), and DatumGetPointer.

Referenced by btbulkdelete().

{
    _bt_end_vacuum((Relation) DatumGetPointer(arg));
}

bool _bt_first	(	IndexScanDesc	scan,
		ScanDirection	dir
	)

Definition at line 447 of file nbtsearch.c.

References _bt_binsrch(), _bt_endpoint(), _bt_freestack(), _bt_preprocess_keys(), _bt_readpage(), _bt_search(), _bt_steppage(), Assert, BT_READ, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTORDER_PROC, BTScanPosData::buf, buf, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferIsValid, cur, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, DatumGetPointer, elog, ERROR, get_opfamily_proc(), i, index_getprocinfo(), INDEX_MAX_KEYS, IndexScanDescData::indexRelation, InvalidOid, InvalidStrategy, BTScanPosData::itemIndex, BTScanPosData::items, BTScanOpaqueData::keyData, LockBuffer(), BTScanOpaqueData::markItemIndex, BTScanPosData::moreLeft, BTScanPosData::moreRight, NULL, BTScanOpaqueData::numberOfKeys, BTScanOpaqueData::numKilled, OffsetNumberPrev, IndexScanDescData::opaque, pgstat_count_index_scan, PredicateLockPage(), PredicateLockRelation(), BTScanOpaqueData::qual_ok, RelationData::rd_opcintype, RelationData::rd_opfamily, RegProcedureIsValid, RelationGetRelationName, ScanDirectionIsBackward, ScanDirectionIsForward, ScanKeyEntryInitialize(), ScanKeyEntryInitializeWithInfo(), ScanKeyData::sk_argument, ScanKeyData::sk_attno, SK_BT_DESC, SK_BT_NULLS_FIRST, ScanKeyData::sk_collation, ScanKeyData::sk_flags, SK_ISNULL, SK_ROW_END, SK_ROW_HEADER, SK_ROW_MEMBER, SK_SEARCHNOTNULL, ScanKeyData::sk_strategy, ScanKeyData::sk_subtype, HeapTupleData::t_self, IndexScanDescData::xs_ctup, IndexScanDescData::xs_itup, IndexScanDescData::xs_snapshot, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

{
    Relation    rel = scan->indexRelation;
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    Buffer      buf;
    BTStack     stack;
    OffsetNumber offnum;
    StrategyNumber strat;
    bool        nextkey;
    bool        goback;
    ScanKey     startKeys[INDEX_MAX_KEYS];
    ScanKeyData scankeys[INDEX_MAX_KEYS];
    ScanKeyData notnullkeys[INDEX_MAX_KEYS];
    int         keysCount = 0;
    int         i;
    StrategyNumber strat_total;
    BTScanPosItem *currItem;

    pgstat_count_index_scan(rel);

    /*
     * Examine the scan keys and eliminate any redundant keys; also mark the
     * keys that must be matched to continue the scan.
     */
    _bt_preprocess_keys(scan);

    /*
     * Quit now if _bt_preprocess_keys() discovered that the scan keys can
     * never be satisfied (eg, x == 1 AND x > 2).
     */
    if (!so->qual_ok)
        return false;

    /*----------
     * Examine the scan keys to discover where we need to start the scan.
     *
     * We want to identify the keys that can be used as starting boundaries;
     * these are =, >, or >= keys for a forward scan or =, <, <= keys for
     * a backwards scan.  We can use keys for multiple attributes so long as
     * the prior attributes had only =, >= (resp. =, <=) keys.  Once we accept
     * a > or < boundary or find an attribute with no boundary (which can be
     * thought of as the same as "> -infinity"), we can't use keys for any
     * attributes to its right, because it would break our simplistic notion
     * of what initial positioning strategy to use.
     *
     * When the scan keys include cross-type operators, _bt_preprocess_keys
     * may not be able to eliminate redundant keys; in such cases we will
     * arbitrarily pick a usable one for each attribute.  This is correct
     * but possibly not optimal behavior.  (For example, with keys like
     * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when
     * x=5 would be more efficient.)  Since the situation only arises given
     * a poorly-worded query plus an incomplete opfamily, live with it.
     *
     * When both equality and inequality keys appear for a single attribute
     * (again, only possible when cross-type operators appear), we *must*
     * select one of the equality keys for the starting point, because
     * _bt_checkkeys() will stop the scan as soon as an equality qual fails.
     * For example, if we have keys like "x >= 4 AND x = 10" and we elect to
     * start at x=4, we will fail and stop before reaching x=10.  If multiple
     * equality quals survive preprocessing, however, it doesn't matter which
     * one we use --- by definition, they are either redundant or
     * contradictory.
     *
     * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier.
     * If the index stores nulls at the end of the index we'll be starting
     * from, and we have no boundary key for the column (which means the key
     * we deduced NOT NULL from is an inequality key that constrains the other
     * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to
     * use as a boundary key.  If we didn't do this, we might find ourselves
     * traversing a lot of null entries at the start of the scan.
     *
     * In this loop, row-comparison keys are treated the same as keys on their
     * first (leftmost) columns.  We'll add on lower-order columns of the row
     * comparison below, if possible.
     *
     * The selected scan keys (at most one per index column) are remembered by
     * storing their addresses into the local startKeys[] array.
     *----------
     */
    strat_total = BTEqualStrategyNumber;
    if (so->numberOfKeys > 0)
    {
        AttrNumber  curattr;
        ScanKey     chosen;
        ScanKey     impliesNN;
        ScanKey     cur;

        /*
         * chosen is the so-far-chosen key for the current attribute, if any.
         * We don't cast the decision in stone until we reach keys for the
         * next attribute.
         */
        curattr = 1;
        chosen = NULL;
        /* Also remember any scankey that implies a NOT NULL constraint */
        impliesNN = NULL;

        /*
         * Loop iterates from 0 to numberOfKeys inclusive; we use the last
         * pass to handle after-last-key processing.  Actual exit from the
         * loop is at one of the "break" statements below.
         */
        for (cur = so->keyData, i = 0;; cur++, i++)
        {
            if (i >= so->numberOfKeys || cur->sk_attno != curattr)
            {
                /*
                 * Done looking at keys for curattr.  If we didn't find a
                 * usable boundary key, see if we can deduce a NOT NULL key.
                 */
                if (chosen == NULL && impliesNN != NULL &&
                    ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
                     ScanDirectionIsForward(dir) :
                     ScanDirectionIsBackward(dir)))
                {
                    /* Yes, so build the key in notnullkeys[keysCount] */
                    chosen = &notnullkeys[keysCount];
                    ScanKeyEntryInitialize(chosen,
                                           (SK_SEARCHNOTNULL | SK_ISNULL |
                                            (impliesNN->sk_flags &
                                          (SK_BT_DESC | SK_BT_NULLS_FIRST))),
                                           curattr,
                                 ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ?
                                  BTGreaterStrategyNumber :
                                  BTLessStrategyNumber),
                                           InvalidOid,
                                           InvalidOid,
                                           InvalidOid,
                                           (Datum) 0);
                }

                /*
                 * If we still didn't find a usable boundary key, quit; else
                 * save the boundary key pointer in startKeys.
                 */
                if (chosen == NULL)
                    break;
                startKeys[keysCount++] = chosen;

                /*
                 * Adjust strat_total, and quit if we have stored a > or <
                 * key.
                 */
                strat = chosen->sk_strategy;
                if (strat != BTEqualStrategyNumber)
                {
                    strat_total = strat;
                    if (strat == BTGreaterStrategyNumber ||
                        strat == BTLessStrategyNumber)
                        break;
                }

                /*
                 * Done if that was the last attribute, or if next key is not
                 * in sequence (implying no boundary key is available for the
                 * next attribute).
                 */
                if (i >= so->numberOfKeys ||
                    cur->sk_attno != curattr + 1)
                    break;

                /*
                 * Reset for next attr.
                 */
                curattr = cur->sk_attno;
                chosen = NULL;
                impliesNN = NULL;
            }

            /*
             * Can we use this key as a starting boundary for this attr?
             *
             * If not, does it imply a NOT NULL constraint?  (Because
             * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber,
             * *any* inequality key works for that; we need not test.)
             */
            switch (cur->sk_strategy)
            {
                case BTLessStrategyNumber:
                case BTLessEqualStrategyNumber:
                    if (chosen == NULL)
                    {
                        if (ScanDirectionIsBackward(dir))
                            chosen = cur;
                        else
                            impliesNN = cur;
                    }
                    break;
                case BTEqualStrategyNumber:
                    /* override any non-equality choice */
                    chosen = cur;
                    break;
                case BTGreaterEqualStrategyNumber:
                case BTGreaterStrategyNumber:
                    if (chosen == NULL)
                    {
                        if (ScanDirectionIsForward(dir))
                            chosen = cur;
                        else
                            impliesNN = cur;
                    }
                    break;
            }
        }
    }

    /*
     * If we found no usable boundary keys, we have to start from one end of
     * the tree.  Walk down that edge to the first or last key, and scan from
     * there.
     */
    if (keysCount == 0)
        return _bt_endpoint(scan, dir);

    /*
     * We want to start the scan somewhere within the index.  Set up an
     * insertion scankey we can use to search for the boundary point we
     * identified above.  The insertion scankey is built in the local
     * scankeys[] array, using the keys identified by startKeys[].
     */
    Assert(keysCount <= INDEX_MAX_KEYS);
    for (i = 0; i < keysCount; i++)
    {
        ScanKey     cur = startKeys[i];

        Assert(cur->sk_attno == i + 1);

        if (cur->sk_flags & SK_ROW_HEADER)
        {
            /*
             * Row comparison header: look to the first row member instead.
             *
             * The member scankeys are already in insertion format (ie, they
             * have sk_func = 3-way-comparison function), but we have to watch
             * out for nulls, which _bt_preprocess_keys didn't check. A null
             * in the first row member makes the condition unmatchable, just
             * like qual_ok = false.
             */
            ScanKey     subkey = (ScanKey) DatumGetPointer(cur->sk_argument);

            Assert(subkey->sk_flags & SK_ROW_MEMBER);
            if (subkey->sk_flags & SK_ISNULL)
                return false;
            memcpy(scankeys + i, subkey, sizeof(ScanKeyData));

            /*
             * If the row comparison is the last positioning key we accepted,
             * try to add additional keys from the lower-order row members.
             * (If we accepted independent conditions on additional index
             * columns, we use those instead --- doesn't seem worth trying to
             * determine which is more restrictive.)  Note that this is OK
             * even if the row comparison is of ">" or "<" type, because the
             * condition applied to all but the last row member is effectively
             * ">=" or "<=", and so the extra keys don't break the positioning
             * scheme.  But, by the same token, if we aren't able to use all
             * the row members, then the part of the row comparison that we
             * did use has to be treated as just a ">=" or "<=" condition, and
             * so we'd better adjust strat_total accordingly.
             */
            if (i == keysCount - 1)
            {
                bool        used_all_subkeys = false;

                Assert(!(subkey->sk_flags & SK_ROW_END));
                for (;;)
                {
                    subkey++;
                    Assert(subkey->sk_flags & SK_ROW_MEMBER);
                    if (subkey->sk_attno != keysCount + 1)
                        break;  /* out-of-sequence, can't use it */
                    if (subkey->sk_strategy != cur->sk_strategy)
                        break;  /* wrong direction, can't use it */
                    if (subkey->sk_flags & SK_ISNULL)
                        break;  /* can't use null keys */
                    Assert(keysCount < INDEX_MAX_KEYS);
                    memcpy(scankeys + keysCount, subkey, sizeof(ScanKeyData));
                    keysCount++;
                    if (subkey->sk_flags & SK_ROW_END)
                    {
                        used_all_subkeys = true;
                        break;
                    }
                }
                if (!used_all_subkeys)
                {
                    switch (strat_total)
                    {
                        case BTLessStrategyNumber:
                            strat_total = BTLessEqualStrategyNumber;
                            break;
                        case BTGreaterStrategyNumber:
                            strat_total = BTGreaterEqualStrategyNumber;
                            break;
                    }
                }
                break;          /* done with outer loop */
            }
        }
        else
        {
            /*
             * Ordinary comparison key.  Transform the search-style scan key
             * to an insertion scan key by replacing the sk_func with the
             * appropriate btree comparison function.
             *
             * If scankey operator is not a cross-type comparison, we can use
             * the cached comparison function; otherwise gotta look it up in
             * the catalogs.  (That can't lead to infinite recursion, since no
             * indexscan initiated by syscache lookup will use cross-data-type
             * operators.)
             *
             * We support the convention that sk_subtype == InvalidOid means
             * the opclass input type; this is a hack to simplify life for
             * ScanKeyInit().
             */
            if (cur->sk_subtype == rel->rd_opcintype[i] ||
                cur->sk_subtype == InvalidOid)
            {
                FmgrInfo   *procinfo;

                procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC);
                ScanKeyEntryInitializeWithInfo(scankeys + i,
                                               cur->sk_flags,
                                               cur->sk_attno,
                                               InvalidStrategy,
                                               cur->sk_subtype,
                                               cur->sk_collation,
                                               procinfo,
                                               cur->sk_argument);
            }
            else
            {
                RegProcedure cmp_proc;

                cmp_proc = get_opfamily_proc(rel->rd_opfamily[i],
                                             rel->rd_opcintype[i],
                                             cur->sk_subtype,
                                             BTORDER_PROC);
                if (!RegProcedureIsValid(cmp_proc))
                    elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"",
                         BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype,
                         cur->sk_attno, RelationGetRelationName(rel));
                ScanKeyEntryInitialize(scankeys + i,
                                       cur->sk_flags,
                                       cur->sk_attno,
                                       InvalidStrategy,
                                       cur->sk_subtype,
                                       cur->sk_collation,
                                       cmp_proc,
                                       cur->sk_argument);
            }
        }
    }

    /*----------
     * Examine the selected initial-positioning strategy to determine exactly
     * where we need to start the scan, and set flag variables to control the
     * code below.
     *
     * If nextkey = false, _bt_search and _bt_binsrch will locate the first
     * item >= scan key.  If nextkey = true, they will locate the first
     * item > scan key.
     *
     * If goback = true, we will then step back one item, while if
     * goback = false, we will start the scan on the located item.
     *----------
     */
    switch (strat_total)
    {
        case BTLessStrategyNumber:

            /*
             * Find first item >= scankey, then back up one to arrive at last
             * item < scankey.  (Note: this positioning strategy is only used
             * for a backward scan, so that is always the correct starting
             * position.)
             */
            nextkey = false;
            goback = true;
            break;

        case BTLessEqualStrategyNumber:

            /*
             * Find first item > scankey, then back up one to arrive at last
             * item <= scankey.  (Note: this positioning strategy is only used
             * for a backward scan, so that is always the correct starting
             * position.)
             */
            nextkey = true;
            goback = true;
            break;

        case BTEqualStrategyNumber:

            /*
             * If a backward scan was specified, need to start with last equal
             * item not first one.
             */
            if (ScanDirectionIsBackward(dir))
            {
                /*
                 * This is the same as the <= strategy.  We will check at the
                 * end whether the found item is actually =.
                 */
                nextkey = true;
                goback = true;
            }
            else
            {
                /*
                 * This is the same as the >= strategy.  We will check at the
                 * end whether the found item is actually =.
                 */
                nextkey = false;
                goback = false;
            }
            break;

        case BTGreaterEqualStrategyNumber:

            /*
             * Find first item >= scankey.  (This is only used for forward
             * scans.)
             */
            nextkey = false;
            goback = false;
            break;

        case BTGreaterStrategyNumber:

            /*
             * Find first item > scankey.  (This is only used for forward
             * scans.)
             */
            nextkey = true;
            goback = false;
            break;

        default:
            /* can't get here, but keep compiler quiet */
            elog(ERROR, "unrecognized strat_total: %d", (int) strat_total);
            return false;
    }

    /*
     * Use the manufactured insertion scan key to descend the tree and
     * position ourselves on the target leaf page.
     */
    stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ);

    /* don't need to keep the stack around... */
    _bt_freestack(stack);

    /* remember which buffer we have pinned, if any */
    so->currPos.buf = buf;

    if (!BufferIsValid(buf))
    {
        /*
         * We only get here if the index is completely empty. Lock relation
         * because nothing finer to lock exists.
         */
        PredicateLockRelation(rel, scan->xs_snapshot);
        return false;
    }
    else
        PredicateLockPage(rel, BufferGetBlockNumber(buf),
                          scan->xs_snapshot);

    /* initialize moreLeft/moreRight appropriately for scan direction */
    if (ScanDirectionIsForward(dir))
    {
        so->currPos.moreLeft = false;
        so->currPos.moreRight = true;
    }
    else
    {
        so->currPos.moreLeft = true;
        so->currPos.moreRight = false;
    }
    so->numKilled = 0;          /* just paranoia */
    so->markItemIndex = -1;     /* ditto */

    /* position to the precise item on the page */
    offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey);

    /*
     * If nextkey = false, we are positioned at the first item >= scan key, or
     * possibly at the end of a page on which all the existing items are less
     * than the scan key and we know that everything on later pages is greater
     * than or equal to scan key.
     *
     * If nextkey = true, we are positioned at the first item > scan key, or
     * possibly at the end of a page on which all the existing items are less
     * than or equal to the scan key and we know that everything on later
     * pages is greater than scan key.
     *
     * The actually desired starting point is either this item or the prior
     * one, or in the end-of-page case it's the first item on the next page or
     * the last item on this page.  Adjust the starting offset if needed. (If
     * this results in an offset before the first item or after the last one,
     * _bt_readpage will report no items found, and then we'll step to the
     * next page as needed.)
     */
    if (goback)
        offnum = OffsetNumberPrev(offnum);

    /*
     * Now load data from the first page of the scan.
     */
    if (!_bt_readpage(scan, dir, offnum))
    {
        /*
         * There's no actually-matching data on this page.  Try to advance to
         * the next page.  Return false if there's no matching data at all.
         */
        if (!_bt_steppage(scan, dir))
            return false;
    }

    /* Drop the lock, but not pin, on the current page */
    LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);

    /* OK, itemIndex says what to return */
    currItem = &so->currPos.items[so->currPos.itemIndex];
    scan->xs_ctup.t_self = currItem->heapTid;
    if (scan->xs_want_itup)
        scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);

    return true;
}

void _bt_freeskey ( ScanKey skey )

Definition at line 155 of file nbtutils.c.

References pfree().

Referenced by _bt_doinsert(), and _bt_load().

{
    pfree(skey);
}

void _bt_freestack ( BTStack stack )

Definition at line 164 of file nbtutils.c.

References BTStackData::bts_parent, NULL, and pfree().

Referenced by _bt_doinsert(), and _bt_first().

{
    BTStack     ostack;

    while (stack != NULL)
    {
        ostack = stack;
        stack = stack->bts_parent;
        pfree(ostack);
    }
}

Buffer _bt_get_endpoint	(	Relation	rel,
		uint32	level,
		bool	rightmost
	)

Definition at line 1433 of file nbtsearch.c.

References _bt_getroot(), _bt_gettrueroot(), _bt_relandgetbuf(), BT_READ, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_next, buf, BufferGetPage, BufferIsValid, elog, ERROR, ItemPointerGetBlockNumber, BTPageOpaqueData::level, P_FIRSTDATAKEY, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, RelationGetRelationName, and IndexTupleData::t_tid.

Referenced by _bt_endpoint(), _bt_insert_parent(), and _bt_pagedel().

{
    Buffer      buf;
    Page        page;
    BTPageOpaque opaque;
    OffsetNumber offnum;
    BlockNumber blkno;
    IndexTuple  itup;

    /*
     * If we are looking for a leaf page, okay to descend from fast root;
     * otherwise better descend from true root.  (There is no point in being
     * smarter about intermediate levels.)
     */
    if (level == 0)
        buf = _bt_getroot(rel, BT_READ);
    else
        buf = _bt_gettrueroot(rel);

    if (!BufferIsValid(buf))
        return InvalidBuffer;

    page = BufferGetPage(buf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);

    for (;;)
    {
        /*
         * If we landed on a deleted page, step right to find a live page
         * (there must be one).  Also, if we want the rightmost page, step
         * right if needed to get to it (this could happen if the page split
         * since we obtained a pointer to it).
         */
        while (P_IGNORE(opaque) ||
               (rightmost && !P_RIGHTMOST(opaque)))
        {
            blkno = opaque->btpo_next;
            if (blkno == P_NONE)
                elog(ERROR, "fell off the end of index \"%s\"",
                     RelationGetRelationName(rel));
            buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
            page = BufferGetPage(buf);
            opaque = (BTPageOpaque) PageGetSpecialPointer(page);
        }

        /* Done? */
        if (opaque->btpo.level == level)
            break;
        if (opaque->btpo.level < level)
            elog(ERROR, "btree level %u not found in index \"%s\"",
                 level, RelationGetRelationName(rel));

        /* Descend to leftmost or rightmost child page */
        if (rightmost)
            offnum = PageGetMaxOffsetNumber(page);
        else
            offnum = P_FIRSTDATAKEY(opaque);

        itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
        blkno = ItemPointerGetBlockNumber(&(itup->t_tid));

        buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ);
        page = BufferGetPage(buf);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    }

    return buf;
}

Buffer _bt_getbuf	(	Relation	rel,
		BlockNumber	blkno,
		int	access
	)

Definition at line 575 of file nbtpage.c.

References _bt_checkpage(), _bt_log_reuse_page(), _bt_page_recyclable(), _bt_pageinit(), _bt_relbuf(), Assert, BT_WRITE, BTPageOpaqueData::btpo, buf, BufferGetPage, BufferGetPageSize, ConditionalLockBuffer(), DEBUG2, elog, ExclusiveLock, GetFreeIndexPage(), InvalidBlockNumber, LockBuffer(), LockRelationForExtension(), P_NEW, PageGetSpecialPointer, PageIsNew, ReadBuffer(), RELATION_IS_LOCAL, ReleaseBuffer(), UnlockRelationForExtension(), BTPageOpaqueData::xact, and XLogStandbyInfoActive.

Referenced by _bt_getroot(), _bt_getrootheight(), _bt_getstackbuf(), _bt_gettrueroot(), _bt_insertonpg(), _bt_newroot(), _bt_pagedel(), _bt_split(), _bt_steppage(), and _bt_walk_left().

{
    Buffer      buf;

    if (blkno != P_NEW)
    {
        /* Read an existing block of the relation */
        buf = ReadBuffer(rel, blkno);
        LockBuffer(buf, access);
        _bt_checkpage(rel, buf);
    }
    else
    {
        bool        needLock;
        Page        page;

        Assert(access == BT_WRITE);

        /*
         * First see if the FSM knows of any free pages.
         *
         * We can't trust the FSM's report unreservedly; we have to check that
         * the page is still free.  (For example, an already-free page could
         * have been re-used between the time the last VACUUM scanned it and
         * the time the VACUUM made its FSM updates.)
         *
         * In fact, it's worse than that: we can't even assume that it's safe
         * to take a lock on the reported page.  If somebody else has a lock
         * on it, or even worse our own caller does, we could deadlock.  (The
         * own-caller scenario is actually not improbable. Consider an index
         * on a serial or timestamp column.  Nearly all splits will be at the
         * rightmost page, so it's entirely likely that _bt_split will call us
         * while holding a lock on the page most recently acquired from FSM. A
         * VACUUM running concurrently with the previous split could well have
         * placed that page back in FSM.)
         *
         * To get around that, we ask for only a conditional lock on the
         * reported page.  If we fail, then someone else is using the page,
         * and we may reasonably assume it's not free.  (If we happen to be
         * wrong, the worst consequence is the page will be lost to use till
         * the next VACUUM, which is no big problem.)
         */
        for (;;)
        {
            blkno = GetFreeIndexPage(rel);
            if (blkno == InvalidBlockNumber)
                break;
            buf = ReadBuffer(rel, blkno);
            if (ConditionalLockBuffer(buf))
            {
                page = BufferGetPage(buf);
                if (_bt_page_recyclable(page))
                {
                    /*
                     * If we are generating WAL for Hot Standby then create a
                     * WAL record that will allow us to conflict with queries
                     * running on standby.
                     */
                    if (XLogStandbyInfoActive())
                    {
                        BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);

                        _bt_log_reuse_page(rel, blkno, opaque->btpo.xact);
                    }

                    /* Okay to use page.  Re-initialize and return it */
                    _bt_pageinit(page, BufferGetPageSize(buf));
                    return buf;
                }
                elog(DEBUG2, "FSM returned nonrecyclable page");
                _bt_relbuf(rel, buf);
            }
            else
            {
                elog(DEBUG2, "FSM returned nonlockable page");
                /* couldn't get lock, so just drop pin */
                ReleaseBuffer(buf);
            }
        }

        /*
         * Extend the relation by one page.
         *
         * We have to use a lock to ensure no one else is extending the rel at
         * the same time, else we will both try to initialize the same new
         * page.  We can skip locking for new or temp relations, however,
         * since no one else could be accessing them.
         */
        needLock = !RELATION_IS_LOCAL(rel);

        if (needLock)
            LockRelationForExtension(rel, ExclusiveLock);

        buf = ReadBuffer(rel, P_NEW);

        /* Acquire buffer lock on new page */
        LockBuffer(buf, BT_WRITE);

        /*
         * Release the file-extension lock; it's now OK for someone else to
         * extend the relation some more.  Note that we cannot release this
         * lock before we have buffer lock on the new page, or we risk a race
         * condition against btvacuumscan --- see comments therein.
         */
        if (needLock)
            UnlockRelationForExtension(rel, ExclusiveLock);

        /* Initialize the new page before returning it */
        page = BufferGetPage(buf);
        Assert(PageIsNew(page));
        _bt_pageinit(page, BufferGetPageSize(buf));
    }

    /* ref count and lock type are correct */
    return buf;
}

Buffer _bt_getroot	(	Relation	rel,
		int	access
	)

Definition at line 94 of file nbtpage.c.

References _bt_getbuf(), _bt_getroot(), _bt_relandgetbuf(), _bt_relbuf(), Assert, BT_READ, BT_WRITE, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_LEAF, BTP_META, BTPageGetMeta, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTREE_MAGIC, BTREE_METAPAGE, BTREE_VERSION, XLogRecData::buffer, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage, CacheInvalidateRelcache(), XLogRecData::data, elog, END_CRIT_SECTION, ereport, errcode(), errmsg(), ERROR, XLogRecData::len, xl_btree_newroot::level, BTPageOpaqueData::level, LockBuffer(), MarkBufferDirty(), MemoryContextAlloc(), XLogRecData::next, xl_btree_newroot::node, NULL, P_IGNORE, P_LEFTMOST, P_NEW, P_NONE, P_RIGHTMOST, PageGetSpecialPointer, PageSetLSN, pfree(), RelationData::rd_amcache, RelationData::rd_indexcxt, RelationData::rd_node, RelationGetRelationName, RelationNeedsWAL, xl_btree_newroot::rootblk, START_CRIT_SECTION, XLOG_BTREE_NEWROOT, and XLogInsert().

Referenced by _bt_get_endpoint(), _bt_getroot(), and _bt_search().

{
    Buffer      metabuf;
    Page        metapg;
    BTPageOpaque metaopaque;
    Buffer      rootbuf;
    Page        rootpage;
    BTPageOpaque rootopaque;
    BlockNumber rootblkno;
    uint32      rootlevel;
    BTMetaPageData *metad;

    /*
     * Try to use previously-cached metapage data to find the root.  This
     * normally saves one buffer access per index search, which is a very
     * helpful savings in bufmgr traffic and hence contention.
     */
    if (rel->rd_amcache != NULL)
    {
        metad = (BTMetaPageData *) rel->rd_amcache;
        /* We shouldn't have cached it if any of these fail */
        Assert(metad->btm_magic == BTREE_MAGIC);
        Assert(metad->btm_version == BTREE_VERSION);
        Assert(metad->btm_root != P_NONE);

        rootblkno = metad->btm_fastroot;
        Assert(rootblkno != P_NONE);
        rootlevel = metad->btm_fastlevel;

        rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
        rootpage = BufferGetPage(rootbuf);
        rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);

        /*
         * Since the cache might be stale, we check the page more carefully
         * here than normal.  We *must* check that it's not deleted. If it's
         * not alone on its level, then we reject too --- this may be overly
         * paranoid but better safe than sorry.  Note we don't check P_ISROOT,
         * because that's not set in a "fast root".
         */
        if (!P_IGNORE(rootopaque) &&
            rootopaque->btpo.level == rootlevel &&
            P_LEFTMOST(rootopaque) &&
            P_RIGHTMOST(rootopaque))
        {
            /* OK, accept cached page as the root */
            return rootbuf;
        }
        _bt_relbuf(rel, rootbuf);
        /* Cache is stale, throw it away */
        if (rel->rd_amcache)
            pfree(rel->rd_amcache);
        rel->rd_amcache = NULL;
    }

    metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
    metapg = BufferGetPage(metabuf);
    metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
    metad = BTPageGetMeta(metapg);

    /* sanity-check the metapage */
    if (!(metaopaque->btpo_flags & BTP_META) ||
        metad->btm_magic != BTREE_MAGIC)
        ereport(ERROR,
                (errcode(ERRCODE_INDEX_CORRUPTED),
                 errmsg("index \"%s\" is not a btree",
                        RelationGetRelationName(rel))));

    if (metad->btm_version != BTREE_VERSION)
        ereport(ERROR,
                (errcode(ERRCODE_INDEX_CORRUPTED),
                 errmsg("version mismatch in index \"%s\": file version %d, code version %d",
                        RelationGetRelationName(rel),
                        metad->btm_version, BTREE_VERSION)));

    /* if no root page initialized yet, do it */
    if (metad->btm_root == P_NONE)
    {
        /* If access = BT_READ, caller doesn't want us to create root yet */
        if (access == BT_READ)
        {
            _bt_relbuf(rel, metabuf);
            return InvalidBuffer;
        }

        /* trade in our read lock for a write lock */
        LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
        LockBuffer(metabuf, BT_WRITE);

        /*
         * Race condition:  if someone else initialized the metadata between
         * the time we released the read lock and acquired the write lock, we
         * must avoid doing it again.
         */
        if (metad->btm_root != P_NONE)
        {
            /*
             * Metadata initialized by someone else.  In order to guarantee no
             * deadlocks, we have to release the metadata page and start all
             * over again.  (Is that really true? But it's hardly worth trying
             * to optimize this case.)
             */
            _bt_relbuf(rel, metabuf);
            return _bt_getroot(rel, access);
        }

        /*
         * Get, initialize, write, and leave a lock of the appropriate type on
         * the new root page.  Since this is the first page in the tree, it's
         * a leaf as well as the root.
         */
        rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
        rootblkno = BufferGetBlockNumber(rootbuf);
        rootpage = BufferGetPage(rootbuf);
        rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
        rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
        rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
        rootopaque->btpo.level = 0;
        rootopaque->btpo_cycleid = 0;

        /* NO ELOG(ERROR) till meta is updated */
        START_CRIT_SECTION();

        metad->btm_root = rootblkno;
        metad->btm_level = 0;
        metad->btm_fastroot = rootblkno;
        metad->btm_fastlevel = 0;

        MarkBufferDirty(rootbuf);
        MarkBufferDirty(metabuf);

        /* XLOG stuff */
        if (RelationNeedsWAL(rel))
        {
            xl_btree_newroot xlrec;
            XLogRecPtr  recptr;
            XLogRecData rdata;

            xlrec.node = rel->rd_node;
            xlrec.rootblk = rootblkno;
            xlrec.level = 0;

            rdata.data = (char *) &xlrec;
            rdata.len = SizeOfBtreeNewroot;
            rdata.buffer = InvalidBuffer;
            rdata.next = NULL;

            recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, &rdata);

            PageSetLSN(rootpage, recptr);
            PageSetLSN(metapg, recptr);
        }

        END_CRIT_SECTION();

        /*
         * Send out relcache inval for metapage change (probably unnecessary
         * here, but let's be safe).
         */
        CacheInvalidateRelcache(rel);

        /*
         * swap root write lock for read lock.  There is no danger of anyone
         * else accessing the new root page while it's unlocked, since no one
         * else knows where it is yet.
         */
        LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
        LockBuffer(rootbuf, BT_READ);

        /* okay, metadata is correct, release lock on it */
        _bt_relbuf(rel, metabuf);
    }
    else
    {
        rootblkno = metad->btm_fastroot;
        Assert(rootblkno != P_NONE);
        rootlevel = metad->btm_fastlevel;

        /*
         * Cache the metapage data for next time
         */
        rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
                                             sizeof(BTMetaPageData));
        memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));

        /*
         * We are done with the metapage; arrange to release it via first
         * _bt_relandgetbuf call
         */
        rootbuf = metabuf;

        for (;;)
        {
            rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
            rootpage = BufferGetPage(rootbuf);
            rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);

            if (!P_IGNORE(rootopaque))
                break;

            /* it's dead, Jim.  step right one page */
            if (P_RIGHTMOST(rootopaque))
                elog(ERROR, "no live root page found in index \"%s\"",
                     RelationGetRelationName(rel));
            rootblkno = rootopaque->btpo_next;
        }

        /* Note: can't check btpo.level on deleted pages */
        if (rootopaque->btpo.level != rootlevel)
            elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
                 rootblkno, RelationGetRelationName(rel),
                 rootopaque->btpo.level, rootlevel);
    }

    /*
     * By here, we have a pin and read lock on the root page, and no lock set
     * on the metadata page.  Return the root page's buffer.
     */
    return rootbuf;
}

int _bt_getrootheight ( Relation rel )

Definition at line 424 of file nbtpage.c.

References _bt_getbuf(), _bt_relbuf(), Assert, BT_READ, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_META, BTPageGetMeta, BTPageOpaqueData::btpo_flags, BTREE_MAGIC, BTREE_METAPAGE, BTREE_VERSION, BufferGetPage, ereport, errcode(), errmsg(), ERROR, MemoryContextAlloc(), NULL, P_NONE, PageGetSpecialPointer, RelationData::rd_amcache, RelationData::rd_indexcxt, and RelationGetRelationName.

Referenced by get_relation_info().

{
    BTMetaPageData *metad;

    /*
     * We can get what we need from the cached metapage data.  If it's not
     * cached yet, load it.  Sanity checks here must match _bt_getroot().
     */
    if (rel->rd_amcache == NULL)
    {
        Buffer      metabuf;
        Page        metapg;
        BTPageOpaque metaopaque;

        metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
        metapg = BufferGetPage(metabuf);
        metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
        metad = BTPageGetMeta(metapg);

        /* sanity-check the metapage */
        if (!(metaopaque->btpo_flags & BTP_META) ||
            metad->btm_magic != BTREE_MAGIC)
            ereport(ERROR,
                    (errcode(ERRCODE_INDEX_CORRUPTED),
                     errmsg("index \"%s\" is not a btree",
                            RelationGetRelationName(rel))));

        if (metad->btm_version != BTREE_VERSION)
            ereport(ERROR,
                    (errcode(ERRCODE_INDEX_CORRUPTED),
                     errmsg("version mismatch in index \"%s\": file version %d, code version %d",
                            RelationGetRelationName(rel),
                            metad->btm_version, BTREE_VERSION)));

        /*
         * If there's no root page yet, _bt_getroot() doesn't expect a cache
         * to be made, so just stop here and report the index height is zero.
         * (XXX perhaps _bt_getroot() should be changed to allow this case.)
         */
        if (metad->btm_root == P_NONE)
        {
            _bt_relbuf(rel, metabuf);
            return 0;
        }

        /*
         * Cache the metapage data for next time
         */
        rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
                                             sizeof(BTMetaPageData));
        memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));

        _bt_relbuf(rel, metabuf);
    }

    metad = (BTMetaPageData *) rel->rd_amcache;
    /* We shouldn't have cached it if any of these fail */
    Assert(metad->btm_magic == BTREE_MAGIC);
    Assert(metad->btm_version == BTREE_VERSION);
    Assert(metad->btm_fastroot != P_NONE);

    return metad->btm_fastlevel;
}

Buffer _bt_getstackbuf	(	Relation	rel,
		BTStack	stack,
		int	access
	)

Definition at line 1724 of file nbtinsert.c.

References _bt_getbuf(), _bt_relbuf(), BTEntrySame, BTPageOpaqueData::btpo_next, BTStackData::bts_blkno, BTStackData::bts_btentry, BTStackData::bts_offset, buf, BufferGetPage, OffsetNumberNext, OffsetNumberPrev, P_FIRSTDATAKEY, P_IGNORE, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, and PageGetSpecialPointer.

Referenced by _bt_insert_parent(), _bt_pagedel(), and _bt_parent_deletion_safe().

{
    BlockNumber blkno;
    OffsetNumber start;

    blkno = stack->bts_blkno;
    start = stack->bts_offset;

    for (;;)
    {
        Buffer      buf;
        Page        page;
        BTPageOpaque opaque;

        buf = _bt_getbuf(rel, blkno, access);
        page = BufferGetPage(buf);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);

        if (!P_IGNORE(opaque))
        {
            OffsetNumber offnum,
                        minoff,
                        maxoff;
            ItemId      itemid;
            IndexTuple  item;

            minoff = P_FIRSTDATAKEY(opaque);
            maxoff = PageGetMaxOffsetNumber(page);

            /*
             * start = InvalidOffsetNumber means "search the whole page". We
             * need this test anyway due to possibility that page has a high
             * key now when it didn't before.
             */
            if (start < minoff)
                start = minoff;

            /*
             * Need this check too, to guard against possibility that page
             * split since we visited it originally.
             */
            if (start > maxoff)
                start = OffsetNumberNext(maxoff);

            /*
             * These loops will check every item on the page --- but in an
             * order that's attuned to the probability of where it actually
             * is.  Scan to the right first, then to the left.
             */
            for (offnum = start;
                 offnum <= maxoff;
                 offnum = OffsetNumberNext(offnum))
            {
                itemid = PageGetItemId(page, offnum);
                item = (IndexTuple) PageGetItem(page, itemid);
                if (BTEntrySame(item, &stack->bts_btentry))
                {
                    /* Return accurate pointer to where link is now */
                    stack->bts_blkno = blkno;
                    stack->bts_offset = offnum;
                    return buf;
                }
            }

            for (offnum = OffsetNumberPrev(start);
                 offnum >= minoff;
                 offnum = OffsetNumberPrev(offnum))
            {
                itemid = PageGetItemId(page, offnum);
                item = (IndexTuple) PageGetItem(page, itemid);
                if (BTEntrySame(item, &stack->bts_btentry))
                {
                    /* Return accurate pointer to where link is now */
                    stack->bts_blkno = blkno;
                    stack->bts_offset = offnum;
                    return buf;
                }
            }
        }

        /*
         * The item we're looking for moved right at least one page.
         */
        if (P_RIGHTMOST(opaque))
        {
            _bt_relbuf(rel, buf);
            return InvalidBuffer;
        }
        blkno = opaque->btpo_next;
        start = InvalidOffsetNumber;
        _bt_relbuf(rel, buf);
    }
}

Buffer _bt_gettrueroot ( Relation rel )

Definition at line 330 of file nbtpage.c.

References _bt_getbuf(), _bt_relandgetbuf(), _bt_relbuf(), BT_READ, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTP_META, BTPageGetMeta, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTREE_MAGIC, BTREE_METAPAGE, BTREE_VERSION, BufferGetPage, elog, ereport, errcode(), errmsg(), ERROR, BTPageOpaqueData::level, P_IGNORE, P_NONE, P_RIGHTMOST, PageGetSpecialPointer, pfree(), RelationData::rd_amcache, and RelationGetRelationName.

Referenced by _bt_get_endpoint().

{
    Buffer      metabuf;
    Page        metapg;
    BTPageOpaque metaopaque;
    Buffer      rootbuf;
    Page        rootpage;
    BTPageOpaque rootopaque;
    BlockNumber rootblkno;
    uint32      rootlevel;
    BTMetaPageData *metad;

    /*
     * We don't try to use cached metapage data here, since (a) this path is
     * not performance-critical, and (b) if we are here it suggests our cache
     * is out-of-date anyway.  In light of point (b), it's probably safest to
     * actively flush any cached metapage info.
     */
    if (rel->rd_amcache)
        pfree(rel->rd_amcache);
    rel->rd_amcache = NULL;

    metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
    metapg = BufferGetPage(metabuf);
    metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
    metad = BTPageGetMeta(metapg);

    if (!(metaopaque->btpo_flags & BTP_META) ||
        metad->btm_magic != BTREE_MAGIC)
        ereport(ERROR,
                (errcode(ERRCODE_INDEX_CORRUPTED),
                 errmsg("index \"%s\" is not a btree",
                        RelationGetRelationName(rel))));

    if (metad->btm_version != BTREE_VERSION)
        ereport(ERROR,
                (errcode(ERRCODE_INDEX_CORRUPTED),
                 errmsg("version mismatch in index \"%s\": file version %d, code version %d",
                        RelationGetRelationName(rel),
                        metad->btm_version, BTREE_VERSION)));

    /* if no root page initialized yet, fail */
    if (metad->btm_root == P_NONE)
    {
        _bt_relbuf(rel, metabuf);
        return InvalidBuffer;
    }

    rootblkno = metad->btm_root;
    rootlevel = metad->btm_level;

    /*
     * We are done with the metapage; arrange to release it via first
     * _bt_relandgetbuf call
     */
    rootbuf = metabuf;

    for (;;)
    {
        rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
        rootpage = BufferGetPage(rootbuf);
        rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);

        if (!P_IGNORE(rootopaque))
            break;

        /* it's dead, Jim.  step right one page */
        if (P_RIGHTMOST(rootopaque))
            elog(ERROR, "no live root page found in index \"%s\"",
                 RelationGetRelationName(rel));
        rootblkno = rootopaque->btpo_next;
    }

    /* Note: can't check btpo.level on deleted pages */
    if (rootopaque->btpo.level != rootlevel)
        elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
             rootblkno, RelationGetRelationName(rel),
             rootopaque->btpo.level, rootlevel);

    return rootbuf;
}

void _bt_initmetapage	(	Page	page,
		BlockNumber	rootbknum,
		uint32	level
	)

Definition at line 39 of file nbtpage.c.

References _bt_pageinit(), BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_level, BTMetaPageData::btm_magic, BTMetaPageData::btm_root, BTMetaPageData::btm_version, BTPageGetMeta, BTPageOpaqueData::btpo_flags, and PageGetSpecialPointer.

Referenced by _bt_uppershutdown(), and btbuildempty().

{
    BTMetaPageData *metad;
    BTPageOpaque metaopaque;

    _bt_pageinit(page, BLCKSZ);

    metad = BTPageGetMeta(page);
    metad->btm_magic = BTREE_MAGIC;
    metad->btm_version = BTREE_VERSION;
    metad->btm_root = rootbknum;
    metad->btm_level = level;
    metad->btm_fastroot = rootbknum;
    metad->btm_fastlevel = level;

    metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
    metaopaque->btpo_flags = BTP_META;

    /*
     * Set pd_lower just past the end of the metadata.  This is not essential
     * but it makes the page look compressible to xlog.c.
     */
    ((PageHeader) page)->pd_lower =
        ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
}

void _bt_insert_parent	(	Relation	rel,
		Buffer	buf,
		Buffer	rbuf,
		BTStack	stack,
		bool	is_root,
		bool	is_only
	)

Definition at line 1610 of file nbtinsert.c.

References _bt_get_endpoint(), _bt_getstackbuf(), _bt_insertonpg(), _bt_newroot(), _bt_relbuf(), Assert, BT_WRITE, BTPageOpaqueData::btpo, BTStackData::bts_blkno, BTStackData::bts_btentry, BTStackData::bts_offset, BTStackData::bts_parent, BufferGetBlockNumber(), BufferGetPage, CopyIndexTuple(), DEBUG2, elog, ERROR, InRecovery, InvalidBuffer, ItemPointerSet, BTPageOpaqueData::level, NULL, P_HIKEY, PageGetItem, PageGetItemId, PageGetSpecialPointer, pfree(), RelationGetRelationName, and IndexTupleData::t_tid.

Referenced by _bt_insertonpg(), and btree_xlog_cleanup().

{
    /*
     * Here we have to do something Lehman and Yao don't talk about: deal with
     * a root split and construction of a new root.  If our stack is empty
     * then we have just split a node on what had been the root level when we
     * descended the tree.  If it was still the root then we perform a
     * new-root construction.  If it *wasn't* the root anymore, search to find
     * the next higher level that someone constructed meanwhile, and find the
     * right place to insert as for the normal case.
     *
     * If we have to search for the parent level, we do so by re-descending
     * from the root.  This is not super-efficient, but it's rare enough not
     * to matter.  (This path is also taken when called from WAL recovery ---
     * we have no stack in that case.)
     */
    if (is_root)
    {
        Buffer      rootbuf;

        Assert(stack == NULL);
        Assert(is_only);
        /* create a new root node and update the metapage */
        rootbuf = _bt_newroot(rel, buf, rbuf);
        /* release the split buffers */
        _bt_relbuf(rel, rootbuf);
        _bt_relbuf(rel, rbuf);
        _bt_relbuf(rel, buf);
    }
    else
    {
        BlockNumber bknum = BufferGetBlockNumber(buf);
        BlockNumber rbknum = BufferGetBlockNumber(rbuf);
        Page        page = BufferGetPage(buf);
        IndexTuple  new_item;
        BTStackData fakestack;
        IndexTuple  ritem;
        Buffer      pbuf;

        if (stack == NULL)
        {
            BTPageOpaque lpageop;

            if (!InRecovery)
                elog(DEBUG2, "concurrent ROOT page split");
            lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
            /* Find the leftmost page at the next level up */
            pbuf = _bt_get_endpoint(rel, lpageop->btpo.level + 1, false);
            /* Set up a phony stack entry pointing there */
            stack = &fakestack;
            stack->bts_blkno = BufferGetBlockNumber(pbuf);
            stack->bts_offset = InvalidOffsetNumber;
            /* bts_btentry will be initialized below */
            stack->bts_parent = NULL;
            _bt_relbuf(rel, pbuf);
        }

        /* get high key from left page == lowest key on new right page */
        ritem = (IndexTuple) PageGetItem(page,
                                         PageGetItemId(page, P_HIKEY));

        /* form an index tuple that points at the new right page */
        new_item = CopyIndexTuple(ritem);
        ItemPointerSet(&(new_item->t_tid), rbknum, P_HIKEY);

        /*
         * Find the parent buffer and get the parent page.
         *
         * Oops - if we were moved right then we need to change stack item! We
         * want to find parent pointing to where we are, right ?    - vadim
         * 05/27/97
         */
        ItemPointerSet(&(stack->bts_btentry.t_tid), bknum, P_HIKEY);

        pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);

        /* Now we can unlock the children */
        _bt_relbuf(rel, rbuf);
        _bt_relbuf(rel, buf);

        /* Check for error only after writing children */
        if (pbuf == InvalidBuffer)
            elog(ERROR, "failed to re-find parent key in index \"%s\" for split pages %u/%u",
                 RelationGetRelationName(rel), bknum, rbknum);

        /* Recursively update the parent */
        _bt_insertonpg(rel, pbuf, stack->bts_parent,
                       new_item, stack->bts_offset + 1,
                       is_only);

        /* be tidy */
        pfree(new_item);
    }
}

void _bt_killitems	(	IndexScanDesc	scan,
		bool	haveLock
	)

Definition at line 1737 of file nbtutils.c.

References Assert, BT_READ, BTScanPosData::buf, BUFFER_LOCK_UNLOCK, BufferGetPage, BufferIsValid, BTScanOpaqueData::currPos, BTScanPosData::firstItem, BTScanPosItem::heapTid, i, BTScanPosItem::indexOffset, ItemIdMarkDead, ItemPointerEquals(), BTScanPosData::items, BTScanOpaqueData::killedItems, LockBuffer(), MarkBufferDirtyHint(), BTScanOpaqueData::numKilled, OffsetNumberNext, IndexScanDescData::opaque, P_FIRSTDATAKEY, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, and IndexTupleData::t_tid.

Referenced by _bt_steppage(), btendscan(), btrescan(), and btrestrpos().

{
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    Page        page;
    BTPageOpaque opaque;
    OffsetNumber minoff;
    OffsetNumber maxoff;
    int         i;
    bool        killedsomething = false;

    Assert(BufferIsValid(so->currPos.buf));

    if (!haveLock)
        LockBuffer(so->currPos.buf, BT_READ);

    page = BufferGetPage(so->currPos.buf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    minoff = P_FIRSTDATAKEY(opaque);
    maxoff = PageGetMaxOffsetNumber(page);

    for (i = 0; i < so->numKilled; i++)
    {
        int         itemIndex = so->killedItems[i];
        BTScanPosItem *kitem = &so->currPos.items[itemIndex];
        OffsetNumber offnum = kitem->indexOffset;

        Assert(itemIndex >= so->currPos.firstItem &&
               itemIndex <= so->currPos.lastItem);
        if (offnum < minoff)
            continue;           /* pure paranoia */
        while (offnum <= maxoff)
        {
            ItemId      iid = PageGetItemId(page, offnum);
            IndexTuple  ituple = (IndexTuple) PageGetItem(page, iid);

            if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
            {
                /* found the item */
                ItemIdMarkDead(iid);
                killedsomething = true;
                break;          /* out of inner search loop */
            }
            offnum = OffsetNumberNext(offnum);
        }
    }

    /*
     * Since this can be redone later if needed, mark as dirty hint.
     *
     * Whenever we mark anything LP_DEAD, we also set the page's
     * BTP_HAS_GARBAGE flag, which is likewise just a hint.
     */
    if (killedsomething)
    {
        opaque->btpo_flags |= BTP_HAS_GARBAGE;
        MarkBufferDirtyHint(so->currPos.buf);
    }

    if (!haveLock)
        LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);

    /*
     * Always reset the scan state, so we don't look for same items on other
     * pages.
     */
    so->numKilled = 0;
}

void _bt_leafbuild	(	BTSpool *	btspool,
		BTSpool *	spool2
	)

Definition at line 198 of file nbtsort.c.

References _bt_load(), BTREE_METAPAGE, BTWriteState::btws_pages_alloced, BTWriteState::btws_pages_written, BTWriteState::btws_use_wal, BTWriteState::btws_zeropage, BTSpool::heap, BTWriteState::heap, BTSpool::index, BTWriteState::index, log_btree_build_stats, RelationNeedsWAL, ResetUsage(), ShowUsage(), BTSpool::sortstate, tuplesort_performsort(), and XLogIsNeeded.

Referenced by btbuild().

{
    BTWriteState wstate;

#ifdef BTREE_BUILD_STATS
    if (log_btree_build_stats)
    {
        ShowUsage("BTREE BUILD (Spool) STATISTICS");
        ResetUsage();
    }
#endif   /* BTREE_BUILD_STATS */

    tuplesort_performsort(btspool->sortstate);
    if (btspool2)
        tuplesort_performsort(btspool2->sortstate);

    wstate.heap = btspool->heap;
    wstate.index = btspool->index;

    /*
     * We need to log index creation in WAL iff WAL archiving/streaming is
     * enabled UNLESS the index isn't WAL-logged anyway.
     */
    wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index);

    /* reserve the metapage */
    wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
    wstate.btws_pages_written = 0;
    wstate.btws_zeropage = NULL;    /* until needed */

    _bt_load(&wstate, btspool, btspool2);
}

void _bt_mark_array_keys ( IndexScanDesc scan )

Definition at line 604 of file nbtutils.c.

References BTScanOpaqueData::arrayKeys, BTArrayKeyInfo::cur_elem, i, BTArrayKeyInfo::mark_elem, BTScanOpaqueData::numArrayKeys, and IndexScanDescData::opaque.

Referenced by btmarkpos().

{
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    int         i;

    for (i = 0; i < so->numArrayKeys; i++)
    {
        BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];

        curArrayKey->mark_elem = curArrayKey->cur_elem;
    }
}

ScanKey _bt_mkscankey	(	Relation	rel,
		IndexTuple	itup
	)

Definition at line 62 of file nbtutils.c.

References arg, BTORDER_PROC, i, index_getattr, index_getprocinfo(), InvalidOid, InvalidStrategy, palloc(), RelationData::rd_indcollation, RelationData::rd_indoption, RelationGetDescr, RelationGetNumberOfAttributes, ScanKeyEntryInitializeWithInfo(), SK_BT_INDOPTION_SHIFT, and SK_ISNULL.

Referenced by _bt_doinsert(), and _bt_pagedel().

{
    ScanKey     skey;
    TupleDesc   itupdesc;
    int         natts;
    int16      *indoption;
    int         i;

    itupdesc = RelationGetDescr(rel);
    natts = RelationGetNumberOfAttributes(rel);
    indoption = rel->rd_indoption;

    skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));

    for (i = 0; i < natts; i++)
    {
        FmgrInfo   *procinfo;
        Datum       arg;
        bool        null;
        int         flags;

        /*
         * We can use the cached (default) support procs since no cross-type
         * comparison can be needed.
         */
        procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
        arg = index_getattr(itup, i + 1, itupdesc, &null);
        flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT);
        ScanKeyEntryInitializeWithInfo(&skey[i],
                                       flags,
                                       (AttrNumber) (i + 1),
                                       InvalidStrategy,
                                       InvalidOid,
                                       rel->rd_indcollation[i],
                                       procinfo,
                                       arg);
    }

    return skey;
}

ScanKey _bt_mkscankey_nodata ( Relation rel )

Definition at line 115 of file nbtutils.c.

References BTORDER_PROC, i, index_getprocinfo(), InvalidOid, InvalidStrategy, palloc(), RelationData::rd_indcollation, RelationData::rd_indoption, RelationGetNumberOfAttributes, ScanKeyEntryInitializeWithInfo(), and SK_ISNULL.

Referenced by _bt_load(), tuplesort_begin_cluster(), and tuplesort_begin_index_btree().

{
    ScanKey     skey;
    int         natts;
    int16      *indoption;
    int         i;

    natts = RelationGetNumberOfAttributes(rel);
    indoption = rel->rd_indoption;

    skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));

    for (i = 0; i < natts; i++)
    {
        FmgrInfo   *procinfo;
        int         flags;

        /*
         * We can use the cached (default) support procs since no cross-type
         * comparison can be needed.
         */
        procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
        flags = SK_ISNULL | (indoption[i] << SK_BT_INDOPTION_SHIFT);
        ScanKeyEntryInitializeWithInfo(&skey[i],
                                       flags,
                                       (AttrNumber) (i + 1),
                                       InvalidStrategy,
                                       InvalidOid,
                                       rel->rd_indcollation[i],
                                       procinfo,
                                       (Datum) 0);
    }

    return skey;
}

Buffer _bt_moveright	(	Relation	rel,
		Buffer	buf,
		int	keysz,
		ScanKey	scankey,
		bool	nextkey,
		int	access
	)

Definition at line 156 of file nbtsearch.c.

References _bt_compare(), _bt_relandgetbuf(), BTPageOpaqueData::btpo_next, BufferGetPage, elog, ERROR, P_HIKEY, P_IGNORE, P_RIGHTMOST, PageGetSpecialPointer, and RelationGetRelationName.

Referenced by _bt_doinsert(), and _bt_search().

{
    Page        page;
    BTPageOpaque opaque;
    int32       cmpval;

    page = BufferGetPage(buf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);

    /*
     * When nextkey = false (normal case): if the scan key that brought us to
     * this page is > the high key stored on the page, then the page has split
     * and we need to move right.  (If the scan key is equal to the high key,
     * we might or might not need to move right; have to scan the page first
     * anyway.)
     *
     * When nextkey = true: move right if the scan key is >= page's high key.
     *
     * The page could even have split more than once, so scan as far as
     * needed.
     *
     * We also have to move right if we followed a link that brought us to a
     * dead page.
     */
    cmpval = nextkey ? 0 : 1;

    while (!P_RIGHTMOST(opaque) &&
           (P_IGNORE(opaque) ||
            _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval))
    {
        /* step right one page */
        BlockNumber rblkno = opaque->btpo_next;

        buf = _bt_relandgetbuf(rel, buf, rblkno, access);
        page = BufferGetPage(buf);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    }

    if (P_IGNORE(opaque))
        elog(ERROR, "fell off the end of index \"%s\"",
             RelationGetRelationName(rel));

    return buf;
}

bool _bt_next	(	IndexScanDesc	scan,
		ScanDirection	dir
	)

Definition at line 995 of file nbtsearch.c.

References _bt_steppage(), Assert, BT_READ, BTScanPosData::buf, BUFFER_LOCK_UNLOCK, BufferIsValid, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanPosData::firstItem, BTScanPosData::itemIndex, BTScanPosData::items, BTScanPosData::lastItem, LockBuffer(), IndexScanDescData::opaque, ScanDirectionIsForward, HeapTupleData::t_self, IndexScanDescData::xs_ctup, IndexScanDescData::xs_itup, and IndexScanDescData::xs_want_itup.

Referenced by btgetbitmap(), and btgettuple().

{
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    BTScanPosItem *currItem;

    /*
     * Advance to next tuple on current page; or if there's no more, try to
     * step to the next page with data.
     */
    if (ScanDirectionIsForward(dir))
    {
        if (++so->currPos.itemIndex > so->currPos.lastItem)
        {
            /* We must acquire lock before applying _bt_steppage */
            Assert(BufferIsValid(so->currPos.buf));
            LockBuffer(so->currPos.buf, BT_READ);
            if (!_bt_steppage(scan, dir))
                return false;
            /* Drop the lock, but not pin, on the new page */
            LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
        }
    }
    else
    {
        if (--so->currPos.itemIndex < so->currPos.firstItem)
        {
            /* We must acquire lock before applying _bt_steppage */
            Assert(BufferIsValid(so->currPos.buf));
            LockBuffer(so->currPos.buf, BT_READ);
            if (!_bt_steppage(scan, dir))
                return false;
            /* Drop the lock, but not pin, on the new page */
            LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
        }
    }

    /* OK, itemIndex says what to return */
    currItem = &so->currPos.items[so->currPos.itemIndex];
    scan->xs_ctup.t_self = currItem->heapTid;
    if (scan->xs_want_itup)
        scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);

    return true;
}

bool _bt_page_recyclable ( Page page )

Definition at line 749 of file nbtpage.c.

References BTPageOpaqueData::btpo, P_ISDELETED, PageGetSpecialPointer, PageIsNew, RecentGlobalXmin, TransactionIdPrecedes(), and BTPageOpaqueData::xact.

Referenced by _bt_getbuf(), and btvacuumpage().

{
    BTPageOpaque opaque;

    /*
     * It's possible to find an all-zeroes page in an index --- for example, a
     * backend might successfully extend the relation one page and then crash
     * before it is able to make a WAL entry for adding the page. If we find a
     * zeroed page then reclaim it.
     */
    if (PageIsNew(page))
        return true;

    /*
     * Otherwise, recycle if deleted and too old to have any processes
     * interested in it.
     */
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    if (P_ISDELETED(opaque) &&
        TransactionIdPrecedes(opaque->btpo.xact, RecentGlobalXmin))
        return true;
    return false;
}

int _bt_pagedel	(	Relation	rel,
		Buffer	buf,
		BTStack	stack
	)

Definition at line 1067 of file nbtpage.c.

References _bt_get_endpoint(), _bt_getbuf(), _bt_getstackbuf(), _bt_mkscankey(), _bt_pagedel(), _bt_parent_deletion_safe(), _bt_relbuf(), _bt_search(), Assert, BT_READ, BT_WRITE, BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_level, BTMetaPageData::btm_root, BTPageGetMeta, BTPageOpaqueData::btpo, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, xl_btree_delete_page::btpo_xact, BTREE_METAPAGE, BTStackData::bts_blkno, BTStackData::bts_btentry, BTStackData::bts_offset, BTStackData::bts_parent, XLogRecData::buffer, XLogRecData::buffer_std, BufferGetBlockNumber(), BufferGetPage, BufferIsValid, CacheInvalidateRelcache(), CopyIndexTuple(), XLogRecData::data, xl_btree_delete_page::deadblk, elog, END_CRIT_SECTION, ERROR, xl_btree_metadata::fastlevel, xl_btree_metadata::fastroot, InRecovery, InvalidBuffer, ItemPointerGetBlockNumber, ItemPointerSet, xl_btree_delete_page::leftblk, XLogRecData::len, xl_btree_metadata::level, BTPageOpaqueData::level, LOG, MarkBufferDirty(), XLogRecData::next, xl_btreetid::node, NULL, OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_ISDELETED, P_ISHALFDEAD, P_ISROOT, P_NONE, P_RIGHTMOST, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, PageIndexTupleDelete(), PageSetLSN, palloc(), PredicateLockPageCombine(), RelationData::rd_node, RelationData::rd_rel, ReadNewTransactionId(), RelationGetRelationName, RelationNeedsWAL, xl_btree_delete_page::rightblk, xl_btree_metadata::root, START_CRIT_SECTION, IndexTupleData::t_tid, xl_btree_delete_page::target, xl_btreetid::tid, BTPageOpaqueData::xact, and XLogInsert().

Referenced by _bt_pagedel(), btree_xlog_cleanup(), and btvacuumpage().

{
    int         result;
    BlockNumber target,
                leftsib,
                rightsib,
                parent;
    OffsetNumber poffset,
                maxoff;
    uint32      targetlevel,
                ilevel;
    ItemId      itemid;
    IndexTuple  targetkey,
                itup;
    ScanKey     itup_scankey;
    Buffer      lbuf,
                rbuf,
                pbuf;
    bool        parent_half_dead;
    bool        parent_one_child;
    bool        rightsib_empty;
    Buffer      metabuf = InvalidBuffer;
    Page        metapg = NULL;
    BTMetaPageData *metad = NULL;
    Page        page;
    BTPageOpaque opaque;

    /*
     * We can never delete rightmost pages nor root pages.  While at it, check
     * that page is not already deleted and is empty.
     */
    page = BufferGetPage(buf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
        P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
    {
        /* Should never fail to delete a half-dead page */
        Assert(!P_ISHALFDEAD(opaque));

        _bt_relbuf(rel, buf);
        return 0;
    }

    /*
     * Save info about page, including a copy of its high key (it must have
     * one, being non-rightmost).
     */
    target = BufferGetBlockNumber(buf);
    targetlevel = opaque->btpo.level;
    leftsib = opaque->btpo_prev;
    itemid = PageGetItemId(page, P_HIKEY);
    targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));

    /*
     * To avoid deadlocks, we'd better drop the target page lock before going
     * further.
     */
    _bt_relbuf(rel, buf);

    /*
     * We need an approximate pointer to the page's parent page.  We use the
     * standard search mechanism to search for the page's high key; this will
     * give us a link to either the current parent or someplace to its left
     * (if there are multiple equal high keys).  In recursion cases, the
     * caller already generated a search stack and we can just re-use that
     * work.
     */
    if (stack == NULL)
    {
        if (!InRecovery)
        {
            /* we need an insertion scan key to do our search, so build one */
            itup_scankey = _bt_mkscankey(rel, targetkey);
            /* find the leftmost leaf page containing this key */
            stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false,
                               &lbuf, BT_READ);
            /* don't need a pin on that either */
            _bt_relbuf(rel, lbuf);

            /*
             * If we are trying to delete an interior page, _bt_search did
             * more than we needed.  Locate the stack item pointing to our
             * parent level.
             */
            ilevel = 0;
            for (;;)
            {
                if (stack == NULL)
                    elog(ERROR, "not enough stack items");
                if (ilevel == targetlevel)
                    break;
                stack = stack->bts_parent;
                ilevel++;
            }
        }
        else
        {
            /*
             * During WAL recovery, we can't use _bt_search (for one reason,
             * it might invoke user-defined comparison functions that expect
             * facilities not available in recovery mode).  Instead, just set
             * up a dummy stack pointing to the left end of the parent tree
             * level, from which _bt_getstackbuf will walk right to the parent
             * page.  Painful, but we don't care too much about performance in
             * this scenario.
             */
            pbuf = _bt_get_endpoint(rel, targetlevel + 1, false);
            stack = (BTStack) palloc(sizeof(BTStackData));
            stack->bts_blkno = BufferGetBlockNumber(pbuf);
            stack->bts_offset = InvalidOffsetNumber;
            /* bts_btentry will be initialized below */
            stack->bts_parent = NULL;
            _bt_relbuf(rel, pbuf);
        }
    }

    /*
     * We cannot delete a page that is the rightmost child of its immediate
     * parent, unless it is the only child --- in which case the parent has to
     * be deleted too, and the same condition applies recursively to it. We
     * have to check this condition all the way up before trying to delete. We
     * don't need to re-test when deleting a non-leaf page, though.
     */
    if (targetlevel == 0 &&
        !_bt_parent_deletion_safe(rel, target, stack))
        return 0;

    /*
     * We have to lock the pages we need to modify in the standard order:
     * moving right, then up.  Else we will deadlock against other writers.
     *
     * So, we need to find and write-lock the current left sibling of the
     * target page.  The sibling that was current a moment ago could have
     * split, so we may have to move right.  This search could fail if either
     * the sibling or the target page was deleted by someone else meanwhile;
     * if so, give up.  (Right now, that should never happen, since page
     * deletion is only done in VACUUM and there shouldn't be multiple VACUUMs
     * concurrently on the same table.)
     */
    if (leftsib != P_NONE)
    {
        lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
        page = BufferGetPage(lbuf);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);
        while (P_ISDELETED(opaque) || opaque->btpo_next != target)
        {
            /* step right one page */
            leftsib = opaque->btpo_next;
            _bt_relbuf(rel, lbuf);
            if (leftsib == P_NONE)
            {
                elog(LOG, "no left sibling (concurrent deletion?) in \"%s\"",
                     RelationGetRelationName(rel));
                return 0;
            }
            lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
            page = BufferGetPage(lbuf);
            opaque = (BTPageOpaque) PageGetSpecialPointer(page);
        }
    }
    else
        lbuf = InvalidBuffer;

    /*
     * Next write-lock the target page itself.  It should be okay to take just
     * a write lock not a superexclusive lock, since no scans would stop on an
     * empty page.
     */
    buf = _bt_getbuf(rel, target, BT_WRITE);
    page = BufferGetPage(buf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);

    /*
     * Check page is still empty etc, else abandon deletion.  The empty check
     * is necessary since someone else might have inserted into it while we
     * didn't have it locked; the others are just for paranoia's sake.
     */
    if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
        P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
    {
        _bt_relbuf(rel, buf);
        if (BufferIsValid(lbuf))
            _bt_relbuf(rel, lbuf);
        return 0;
    }
    if (opaque->btpo_prev != leftsib)
        elog(ERROR, "left link changed unexpectedly in block %u of index \"%s\"",
             target, RelationGetRelationName(rel));

    /*
     * And next write-lock the (current) right sibling.
     */
    rightsib = opaque->btpo_next;
    rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
    page = BufferGetPage(rbuf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    if (opaque->btpo_prev != target)
        elog(ERROR, "right sibling's left-link doesn't match: "
             "block %u links to %u instead of expected %u in index \"%s\"",
             rightsib, opaque->btpo_prev, target,
             RelationGetRelationName(rel));

    /*
     * Any insert which would have gone on the target block will now go to the
     * right sibling block.
     */
    PredicateLockPageCombine(rel, target, rightsib);

    /*
     * Next find and write-lock the current parent of the target page. This is
     * essentially the same as the corresponding step of splitting.
     */
    ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY);
    pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
    if (pbuf == InvalidBuffer)
        elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u",
             RelationGetRelationName(rel), target);
    parent = stack->bts_blkno;
    poffset = stack->bts_offset;

    /*
     * If the target is the rightmost child of its parent, then we can't
     * delete, unless it's also the only child --- in which case the parent
     * changes to half-dead status.  The "can't delete" case should have been
     * detected by _bt_parent_deletion_safe, so complain if we see it now.
     */
    page = BufferGetPage(pbuf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    maxoff = PageGetMaxOffsetNumber(page);
    parent_half_dead = false;
    parent_one_child = false;
    if (poffset >= maxoff)
    {
        if (poffset == P_FIRSTDATAKEY(opaque))
            parent_half_dead = true;
        else
            elog(ERROR, "failed to delete rightmost child %u of block %u in index \"%s\"",
                 target, parent, RelationGetRelationName(rel));
    }
    else
    {
        /* Will there be exactly one child left in this parent? */
        if (OffsetNumberNext(P_FIRSTDATAKEY(opaque)) == maxoff)
            parent_one_child = true;
    }

    /*
     * If we are deleting the next-to-last page on the target's level, then
     * the rightsib is a candidate to become the new fast root. (In theory, it
     * might be possible to push the fast root even further down, but the odds
     * of doing so are slim, and the locking considerations daunting.)
     *
     * We don't support handling this in the case where the parent is becoming
     * half-dead, even though it theoretically could occur.
     *
     * We can safely acquire a lock on the metapage here --- see comments for
     * _bt_newroot().
     */
    if (leftsib == P_NONE && !parent_half_dead)
    {
        page = BufferGetPage(rbuf);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);
        Assert(opaque->btpo.level == targetlevel);
        if (P_RIGHTMOST(opaque))
        {
            /* rightsib will be the only one left on the level */
            metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
            metapg = BufferGetPage(metabuf);
            metad = BTPageGetMeta(metapg);

            /*
             * The expected case here is btm_fastlevel == targetlevel+1; if
             * the fastlevel is <= targetlevel, something is wrong, and we
             * choose to overwrite it to fix it.
             */
            if (metad->btm_fastlevel > targetlevel + 1)
            {
                /* no update wanted */
                _bt_relbuf(rel, metabuf);
                metabuf = InvalidBuffer;
            }
        }
    }

    /*
     * Check that the parent-page index items we're about to delete/overwrite
     * contain what we expect.  This can fail if the index has become corrupt
     * for some reason.  We want to throw any error before entering the
     * critical section --- otherwise it'd be a PANIC.
     *
     * The test on the target item is just an Assert because _bt_getstackbuf
     * should have guaranteed it has the expected contents.  The test on the
     * next-child downlink is known to sometimes fail in the field, though.
     */
    page = BufferGetPage(pbuf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);

#ifdef USE_ASSERT_CHECKING
    itemid = PageGetItemId(page, poffset);
    itup = (IndexTuple) PageGetItem(page, itemid);
    Assert(ItemPointerGetBlockNumber(&(itup->t_tid)) == target);
#endif

    if (!parent_half_dead)
    {
        OffsetNumber nextoffset;

        nextoffset = OffsetNumberNext(poffset);
        itemid = PageGetItemId(page, nextoffset);
        itup = (IndexTuple) PageGetItem(page, itemid);
        if (ItemPointerGetBlockNumber(&(itup->t_tid)) != rightsib)
            elog(ERROR, "right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
                 rightsib, target, ItemPointerGetBlockNumber(&(itup->t_tid)),
                 parent, RelationGetRelationName(rel));
    }

    /*
     * Here we begin doing the deletion.
     */

    /* No ereport(ERROR) until changes are logged */
    START_CRIT_SECTION();

    /*
     * Update parent.  The normal case is a tad tricky because we want to
     * delete the target's downlink and the *following* key.  Easiest way is
     * to copy the right sibling's downlink over the target downlink, and then
     * delete the following item.
     */
    if (parent_half_dead)
    {
        PageIndexTupleDelete(page, poffset);
        opaque->btpo_flags |= BTP_HALF_DEAD;
    }
    else
    {
        OffsetNumber nextoffset;

        itemid = PageGetItemId(page, poffset);
        itup = (IndexTuple) PageGetItem(page, itemid);
        ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY);

        nextoffset = OffsetNumberNext(poffset);
        PageIndexTupleDelete(page, nextoffset);
    }

    /*
     * Update siblings' side-links.  Note the target page's side-links will
     * continue to point to the siblings.  Asserts here are just rechecking
     * things we already verified above.
     */
    if (BufferIsValid(lbuf))
    {
        page = BufferGetPage(lbuf);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);
        Assert(opaque->btpo_next == target);
        opaque->btpo_next = rightsib;
    }
    page = BufferGetPage(rbuf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    Assert(opaque->btpo_prev == target);
    opaque->btpo_prev = leftsib;
    rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));

    /*
     * Mark the page itself deleted.  It can be recycled when all current
     * transactions are gone.  Storing GetTopTransactionId() would work, but
     * we're in VACUUM and would not otherwise have an XID.  Having already
     * updated links to the target, ReadNewTransactionId() suffices as an
     * upper bound.  Any scan having retained a now-stale link is advertising
     * in its PGXACT an xmin less than or equal to the value we read here.  It
     * will continue to do so, holding back RecentGlobalXmin, for the duration
     * of that scan.
     */
    page = BufferGetPage(buf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    opaque->btpo_flags &= ~BTP_HALF_DEAD;
    opaque->btpo_flags |= BTP_DELETED;
    opaque->btpo.xact = ReadNewTransactionId();

    /* And update the metapage, if needed */
    if (BufferIsValid(metabuf))
    {
        metad->btm_fastroot = rightsib;
        metad->btm_fastlevel = targetlevel;
        MarkBufferDirty(metabuf);
    }

    /* Must mark buffers dirty before XLogInsert */
    MarkBufferDirty(pbuf);
    MarkBufferDirty(rbuf);
    MarkBufferDirty(buf);
    if (BufferIsValid(lbuf))
        MarkBufferDirty(lbuf);

    /* XLOG stuff */
    if (RelationNeedsWAL(rel))
    {
        xl_btree_delete_page xlrec;
        xl_btree_metadata xlmeta;
        uint8       xlinfo;
        XLogRecPtr  recptr;
        XLogRecData rdata[5];
        XLogRecData *nextrdata;

        xlrec.target.node = rel->rd_node;
        ItemPointerSet(&(xlrec.target.tid), parent, poffset);
        xlrec.deadblk = target;
        xlrec.leftblk = leftsib;
        xlrec.rightblk = rightsib;
        xlrec.btpo_xact = opaque->btpo.xact;

        rdata[0].data = (char *) &xlrec;
        rdata[0].len = SizeOfBtreeDeletePage;
        rdata[0].buffer = InvalidBuffer;
        rdata[0].next = nextrdata = &(rdata[1]);

        if (BufferIsValid(metabuf))
        {
            xlmeta.root = metad->btm_root;
            xlmeta.level = metad->btm_level;
            xlmeta.fastroot = metad->btm_fastroot;
            xlmeta.fastlevel = metad->btm_fastlevel;

            nextrdata->data = (char *) &xlmeta;
            nextrdata->len = sizeof(xl_btree_metadata);
            nextrdata->buffer = InvalidBuffer;
            nextrdata->next = nextrdata + 1;
            nextrdata++;
            xlinfo = XLOG_BTREE_DELETE_PAGE_META;
        }
        else if (parent_half_dead)
            xlinfo = XLOG_BTREE_DELETE_PAGE_HALF;
        else
            xlinfo = XLOG_BTREE_DELETE_PAGE;

        nextrdata->data = NULL;
        nextrdata->len = 0;
        nextrdata->next = nextrdata + 1;
        nextrdata->buffer = pbuf;
        nextrdata->buffer_std = true;
        nextrdata++;

        nextrdata->data = NULL;
        nextrdata->len = 0;
        nextrdata->buffer = rbuf;
        nextrdata->buffer_std = true;
        nextrdata->next = NULL;

        if (BufferIsValid(lbuf))
        {
            nextrdata->next = nextrdata + 1;
            nextrdata++;
            nextrdata->data = NULL;
            nextrdata->len = 0;
            nextrdata->buffer = lbuf;
            nextrdata->buffer_std = true;
            nextrdata->next = NULL;
        }

        recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);

        if (BufferIsValid(metabuf))
        {
            PageSetLSN(metapg, recptr);
        }
        page = BufferGetPage(pbuf);
        PageSetLSN(page, recptr);
        page = BufferGetPage(rbuf);
        PageSetLSN(page, recptr);
        page = BufferGetPage(buf);
        PageSetLSN(page, recptr);
        if (BufferIsValid(lbuf))
        {
            page = BufferGetPage(lbuf);
            PageSetLSN(page, recptr);
        }
    }

    END_CRIT_SECTION();

    /* release metapage; send out relcache inval if metapage changed */
    if (BufferIsValid(metabuf))
    {
        CacheInvalidateRelcache(rel);
        _bt_relbuf(rel, metabuf);
    }
    /* can always release leftsib immediately */
    if (BufferIsValid(lbuf))
        _bt_relbuf(rel, lbuf);

    /*
     * If parent became half dead, recurse to delete it. Otherwise, if right
     * sibling is empty and is now the last child of the parent, recurse to
     * try to delete it.  (These cases cannot apply at the same time, though
     * the second case might itself recurse to the first.)
     *
     * When recursing to parent, we hold the lock on the target page until
     * done.  This delays any insertions into the keyspace that was just
     * effectively reassigned to the parent's right sibling.  If we allowed
     * that, and there were enough such insertions before we finish deleting
     * the parent, page splits within that keyspace could lead to inserting
     * out-of-order keys into the grandparent level.  It is thought that that
     * wouldn't have any serious consequences, but it still seems like a
     * pretty bad idea.
     */
    if (parent_half_dead)
    {
        /* recursive call will release pbuf */
        _bt_relbuf(rel, rbuf);
        result = _bt_pagedel(rel, pbuf, stack->bts_parent) + 1;
        _bt_relbuf(rel, buf);
    }
    else if (parent_one_child && rightsib_empty)
    {
        _bt_relbuf(rel, pbuf);
        _bt_relbuf(rel, buf);
        /* recursive call will release rbuf */
        result = _bt_pagedel(rel, rbuf, stack) + 1;
    }
    else
    {
        _bt_relbuf(rel, pbuf);
        _bt_relbuf(rel, buf);
        _bt_relbuf(rel, rbuf);
        result = 1;
    }

    return result;
}

void _bt_pageinit	(	Page	page,
		Size	size
	)

Definition at line 737 of file nbtpage.c.

References PageInit().

Referenced by _bt_blnewpage(), _bt_getbuf(), _bt_initmetapage(), _bt_restore_meta(), _bt_split(), btree_xlog_delete_page(), btree_xlog_newroot(), and btree_xlog_split().

{
    PageInit(page, size, sizeof(BTPageOpaqueData));
}

void _bt_preprocess_array_keys ( IndexScanDesc scan )

Definition at line 192 of file nbtutils.c.

References _bt_find_extreme_element(), _bt_sort_array_elements(), ALLOCSET_SMALL_INITSIZE, ALLOCSET_SMALL_MAXSIZE, ALLOCSET_SMALL_MINSIZE, AllocSetContextCreate(), ARR_ELEMTYPE, BTScanOpaqueData::arrayContext, BTScanOpaqueData::arrayKeyData, BTScanOpaqueData::arrayKeys, Assert, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, cur, CurrentMemoryContext, DatumGetArrayTypeP, deconstruct_array(), BTArrayKeyInfo::elem_values, elog, ERROR, get_typlenbyvalalign(), i, IndexScanDescData::indexRelation, INDOPTION_DESC, IndexScanDescData::keyData, MemoryContextReset(), MemoryContextSwitchTo(), NULL, BTArrayKeyInfo::num_elems, BTScanOpaqueData::numArrayKeys, IndexScanDescData::numberOfKeys, IndexScanDescData::opaque, palloc(), palloc0(), RelationData::rd_indoption, BTArrayKeyInfo::scan_key, SK_ISNULL, SK_ROW_HEADER, SK_SEARCHARRAY, SK_SEARCHNOTNULL, and SK_SEARCHNULL.

Referenced by btrescan().

{
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    int         numberOfKeys = scan->numberOfKeys;
    int16      *indoption = scan->indexRelation->rd_indoption;
    int         numArrayKeys;
    ScanKey     cur;
    int         i;
    MemoryContext oldContext;

    /* Quick check to see if there are any array keys */
    numArrayKeys = 0;
    for (i = 0; i < numberOfKeys; i++)
    {
        cur = &scan->keyData[i];
        if (cur->sk_flags & SK_SEARCHARRAY)
        {
            numArrayKeys++;
            Assert(!(cur->sk_flags & (SK_ROW_HEADER | SK_SEARCHNULL | SK_SEARCHNOTNULL)));
            /* If any arrays are null as a whole, we can quit right now. */
            if (cur->sk_flags & SK_ISNULL)
            {
                so->numArrayKeys = -1;
                so->arrayKeyData = NULL;
                return;
            }
        }
    }

    /* Quit if nothing to do. */
    if (numArrayKeys == 0)
    {
        so->numArrayKeys = 0;
        so->arrayKeyData = NULL;
        return;
    }

    /*
     * Make a scan-lifespan context to hold array-associated data, or reset it
     * if we already have one from a previous rescan cycle.
     */
    if (so->arrayContext == NULL)
        so->arrayContext = AllocSetContextCreate(CurrentMemoryContext,
                                                 "BTree Array Context",
                                                 ALLOCSET_SMALL_MINSIZE,
                                                 ALLOCSET_SMALL_INITSIZE,
                                                 ALLOCSET_SMALL_MAXSIZE);
    else
        MemoryContextReset(so->arrayContext);

    oldContext = MemoryContextSwitchTo(so->arrayContext);

    /* Create modifiable copy of scan->keyData in the workspace context */
    so->arrayKeyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
    memcpy(so->arrayKeyData,
           scan->keyData,
           scan->numberOfKeys * sizeof(ScanKeyData));

    /* Allocate space for per-array data in the workspace context */
    so->arrayKeys = (BTArrayKeyInfo *) palloc0(numArrayKeys * sizeof(BTArrayKeyInfo));

    /* Now process each array key */
    numArrayKeys = 0;
    for (i = 0; i < numberOfKeys; i++)
    {
        ArrayType  *arrayval;
        int16       elmlen;
        bool        elmbyval;
        char        elmalign;
        int         num_elems;
        Datum      *elem_values;
        bool       *elem_nulls;
        int         num_nonnulls;
        int         j;

        cur = &so->arrayKeyData[i];
        if (!(cur->sk_flags & SK_SEARCHARRAY))
            continue;

        /*
         * First, deconstruct the array into elements.  Anything allocated
         * here (including a possibly detoasted array value) is in the
         * workspace context.
         */
        arrayval = DatumGetArrayTypeP(cur->sk_argument);
        /* We could cache this data, but not clear it's worth it */
        get_typlenbyvalalign(ARR_ELEMTYPE(arrayval),
                             &elmlen, &elmbyval, &elmalign);
        deconstruct_array(arrayval,
                          ARR_ELEMTYPE(arrayval),
                          elmlen, elmbyval, elmalign,
                          &elem_values, &elem_nulls, &num_elems);

        /*
         * Compress out any null elements.  We can ignore them since we assume
         * all btree operators are strict.
         */
        num_nonnulls = 0;
        for (j = 0; j < num_elems; j++)
        {
            if (!elem_nulls[j])
                elem_values[num_nonnulls++] = elem_values[j];
        }

        /* We could pfree(elem_nulls) now, but not worth the cycles */

        /* If there's no non-nulls, the scan qual is unsatisfiable */
        if (num_nonnulls == 0)
        {
            numArrayKeys = -1;
            break;
        }

        /*
         * If the comparison operator is not equality, then the array qual
         * degenerates to a simple comparison against the smallest or largest
         * non-null array element, as appropriate.
         */
        switch (cur->sk_strategy)
        {
            case BTLessStrategyNumber:
            case BTLessEqualStrategyNumber:
                cur->sk_argument =
                    _bt_find_extreme_element(scan, cur,
                                             BTGreaterStrategyNumber,
                                             elem_values, num_nonnulls);
                continue;
            case BTEqualStrategyNumber:
                /* proceed with rest of loop */
                break;
            case BTGreaterEqualStrategyNumber:
            case BTGreaterStrategyNumber:
                cur->sk_argument =
                    _bt_find_extreme_element(scan, cur,
                                             BTLessStrategyNumber,
                                             elem_values, num_nonnulls);
                continue;
            default:
                elog(ERROR, "unrecognized StrategyNumber: %d",
                     (int) cur->sk_strategy);
                break;
        }

        /*
         * Sort the non-null elements and eliminate any duplicates.  We must
         * sort in the same ordering used by the index column, so that the
         * successive primitive indexscans produce data in index order.
         */
        num_elems = _bt_sort_array_elements(scan, cur,
                        (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0,
                                            elem_values, num_nonnulls);

        /*
         * And set up the BTArrayKeyInfo data.
         */
        so->arrayKeys[numArrayKeys].scan_key = i;
        so->arrayKeys[numArrayKeys].num_elems = num_elems;
        so->arrayKeys[numArrayKeys].elem_values = elem_values;
        numArrayKeys++;
    }

    so->numArrayKeys = numArrayKeys;

    MemoryContextSwitchTo(oldContext);
}

void _bt_preprocess_keys ( IndexScanDesc scan )

Definition at line 743 of file nbtutils.c.

References _bt_compare_scankey_args(), _bt_fix_scankey_strategy(), _bt_mark_scankey_required(), BTScanOpaqueData::arrayKeyData, Assert, BTEqualStrategyNumber, BTGreaterEqualStrategyNumber, BTGreaterStrategyNumber, BTLessEqualStrategyNumber, BTLessStrategyNumber, BTMaxStrategyNumber, cur, elog, ERROR, i, IndexScanDescData::indexRelation, BTScanOpaqueData::keyData, IndexScanDescData::keyData, NULL, BTScanOpaqueData::numberOfKeys, IndexScanDescData::numberOfKeys, IndexScanDescData::opaque, BTScanOpaqueData::qual_ok, RelationData::rd_indoption, ScanKeyData::sk_flags, SK_ROW_HEADER, and SK_SEARCHNULL.

Referenced by _bt_first(), and _bt_restore_array_keys().

{
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    int         numberOfKeys = scan->numberOfKeys;
    int16      *indoption = scan->indexRelation->rd_indoption;
    int         new_numberOfKeys;
    int         numberOfEqualCols;
    ScanKey     inkeys;
    ScanKey     outkeys;
    ScanKey     cur;
    ScanKey     xform[BTMaxStrategyNumber];
    bool        test_result;
    int         i,
                j;
    AttrNumber  attno;

    /* initialize result variables */
    so->qual_ok = true;
    so->numberOfKeys = 0;

    if (numberOfKeys < 1)
        return;                 /* done if qual-less scan */

    /*
     * Read so->arrayKeyData if array keys are present, else scan->keyData
     */
    if (so->arrayKeyData != NULL)
        inkeys = so->arrayKeyData;
    else
        inkeys = scan->keyData;

    outkeys = so->keyData;
    cur = &inkeys[0];
    /* we check that input keys are correctly ordered */
    if (cur->sk_attno < 1)
        elog(ERROR, "btree index keys must be ordered by attribute");

    /* We can short-circuit most of the work if there's just one key */
    if (numberOfKeys == 1)
    {
        /* Apply indoption to scankey (might change sk_strategy!) */
        if (!_bt_fix_scankey_strategy(cur, indoption))
            so->qual_ok = false;
        memcpy(outkeys, cur, sizeof(ScanKeyData));
        so->numberOfKeys = 1;
        /* We can mark the qual as required if it's for first index col */
        if (cur->sk_attno == 1)
            _bt_mark_scankey_required(outkeys);
        return;
    }

    /*
     * Otherwise, do the full set of pushups.
     */
    new_numberOfKeys = 0;
    numberOfEqualCols = 0;

    /*
     * Initialize for processing of keys for attr 1.
     *
     * xform[i] points to the currently best scan key of strategy type i+1; it
     * is NULL if we haven't yet found such a key for this attr.
     */
    attno = 1;
    memset(xform, 0, sizeof(xform));

    /*
     * Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to
     * handle after-last-key processing.  Actual exit from the loop is at the
     * "break" statement below.
     */
    for (i = 0;; cur++, i++)
    {
        if (i < numberOfKeys)
        {
            /* Apply indoption to scankey (might change sk_strategy!) */
            if (!_bt_fix_scankey_strategy(cur, indoption))
            {
                /* NULL can't be matched, so give up */
                so->qual_ok = false;
                return;
            }
        }

        /*
         * If we are at the end of the keys for a particular attr, finish up
         * processing and emit the cleaned-up keys.
         */
        if (i == numberOfKeys || cur->sk_attno != attno)
        {
            int         priorNumberOfEqualCols = numberOfEqualCols;

            /* check input keys are correctly ordered */
            if (i < numberOfKeys && cur->sk_attno < attno)
                elog(ERROR, "btree index keys must be ordered by attribute");

            /*
             * If = has been specified, all other keys can be eliminated as
             * redundant.  If we have a case like key = 1 AND key > 2, we can
             * set qual_ok to false and abandon further processing.
             *
             * We also have to deal with the case of "key IS NULL", which is
             * unsatisfiable in combination with any other index condition. By
             * the time we get here, that's been classified as an equality
             * check, and we've rejected any combination of it with a regular
             * equality condition; but not with other types of conditions.
             */
            if (xform[BTEqualStrategyNumber - 1])
            {
                ScanKey     eq = xform[BTEqualStrategyNumber - 1];

                for (j = BTMaxStrategyNumber; --j >= 0;)
                {
                    ScanKey     chk = xform[j];

                    if (!chk || j == (BTEqualStrategyNumber - 1))
                        continue;

                    if (eq->sk_flags & SK_SEARCHNULL)
                    {
                        /* IS NULL is contradictory to anything else */
                        so->qual_ok = false;
                        return;
                    }

                    if (_bt_compare_scankey_args(scan, chk, eq, chk,
                                                 &test_result))
                    {
                        if (!test_result)
                        {
                            /* keys proven mutually contradictory */
                            so->qual_ok = false;
                            return;
                        }
                        /* else discard the redundant non-equality key */
                        xform[j] = NULL;
                    }
                    /* else, cannot determine redundancy, keep both keys */
                }
                /* track number of attrs for which we have "=" keys */
                numberOfEqualCols++;
            }

            /* try to keep only one of <, <= */
            if (xform[BTLessStrategyNumber - 1]
                && xform[BTLessEqualStrategyNumber - 1])
            {
                ScanKey     lt = xform[BTLessStrategyNumber - 1];
                ScanKey     le = xform[BTLessEqualStrategyNumber - 1];

                if (_bt_compare_scankey_args(scan, le, lt, le,
                                             &test_result))
                {
                    if (test_result)
                        xform[BTLessEqualStrategyNumber - 1] = NULL;
                    else
                        xform[BTLessStrategyNumber - 1] = NULL;
                }
            }

            /* try to keep only one of >, >= */
            if (xform[BTGreaterStrategyNumber - 1]
                && xform[BTGreaterEqualStrategyNumber - 1])
            {
                ScanKey     gt = xform[BTGreaterStrategyNumber - 1];
                ScanKey     ge = xform[BTGreaterEqualStrategyNumber - 1];

                if (_bt_compare_scankey_args(scan, ge, gt, ge,
                                             &test_result))
                {
                    if (test_result)
                        xform[BTGreaterEqualStrategyNumber - 1] = NULL;
                    else
                        xform[BTGreaterStrategyNumber - 1] = NULL;
                }
            }

            /*
             * Emit the cleaned-up keys into the outkeys[] array, and then
             * mark them if they are required.  They are required (possibly
             * only in one direction) if all attrs before this one had "=".
             */
            for (j = BTMaxStrategyNumber; --j >= 0;)
            {
                if (xform[j])
                {
                    ScanKey     outkey = &outkeys[new_numberOfKeys++];

                    memcpy(outkey, xform[j], sizeof(ScanKeyData));
                    if (priorNumberOfEqualCols == attno - 1)
                        _bt_mark_scankey_required(outkey);
                }
            }

            /*
             * Exit loop here if done.
             */
            if (i == numberOfKeys)
                break;

            /* Re-initialize for new attno */
            attno = cur->sk_attno;
            memset(xform, 0, sizeof(xform));
        }

        /* check strategy this key's operator corresponds to */
        j = cur->sk_strategy - 1;

        /* if row comparison, push it directly to the output array */
        if (cur->sk_flags & SK_ROW_HEADER)
        {
            ScanKey     outkey = &outkeys[new_numberOfKeys++];

            memcpy(outkey, cur, sizeof(ScanKeyData));
            if (numberOfEqualCols == attno - 1)
                _bt_mark_scankey_required(outkey);

            /*
             * We don't support RowCompare using equality; such a qual would
             * mess up the numberOfEqualCols tracking.
             */
            Assert(j != (BTEqualStrategyNumber - 1));
            continue;
        }

        /* have we seen one of these before? */
        if (xform[j] == NULL)
        {
            /* nope, so remember this scankey */
            xform[j] = cur;
        }
        else
        {
            /* yup, keep only the more restrictive key */
            if (_bt_compare_scankey_args(scan, cur, cur, xform[j],
                                         &test_result))
            {
                if (test_result)
                    xform[j] = cur;
                else if (j == (BTEqualStrategyNumber - 1))
                {
                    /* key == a && key == b, but a != b */
                    so->qual_ok = false;
                    return;
                }
                /* else old key is more restrictive, keep it */
            }
            else
            {
                /*
                 * We can't determine which key is more restrictive.  Keep the
                 * previous one in xform[j] and push this one directly to the
                 * output array.
                 */
                ScanKey     outkey = &outkeys[new_numberOfKeys++];

                memcpy(outkey, cur, sizeof(ScanKeyData));
                if (numberOfEqualCols == attno - 1)
                    _bt_mark_scankey_required(outkey);
            }
        }
    }

    so->numberOfKeys = new_numberOfKeys;
}

Buffer _bt_relandgetbuf	(	Relation	rel,
		Buffer	obuf,
		BlockNumber	blkno,
		int	access
	)

Definition at line 706 of file nbtpage.c.

References _bt_checkpage(), Assert, buf, BUFFER_LOCK_UNLOCK, BufferIsValid, LockBuffer(), P_NEW, and ReleaseAndReadBuffer().

Referenced by _bt_check_unique(), _bt_findinsertloc(), _bt_get_endpoint(), _bt_getroot(), _bt_gettrueroot(), _bt_moveright(), _bt_search(), and _bt_walk_left().

{
    Buffer      buf;

    Assert(blkno != P_NEW);
    if (BufferIsValid(obuf))
        LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
    buf = ReleaseAndReadBuffer(obuf, rel, blkno);
    LockBuffer(buf, access);
    _bt_checkpage(rel, buf);
    return buf;
}

void _bt_relbuf	(	Relation	rel,
		Buffer	buf
	)

Definition at line 725 of file nbtpage.c.

References UnlockReleaseBuffer().

Referenced by _bt_check_unique(), _bt_doinsert(), _bt_findinsertloc(), _bt_getbuf(), _bt_getroot(), _bt_getrootheight(), _bt_getstackbuf(), _bt_gettrueroot(), _bt_insert_parent(), _bt_insertonpg(), _bt_newroot(), _bt_pagedel(), _bt_parent_deletion_safe(), _bt_split(), _bt_steppage(), _bt_walk_left(), btvacuumpage(), btvacuumscan(), and pgstat_btree_page().

{
    UnlockReleaseBuffer(buf);
}

void _bt_restore_array_keys ( IndexScanDesc scan )

Definition at line 623 of file nbtutils.c.

References _bt_preprocess_keys(), BTScanOpaqueData::arrayKeyData, BTScanOpaqueData::arrayKeys, Assert, BTArrayKeyInfo::cur_elem, BTArrayKeyInfo::elem_values, i, BTArrayKeyInfo::mark_elem, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, BTScanOpaqueData::qual_ok, BTArrayKeyInfo::scan_key, and ScanKeyData::sk_argument.

Referenced by btrestrpos().

{
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    bool        changed = false;
    int         i;

    /* Restore each array key to its position when the mark was set */
    for (i = 0; i < so->numArrayKeys; i++)
    {
        BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
        ScanKey     skey = &so->arrayKeyData[curArrayKey->scan_key];
        int         mark_elem = curArrayKey->mark_elem;

        if (curArrayKey->cur_elem != mark_elem)
        {
            curArrayKey->cur_elem = mark_elem;
            skey->sk_argument = curArrayKey->elem_values[mark_elem];
            changed = true;
        }
    }

    /*
     * If we changed any keys, we must redo _bt_preprocess_keys.  That might
     * sound like overkill, but in cases with multiple keys per index column
     * it seems necessary to do the full set of pushups.
     */
    if (changed)
    {
        _bt_preprocess_keys(scan);
        /* The mark should have been set on a consistent set of keys... */
        Assert(so->qual_ok);
    }
}

BTStack _bt_search	(	Relation	rel,
		int	keysz,
		ScanKey	scankey,
		bool	nextkey,
		Buffer *	bufP,
		int	access
	)

Definition at line 57 of file nbtsearch.c.

References _bt_binsrch(), _bt_getroot(), _bt_moveright(), _bt_relandgetbuf(), BT_READ, BTStackData::bts_blkno, BTStackData::bts_btentry, BTStackData::bts_offset, BTStackData::bts_parent, BufferGetBlockNumber(), BufferGetPage, BufferIsValid, ItemPointerGetBlockNumber, P_ISLEAF, PageGetItem, PageGetItemId, PageGetSpecialPointer, palloc(), and IndexTupleData::t_tid.

Referenced by _bt_doinsert(), _bt_first(), and _bt_pagedel().

{
    BTStack     stack_in = NULL;

    /* Get the root page to start with */
    *bufP = _bt_getroot(rel, access);

    /* If index is empty and access = BT_READ, no root page is created. */
    if (!BufferIsValid(*bufP))
        return (BTStack) NULL;

    /* Loop iterates once per level descended in the tree */
    for (;;)
    {
        Page        page;
        BTPageOpaque opaque;
        OffsetNumber offnum;
        ItemId      itemid;
        IndexTuple  itup;
        BlockNumber blkno;
        BlockNumber par_blkno;
        BTStack     new_stack;

        /*
         * Race -- the page we just grabbed may have split since we read its
         * pointer in the parent (or metapage).  If it has, we may need to
         * move right to its new sibling.  Do that.
         */
        *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ);

        /* if this is a leaf page, we're done */
        page = BufferGetPage(*bufP);
        opaque = (BTPageOpaque) PageGetSpecialPointer(page);
        if (P_ISLEAF(opaque))
            break;

        /*
         * Find the appropriate item on the internal page, and get the child
         * page that it points to.
         */
        offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
        itemid = PageGetItemId(page, offnum);
        itup = (IndexTuple) PageGetItem(page, itemid);
        blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
        par_blkno = BufferGetBlockNumber(*bufP);

        /*
         * We need to save the location of the index entry we chose in the
         * parent page on a stack. In case we split the tree, we'll use the
         * stack to work back up to the parent page.  We also save the actual
         * downlink (TID) to uniquely identify the index entry, in case it
         * moves right while we're working lower in the tree.  See the paper
         * by Lehman and Yao for how this is detected and handled. (We use the
         * child link to disambiguate duplicate keys in the index -- Lehman
         * and Yao disallow duplicate keys.)
         */
        new_stack = (BTStack) palloc(sizeof(BTStackData));
        new_stack->bts_blkno = par_blkno;
        new_stack->bts_offset = offnum;
        memcpy(&new_stack->bts_btentry, itup, sizeof(IndexTupleData));
        new_stack->bts_parent = stack_in;

        /* drop the read lock on the parent page, acquire one on the child */
        *bufP = _bt_relandgetbuf(rel, *bufP, blkno, BT_READ);

        /* okay, all set to move down a level */
        stack_in = new_stack;
    }

    return stack_in;
}

void _bt_spool	(	IndexTuple	itup,
		BTSpool *	btspool
	)

Definition at line 188 of file nbtsort.c.

References BTSpool::sortstate, and tuplesort_putindextuple().

Referenced by btbuildCallback().

{
    tuplesort_putindextuple(btspool->sortstate, itup);
}

void _bt_spooldestroy ( BTSpool * btspool )

Definition at line 178 of file nbtsort.c.

References pfree(), BTSpool::sortstate, and tuplesort_end().

Referenced by btbuild().

{
    tuplesort_end(btspool->sortstate);
    pfree(btspool);
}

BTSpool* _bt_spoolinit	(	Relation	heap,
		Relation	index,
		bool	isunique,
		bool	isdead
	)

Definition at line 150 of file nbtsort.c.

References BTSpool::heap, BTSpool::index, BTSpool::isunique, maintenance_work_mem, palloc0(), BTSpool::sortstate, tuplesort_begin_index_btree(), and work_mem.

Referenced by btbuild().

{
    BTSpool    *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
    int         btKbytes;

    btspool->heap = heap;
    btspool->index = index;
    btspool->isunique = isunique;

    /*
     * We size the sort area as maintenance_work_mem rather than work_mem to
     * speed index creation.  This should be OK since a single backend can't
     * run multiple index creations in parallel.  Note that creation of a
     * unique index actually requires two BTSpool objects.  We expect that the
     * second one (for dead tuples) won't get very full, so we give it only
     * work_mem.
     */
    btKbytes = isdead ? work_mem : maintenance_work_mem;
    btspool->sortstate = tuplesort_begin_index_btree(heap, index, isunique,
                                                     btKbytes, false);

    return btspool;
}

void _bt_start_array_keys	(	IndexScanDesc	scan,
		ScanDirection	dir
	)

Definition at line 523 of file nbtutils.c.

References BTScanOpaqueData::arrayKeyData, BTScanOpaqueData::arrayKeys, Assert, BTArrayKeyInfo::cur_elem, BTArrayKeyInfo::elem_values, i, BTArrayKeyInfo::num_elems, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, BTArrayKeyInfo::scan_key, ScanDirectionIsBackward, and ScanKeyData::sk_argument.

Referenced by btgetbitmap(), and btgettuple().

{
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    int         i;

    for (i = 0; i < so->numArrayKeys; i++)
    {
        BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i];
        ScanKey     skey = &so->arrayKeyData[curArrayKey->scan_key];

        Assert(curArrayKey->num_elems > 0);
        if (ScanDirectionIsBackward(dir))
            curArrayKey->cur_elem = curArrayKey->num_elems - 1;
        else
            curArrayKey->cur_elem = 0;
        skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem];
    }
}

BTCycleId _bt_start_vacuum ( Relation rel )

Definition at line 1882 of file nbtutils.c.

References BtreeVacuumLock, BTVacInfo::cycle_ctr, BTOneVacInfo::cycleid, LockRelId::dbId, elog, ERROR, i, LockInfoData::lockRelId, LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), MAX_BT_CYCLE_ID, BTVacInfo::max_vacuums, BTVacInfo::num_vacuums, RelationData::rd_lockInfo, RelationGetRelationName, LockRelId::relId, BTOneVacInfo::relid, and BTVacInfo::vacuums.

Referenced by btbulkdelete().

{
    BTCycleId   result;
    int         i;
    BTOneVacInfo *vac;

    LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE);

    /*
     * Assign the next cycle ID, being careful to avoid zero as well as the
     * reserved high values.
     */
    result = ++(btvacinfo->cycle_ctr);
    if (result == 0 || result > MAX_BT_CYCLE_ID)
        result = btvacinfo->cycle_ctr = 1;

    /* Let's just make sure there's no entry already for this index */
    for (i = 0; i < btvacinfo->num_vacuums; i++)
    {
        vac = &btvacinfo->vacuums[i];
        if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
            vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
        {
            /*
             * Unlike most places in the backend, we have to explicitly
             * release our LWLock before throwing an error.  This is because
             * we expect _bt_end_vacuum() to be called before transaction
             * abort cleanup can run to release LWLocks.
             */
            LWLockRelease(BtreeVacuumLock);
            elog(ERROR, "multiple active vacuums for index \"%s\"",
                 RelationGetRelationName(rel));
        }
    }

    /* OK, add an entry */
    if (btvacinfo->num_vacuums >= btvacinfo->max_vacuums)
    {
        LWLockRelease(BtreeVacuumLock);
        elog(ERROR, "out of btvacinfo slots");
    }
    vac = &btvacinfo->vacuums[btvacinfo->num_vacuums];
    vac->relid = rel->rd_lockInfo.lockRelId;
    vac->cycleid = result;
    btvacinfo->num_vacuums++;

    LWLockRelease(BtreeVacuumLock);
    return result;
}

BTCycleId _bt_vacuum_cycleid ( Relation rel )

Definition at line 1848 of file nbtutils.c.

References BtreeVacuumLock, BTOneVacInfo::cycleid, LockRelId::dbId, i, LockInfoData::lockRelId, LW_SHARED, LWLockAcquire(), LWLockRelease(), BTVacInfo::num_vacuums, RelationData::rd_lockInfo, LockRelId::relId, BTOneVacInfo::relid, and BTVacInfo::vacuums.

Referenced by _bt_split().

{
    BTCycleId   result = 0;
    int         i;

    /* Share lock is enough since this is a read-only operation */
    LWLockAcquire(BtreeVacuumLock, LW_SHARED);

    for (i = 0; i < btvacinfo->num_vacuums; i++)
    {
        BTOneVacInfo *vac = &btvacinfo->vacuums[i];

        if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId &&
            vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId)
        {
            result = vac->cycleid;
            break;
        }
    }

    LWLockRelease(BtreeVacuumLock);
    return result;
}

Datum btbeginscan ( PG_FUNCTION_ARGS )

Definition at line 406 of file nbtree.c.

References BTScanOpaqueData::arrayContext, BTScanOpaqueData::arrayKeyData, BTScanOpaqueData::arrayKeys, Assert, BTScanPosData::buf, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanOpaqueData::keyData, BTScanOpaqueData::killedItems, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numArrayKeys, IndexScanDescData::numberOfKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, palloc(), PG_GETARG_INT32, PG_GETARG_POINTER, PG_RETURN_POINTER, RelationGetDescr, RelationGetIndexScan(), and IndexScanDescData::xs_itupdesc.

{
    Relation    rel = (Relation) PG_GETARG_POINTER(0);
    int         nkeys = PG_GETARG_INT32(1);
    int         norderbys = PG_GETARG_INT32(2);
    IndexScanDesc scan;
    BTScanOpaque so;

    /* no order by operators allowed */
    Assert(norderbys == 0);

    /* get the scan */
    scan = RelationGetIndexScan(rel, nkeys, norderbys);

    /* allocate private workspace */
    so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
    so->currPos.buf = so->markPos.buf = InvalidBuffer;
    if (scan->numberOfKeys > 0)
        so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
    else
        so->keyData = NULL;

    so->arrayKeyData = NULL;    /* assume no array keys for now */
    so->numArrayKeys = 0;
    so->arrayKeys = NULL;
    so->arrayContext = NULL;

    so->killedItems = NULL;     /* until needed */
    so->numKilled = 0;

    /*
     * We don't know yet whether the scan will be index-only, so we do not
     * allocate the tuple workspace arrays until btrescan.  However, we set up
     * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
     */
    so->currTuples = so->markTuples = NULL;
    so->currPos.nextTupleOffset = 0;
    so->markPos.nextTupleOffset = 0;

    scan->xs_itupdesc = RelationGetDescr(rel);

    scan->opaque = so;

    PG_RETURN_POINTER(scan);
}

Datum btbuild ( PG_FUNCTION_ARGS )

Definition at line 83 of file nbtree.c.

References _bt_leafbuild(), _bt_spooldestroy(), _bt_spoolinit(), btbuildCallback(), elog, ERROR, BTBuildState::haveDead, IndexBuildResult::heap_tuples, BTBuildState::heapRel, IndexInfo::ii_Unique, IndexBuildResult::index_tuples, IndexBuildHeapScan(), BTBuildState::indtuples, BTBuildState::isUnique, log_btree_build_stats, palloc(), PG_GETARG_POINTER, PG_RETURN_POINTER, RelationGetNumberOfBlocks, RelationGetRelationName, ResetUsage(), ShowUsage(), BTBuildState::spool, and BTBuildState::spool2.

{
    Relation    heap = (Relation) PG_GETARG_POINTER(0);
    Relation    index = (Relation) PG_GETARG_POINTER(1);
    IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
    IndexBuildResult *result;
    double      reltuples;
    BTBuildState buildstate;

    buildstate.isUnique = indexInfo->ii_Unique;
    buildstate.haveDead = false;
    buildstate.heapRel = heap;
    buildstate.spool = NULL;
    buildstate.spool2 = NULL;
    buildstate.indtuples = 0;

#ifdef BTREE_BUILD_STATS
    if (log_btree_build_stats)
        ResetUsage();
#endif   /* BTREE_BUILD_STATS */

    /*
     * We expect to be called exactly once for any index relation. If that's
     * not the case, big trouble's what we have.
     */
    if (RelationGetNumberOfBlocks(index) != 0)
        elog(ERROR, "index \"%s\" already contains data",
             RelationGetRelationName(index));

    buildstate.spool = _bt_spoolinit(heap, index, indexInfo->ii_Unique, false);

    /*
     * If building a unique index, put dead tuples in a second spool to keep
     * them out of the uniqueness check.
     */
    if (indexInfo->ii_Unique)
        buildstate.spool2 = _bt_spoolinit(heap, index, false, true);

    /* do the heap scan */
    reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
                                   btbuildCallback, (void *) &buildstate);

    /* okay, all heap tuples are indexed */
    if (buildstate.spool2 && !buildstate.haveDead)
    {
        /* spool2 turns out to be unnecessary */
        _bt_spooldestroy(buildstate.spool2);
        buildstate.spool2 = NULL;
    }

    /*
     * Finish the build by (1) completing the sort of the spool file, (2)
     * inserting the sorted tuples into btree pages and (3) building the upper
     * levels.
     */
    _bt_leafbuild(buildstate.spool, buildstate.spool2);
    _bt_spooldestroy(buildstate.spool);
    if (buildstate.spool2)
        _bt_spooldestroy(buildstate.spool2);

#ifdef BTREE_BUILD_STATS
    if (log_btree_build_stats)
    {
        ShowUsage("BTREE BUILD STATS");
        ResetUsage();
    }
#endif   /* BTREE_BUILD_STATS */

    /*
     * If we are reindexing a pre-existing index, it is critical to send out a
     * relcache invalidation SI message to ensure all backends re-read the
     * index metapage.  We expect that the caller will ensure that happens
     * (typically as a side effect of updating index stats, but it must happen
     * even if the stats don't change!)
     */

    /*
     * Return statistics
     */
    result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));

    result->heap_tuples = reltuples;
    result->index_tuples = buildstate.indtuples;

    PG_RETURN_POINTER(result);
}

Datum btbuildempty ( PG_FUNCTION_ARGS )

Definition at line 210 of file nbtree.c.

References _bt_initmetapage(), BTREE_METAPAGE, INIT_FORKNUM, log_newpage(), RelFileNodeBackend::node, P_NONE, PageSetChecksumInplace(), palloc(), PG_GETARG_POINTER, PG_RETURN_VOID, RelationData::rd_smgr, SMgrRelationData::smgr_rnode, smgrimmedsync(), smgrwrite(), and XLogIsNeeded.

{
    Relation    index = (Relation) PG_GETARG_POINTER(0);
    Page        metapage;

    /* Construct metapage. */
    metapage = (Page) palloc(BLCKSZ);
    _bt_initmetapage(metapage, P_NONE, 0);

    /* Write the page.  If archiving/streaming, XLOG it. */
    PageSetChecksumInplace(metapage, BTREE_METAPAGE);
    smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
              (char *) metapage, true);
    if (XLogIsNeeded())
        log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
                    BTREE_METAPAGE, metapage);

    /*
     * An immediate sync is require even if we xlog'd the page, because the
     * write did not go through shared_buffers and therefore a concurrent
     * checkpoint may have move the redo pointer past our xlog record.
     */
    smgrimmedsync(index->rd_smgr, INIT_FORKNUM);

    PG_RETURN_VOID();
}

Datum btbulkdelete ( PG_FUNCTION_ARGS )

Definition at line 653 of file nbtree.c.

References _bt_end_vacuum(), _bt_end_vacuum_callback(), _bt_start_vacuum(), btvacuumscan(), callback(), IndexVacuumInfo::index, NULL, palloc0(), PG_END_ENSURE_ERROR_CLEANUP, PG_ENSURE_ERROR_CLEANUP, PG_GETARG_POINTER, PG_RETURN_POINTER, and PointerGetDatum.

{
    IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
    IndexBulkDeleteResult *volatile stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
    IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2);
    void       *callback_state = (void *) PG_GETARG_POINTER(3);
    Relation    rel = info->index;
    BTCycleId   cycleid;

    /* allocate stats if first time through, else re-use existing struct */
    if (stats == NULL)
        stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));

    /* Establish the vacuum cycle ID to use for this scan */
    /* The ENSURE stuff ensures we clean up shared memory on failure */
    PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
    {
        cycleid = _bt_start_vacuum(rel);

        btvacuumscan(info, stats, callback, callback_state, cycleid);
    }
    PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
    _bt_end_vacuum(rel);

    PG_RETURN_POINTER(stats);
}

Datum btcanreturn ( PG_FUNCTION_ARGS )

Definition at line 1112 of file nbtree.c.

References PG_RETURN_BOOL.

{
    PG_RETURN_BOOL(true);
}

Datum btendscan ( PG_FUNCTION_ARGS )

Definition at line 523 of file nbtree.c.

References _bt_killitems(), BTScanOpaqueData::arrayContext, BTScanPosIsValid, BTScanPosData::buf, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanOpaqueData::keyData, BTScanOpaqueData::killedItems, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, MemoryContextDelete(), NULL, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, pfree(), PG_GETARG_POINTER, PG_RETURN_VOID, and ReleaseBuffer().

{
    IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
    BTScanOpaque so = (BTScanOpaque) scan->opaque;

    /* we aren't holding any read locks, but gotta drop the pins */
    if (BTScanPosIsValid(so->currPos))
    {
        /* Before leaving current page, deal with any killed items */
        if (so->numKilled > 0)
            _bt_killitems(scan, false);
        ReleaseBuffer(so->currPos.buf);
        so->currPos.buf = InvalidBuffer;
    }

    if (BTScanPosIsValid(so->markPos))
    {
        ReleaseBuffer(so->markPos.buf);
        so->markPos.buf = InvalidBuffer;
    }
    so->markItemIndex = -1;

    /* Release storage */
    if (so->keyData != NULL)
        pfree(so->keyData);
    /* so->arrayKeyData and so->arrayKeys are in arrayContext */
    if (so->arrayContext != NULL)
        MemoryContextDelete(so->arrayContext);
    if (so->killedItems != NULL)
        pfree(so->killedItems);
    if (so->currTuples != NULL)
        pfree(so->currTuples);
    /* so->markTuples should not be pfree'd, see btrescan */
    pfree(so);

    PG_RETURN_VOID();
}

Datum btgetbitmap ( PG_FUNCTION_ARGS )

Definition at line 346 of file nbtree.c.

References _bt_advance_array_keys(), _bt_first(), _bt_next(), _bt_start_array_keys(), BTScanOpaqueData::currPos, ForwardScanDirection, BTScanPosItem::heapTid, BTScanPosData::itemIndex, BTScanPosData::items, BTScanPosData::lastItem, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, PG_GETARG_POINTER, PG_RETURN_INT64, HeapTupleData::t_self, tbm_add_tuples(), and IndexScanDescData::xs_ctup.

{
    IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
    TIDBitmap  *tbm = (TIDBitmap *) PG_GETARG_POINTER(1);
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    int64       ntids = 0;
    ItemPointer heapTid;

    /*
     * If we have any array keys, initialize them.
     */
    if (so->numArrayKeys)
    {
        /* punt if we have any unsatisfiable array keys */
        if (so->numArrayKeys < 0)
            PG_RETURN_INT64(ntids);

        _bt_start_array_keys(scan, ForwardScanDirection);
    }

    /* This loop handles advancing to the next array elements, if any */
    do
    {
        /* Fetch the first page & tuple */
        if (_bt_first(scan, ForwardScanDirection))
        {
            /* Save tuple ID, and continue scanning */
            heapTid = &scan->xs_ctup.t_self;
            tbm_add_tuples(tbm, heapTid, 1, false);
            ntids++;

            for (;;)
            {
                /*
                 * Advance to next tuple within page.  This is the same as the
                 * easy case in _bt_next().
                 */
                if (++so->currPos.itemIndex > so->currPos.lastItem)
                {
                    /* let _bt_next do the heavy lifting */
                    if (!_bt_next(scan, ForwardScanDirection))
                        break;
                }

                /* Save tuple ID, and continue scanning */
                heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
                tbm_add_tuples(tbm, heapTid, 1, false);
                ntids++;
            }
        }
        /* Now see if we have more array keys to deal with */
    } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));

    PG_RETURN_INT64(ntids);
}

Datum btgettuple ( PG_FUNCTION_ARGS )

Definition at line 270 of file nbtree.c.

References _bt_advance_array_keys(), _bt_first(), _bt_next(), _bt_start_array_keys(), BTScanPosIsValid, BTScanOpaqueData::currPos, BTScanPosData::itemIndex, IndexScanDescData::kill_prior_tuple, BTScanOpaqueData::killedItems, MaxIndexTuplesPerPage, NULL, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, palloc(), PG_GETARG_INT32, PG_GETARG_POINTER, PG_RETURN_BOOL, and IndexScanDescData::xs_recheck.

{
    IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
    ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    bool        res;

    /* btree indexes are never lossy */
    scan->xs_recheck = false;

    /*
     * If we have any array keys, initialize them during first call for a
     * scan.  We can't do this in btrescan because we don't know the scan
     * direction at that time.
     */
    if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
    {
        /* punt if we have any unsatisfiable array keys */
        if (so->numArrayKeys < 0)
            PG_RETURN_BOOL(false);

        _bt_start_array_keys(scan, dir);
    }

    /* This loop handles advancing to the next array elements, if any */
    do
    {
        /*
         * If we've already initialized this scan, we can just advance it in
         * the appropriate direction.  If we haven't done so yet, we call
         * _bt_first() to get the first item in the scan.
         */
        if (!BTScanPosIsValid(so->currPos))
            res = _bt_first(scan, dir);
        else
        {
            /*
             * Check to see if we should kill the previously-fetched tuple.
             */
            if (scan->kill_prior_tuple)
            {
                /*
                 * Yes, remember it for later. (We'll deal with all such
                 * tuples at once right before leaving the index page.)  The
                 * test for numKilled overrun is not just paranoia: if the
                 * caller reverses direction in the indexscan then the same
                 * item might get entered multiple times. It's not worth
                 * trying to optimize that, so we don't detect it, but instead
                 * just forget any excess entries.
                 */
                if (so->killedItems == NULL)
                    so->killedItems = (int *)
                        palloc(MaxIndexTuplesPerPage * sizeof(int));
                if (so->numKilled < MaxIndexTuplesPerPage)
                    so->killedItems[so->numKilled++] = so->currPos.itemIndex;
            }

            /*
             * Now continue the scan.
             */
            res = _bt_next(scan, dir);
        }

        /* If we have a tuple, return it ... */
        if (res)
            break;
        /* ... otherwise see if we have more array keys to deal with */
    } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));

    PG_RETURN_BOOL(res);
}

Datum btinsert ( PG_FUNCTION_ARGS )

Definition at line 244 of file nbtree.c.

References _bt_doinsert(), index_form_tuple(), pfree(), PG_GETARG_INT32, PG_GETARG_POINTER, PG_RETURN_BOOL, RelationGetDescr, IndexTupleData::t_tid, and values.

{
    Relation    rel = (Relation) PG_GETARG_POINTER(0);
    Datum      *values = (Datum *) PG_GETARG_POINTER(1);
    bool       *isnull = (bool *) PG_GETARG_POINTER(2);
    ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3);
    Relation    heapRel = (Relation) PG_GETARG_POINTER(4);
    IndexUniqueCheck checkUnique = (IndexUniqueCheck) PG_GETARG_INT32(5);
    bool        result;
    IndexTuple  itup;

    /* generate an index tuple */
    itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
    itup->t_tid = *ht_ctid;

    result = _bt_doinsert(rel, itup, checkUnique, heapRel);

    pfree(itup);

    PG_RETURN_BOOL(result);
}

Datum btmarkpos ( PG_FUNCTION_ARGS )

Definition at line 565 of file nbtree.c.

References _bt_mark_array_keys(), BTScanPosIsValid, BTScanPosData::buf, BTScanOpaqueData::currPos, BTScanPosData::itemIndex, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, PG_GETARG_POINTER, PG_RETURN_VOID, and ReleaseBuffer().

{
    IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
    BTScanOpaque so = (BTScanOpaque) scan->opaque;

    /* we aren't holding any read locks, but gotta drop the pin */
    if (BTScanPosIsValid(so->markPos))
    {
        ReleaseBuffer(so->markPos.buf);
        so->markPos.buf = InvalidBuffer;
    }

    /*
     * Just record the current itemIndex.  If we later step to next page
     * before releasing the marked position, _bt_steppage makes a full copy of
     * the currPos struct in markPos.  If (as often happens) the mark is moved
     * before we leave the page, we don't have to do that work.
     */
    if (BTScanPosIsValid(so->currPos))
        so->markItemIndex = so->currPos.itemIndex;
    else
        so->markItemIndex = -1;

    /* Also record the current positions of any array keys */
    if (so->numArrayKeys)
        _bt_mark_array_keys(scan);

    PG_RETURN_VOID();
}

Datum btoptions ( PG_FUNCTION_ARGS )

Definition at line 2017 of file nbtutils.c.

References default_reloptions(), PG_GETARG_BOOL, PG_GETARG_DATUM, PG_RETURN_BYTEA_P, PG_RETURN_NULL, and RELOPT_KIND_BTREE.

{
    Datum       reloptions = PG_GETARG_DATUM(0);
    bool        validate = PG_GETARG_BOOL(1);
    bytea      *result;

    result = default_reloptions(reloptions, validate, RELOPT_KIND_BTREE);
    if (result)
        PG_RETURN_BYTEA_P(result);
    PG_RETURN_NULL();
}

void btree_desc	(	StringInfo	buf,
		uint8	xl_info,
		char *	rec
	)

Definition at line 29 of file nbtdesc.c.

References appendStringInfo(), xl_btree_delete::block, xl_btree_vacuum::block, RelFileNode::dbNode, xl_btree_delete_page::deadblk, xl_btree_split::firstright, xl_btree_delete::hnode, xl_btree_vacuum::lastBlockVacuumed, xl_btree_reuse_page::latestRemovedXid, xl_btree_delete_page::leftblk, xl_btree_split::leftsib, xl_btree_newroot::level, xl_btree_split::level, xl_btree_reuse_page::node, xl_btree_newroot::node, xl_btree_delete::node, xl_btree_vacuum::node, xl_btree_split::node, out_target(), RelFileNode::relNode, xl_btree_delete_page::rightblk, xl_btree_split::rightsib, xl_btree_split::rnext, xl_btree_newroot::rootblk, RelFileNode::spcNode, xl_btree_delete_page::target, xl_btree_insert::target, XLOG_BTREE_DELETE, XLOG_BTREE_DELETE_PAGE, XLOG_BTREE_DELETE_PAGE_HALF, XLOG_BTREE_DELETE_PAGE_META, XLOG_BTREE_INSERT_LEAF, XLOG_BTREE_INSERT_META, XLOG_BTREE_INSERT_UPPER, XLOG_BTREE_NEWROOT, XLOG_BTREE_REUSE_PAGE, XLOG_BTREE_SPLIT_L, XLOG_BTREE_SPLIT_L_ROOT, XLOG_BTREE_SPLIT_R, XLOG_BTREE_SPLIT_R_ROOT, and XLOG_BTREE_VACUUM.

{
    uint8       info = xl_info & ~XLR_INFO_MASK;

    switch (info)
    {
        case XLOG_BTREE_INSERT_LEAF:
            {
                xl_btree_insert *xlrec = (xl_btree_insert *) rec;

                appendStringInfo(buf, "insert: ");
                out_target(buf, &(xlrec->target));
                break;
            }
        case XLOG_BTREE_INSERT_UPPER:
            {
                xl_btree_insert *xlrec = (xl_btree_insert *) rec;

                appendStringInfo(buf, "insert_upper: ");
                out_target(buf, &(xlrec->target));
                break;
            }
        case XLOG_BTREE_INSERT_META:
            {
                xl_btree_insert *xlrec = (xl_btree_insert *) rec;

                appendStringInfo(buf, "insert_meta: ");
                out_target(buf, &(xlrec->target));
                break;
            }
        case XLOG_BTREE_SPLIT_L:
            {
                xl_btree_split *xlrec = (xl_btree_split *) rec;

                appendStringInfo(buf, "split_l: rel %u/%u/%u ",
                                 xlrec->node.spcNode, xlrec->node.dbNode,
                                 xlrec->node.relNode);
                appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d",
                               xlrec->leftsib, xlrec->rightsib, xlrec->rnext,
                                 xlrec->level, xlrec->firstright);
                break;
            }
        case XLOG_BTREE_SPLIT_R:
            {
                xl_btree_split *xlrec = (xl_btree_split *) rec;

                appendStringInfo(buf, "split_r: rel %u/%u/%u ",
                                 xlrec->node.spcNode, xlrec->node.dbNode,
                                 xlrec->node.relNode);
                appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d",
                               xlrec->leftsib, xlrec->rightsib, xlrec->rnext,
                                 xlrec->level, xlrec->firstright);
                break;
            }
        case XLOG_BTREE_SPLIT_L_ROOT:
            {
                xl_btree_split *xlrec = (xl_btree_split *) rec;

                appendStringInfo(buf, "split_l_root: rel %u/%u/%u ",
                                 xlrec->node.spcNode, xlrec->node.dbNode,
                                 xlrec->node.relNode);
                appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d",
                               xlrec->leftsib, xlrec->rightsib, xlrec->rnext,
                                 xlrec->level, xlrec->firstright);
                break;
            }
        case XLOG_BTREE_SPLIT_R_ROOT:
            {
                xl_btree_split *xlrec = (xl_btree_split *) rec;

                appendStringInfo(buf, "split_r_root: rel %u/%u/%u ",
                                 xlrec->node.spcNode, xlrec->node.dbNode,
                                 xlrec->node.relNode);
                appendStringInfo(buf, "left %u, right %u, next %u, level %u, firstright %d",
                               xlrec->leftsib, xlrec->rightsib, xlrec->rnext,
                                 xlrec->level, xlrec->firstright);
                break;
            }
        case XLOG_BTREE_VACUUM:
            {
                xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec;

                appendStringInfo(buf, "vacuum: rel %u/%u/%u; blk %u, lastBlockVacuumed %u",
                                 xlrec->node.spcNode, xlrec->node.dbNode,
                                 xlrec->node.relNode, xlrec->block,
                                 xlrec->lastBlockVacuumed);
                break;
            }
        case XLOG_BTREE_DELETE:
            {
                xl_btree_delete *xlrec = (xl_btree_delete *) rec;

                appendStringInfo(buf, "delete: index %u/%u/%u; iblk %u, heap %u/%u/%u;",
                xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode,
                                 xlrec->block,
                                 xlrec->hnode.spcNode, xlrec->hnode.dbNode, xlrec->hnode.relNode);
                break;
            }
        case XLOG_BTREE_DELETE_PAGE:
        case XLOG_BTREE_DELETE_PAGE_META:
        case XLOG_BTREE_DELETE_PAGE_HALF:
            {
                xl_btree_delete_page *xlrec = (xl_btree_delete_page *) rec;

                appendStringInfo(buf, "delete_page: ");
                out_target(buf, &(xlrec->target));
                appendStringInfo(buf, "; dead %u; left %u; right %u",
                            xlrec->deadblk, xlrec->leftblk, xlrec->rightblk);
                break;
            }
        case XLOG_BTREE_NEWROOT:
            {
                xl_btree_newroot *xlrec = (xl_btree_newroot *) rec;

                appendStringInfo(buf, "newroot: rel %u/%u/%u; root %u lev %u",
                                 xlrec->node.spcNode, xlrec->node.dbNode,
                                 xlrec->node.relNode,
                                 xlrec->rootblk, xlrec->level);
                break;
            }
        case XLOG_BTREE_REUSE_PAGE:
            {
                xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) rec;

                appendStringInfo(buf, "reuse_page: rel %u/%u/%u; latestRemovedXid %u",
                                 xlrec->node.spcNode, xlrec->node.dbNode,
                               xlrec->node.relNode, xlrec->latestRemovedXid);
                break;
            }
        default:
            appendStringInfo(buf, "UNKNOWN");
            break;
    }
}

void btree_redo	(	XLogRecPtr	lsn,
		XLogRecord *	record
	)

Definition at line 1023 of file nbtxlog.c.

References btree_xlog_delete(), btree_xlog_delete_page(), btree_xlog_insert(), btree_xlog_newroot(), btree_xlog_reuse_page(), btree_xlog_split(), btree_xlog_vacuum(), elog, PANIC, XLogRecord::xl_info, XLOG_BTREE_DELETE, XLOG_BTREE_DELETE_PAGE, XLOG_BTREE_DELETE_PAGE_HALF, XLOG_BTREE_DELETE_PAGE_META, XLOG_BTREE_INSERT_LEAF, XLOG_BTREE_INSERT_META, XLOG_BTREE_INSERT_UPPER, XLOG_BTREE_NEWROOT, XLOG_BTREE_REUSE_PAGE, XLOG_BTREE_SPLIT_L, XLOG_BTREE_SPLIT_L_ROOT, XLOG_BTREE_SPLIT_R, XLOG_BTREE_SPLIT_R_ROOT, and XLOG_BTREE_VACUUM.

{
    uint8       info = record->xl_info & ~XLR_INFO_MASK;

    switch (info)
    {
        case XLOG_BTREE_INSERT_LEAF:
            btree_xlog_insert(true, false, lsn, record);
            break;
        case XLOG_BTREE_INSERT_UPPER:
            btree_xlog_insert(false, false, lsn, record);
            break;
        case XLOG_BTREE_INSERT_META:
            btree_xlog_insert(false, true, lsn, record);
            break;
        case XLOG_BTREE_SPLIT_L:
            btree_xlog_split(true, false, lsn, record);
            break;
        case XLOG_BTREE_SPLIT_R:
            btree_xlog_split(false, false, lsn, record);
            break;
        case XLOG_BTREE_SPLIT_L_ROOT:
            btree_xlog_split(true, true, lsn, record);
            break;
        case XLOG_BTREE_SPLIT_R_ROOT:
            btree_xlog_split(false, true, lsn, record);
            break;
        case XLOG_BTREE_VACUUM:
            btree_xlog_vacuum(lsn, record);
            break;
        case XLOG_BTREE_DELETE:
            btree_xlog_delete(lsn, record);
            break;
        case XLOG_BTREE_DELETE_PAGE:
        case XLOG_BTREE_DELETE_PAGE_META:
        case XLOG_BTREE_DELETE_PAGE_HALF:
            btree_xlog_delete_page(info, lsn, record);
            break;
        case XLOG_BTREE_NEWROOT:
            btree_xlog_newroot(lsn, record);
            break;
        case XLOG_BTREE_REUSE_PAGE:
            btree_xlog_reuse_page(lsn, record);
            break;
        default:
            elog(PANIC, "btree_redo: unknown op code %u", info);
    }
}

bool btree_safe_restartpoint ( void )

Definition at line 1141 of file nbtxlog.c.

{
    if (incomplete_actions)
        return false;
    return true;
}

void btree_xlog_cleanup ( void )

Definition at line 1079 of file nbtxlog.c.

References _bt_insert_parent(), _bt_pagedel(), buf, BufferGetPage, BufferIsValid, CreateFakeRelcacheEntry(), bt_incomplete_action::delblk, elog, FreeFakeRelcacheEntry(), bt_incomplete_action::is_root, bt_incomplete_action::is_split, bt_incomplete_action::leftblk, lfirst, bt_incomplete_action::node, NULL, P_LEFTMOST, P_RIGHTMOST, PageGetSpecialPointer, PANIC, bt_incomplete_action::rightblk, and XLogReadBuffer().

{
    ListCell   *l;

    foreach(l, incomplete_actions)
    {
        bt_incomplete_action *action = (bt_incomplete_action *) lfirst(l);

        if (action->is_split)
        {
            /* finish an incomplete split */
            Buffer      lbuf,
                        rbuf;
            Page        lpage,
                        rpage;
            BTPageOpaque lpageop,
                        rpageop;
            bool        is_only;
            Relation    reln;

            lbuf = XLogReadBuffer(action->node, action->leftblk, false);
            /* failure is impossible because we wrote this page earlier */
            if (!BufferIsValid(lbuf))
                elog(PANIC, "btree_xlog_cleanup: left block unfound");
            lpage = (Page) BufferGetPage(lbuf);
            lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
            rbuf = XLogReadBuffer(action->node, action->rightblk, false);
            /* failure is impossible because we wrote this page earlier */
            if (!BufferIsValid(rbuf))
                elog(PANIC, "btree_xlog_cleanup: right block unfound");
            rpage = (Page) BufferGetPage(rbuf);
            rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);

            /* if the pages are all of their level, it's a only-page split */
            is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop);

            reln = CreateFakeRelcacheEntry(action->node);
            _bt_insert_parent(reln, lbuf, rbuf, NULL,
                              action->is_root, is_only);
            FreeFakeRelcacheEntry(reln);
        }
        else
        {
            /* finish an incomplete deletion (of a half-dead page) */
            Buffer      buf;

            buf = XLogReadBuffer(action->node, action->delblk, false);
            if (BufferIsValid(buf))
            {
                Relation    reln;

                reln = CreateFakeRelcacheEntry(action->node);
                if (_bt_pagedel(reln, buf, NULL) == 0)
                    elog(PANIC, "btree_xlog_cleanup: _bt_pagedel failed");
                FreeFakeRelcacheEntry(reln);
            }
        }
    }
    incomplete_actions = NIL;
}

void btree_xlog_startup ( void )

Definition at line 1073 of file nbtxlog.c.

{
    incomplete_actions = NIL;
}

void BTreeShmemInit ( void )

Definition at line 1989 of file nbtutils.c.

References Assert, BTreeShmemSize(), BTVacInfo::cycle_ctr, IsUnderPostmaster, BTVacInfo::max_vacuums, MaxBackends, NULL, BTVacInfo::num_vacuums, and ShmemInitStruct().

Referenced by CreateSharedMemoryAndSemaphores().

{
    bool        found;

    btvacinfo = (BTVacInfo *) ShmemInitStruct("BTree Vacuum State",
                                              BTreeShmemSize(),
                                              &found);

    if (!IsUnderPostmaster)
    {
        /* Initialize shared memory area */
        Assert(!found);

        /*
         * It doesn't really matter what the cycle counter starts at, but
         * having it always start the same doesn't seem good.  Seed with
         * low-order bits of time() instead.
         */
        btvacinfo->cycle_ctr = (BTCycleId) time(NULL);

        btvacinfo->num_vacuums = 0;
        btvacinfo->max_vacuums = MaxBackends;
    }
    else
        Assert(found);
}

Size BTreeShmemSize ( void )

Definition at line 1976 of file nbtutils.c.

References add_size(), MaxBackends, mul_size(), and offsetof.

Referenced by BTreeShmemInit(), and CreateSharedMemoryAndSemaphores().

{
    Size        size;

    size = offsetof(BTVacInfo, vacuums[0]);
    size = add_size(size, mul_size(MaxBackends, sizeof(BTOneVacInfo)));
    return size;
}

Datum btrescan ( PG_FUNCTION_ARGS )

Definition at line 456 of file nbtree.c.

References _bt_killitems(), _bt_preprocess_array_keys(), BTScanPosIsValid, BTScanPosData::buf, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, IndexScanDescData::keyData, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, memmove, NULL, BTScanOpaqueData::numberOfKeys, IndexScanDescData::numberOfKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, palloc(), PG_GETARG_POINTER, PG_RETURN_VOID, ReleaseBuffer(), and IndexScanDescData::xs_want_itup.

{
    IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
    ScanKey     scankey = (ScanKey) PG_GETARG_POINTER(1);

    /* remaining arguments are ignored */
    BTScanOpaque so = (BTScanOpaque) scan->opaque;

    /* we aren't holding any read locks, but gotta drop the pins */
    if (BTScanPosIsValid(so->currPos))
    {
        /* Before leaving current page, deal with any killed items */
        if (so->numKilled > 0)
            _bt_killitems(scan, false);
        ReleaseBuffer(so->currPos.buf);
        so->currPos.buf = InvalidBuffer;
    }

    if (BTScanPosIsValid(so->markPos))
    {
        ReleaseBuffer(so->markPos.buf);
        so->markPos.buf = InvalidBuffer;
    }
    so->markItemIndex = -1;

    /*
     * Allocate tuple workspace arrays, if needed for an index-only scan and
     * not already done in a previous rescan call.  To save on palloc
     * overhead, both workspaces are allocated as one palloc block; only this
     * function and btendscan know that.
     *
     * NOTE: this data structure also makes it safe to return data from a
     * "name" column, even though btree name_ops uses an underlying storage
     * datatype of cstring.  The risk there is that "name" is supposed to be
     * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
     * However, since we only return data out of tuples sitting in the
     * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
     * data out of the markTuples array --- running off the end of memory for
     * a SIGSEGV is not possible.  Yeah, this is ugly as sin, but it beats
     * adding special-case treatment for name_ops elsewhere.
     */
    if (scan->xs_want_itup && so->currTuples == NULL)
    {
        so->currTuples = (char *) palloc(BLCKSZ * 2);
        so->markTuples = so->currTuples + BLCKSZ;
    }

    /*
     * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
     * - vadim 05/05/97
     */
    if (scankey && scan->numberOfKeys > 0)
        memmove(scan->keyData,
                scankey,
                scan->numberOfKeys * sizeof(ScanKeyData));
    so->numberOfKeys = 0;       /* until _bt_preprocess_keys sets it */

    /* If any keys are SK_SEARCHARRAY type, set up array-key info */
    _bt_preprocess_array_keys(scan);

    PG_RETURN_VOID();
}

Datum btrestrpos ( PG_FUNCTION_ARGS )

Definition at line 599 of file nbtree.c.

References _bt_killitems(), _bt_restore_array_keys(), BTScanPosIsValid, BTScanPosData::buf, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, IncrBufferRefCount(), BTScanPosData::itemIndex, BTScanPosData::lastItem, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numKilled, offsetof, IndexScanDescData::opaque, PG_GETARG_POINTER, PG_RETURN_VOID, and ReleaseBuffer().

{
    IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
    BTScanOpaque so = (BTScanOpaque) scan->opaque;

    /* Restore the marked positions of any array keys */
    if (so->numArrayKeys)
        _bt_restore_array_keys(scan);

    if (so->markItemIndex >= 0)
    {
        /*
         * The mark position is on the same page we are currently on. Just
         * restore the itemIndex.
         */
        so->currPos.itemIndex = so->markItemIndex;
    }
    else
    {
        /* we aren't holding any read locks, but gotta drop the pin */
        if (BTScanPosIsValid(so->currPos))
        {
            /* Before leaving current page, deal with any killed items */
            if (so->numKilled > 0 &&
                so->currPos.buf != so->markPos.buf)
                _bt_killitems(scan, false);
            ReleaseBuffer(so->currPos.buf);
            so->currPos.buf = InvalidBuffer;
        }

        if (BTScanPosIsValid(so->markPos))
        {
            /* bump pin on mark buffer for assignment to current buffer */
            IncrBufferRefCount(so->markPos.buf);
            memcpy(&so->currPos, &so->markPos,
                   offsetof(BTScanPosData, items[1]) +
                   so->markPos.lastItem * sizeof(BTScanPosItem));
            if (so->currTuples)
                memcpy(so->currTuples, so->markTuples,
                       so->markPos.nextTupleOffset);
        }
    }

    PG_RETURN_VOID();
}

Datum btvacuumcleanup ( PG_FUNCTION_ARGS )

Definition at line 686 of file nbtree.c.

References IndexVacuumInfo::analyze_only, btvacuumscan(), IndexVacuumInfo::estimated_count, IndexVacuumInfo::index, IndexFreeSpaceMapVacuum(), NULL, IndexVacuumInfo::num_heap_tuples, IndexBulkDeleteResult::num_index_tuples, palloc0(), PG_GETARG_POINTER, and PG_RETURN_POINTER.

{
    IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
    IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);

    /* No-op in ANALYZE ONLY mode */
    if (info->analyze_only)
        PG_RETURN_POINTER(stats);

    /*
     * If btbulkdelete was called, we need not do anything, just return the
     * stats from the latest btbulkdelete call.  If it wasn't called, we must
     * still do a pass over the index, to recycle any newly-recyclable pages
     * and to obtain index statistics.
     *
     * Since we aren't going to actually delete any leaf items, there's no
     * need to go through all the vacuum-cycle-ID pushups.
     */
    if (stats == NULL)
    {
        stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
        btvacuumscan(info, stats, NULL, NULL, 0);
    }

    /* Finally, vacuum the FSM */
    IndexFreeSpaceMapVacuum(info->index);

    /*
     * It's quite possible for us to be fooled by concurrent page splits into
     * double-counting some index tuples, so disbelieve any total that exceeds
     * the underlying heap's count ... if we know that accurately.  Otherwise
     * this might just make matters worse.
     */
    if (!info->estimated_count)
    {
        if (stats->num_index_tuples > info->num_heap_tuples)
            stats->num_index_tuples = info->num_heap_tuples;
    }

    PG_RETURN_POINTER(stats);
}

Header And Logo

nbtree.h File Reference

Data Structures

Defines

Typedefs

Functions

Define Documentation

Typedef Documentation

Function Documentation