Header And Logo

PostgreSQL
| The world's most advanced open source database.

Data Structures | Functions

nbtree.c File Reference

#include "postgres.h"
#include "access/heapam_xlog.h"
#include "access/nbtree.h"
#include "access/relscan.h"
#include "catalog/index.h"
#include "commands/vacuum.h"
#include "storage/indexfsm.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/smgr.h"
#include "tcop/tcopprot.h"
#include "utils/memutils.h"
Include dependency graph for nbtree.c:

Go to the source code of this file.

Data Structures

struct  BTBuildState
struct  BTVacState

Functions

static void btbuildCallback (Relation index, HeapTuple htup, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
static void btvacuumscan (IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state, BTCycleId cycleid)
static void btvacuumpage (BTVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
Datum btbuild (PG_FUNCTION_ARGS)
Datum btbuildempty (PG_FUNCTION_ARGS)
Datum btinsert (PG_FUNCTION_ARGS)
Datum btgettuple (PG_FUNCTION_ARGS)
Datum btgetbitmap (PG_FUNCTION_ARGS)
Datum btbeginscan (PG_FUNCTION_ARGS)
Datum btrescan (PG_FUNCTION_ARGS)
Datum btendscan (PG_FUNCTION_ARGS)
Datum btmarkpos (PG_FUNCTION_ARGS)
Datum btrestrpos (PG_FUNCTION_ARGS)
Datum btbulkdelete (PG_FUNCTION_ARGS)
Datum btvacuumcleanup (PG_FUNCTION_ARGS)
Datum btcanreturn (PG_FUNCTION_ARGS)

Function Documentation

Datum btbeginscan ( PG_FUNCTION_ARGS   ) 

Definition at line 406 of file nbtree.c.

References BTScanOpaqueData::arrayContext, BTScanOpaqueData::arrayKeyData, BTScanOpaqueData::arrayKeys, Assert, BTScanPosData::buf, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanOpaqueData::keyData, BTScanOpaqueData::killedItems, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numArrayKeys, IndexScanDescData::numberOfKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, palloc(), PG_GETARG_INT32, PG_GETARG_POINTER, PG_RETURN_POINTER, RelationGetDescr, RelationGetIndexScan(), and IndexScanDescData::xs_itupdesc.

{
    Relation    rel = (Relation) PG_GETARG_POINTER(0);
    int         nkeys = PG_GETARG_INT32(1);
    int         norderbys = PG_GETARG_INT32(2);
    IndexScanDesc scan;
    BTScanOpaque so;

    /* no order by operators allowed */
    Assert(norderbys == 0);

    /* get the scan */
    scan = RelationGetIndexScan(rel, nkeys, norderbys);

    /* allocate private workspace */
    so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
    so->currPos.buf = so->markPos.buf = InvalidBuffer;
    if (scan->numberOfKeys > 0)
        so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
    else
        so->keyData = NULL;

    so->arrayKeyData = NULL;    /* assume no array keys for now */
    so->numArrayKeys = 0;
    so->arrayKeys = NULL;
    so->arrayContext = NULL;

    so->killedItems = NULL;     /* until needed */
    so->numKilled = 0;

    /*
     * We don't know yet whether the scan will be index-only, so we do not
     * allocate the tuple workspace arrays until btrescan.  However, we set up
     * scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
     */
    so->currTuples = so->markTuples = NULL;
    so->currPos.nextTupleOffset = 0;
    so->markPos.nextTupleOffset = 0;

    scan->xs_itupdesc = RelationGetDescr(rel);

    scan->opaque = so;

    PG_RETURN_POINTER(scan);
}

Datum btbuild ( PG_FUNCTION_ARGS   ) 

Definition at line 83 of file nbtree.c.

References _bt_leafbuild(), _bt_spooldestroy(), _bt_spoolinit(), btbuildCallback(), elog, ERROR, BTBuildState::haveDead, IndexBuildResult::heap_tuples, BTBuildState::heapRel, IndexInfo::ii_Unique, IndexBuildResult::index_tuples, IndexBuildHeapScan(), BTBuildState::indtuples, BTBuildState::isUnique, log_btree_build_stats, palloc(), PG_GETARG_POINTER, PG_RETURN_POINTER, RelationGetNumberOfBlocks, RelationGetRelationName, ResetUsage(), ShowUsage(), BTBuildState::spool, and BTBuildState::spool2.

{
    Relation    heap = (Relation) PG_GETARG_POINTER(0);
    Relation    index = (Relation) PG_GETARG_POINTER(1);
    IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
    IndexBuildResult *result;
    double      reltuples;
    BTBuildState buildstate;

    buildstate.isUnique = indexInfo->ii_Unique;
    buildstate.haveDead = false;
    buildstate.heapRel = heap;
    buildstate.spool = NULL;
    buildstate.spool2 = NULL;
    buildstate.indtuples = 0;

#ifdef BTREE_BUILD_STATS
    if (log_btree_build_stats)
        ResetUsage();
#endif   /* BTREE_BUILD_STATS */

    /*
     * We expect to be called exactly once for any index relation. If that's
     * not the case, big trouble's what we have.
     */
    if (RelationGetNumberOfBlocks(index) != 0)
        elog(ERROR, "index \"%s\" already contains data",
             RelationGetRelationName(index));

    buildstate.spool = _bt_spoolinit(heap, index, indexInfo->ii_Unique, false);

    /*
     * If building a unique index, put dead tuples in a second spool to keep
     * them out of the uniqueness check.
     */
    if (indexInfo->ii_Unique)
        buildstate.spool2 = _bt_spoolinit(heap, index, false, true);

    /* do the heap scan */
    reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
                                   btbuildCallback, (void *) &buildstate);

    /* okay, all heap tuples are indexed */
    if (buildstate.spool2 && !buildstate.haveDead)
    {
        /* spool2 turns out to be unnecessary */
        _bt_spooldestroy(buildstate.spool2);
        buildstate.spool2 = NULL;
    }

    /*
     * Finish the build by (1) completing the sort of the spool file, (2)
     * inserting the sorted tuples into btree pages and (3) building the upper
     * levels.
     */
    _bt_leafbuild(buildstate.spool, buildstate.spool2);
    _bt_spooldestroy(buildstate.spool);
    if (buildstate.spool2)
        _bt_spooldestroy(buildstate.spool2);

#ifdef BTREE_BUILD_STATS
    if (log_btree_build_stats)
    {
        ShowUsage("BTREE BUILD STATS");
        ResetUsage();
    }
#endif   /* BTREE_BUILD_STATS */

    /*
     * If we are reindexing a pre-existing index, it is critical to send out a
     * relcache invalidation SI message to ensure all backends re-read the
     * index metapage.  We expect that the caller will ensure that happens
     * (typically as a side effect of updating index stats, but it must happen
     * even if the stats don't change!)
     */

    /*
     * Return statistics
     */
    result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));

    result->heap_tuples = reltuples;
    result->index_tuples = buildstate.indtuples;

    PG_RETURN_POINTER(result);
}

static void btbuildCallback ( Relation  index,
HeapTuple  htup,
Datum values,
bool isnull,
bool  tupleIsAlive,
void *  state 
) [static]

Definition at line 174 of file nbtree.c.

References _bt_spool(), BTBuildState::haveDead, index_form_tuple(), BTBuildState::indtuples, NULL, pfree(), RelationGetDescr, BTBuildState::spool, BTBuildState::spool2, HeapTupleData::t_self, and IndexTupleData::t_tid.

Referenced by btbuild().

{
    BTBuildState *buildstate = (BTBuildState *) state;
    IndexTuple  itup;

    /* form an index tuple and point it at the heap tuple */
    itup = index_form_tuple(RelationGetDescr(index), values, isnull);
    itup->t_tid = htup->t_self;

    /*
     * insert the index tuple into the appropriate spool file for subsequent
     * processing
     */
    if (tupleIsAlive || buildstate->spool2 == NULL)
        _bt_spool(itup, buildstate->spool);
    else
    {
        /* dead tuples are put into spool2 */
        buildstate->haveDead = true;
        _bt_spool(itup, buildstate->spool2);
    }

    buildstate->indtuples += 1;

    pfree(itup);
}

Datum btbuildempty ( PG_FUNCTION_ARGS   ) 

Definition at line 210 of file nbtree.c.

References _bt_initmetapage(), BTREE_METAPAGE, INIT_FORKNUM, log_newpage(), RelFileNodeBackend::node, P_NONE, PageSetChecksumInplace(), palloc(), PG_GETARG_POINTER, PG_RETURN_VOID, RelationData::rd_smgr, SMgrRelationData::smgr_rnode, smgrimmedsync(), smgrwrite(), and XLogIsNeeded.

{
    Relation    index = (Relation) PG_GETARG_POINTER(0);
    Page        metapage;

    /* Construct metapage. */
    metapage = (Page) palloc(BLCKSZ);
    _bt_initmetapage(metapage, P_NONE, 0);

    /* Write the page.  If archiving/streaming, XLOG it. */
    PageSetChecksumInplace(metapage, BTREE_METAPAGE);
    smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
              (char *) metapage, true);
    if (XLogIsNeeded())
        log_newpage(&index->rd_smgr->smgr_rnode.node, INIT_FORKNUM,
                    BTREE_METAPAGE, metapage);

    /*
     * An immediate sync is require even if we xlog'd the page, because the
     * write did not go through shared_buffers and therefore a concurrent
     * checkpoint may have move the redo pointer past our xlog record.
     */
    smgrimmedsync(index->rd_smgr, INIT_FORKNUM);

    PG_RETURN_VOID();
}

Datum btbulkdelete ( PG_FUNCTION_ARGS   ) 

Definition at line 653 of file nbtree.c.

References _bt_end_vacuum(), _bt_end_vacuum_callback(), _bt_start_vacuum(), btvacuumscan(), callback(), IndexVacuumInfo::index, NULL, palloc0(), PG_END_ENSURE_ERROR_CLEANUP, PG_ENSURE_ERROR_CLEANUP, PG_GETARG_POINTER, PG_RETURN_POINTER, and PointerGetDatum.

{
    IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
    IndexBulkDeleteResult *volatile stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
    IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2);
    void       *callback_state = (void *) PG_GETARG_POINTER(3);
    Relation    rel = info->index;
    BTCycleId   cycleid;

    /* allocate stats if first time through, else re-use existing struct */
    if (stats == NULL)
        stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));

    /* Establish the vacuum cycle ID to use for this scan */
    /* The ENSURE stuff ensures we clean up shared memory on failure */
    PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
    {
        cycleid = _bt_start_vacuum(rel);

        btvacuumscan(info, stats, callback, callback_state, cycleid);
    }
    PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
    _bt_end_vacuum(rel);

    PG_RETURN_POINTER(stats);
}

Datum btcanreturn ( PG_FUNCTION_ARGS   ) 

Definition at line 1112 of file nbtree.c.

References PG_RETURN_BOOL.

{
    PG_RETURN_BOOL(true);
}

Datum btendscan ( PG_FUNCTION_ARGS   ) 

Definition at line 523 of file nbtree.c.

References _bt_killitems(), BTScanOpaqueData::arrayContext, BTScanPosIsValid, BTScanPosData::buf, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, BTScanOpaqueData::keyData, BTScanOpaqueData::killedItems, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, MemoryContextDelete(), NULL, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, pfree(), PG_GETARG_POINTER, PG_RETURN_VOID, and ReleaseBuffer().

{
    IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
    BTScanOpaque so = (BTScanOpaque) scan->opaque;

    /* we aren't holding any read locks, but gotta drop the pins */
    if (BTScanPosIsValid(so->currPos))
    {
        /* Before leaving current page, deal with any killed items */
        if (so->numKilled > 0)
            _bt_killitems(scan, false);
        ReleaseBuffer(so->currPos.buf);
        so->currPos.buf = InvalidBuffer;
    }

    if (BTScanPosIsValid(so->markPos))
    {
        ReleaseBuffer(so->markPos.buf);
        so->markPos.buf = InvalidBuffer;
    }
    so->markItemIndex = -1;

    /* Release storage */
    if (so->keyData != NULL)
        pfree(so->keyData);
    /* so->arrayKeyData and so->arrayKeys are in arrayContext */
    if (so->arrayContext != NULL)
        MemoryContextDelete(so->arrayContext);
    if (so->killedItems != NULL)
        pfree(so->killedItems);
    if (so->currTuples != NULL)
        pfree(so->currTuples);
    /* so->markTuples should not be pfree'd, see btrescan */
    pfree(so);

    PG_RETURN_VOID();
}

Datum btgetbitmap ( PG_FUNCTION_ARGS   ) 

Definition at line 346 of file nbtree.c.

References _bt_advance_array_keys(), _bt_first(), _bt_next(), _bt_start_array_keys(), BTScanOpaqueData::currPos, ForwardScanDirection, BTScanPosItem::heapTid, BTScanPosData::itemIndex, BTScanPosData::items, BTScanPosData::lastItem, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, PG_GETARG_POINTER, PG_RETURN_INT64, HeapTupleData::t_self, tbm_add_tuples(), and IndexScanDescData::xs_ctup.

{
    IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
    TIDBitmap  *tbm = (TIDBitmap *) PG_GETARG_POINTER(1);
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    int64       ntids = 0;
    ItemPointer heapTid;

    /*
     * If we have any array keys, initialize them.
     */
    if (so->numArrayKeys)
    {
        /* punt if we have any unsatisfiable array keys */
        if (so->numArrayKeys < 0)
            PG_RETURN_INT64(ntids);

        _bt_start_array_keys(scan, ForwardScanDirection);
    }

    /* This loop handles advancing to the next array elements, if any */
    do
    {
        /* Fetch the first page & tuple */
        if (_bt_first(scan, ForwardScanDirection))
        {
            /* Save tuple ID, and continue scanning */
            heapTid = &scan->xs_ctup.t_self;
            tbm_add_tuples(tbm, heapTid, 1, false);
            ntids++;

            for (;;)
            {
                /*
                 * Advance to next tuple within page.  This is the same as the
                 * easy case in _bt_next().
                 */
                if (++so->currPos.itemIndex > so->currPos.lastItem)
                {
                    /* let _bt_next do the heavy lifting */
                    if (!_bt_next(scan, ForwardScanDirection))
                        break;
                }

                /* Save tuple ID, and continue scanning */
                heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid;
                tbm_add_tuples(tbm, heapTid, 1, false);
                ntids++;
            }
        }
        /* Now see if we have more array keys to deal with */
    } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));

    PG_RETURN_INT64(ntids);
}

Datum btgettuple ( PG_FUNCTION_ARGS   ) 

Definition at line 270 of file nbtree.c.

References _bt_advance_array_keys(), _bt_first(), _bt_next(), _bt_start_array_keys(), BTScanPosIsValid, BTScanOpaqueData::currPos, BTScanPosData::itemIndex, IndexScanDescData::kill_prior_tuple, BTScanOpaqueData::killedItems, MaxIndexTuplesPerPage, NULL, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, palloc(), PG_GETARG_INT32, PG_GETARG_POINTER, PG_RETURN_BOOL, and IndexScanDescData::xs_recheck.

{
    IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
    ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
    BTScanOpaque so = (BTScanOpaque) scan->opaque;
    bool        res;

    /* btree indexes are never lossy */
    scan->xs_recheck = false;

    /*
     * If we have any array keys, initialize them during first call for a
     * scan.  We can't do this in btrescan because we don't know the scan
     * direction at that time.
     */
    if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
    {
        /* punt if we have any unsatisfiable array keys */
        if (so->numArrayKeys < 0)
            PG_RETURN_BOOL(false);

        _bt_start_array_keys(scan, dir);
    }

    /* This loop handles advancing to the next array elements, if any */
    do
    {
        /*
         * If we've already initialized this scan, we can just advance it in
         * the appropriate direction.  If we haven't done so yet, we call
         * _bt_first() to get the first item in the scan.
         */
        if (!BTScanPosIsValid(so->currPos))
            res = _bt_first(scan, dir);
        else
        {
            /*
             * Check to see if we should kill the previously-fetched tuple.
             */
            if (scan->kill_prior_tuple)
            {
                /*
                 * Yes, remember it for later. (We'll deal with all such
                 * tuples at once right before leaving the index page.)  The
                 * test for numKilled overrun is not just paranoia: if the
                 * caller reverses direction in the indexscan then the same
                 * item might get entered multiple times. It's not worth
                 * trying to optimize that, so we don't detect it, but instead
                 * just forget any excess entries.
                 */
                if (so->killedItems == NULL)
                    so->killedItems = (int *)
                        palloc(MaxIndexTuplesPerPage * sizeof(int));
                if (so->numKilled < MaxIndexTuplesPerPage)
                    so->killedItems[so->numKilled++] = so->currPos.itemIndex;
            }

            /*
             * Now continue the scan.
             */
            res = _bt_next(scan, dir);
        }

        /* If we have a tuple, return it ... */
        if (res)
            break;
        /* ... otherwise see if we have more array keys to deal with */
    } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));

    PG_RETURN_BOOL(res);
}

Datum btinsert ( PG_FUNCTION_ARGS   ) 

Definition at line 244 of file nbtree.c.

References _bt_doinsert(), index_form_tuple(), pfree(), PG_GETARG_INT32, PG_GETARG_POINTER, PG_RETURN_BOOL, RelationGetDescr, IndexTupleData::t_tid, and values.

{
    Relation    rel = (Relation) PG_GETARG_POINTER(0);
    Datum      *values = (Datum *) PG_GETARG_POINTER(1);
    bool       *isnull = (bool *) PG_GETARG_POINTER(2);
    ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3);
    Relation    heapRel = (Relation) PG_GETARG_POINTER(4);
    IndexUniqueCheck checkUnique = (IndexUniqueCheck) PG_GETARG_INT32(5);
    bool        result;
    IndexTuple  itup;

    /* generate an index tuple */
    itup = index_form_tuple(RelationGetDescr(rel), values, isnull);
    itup->t_tid = *ht_ctid;

    result = _bt_doinsert(rel, itup, checkUnique, heapRel);

    pfree(itup);

    PG_RETURN_BOOL(result);
}

Datum btmarkpos ( PG_FUNCTION_ARGS   ) 

Definition at line 565 of file nbtree.c.

References _bt_mark_array_keys(), BTScanPosIsValid, BTScanPosData::buf, BTScanOpaqueData::currPos, BTScanPosData::itemIndex, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::numArrayKeys, IndexScanDescData::opaque, PG_GETARG_POINTER, PG_RETURN_VOID, and ReleaseBuffer().

{
    IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
    BTScanOpaque so = (BTScanOpaque) scan->opaque;

    /* we aren't holding any read locks, but gotta drop the pin */
    if (BTScanPosIsValid(so->markPos))
    {
        ReleaseBuffer(so->markPos.buf);
        so->markPos.buf = InvalidBuffer;
    }

    /*
     * Just record the current itemIndex.  If we later step to next page
     * before releasing the marked position, _bt_steppage makes a full copy of
     * the currPos struct in markPos.  If (as often happens) the mark is moved
     * before we leave the page, we don't have to do that work.
     */
    if (BTScanPosIsValid(so->currPos))
        so->markItemIndex = so->currPos.itemIndex;
    else
        so->markItemIndex = -1;

    /* Also record the current positions of any array keys */
    if (so->numArrayKeys)
        _bt_mark_array_keys(scan);

    PG_RETURN_VOID();
}

Datum btrescan ( PG_FUNCTION_ARGS   ) 

Definition at line 456 of file nbtree.c.

References _bt_killitems(), _bt_preprocess_array_keys(), BTScanPosIsValid, BTScanPosData::buf, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, IndexScanDescData::keyData, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, memmove, NULL, BTScanOpaqueData::numberOfKeys, IndexScanDescData::numberOfKeys, BTScanOpaqueData::numKilled, IndexScanDescData::opaque, palloc(), PG_GETARG_POINTER, PG_RETURN_VOID, ReleaseBuffer(), and IndexScanDescData::xs_want_itup.

{
    IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
    ScanKey     scankey = (ScanKey) PG_GETARG_POINTER(1);

    /* remaining arguments are ignored */
    BTScanOpaque so = (BTScanOpaque) scan->opaque;

    /* we aren't holding any read locks, but gotta drop the pins */
    if (BTScanPosIsValid(so->currPos))
    {
        /* Before leaving current page, deal with any killed items */
        if (so->numKilled > 0)
            _bt_killitems(scan, false);
        ReleaseBuffer(so->currPos.buf);
        so->currPos.buf = InvalidBuffer;
    }

    if (BTScanPosIsValid(so->markPos))
    {
        ReleaseBuffer(so->markPos.buf);
        so->markPos.buf = InvalidBuffer;
    }
    so->markItemIndex = -1;

    /*
     * Allocate tuple workspace arrays, if needed for an index-only scan and
     * not already done in a previous rescan call.  To save on palloc
     * overhead, both workspaces are allocated as one palloc block; only this
     * function and btendscan know that.
     *
     * NOTE: this data structure also makes it safe to return data from a
     * "name" column, even though btree name_ops uses an underlying storage
     * datatype of cstring.  The risk there is that "name" is supposed to be
     * padded to NAMEDATALEN, but the actual index tuple is probably shorter.
     * However, since we only return data out of tuples sitting in the
     * currTuples array, a fetch of NAMEDATALEN bytes can at worst pull some
     * data out of the markTuples array --- running off the end of memory for
     * a SIGSEGV is not possible.  Yeah, this is ugly as sin, but it beats
     * adding special-case treatment for name_ops elsewhere.
     */
    if (scan->xs_want_itup && so->currTuples == NULL)
    {
        so->currTuples = (char *) palloc(BLCKSZ * 2);
        so->markTuples = so->currTuples + BLCKSZ;
    }

    /*
     * Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
     * - vadim 05/05/97
     */
    if (scankey && scan->numberOfKeys > 0)
        memmove(scan->keyData,
                scankey,
                scan->numberOfKeys * sizeof(ScanKeyData));
    so->numberOfKeys = 0;       /* until _bt_preprocess_keys sets it */

    /* If any keys are SK_SEARCHARRAY type, set up array-key info */
    _bt_preprocess_array_keys(scan);

    PG_RETURN_VOID();
}

Datum btrestrpos ( PG_FUNCTION_ARGS   ) 

Definition at line 599 of file nbtree.c.

References _bt_killitems(), _bt_restore_array_keys(), BTScanPosIsValid, BTScanPosData::buf, BTScanOpaqueData::currPos, BTScanOpaqueData::currTuples, IncrBufferRefCount(), BTScanPosData::itemIndex, BTScanPosData::lastItem, BTScanOpaqueData::markItemIndex, BTScanOpaqueData::markPos, BTScanOpaqueData::markTuples, BTScanPosData::nextTupleOffset, BTScanOpaqueData::numArrayKeys, BTScanOpaqueData::numKilled, offsetof, IndexScanDescData::opaque, PG_GETARG_POINTER, PG_RETURN_VOID, and ReleaseBuffer().

{
    IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
    BTScanOpaque so = (BTScanOpaque) scan->opaque;

    /* Restore the marked positions of any array keys */
    if (so->numArrayKeys)
        _bt_restore_array_keys(scan);

    if (so->markItemIndex >= 0)
    {
        /*
         * The mark position is on the same page we are currently on. Just
         * restore the itemIndex.
         */
        so->currPos.itemIndex = so->markItemIndex;
    }
    else
    {
        /* we aren't holding any read locks, but gotta drop the pin */
        if (BTScanPosIsValid(so->currPos))
        {
            /* Before leaving current page, deal with any killed items */
            if (so->numKilled > 0 &&
                so->currPos.buf != so->markPos.buf)
                _bt_killitems(scan, false);
            ReleaseBuffer(so->currPos.buf);
            so->currPos.buf = InvalidBuffer;
        }

        if (BTScanPosIsValid(so->markPos))
        {
            /* bump pin on mark buffer for assignment to current buffer */
            IncrBufferRefCount(so->markPos.buf);
            memcpy(&so->currPos, &so->markPos,
                   offsetof(BTScanPosData, items[1]) +
                   so->markPos.lastItem * sizeof(BTScanPosItem));
            if (so->currTuples)
                memcpy(so->currTuples, so->markTuples,
                       so->markPos.nextTupleOffset);
        }
    }

    PG_RETURN_VOID();
}

Datum btvacuumcleanup ( PG_FUNCTION_ARGS   ) 

Definition at line 686 of file nbtree.c.

References IndexVacuumInfo::analyze_only, btvacuumscan(), IndexVacuumInfo::estimated_count, IndexVacuumInfo::index, IndexFreeSpaceMapVacuum(), NULL, IndexVacuumInfo::num_heap_tuples, IndexBulkDeleteResult::num_index_tuples, palloc0(), PG_GETARG_POINTER, and PG_RETURN_POINTER.

{
    IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
    IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);

    /* No-op in ANALYZE ONLY mode */
    if (info->analyze_only)
        PG_RETURN_POINTER(stats);

    /*
     * If btbulkdelete was called, we need not do anything, just return the
     * stats from the latest btbulkdelete call.  If it wasn't called, we must
     * still do a pass over the index, to recycle any newly-recyclable pages
     * and to obtain index statistics.
     *
     * Since we aren't going to actually delete any leaf items, there's no
     * need to go through all the vacuum-cycle-ID pushups.
     */
    if (stats == NULL)
    {
        stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
        btvacuumscan(info, stats, NULL, NULL, 0);
    }

    /* Finally, vacuum the FSM */
    IndexFreeSpaceMapVacuum(info->index);

    /*
     * It's quite possible for us to be fooled by concurrent page splits into
     * double-counting some index tuples, so disbelieve any total that exceeds
     * the underlying heap's count ... if we know that accurately.  Otherwise
     * this might just make matters worse.
     */
    if (!info->estimated_count)
    {
        if (stats->num_index_tuples > info->num_heap_tuples)
            stats->num_index_tuples = info->num_heap_tuples;
    }

    PG_RETURN_POINTER(stats);
}

static void btvacuumpage ( BTVacState vstate,
BlockNumber  blkno,
BlockNumber  orig_blkno 
) [static]

Definition at line 866 of file nbtree.c.

References _bt_checkpage(), _bt_delitems_vacuum(), _bt_page_recyclable(), _bt_pagedel(), _bt_relbuf(), BT_READ, BTP_SPLIT_END, BTPageOpaqueData::btpo_cycleid, BTPageOpaqueData::btpo_flags, BTPageOpaqueData::btpo_next, buf, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage, BTVacState::callback, callback(), BTVacState::callback_state, BTVacState::cycleid, IndexVacuumInfo::index, BTVacState::info, BTVacState::lastBlockVacuumed, BTVacState::lastUsedPage, LockBuffer(), LockBufferForCleanup(), MAIN_FORKNUM, MarkBufferDirtyHint(), MemoryContextReset(), MemoryContextSwitchTo(), NULL, IndexBulkDeleteResult::num_index_tuples, OffsetNumberNext, P_FIRSTDATAKEY, P_IGNORE, P_ISDELETED, P_ISHALFDEAD, P_ISLEAF, P_NONE, P_RIGHTMOST, BTVacState::pagedelcontext, PageGetItem, PageGetItemId, PageGetMaxOffsetNumber, PageGetSpecialPointer, PageIsNew, IndexBulkDeleteResult::pages_deleted, RBM_NORMAL, ReadBufferExtended(), RecordFreeIndexPage(), BTVacState::stats, IndexVacuumInfo::strategy, IndexTupleData::t_tid, BTVacState::totFreePages, IndexBulkDeleteResult::tuples_removed, and vacuum_delay_point().

Referenced by btvacuumscan().

{
    IndexVacuumInfo *info = vstate->info;
    IndexBulkDeleteResult *stats = vstate->stats;
    IndexBulkDeleteCallback callback = vstate->callback;
    void       *callback_state = vstate->callback_state;
    Relation    rel = info->index;
    bool        delete_now;
    BlockNumber recurse_to;
    Buffer      buf;
    Page        page;
    BTPageOpaque opaque;

restart:
    delete_now = false;
    recurse_to = P_NONE;

    /* call vacuum_delay_point while not holding any buffer lock */
    vacuum_delay_point();

    /*
     * We can't use _bt_getbuf() here because it always applies
     * _bt_checkpage(), which will barf on an all-zero page. We want to
     * recycle all-zero pages, not fail.  Also, we want to use a nondefault
     * buffer access strategy.
     */
    buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
                             info->strategy);
    LockBuffer(buf, BT_READ);
    page = BufferGetPage(buf);
    opaque = (BTPageOpaque) PageGetSpecialPointer(page);
    if (!PageIsNew(page))
        _bt_checkpage(rel, buf);

    /*
     * If we are recursing, the only case we want to do anything with is a
     * live leaf page having the current vacuum cycle ID.  Any other state
     * implies we already saw the page (eg, deleted it as being empty).
     */
    if (blkno != orig_blkno)
    {
        if (_bt_page_recyclable(page) ||
            P_IGNORE(opaque) ||
            !P_ISLEAF(opaque) ||
            opaque->btpo_cycleid != vstate->cycleid)
        {
            _bt_relbuf(rel, buf);
            return;
        }
    }

    /* If the page is in use, update lastUsedPage */
    if (!_bt_page_recyclable(page) && vstate->lastUsedPage < blkno)
        vstate->lastUsedPage = blkno;

    /* Page is valid, see what to do with it */
    if (_bt_page_recyclable(page))
    {
        /* Okay to recycle this page */
        RecordFreeIndexPage(rel, blkno);
        vstate->totFreePages++;
        stats->pages_deleted++;
    }
    else if (P_ISDELETED(opaque))
    {
        /* Already deleted, but can't recycle yet */
        stats->pages_deleted++;
    }
    else if (P_ISHALFDEAD(opaque))
    {
        /* Half-dead, try to delete */
        delete_now = true;
    }
    else if (P_ISLEAF(opaque))
    {
        OffsetNumber deletable[MaxOffsetNumber];
        int         ndeletable;
        OffsetNumber offnum,
                    minoff,
                    maxoff;

        /*
         * Trade in the initial read lock for a super-exclusive write lock on
         * this page.  We must get such a lock on every leaf page over the
         * course of the vacuum scan, whether or not it actually contains any
         * deletable tuples --- see nbtree/README.
         */
        LockBuffer(buf, BUFFER_LOCK_UNLOCK);
        LockBufferForCleanup(buf);

        /*
         * Check whether we need to recurse back to earlier pages.  What we
         * are concerned about is a page split that happened since we started
         * the vacuum scan.  If the split moved some tuples to a lower page
         * then we might have missed 'em.  If so, set up for tail recursion.
         * (Must do this before possibly clearing btpo_cycleid below!)
         */
        if (vstate->cycleid != 0 &&
            opaque->btpo_cycleid == vstate->cycleid &&
            !(opaque->btpo_flags & BTP_SPLIT_END) &&
            !P_RIGHTMOST(opaque) &&
            opaque->btpo_next < orig_blkno)
            recurse_to = opaque->btpo_next;

        /*
         * Scan over all items to see which ones need deleted according to the
         * callback function.
         */
        ndeletable = 0;
        minoff = P_FIRSTDATAKEY(opaque);
        maxoff = PageGetMaxOffsetNumber(page);
        if (callback)
        {
            for (offnum = minoff;
                 offnum <= maxoff;
                 offnum = OffsetNumberNext(offnum))
            {
                IndexTuple  itup;
                ItemPointer htup;

                itup = (IndexTuple) PageGetItem(page,
                                                PageGetItemId(page, offnum));
                htup = &(itup->t_tid);

                /*
                 * During Hot Standby we currently assume that
                 * XLOG_BTREE_VACUUM records do not produce conflicts. That is
                 * only true as long as the callback function depends only
                 * upon whether the index tuple refers to heap tuples removed
                 * in the initial heap scan. When vacuum starts it derives a
                 * value of OldestXmin. Backends taking later snapshots could
                 * have a RecentGlobalXmin with a later xid than the vacuum's
                 * OldestXmin, so it is possible that row versions deleted
                 * after OldestXmin could be marked as killed by other
                 * backends. The callback function *could* look at the index
                 * tuple state in isolation and decide to delete the index
                 * tuple, though currently it does not. If it ever did, we
                 * would need to reconsider whether XLOG_BTREE_VACUUM records
                 * should cause conflicts. If they did cause conflicts they
                 * would be fairly harsh conflicts, since we haven't yet
                 * worked out a way to pass a useful value for
                 * latestRemovedXid on the XLOG_BTREE_VACUUM records. This
                 * applies to *any* type of index that marks index tuples as
                 * killed.
                 */
                if (callback(htup, callback_state))
                    deletable[ndeletable++] = offnum;
            }
        }

        /*
         * Apply any needed deletes.  We issue just one _bt_delitems_vacuum()
         * call per page, so as to minimize WAL traffic.
         */
        if (ndeletable > 0)
        {
            BlockNumber lastBlockVacuumed = BufferGetBlockNumber(buf);

            _bt_delitems_vacuum(rel, buf, deletable, ndeletable,
                                vstate->lastBlockVacuumed);

            /*
             * Keep track of the block number of the lastBlockVacuumed, so we
             * can scan those blocks as well during WAL replay. This then
             * provides concurrency protection and allows btrees to be used
             * while in recovery.
             */
            if (lastBlockVacuumed > vstate->lastBlockVacuumed)
                vstate->lastBlockVacuumed = lastBlockVacuumed;

            stats->tuples_removed += ndeletable;
            /* must recompute maxoff */
            maxoff = PageGetMaxOffsetNumber(page);
        }
        else
        {
            /*
             * If the page has been split during this vacuum cycle, it seems
             * worth expending a write to clear btpo_cycleid even if we don't
             * have any deletions to do.  (If we do, _bt_delitems_vacuum takes
             * care of this.)  This ensures we won't process the page again.
             *
             * We treat this like a hint-bit update because there's no need to
             * WAL-log it.
             */
            if (vstate->cycleid != 0 &&
                opaque->btpo_cycleid == vstate->cycleid)
            {
                opaque->btpo_cycleid = 0;
                MarkBufferDirtyHint(buf);
            }
        }

        /*
         * If it's now empty, try to delete; else count the live tuples. We
         * don't delete when recursing, though, to avoid putting entries into
         * freePages out-of-order (doesn't seem worth any extra code to handle
         * the case).
         */
        if (minoff > maxoff)
            delete_now = (blkno == orig_blkno);
        else
            stats->num_index_tuples += maxoff - minoff + 1;
    }

    if (delete_now)
    {
        MemoryContext oldcontext;
        int         ndel;

        /* Run pagedel in a temp context to avoid memory leakage */
        MemoryContextReset(vstate->pagedelcontext);
        oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);

        ndel = _bt_pagedel(rel, buf, NULL);

        /* count only this page, else may double-count parent */
        if (ndel)
            stats->pages_deleted++;

        MemoryContextSwitchTo(oldcontext);
        /* pagedel released buffer, so we shouldn't */
    }
    else
        _bt_relbuf(rel, buf);

    /*
     * This is really tail recursion, but if the compiler is too stupid to
     * optimize it as such, we'd eat an uncomfortably large amount of stack
     * space per recursion level (due to the deletable[] array). A failure is
     * improbable since the number of levels isn't likely to be large ... but
     * just in case, let's hand-optimize into a loop.
     */
    if (recurse_to != P_NONE)
    {
        blkno = recurse_to;
        goto restart;
    }
}

static void btvacuumscan ( IndexVacuumInfo info,
IndexBulkDeleteResult stats,
IndexBulkDeleteCallback  callback,
void *  callback_state,
BTCycleId  cycleid 
) [static]

Definition at line 741 of file nbtree.c.

References _bt_delitems_vacuum(), _bt_relbuf(), ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE, ALLOCSET_DEFAULT_MINSIZE, AllocSetContextCreate(), BTREE_METAPAGE, btvacuumpage(), buf, BTVacState::callback, BTVacState::callback_state, CurrentMemoryContext, BTVacState::cycleid, IndexBulkDeleteResult::estimated_count, ExclusiveLock, IndexVacuumInfo::index, BTVacState::info, BTVacState::lastBlockVacuumed, BTVacState::lastUsedPage, LockBufferForCleanup(), LockRelationForExtension(), MAIN_FORKNUM, MemoryContextDelete(), NULL, IndexBulkDeleteResult::num_index_tuples, IndexBulkDeleteResult::num_pages, BTVacState::pagedelcontext, IndexBulkDeleteResult::pages_deleted, IndexBulkDeleteResult::pages_free, RBM_NORMAL, ReadBufferExtended(), RELATION_IS_LOCAL, RelationGetNumberOfBlocks, BTVacState::stats, IndexVacuumInfo::strategy, BTVacState::totFreePages, UnlockRelationForExtension(), and XLogStandbyInfoActive.

Referenced by btbulkdelete(), and btvacuumcleanup().

{
    Relation    rel = info->index;
    BTVacState  vstate;
    BlockNumber num_pages;
    BlockNumber blkno;
    bool        needLock;

    /*
     * Reset counts that will be incremented during the scan; needed in case
     * of multiple scans during a single VACUUM command
     */
    stats->estimated_count = false;
    stats->num_index_tuples = 0;
    stats->pages_deleted = 0;

    /* Set up info to pass down to btvacuumpage */
    vstate.info = info;
    vstate.stats = stats;
    vstate.callback = callback;
    vstate.callback_state = callback_state;
    vstate.cycleid = cycleid;
    vstate.lastBlockVacuumed = BTREE_METAPAGE;  /* Initialise at first block */
    vstate.lastUsedPage = BTREE_METAPAGE;
    vstate.totFreePages = 0;

    /* Create a temporary memory context to run _bt_pagedel in */
    vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
                                                  "_bt_pagedel",
                                                  ALLOCSET_DEFAULT_MINSIZE,
                                                  ALLOCSET_DEFAULT_INITSIZE,
                                                  ALLOCSET_DEFAULT_MAXSIZE);

    /*
     * The outer loop iterates over all index pages except the metapage, in
     * physical order (we hope the kernel will cooperate in providing
     * read-ahead for speed).  It is critical that we visit all leaf pages,
     * including ones added after we start the scan, else we might fail to
     * delete some deletable tuples.  Hence, we must repeatedly check the
     * relation length.  We must acquire the relation-extension lock while
     * doing so to avoid a race condition: if someone else is extending the
     * relation, there is a window where bufmgr/smgr have created a new
     * all-zero page but it hasn't yet been write-locked by _bt_getbuf(). If
     * we manage to scan such a page here, we'll improperly assume it can be
     * recycled.  Taking the lock synchronizes things enough to prevent a
     * problem: either num_pages won't include the new page, or _bt_getbuf
     * already has write lock on the buffer and it will be fully initialized
     * before we can examine it.  (See also vacuumlazy.c, which has the same
     * issue.)  Also, we need not worry if a page is added immediately after
     * we look; the page splitting code already has write-lock on the left
     * page before it adds a right page, so we must already have processed any
     * tuples due to be moved into such a page.
     *
     * We can skip locking for new or temp relations, however, since no one
     * else could be accessing them.
     */
    needLock = !RELATION_IS_LOCAL(rel);

    blkno = BTREE_METAPAGE + 1;
    for (;;)
    {
        /* Get the current relation length */
        if (needLock)
            LockRelationForExtension(rel, ExclusiveLock);
        num_pages = RelationGetNumberOfBlocks(rel);
        if (needLock)
            UnlockRelationForExtension(rel, ExclusiveLock);

        /* Quit if we've scanned the whole relation */
        if (blkno >= num_pages)
            break;
        /* Iterate over pages, then loop back to recheck length */
        for (; blkno < num_pages; blkno++)
        {
            btvacuumpage(&vstate, blkno, blkno);
        }
    }

    /*
     * InHotStandby we need to scan right up to the end of the index for
     * correct locking, so we may need to write a WAL record for the final
     * block in the index if it was not vacuumed. It's possible that VACUUMing
     * has actually removed zeroed pages at the end of the index so we need to
     * take care to issue the record for last actual block and not for the
     * last block that was scanned. Ignore empty indexes.
     */
    if (XLogStandbyInfoActive() &&
        num_pages > 1 && vstate.lastBlockVacuumed < (num_pages - 1))
    {
        Buffer      buf;

        /*
         * We can't use _bt_getbuf() here because it always applies
         * _bt_checkpage(), which will barf on an all-zero page. We want to
         * recycle all-zero pages, not fail.  Also, we want to use a
         * nondefault buffer access strategy.
         */
        buf = ReadBufferExtended(rel, MAIN_FORKNUM, num_pages - 1, RBM_NORMAL,
                                 info->strategy);
        LockBufferForCleanup(buf);
        _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
        _bt_relbuf(rel, buf);
    }

    MemoryContextDelete(vstate.pagedelcontext);

    /* update statistics */
    stats->num_pages = num_pages;
    stats->pages_free = vstate.totFreePages;
}