#include "postgres.h"
#include "access/multixact.h"
#include "access/relscan.h"
#include "access/rewriteheap.h"
#include "access/transam.h"
#include "access/xact.h"
#include "catalog/catalog.h"
#include "catalog/dependency.h"
#include "catalog/heap.h"
#include "catalog/index.h"
#include "catalog/namespace.h"
#include "catalog/objectaccess.h"
#include "catalog/toasting.h"
#include "commands/cluster.h"
#include "commands/matview.h"
#include "commands/tablecmds.h"
#include "commands/vacuum.h"
#include "miscadmin.h"
#include "optimizer/planner.h"
#include "storage/bufmgr.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
#include "storage/smgr.h"
#include "utils/acl.h"
#include "utils/fmgroids.h"
#include "utils/inval.h"
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/pg_rusage.h"
#include "utils/relmapper.h"
#include "utils/snapmgr.h"
#include "utils/syscache.h"
#include "utils/tqual.h"
#include "utils/tuplesort.h"

Include dependency graph for cluster.c:

Data Structures
struct	RelToCluster
Functions
static void	rebuild_relation (Relation OldHeap, Oid indexOid, int freeze_min_age, int freeze_table_age, bool verbose)
static void	copy_heap_data (Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, int freeze_min_age, int freeze_table_age, bool verbose, bool pSwapToastByContent, TransactionId pFreezeXid, MultiXactId *pFreezeMulti)
static List *	get_tables_to_cluster (MemoryContext cluster_context)
static void	reform_and_rewrite_tuple (HeapTuple tuple, TupleDesc oldTupDesc, TupleDesc newTupDesc, Datum values, bool isnull, bool newRelHasOids, RewriteState rwstate)
void	cluster (ClusterStmt *stmt, bool isTopLevel)
void	cluster_rel (Oid tableOid, Oid indexOid, bool recheck, bool verbose, int freeze_min_age, int freeze_table_age)
void	check_index_is_clusterable (Relation OldHeap, Oid indexOid, bool recheck, LOCKMODE lockmode)
void	mark_index_clustered (Relation rel, Oid indexOid, bool is_internal)
Oid	make_new_heap (Oid OIDOldHeap, Oid NewTableSpace)
static void	swap_relation_files (Oid r1, Oid r2, bool target_is_pg_class, bool swap_toast_by_content, bool is_internal, TransactionId frozenXid, MultiXactId frozenMulti, Oid *mapped_tables)
void	finish_heap_swap (Oid OIDOldHeap, Oid OIDNewHeap, bool is_system_catalog, bool swap_toast_by_content, bool check_constraints, bool is_internal, TransactionId frozenXid, MultiXactId frozenMulti)

Function Documentation

void check_index_is_clusterable	(	Relation	OldHeap,
		Oid	indexOid,
		bool	recheck,
		LOCKMODE	lockmode
	)

Definition at line 421 of file cluster.c.

References Anum_pg_index_indpred, ereport, errcode(), errmsg(), ERROR, heap_attisnull(), index_close(), index_open(), IndexIsValid, NoLock, NULL, RelationData::rd_am, RelationData::rd_index, RelationData::rd_indextuple, RelationGetRelationName, and RelationGetRelid.

Referenced by ATExecClusterOn(), and cluster_rel().

{
    Relation    OldIndex;

    OldIndex = index_open(indexOid, lockmode);

    /*
     * Check that index is in fact an index on the given relation
     */
    if (OldIndex->rd_index == NULL ||
        OldIndex->rd_index->indrelid != RelationGetRelid(OldHeap))
        ereport(ERROR,
                (errcode(ERRCODE_WRONG_OBJECT_TYPE),
                 errmsg("\"%s\" is not an index for table \"%s\"",
                        RelationGetRelationName(OldIndex),
                        RelationGetRelationName(OldHeap))));

    /* Index AM must allow clustering */
    if (!OldIndex->rd_am->amclusterable)
        ereport(ERROR,
                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                 errmsg("cannot cluster on index \"%s\" because access method does not support clustering",
                        RelationGetRelationName(OldIndex))));

    /*
     * Disallow clustering on incomplete indexes (those that might not index
     * every row of the relation).  We could relax this by making a separate
     * seqscan pass over the table to copy the missing rows, but that seems
     * expensive and tedious.
     */
    if (!heap_attisnull(OldIndex->rd_indextuple, Anum_pg_index_indpred))
        ereport(ERROR,
                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                 errmsg("cannot cluster on partial index \"%s\"",
                        RelationGetRelationName(OldIndex))));

    /*
     * Disallow if index is left over from a failed CREATE INDEX CONCURRENTLY;
     * it might well not contain entries for every heap row, or might not even
     * be internally consistent.  (But note that we don't check indcheckxmin;
     * the worst consequence of following broken HOT chains would be that we
     * might put recently-dead tuples out-of-order in the new table, and there
     * is little harm in that.)
     */
    if (!IndexIsValid(OldIndex->rd_index))
        ereport(ERROR,
                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                 errmsg("cannot cluster on invalid index \"%s\"",
                        RelationGetRelationName(OldIndex))));

    /* Drop relcache refcnt on OldIndex, but keep lock */
    index_close(OldIndex, NoLock);
}

void cluster	(	ClusterStmt *	stmt,
		bool	isTopLevel
	)

Definition at line 105 of file cluster.c.

References AccessExclusiveLock, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE, ALLOCSET_DEFAULT_MINSIZE, AllocSetContextCreate(), cluster_rel(), CommitTransactionCommand(), elog, ereport, errcode(), errmsg(), ERROR, get_relname_relid(), get_tables_to_cluster(), GETSTRUCT, GetTransactionSnapshot(), heap_close, heap_open(), HeapTupleIsValid, ClusterStmt::indexname, RelToCluster::indexOid, INDEXRELID, lfirst, lfirst_oid, MemoryContextDelete(), NoLock, NULL, ObjectIdGetDatum, OidIsValid, PopActiveSnapshot(), PortalContext, PreventTransactionChain(), PushActiveSnapshot(), RangeVarCallbackOwnsTable(), RangeVarGetRelidExtended(), RelationData::rd_rel, ClusterStmt::relation, RELATION_IS_OTHER_TEMP, RelationGetIndexList(), ReleaseSysCache(), RangeVar::relname, SearchSysCache1, StartTransactionCommand(), RelToCluster::tableOid, and ClusterStmt::verbose.

Referenced by standard_ProcessUtility().

{
    if (stmt->relation != NULL)
    {
        /* This is the single-relation case. */
        Oid         tableOid,
                    indexOid = InvalidOid;
        Relation    rel;

        /* Find, lock, and check permissions on the table */
        tableOid = RangeVarGetRelidExtended(stmt->relation,
                                            AccessExclusiveLock,
                                            false, false,
                                            RangeVarCallbackOwnsTable, NULL);
        rel = heap_open(tableOid, NoLock);

        /*
         * Reject clustering a remote temp table ... their local buffer
         * manager is not going to cope.
         */
        if (RELATION_IS_OTHER_TEMP(rel))
            ereport(ERROR,
                    (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
               errmsg("cannot cluster temporary tables of other sessions")));

        if (stmt->indexname == NULL)
        {
            ListCell   *index;

            /* We need to find the index that has indisclustered set. */
            foreach(index, RelationGetIndexList(rel))
            {
                HeapTuple   idxtuple;
                Form_pg_index indexForm;

                indexOid = lfirst_oid(index);
                idxtuple = SearchSysCache1(INDEXRELID,
                                           ObjectIdGetDatum(indexOid));
                if (!HeapTupleIsValid(idxtuple))
                    elog(ERROR, "cache lookup failed for index %u", indexOid);
                indexForm = (Form_pg_index) GETSTRUCT(idxtuple);
                if (indexForm->indisclustered)
                {
                    ReleaseSysCache(idxtuple);
                    break;
                }
                ReleaseSysCache(idxtuple);
                indexOid = InvalidOid;
            }

            if (!OidIsValid(indexOid))
                ereport(ERROR,
                        (errcode(ERRCODE_UNDEFINED_OBJECT),
                         errmsg("there is no previously clustered index for table \"%s\"",
                                stmt->relation->relname)));
        }
        else
        {
            /*
             * The index is expected to be in the same namespace as the
             * relation.
             */
            indexOid = get_relname_relid(stmt->indexname,
                                         rel->rd_rel->relnamespace);
            if (!OidIsValid(indexOid))
                ereport(ERROR,
                        (errcode(ERRCODE_UNDEFINED_OBJECT),
                       errmsg("index \"%s\" for table \"%s\" does not exist",
                              stmt->indexname, stmt->relation->relname)));
        }

        /* close relation, keep lock till commit */
        heap_close(rel, NoLock);

        /* Do the job */
        cluster_rel(tableOid, indexOid, false, stmt->verbose, -1, -1);
    }
    else
    {
        /*
         * This is the "multi relation" case. We need to cluster all tables
         * that have some index with indisclustered set.
         */
        MemoryContext cluster_context;
        List       *rvs;
        ListCell   *rv;

        /*
         * We cannot run this form of CLUSTER inside a user transaction block;
         * we'd be holding locks way too long.
         */
        PreventTransactionChain(isTopLevel, "CLUSTER");

        /*
         * Create special memory context for cross-transaction storage.
         *
         * Since it is a child of PortalContext, it will go away even in case
         * of error.
         */
        cluster_context = AllocSetContextCreate(PortalContext,
                                                "Cluster",
                                                ALLOCSET_DEFAULT_MINSIZE,
                                                ALLOCSET_DEFAULT_INITSIZE,
                                                ALLOCSET_DEFAULT_MAXSIZE);

        /*
         * Build the list of relations to cluster.  Note that this lives in
         * cluster_context.
         */
        rvs = get_tables_to_cluster(cluster_context);

        /* Commit to get out of starting transaction */
        PopActiveSnapshot();
        CommitTransactionCommand();

        /* Ok, now that we've got them all, cluster them one by one */
        foreach(rv, rvs)
        {
            RelToCluster *rvtc = (RelToCluster *) lfirst(rv);

            /* Start a new transaction for each relation. */
            StartTransactionCommand();
            /* functions in indexes may want a snapshot set */
            PushActiveSnapshot(GetTransactionSnapshot());
            cluster_rel(rvtc->tableOid, rvtc->indexOid, true, stmt->verbose,
                        -1, -1);
            PopActiveSnapshot();
            CommitTransactionCommand();
        }

        /* Start a new transaction for the cleanup work. */
        StartTransactionCommand();

        /* Clean up working storage */
        MemoryContextDelete(cluster_context);
    }
}

void cluster_rel	(	Oid	tableOid,
		Oid	indexOid,
		bool	recheck,
		bool	verbose,
		int	freeze_min_age,
		int	freeze_table_age
	)

Definition at line 261 of file cluster.c.

References AccessExclusiveLock, CHECK_FOR_INTERRUPTS, check_index_is_clusterable(), CheckTableNotInUse(), ereport, errcode(), errmsg(), ERROR, GETSTRUCT, GetUserId(), HeapTupleIsValid, INDEXRELID, ObjectIdGetDatum, OidIsValid, pg_class_ownercheck(), RelationData::rd_ispopulated, RelationData::rd_rel, rebuild_relation(), relation_close(), RELATION_IS_OTHER_TEMP, ReleaseSysCache(), RELKIND_MATVIEW, RELOID, SearchSysCache1, SearchSysCacheExists1, TransferPredicateLocksToHeapRelation(), and try_relation_open().

Referenced by cluster(), and vacuum_rel().

{
    Relation    OldHeap;

    /* Check for user-requested abort. */
    CHECK_FOR_INTERRUPTS();

    /*
     * We grab exclusive access to the target rel and index for the duration
     * of the transaction.  (This is redundant for the single-transaction
     * case, since cluster() already did it.)  The index lock is taken inside
     * check_index_is_clusterable.
     */
    OldHeap = try_relation_open(tableOid, AccessExclusiveLock);

    /* If the table has gone away, we can skip processing it */
    if (!OldHeap)
        return;

    /*
     * Since we may open a new transaction for each relation, we have to check
     * that the relation still is what we think it is.
     *
     * If this is a single-transaction CLUSTER, we can skip these tests. We
     * *must* skip the one on indisclustered since it would reject an attempt
     * to cluster a not-previously-clustered index.
     */
    if (recheck)
    {
        HeapTuple   tuple;
        Form_pg_index indexForm;

        /* Check that the user still owns the relation */
        if (!pg_class_ownercheck(tableOid, GetUserId()))
        {
            relation_close(OldHeap, AccessExclusiveLock);
            return;
        }

        /*
         * Silently skip a temp table for a remote session.  Only doing this
         * check in the "recheck" case is appropriate (which currently means
         * somebody is executing a database-wide CLUSTER), because there is
         * another check in cluster() which will stop any attempt to cluster
         * remote temp tables by name.  There is another check in cluster_rel
         * which is redundant, but we leave it for extra safety.
         */
        if (RELATION_IS_OTHER_TEMP(OldHeap))
        {
            relation_close(OldHeap, AccessExclusiveLock);
            return;
        }

        if (OidIsValid(indexOid))
        {
            /*
             * Check that the index still exists
             */
            if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid)))
            {
                relation_close(OldHeap, AccessExclusiveLock);
                return;
            }

            /*
             * Check that the index is still the one with indisclustered set.
             */
            tuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexOid));
            if (!HeapTupleIsValid(tuple))       /* probably can't happen */
            {
                relation_close(OldHeap, AccessExclusiveLock);
                return;
            }
            indexForm = (Form_pg_index) GETSTRUCT(tuple);
            if (!indexForm->indisclustered)
            {
                ReleaseSysCache(tuple);
                relation_close(OldHeap, AccessExclusiveLock);
                return;
            }
            ReleaseSysCache(tuple);
        }
    }

    /*
     * We allow VACUUM FULL, but not CLUSTER, on shared catalogs.  CLUSTER
     * would work in most respects, but the index would only get marked as
     * indisclustered in the current database, leading to unexpected behavior
     * if CLUSTER were later invoked in another database.
     */
    if (OidIsValid(indexOid) && OldHeap->rd_rel->relisshared)
        ereport(ERROR,
                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                 errmsg("cannot cluster a shared catalog")));

    /*
     * Don't process temp tables of other backends ... their local buffer
     * manager is not going to cope.
     */
    if (RELATION_IS_OTHER_TEMP(OldHeap))
    {
        if (OidIsValid(indexOid))
            ereport(ERROR,
                    (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
               errmsg("cannot cluster temporary tables of other sessions")));
        else
            ereport(ERROR,
                    (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                errmsg("cannot vacuum temporary tables of other sessions")));
    }

    /*
     * Also check for active uses of the relation in the current transaction,
     * including open scans and pending AFTER trigger events.
     */
    CheckTableNotInUse(OldHeap, OidIsValid(indexOid) ? "CLUSTER" : "VACUUM");

    /* Check heap and index are valid to cluster on */
    if (OidIsValid(indexOid))
        check_index_is_clusterable(OldHeap, indexOid, recheck, AccessExclusiveLock);

    /*
     * Quietly ignore the request if this is a materialized view which has not
     * been populated from its query. No harm is done because there is no data
     * to deal with, and we don't want to throw an error if this is part of a
     * multi-relation request -- for example, CLUSTER was run on the entire
     * database.
     */
    if (OldHeap->rd_rel->relkind == RELKIND_MATVIEW &&
        !OldHeap->rd_ispopulated)
    {
        relation_close(OldHeap, AccessExclusiveLock);
        return;
    }

    /*
     * All predicate locks on the tuples or pages are about to be made
     * invalid, because we move tuples around.  Promote them to relation
     * locks.  Predicate locks on indexes will be promoted when they are
     * reindexed.
     */
    TransferPredicateLocksToHeapRelation(OldHeap);

    /* rebuild_relation does all the dirty work */
    rebuild_relation(OldHeap, indexOid, freeze_min_age, freeze_table_age,
                     verbose);

    /* NB: rebuild_relation does heap_close() on OldHeap */
}

static void copy_heap_data	(	Oid	OIDNewHeap,
		Oid	OIDOldHeap,
		Oid	OIDOldIndex,
		int	freeze_min_age,
		int	freeze_table_age,
		bool	verbose,
		bool *	pSwapToastByContent,
		TransactionId *	pFreezeXid,
		MultiXactId *	pFreezeMulti
	)			`[static]`

Definition at line 731 of file cluster.c.

References AccessExclusiveLock, Assert, begin_heap_rewrite(), BTREE_AM_OID, buf, BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, CHECK_FOR_INTERRUPTS, elevel, elog, end_heap_rewrite(), ereport, errdetail(), errmsg(), ERROR, ForwardScanDirection, get_namespace_name(), heap_beginscan(), heap_close, heap_endscan(), heap_freetuple(), heap_getnext(), heap_open(), HEAPTUPLE_DEAD, HEAPTUPLE_DELETE_IN_PROGRESS, HEAPTUPLE_INSERT_IN_PROGRESS, HEAPTUPLE_LIVE, HEAPTUPLE_RECENTLY_DEAD, HeapTupleHeaderGetUpdateXid, HeapTupleHeaderGetXmin, HeapTupleSatisfiesVacuum(), index_beginscan(), index_close(), index_endscan(), index_getnext(), index_open(), index_rescan(), INFO, InvalidBlockNumber, IsSystemRelation(), LockBuffer(), LockRelationOid(), maintenance_work_mem, MultiXactFrzLimit, tupleDesc::natts, NoLock, NULL, OidIsValid, OldestXmin, palloc(), pfree(), pg_rusage_init(), pg_rusage_show(), plan_cluster_use_sort(), RelationData::rd_rel, RelationData::rd_toastoid, reform_and_rewrite_tuple(), RelationGetDescr, RelationGetNamespace, RelationGetNumberOfBlocks, RelationGetRelationName, RelationGetTargetBlock, RelationNeedsWAL, RELKIND_MATVIEW, rewrite_heap_dead_tuple(), HeapScanDescData::rs_cbuf, SetMatViewToPopulated(), SnapshotAny, HeapTupleData::t_data, TransactionIdIsCurrentTransactionId(), TransactionIdPrecedes(), tuplesort_begin_cluster(), tuplesort_end(), tuplesort_getheaptuple(), tuplesort_performsort(), tuplesort_putheaptuple(), vacuum_set_xid_limits(), values, WARNING, XLogIsNeeded, IndexScanDescData::xs_cbuf, and IndexScanDescData::xs_recheck.

Referenced by rebuild_relation().

{
    Relation    NewHeap,
                OldHeap,
                OldIndex;
    TupleDesc   oldTupDesc;
    TupleDesc   newTupDesc;
    int         natts;
    Datum      *values;
    bool       *isnull;
    IndexScanDesc indexScan;
    HeapScanDesc heapScan;
    bool        use_wal;
    bool        is_system_catalog;
    TransactionId OldestXmin;
    TransactionId FreezeXid;
    MultiXactId MultiXactFrzLimit;
    RewriteState rwstate;
    bool        use_sort;
    Tuplesortstate *tuplesort;
    double      num_tuples = 0,
                tups_vacuumed = 0,
                tups_recently_dead = 0;
    int         elevel = verbose ? INFO : DEBUG2;
    PGRUsage    ru0;

    pg_rusage_init(&ru0);

    /*
     * Open the relations we need.
     */
    NewHeap = heap_open(OIDNewHeap, AccessExclusiveLock);
    OldHeap = heap_open(OIDOldHeap, AccessExclusiveLock);
    if (OidIsValid(OIDOldIndex))
        OldIndex = index_open(OIDOldIndex, AccessExclusiveLock);
    else
        OldIndex = NULL;

    /*
     * Their tuple descriptors should be exactly alike, but here we only need
     * assume that they have the same number of columns.
     */
    oldTupDesc = RelationGetDescr(OldHeap);
    newTupDesc = RelationGetDescr(NewHeap);
    Assert(newTupDesc->natts == oldTupDesc->natts);

    /* Preallocate values/isnull arrays */
    natts = newTupDesc->natts;
    values = (Datum *) palloc(natts * sizeof(Datum));
    isnull = (bool *) palloc(natts * sizeof(bool));

    /*
     * If the OldHeap has a toast table, get lock on the toast table to keep
     * it from being vacuumed.  This is needed because autovacuum processes
     * toast tables independently of their main tables, with no lock on the
     * latter.  If an autovacuum were to start on the toast table after we
     * compute our OldestXmin below, it would use a later OldestXmin, and then
     * possibly remove as DEAD toast tuples belonging to main tuples we think
     * are only RECENTLY_DEAD.  Then we'd fail while trying to copy those
     * tuples.
     *
     * We don't need to open the toast relation here, just lock it.  The lock
     * will be held till end of transaction.
     */
    if (OldHeap->rd_rel->reltoastrelid)
        LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock);

    /*
     * We need to log the copied data in WAL iff WAL archiving/streaming is
     * enabled AND it's a WAL-logged rel.
     */
    use_wal = XLogIsNeeded() && RelationNeedsWAL(NewHeap);

    /* use_wal off requires smgr_targblock be initially invalid */
    Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber);

    /*
     * If both tables have TOAST tables, perform toast swap by content.  It is
     * possible that the old table has a toast table but the new one doesn't,
     * if toastable columns have been dropped.  In that case we have to do
     * swap by links.  This is okay because swap by content is only essential
     * for system catalogs, and we don't support schema changes for them.
     */
    if (OldHeap->rd_rel->reltoastrelid && NewHeap->rd_rel->reltoastrelid)
    {
        *pSwapToastByContent = true;

        /*
         * When doing swap by content, any toast pointers written into NewHeap
         * must use the old toast table's OID, because that's where the toast
         * data will eventually be found.  Set this up by setting rd_toastoid.
         * This also tells toast_save_datum() to preserve the toast value
         * OIDs, which we want so as not to invalidate toast pointers in
         * system catalog caches, and to avoid making multiple copies of a
         * single toast value.
         *
         * Note that we must hold NewHeap open until we are done writing data,
         * since the relcache will not guarantee to remember this setting once
         * the relation is closed.  Also, this technique depends on the fact
         * that no one will try to read from the NewHeap until after we've
         * finished writing it and swapping the rels --- otherwise they could
         * follow the toast pointers to the wrong place.  (It would actually
         * work for values copied over from the old toast table, but not for
         * any values that we toast which were previously not toasted.)
         */
        NewHeap->rd_toastoid = OldHeap->rd_rel->reltoastrelid;
    }
    else
        *pSwapToastByContent = false;

    /*
     * compute xids used to freeze and weed out dead tuples.  We use -1
     * freeze_min_age to avoid having CLUSTER freeze tuples earlier than a
     * plain VACUUM would.
     */
    vacuum_set_xid_limits(freeze_min_age, freeze_table_age,
                          OldHeap->rd_rel->relisshared,
                          &OldestXmin, &FreezeXid, NULL, &MultiXactFrzLimit);

    /*
     * FreezeXid will become the table's new relfrozenxid, and that mustn't go
     * backwards, so take the max.
     */
    if (TransactionIdPrecedes(FreezeXid, OldHeap->rd_rel->relfrozenxid))
        FreezeXid = OldHeap->rd_rel->relfrozenxid;

    /* return selected values to caller */
    *pFreezeXid = FreezeXid;
    *pFreezeMulti = MultiXactFrzLimit;

    /* Remember if it's a system catalog */
    is_system_catalog = IsSystemRelation(OldHeap);

    /* Initialize the rewrite operation */
    rwstate = begin_heap_rewrite(NewHeap, OldestXmin, FreezeXid,
                                 MultiXactFrzLimit, use_wal);

    /*
     * Decide whether to use an indexscan or seqscan-and-optional-sort to scan
     * the OldHeap.  We know how to use a sort to duplicate the ordering of a
     * btree index, and will use seqscan-and-sort for that case if the planner
     * tells us it's cheaper.  Otherwise, always indexscan if an index is
     * provided, else plain seqscan.
     */
    if (OldIndex != NULL && OldIndex->rd_rel->relam == BTREE_AM_OID)
        use_sort = plan_cluster_use_sort(OIDOldHeap, OIDOldIndex);
    else
        use_sort = false;

    /* Set up sorting if wanted */
    if (use_sort)
        tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex,
                                            maintenance_work_mem, false);
    else
        tuplesort = NULL;

    /*
     * Prepare to scan the OldHeap.  To ensure we see recently-dead tuples
     * that still need to be copied, we scan with SnapshotAny and use
     * HeapTupleSatisfiesVacuum for the visibility test.
     */
    if (OldIndex != NULL && !use_sort)
    {
        heapScan = NULL;
        indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0);
        index_rescan(indexScan, NULL, 0, NULL, 0);
    }
    else
    {
        heapScan = heap_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL);
        indexScan = NULL;
    }

    /* Log what we're doing */
    if (indexScan != NULL)
        ereport(elevel,
                (errmsg("clustering \"%s.%s\" using index scan on \"%s\"",
                        get_namespace_name(RelationGetNamespace(OldHeap)),
                        RelationGetRelationName(OldHeap),
                        RelationGetRelationName(OldIndex))));
    else if (tuplesort != NULL)
        ereport(elevel,
                (errmsg("clustering \"%s.%s\" using sequential scan and sort",
                        get_namespace_name(RelationGetNamespace(OldHeap)),
                        RelationGetRelationName(OldHeap))));
    else
        ereport(elevel,
                (errmsg("vacuuming \"%s.%s\"",
                        get_namespace_name(RelationGetNamespace(OldHeap)),
                        RelationGetRelationName(OldHeap))));

    if (OldHeap->rd_rel->relkind == RELKIND_MATVIEW)
        /* Make sure the heap looks good even if no rows are written. */
        SetMatViewToPopulated(NewHeap);

    /*
     * Scan through the OldHeap, either in OldIndex order or sequentially;
     * copy each tuple into the NewHeap, or transiently to the tuplesort
     * module.  Note that we don't bother sorting dead tuples (they won't get
     * to the new table anyway).
     */
    for (;;)
    {
        HeapTuple   tuple;
        Buffer      buf;
        bool        isdead;

        CHECK_FOR_INTERRUPTS();

        if (indexScan != NULL)
        {
            tuple = index_getnext(indexScan, ForwardScanDirection);
            if (tuple == NULL)
                break;

            /* Since we used no scan keys, should never need to recheck */
            if (indexScan->xs_recheck)
                elog(ERROR, "CLUSTER does not support lossy index conditions");

            buf = indexScan->xs_cbuf;
        }
        else
        {
            tuple = heap_getnext(heapScan, ForwardScanDirection);
            if (tuple == NULL)
                break;

            buf = heapScan->rs_cbuf;
        }

        LockBuffer(buf, BUFFER_LOCK_SHARE);

        switch (HeapTupleSatisfiesVacuum(tuple->t_data, OldestXmin, buf))
        {
            case HEAPTUPLE_DEAD:
                /* Definitely dead */
                isdead = true;
                break;
            case HEAPTUPLE_RECENTLY_DEAD:
                tups_recently_dead += 1;
                /* fall through */
            case HEAPTUPLE_LIVE:
                /* Live or recently dead, must copy it */
                isdead = false;
                break;
            case HEAPTUPLE_INSERT_IN_PROGRESS:

                /*
                 * Since we hold exclusive lock on the relation, normally the
                 * only way to see this is if it was inserted earlier in our
                 * own transaction.  However, it can happen in system
                 * catalogs, since we tend to release write lock before commit
                 * there.  Give a warning if neither case applies; but in any
                 * case we had better copy it.
                 */
                if (!is_system_catalog &&
                    !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data)))
                    elog(WARNING, "concurrent insert in progress within table \"%s\"",
                         RelationGetRelationName(OldHeap));
                /* treat as live */
                isdead = false;
                break;
            case HEAPTUPLE_DELETE_IN_PROGRESS:

                /*
                 * Similar situation to INSERT_IN_PROGRESS case.
                 */
                if (!is_system_catalog &&
                    !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data)))
                    elog(WARNING, "concurrent delete in progress within table \"%s\"",
                         RelationGetRelationName(OldHeap));
                /* treat as recently dead */
                tups_recently_dead += 1;
                isdead = false;
                break;
            default:
                elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
                isdead = false; /* keep compiler quiet */
                break;
        }

        LockBuffer(buf, BUFFER_LOCK_UNLOCK);

        if (isdead)
        {
            tups_vacuumed += 1;
            /* heap rewrite module still needs to see it... */
            if (rewrite_heap_dead_tuple(rwstate, tuple))
            {
                /* A previous recently-dead tuple is now known dead */
                tups_vacuumed += 1;
                tups_recently_dead -= 1;
            }
            continue;
        }

        num_tuples += 1;
        if (tuplesort != NULL)
            tuplesort_putheaptuple(tuplesort, tuple);
        else
            reform_and_rewrite_tuple(tuple,
                                     oldTupDesc, newTupDesc,
                                     values, isnull,
                                     NewHeap->rd_rel->relhasoids, rwstate);
    }

    if (indexScan != NULL)
        index_endscan(indexScan);
    if (heapScan != NULL)
        heap_endscan(heapScan);

    /*
     * In scan-and-sort mode, complete the sort, then read out all live tuples
     * from the tuplestore and write them to the new relation.
     */
    if (tuplesort != NULL)
    {
        tuplesort_performsort(tuplesort);

        for (;;)
        {
            HeapTuple   tuple;
            bool        shouldfree;

            CHECK_FOR_INTERRUPTS();

            tuple = tuplesort_getheaptuple(tuplesort, true, &shouldfree);
            if (tuple == NULL)
                break;

            reform_and_rewrite_tuple(tuple,
                                     oldTupDesc, newTupDesc,
                                     values, isnull,
                                     NewHeap->rd_rel->relhasoids, rwstate);

            if (shouldfree)
                heap_freetuple(tuple);
        }

        tuplesort_end(tuplesort);
    }

    /* Write out any remaining tuples, and fsync if needed */
    end_heap_rewrite(rwstate);

    /* Reset rd_toastoid just to be tidy --- it shouldn't be looked at again */
    NewHeap->rd_toastoid = InvalidOid;

    /* Log what we did */
    ereport(elevel,
            (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
                    RelationGetRelationName(OldHeap),
                    tups_vacuumed, num_tuples,
                    RelationGetNumberOfBlocks(OldHeap)),
             errdetail("%.0f dead row versions cannot be removed yet.\n"
                       "%s.",
                       tups_recently_dead,
                       pg_rusage_show(&ru0))));

    /* Clean up */
    pfree(values);
    pfree(isnull);

    if (OldIndex != NULL)
        index_close(OldIndex, NoLock);
    heap_close(OldHeap, NoLock);
    heap_close(NewHeap, NoLock);
}

void finish_heap_swap	(	Oid	OIDOldHeap,
		Oid	OIDNewHeap,
		bool	is_system_catalog,
		bool	swap_toast_by_content,
		bool	check_constraints,
		bool	is_internal,
		TransactionId	frozenXid,
		MultiXactId	frozenMulti
	)

Definition at line 1445 of file cluster.c.

References AccessShareLock, CacheInvalidateCatalog(), DROP_RESTRICT, heap_open(), i, NAMEDATALEN, NoLock, OidIsValid, PERFORM_DELETION_INTERNAL, performDeletion(), RelationData::rd_rel, reindex_relation(), relation_close(), relation_open(), RelationMapRemoveMapping(), RelationRelationId, RenameRelationInternal(), snprintf(), and swap_relation_files().

Referenced by ATRewriteTables(), ExecRefreshMatView(), and rebuild_relation().

{
    ObjectAddress object;
    Oid         mapped_tables[4];
    int         reindex_flags;
    int         i;

    /* Zero out possible results from swapped_relation_files */
    memset(mapped_tables, 0, sizeof(mapped_tables));

    /*
     * Swap the contents of the heap relations (including any toast tables).
     * Also set old heap's relfrozenxid to frozenXid.
     */
    swap_relation_files(OIDOldHeap, OIDNewHeap,
                        (OIDOldHeap == RelationRelationId),
                        swap_toast_by_content, is_internal,
                        frozenXid, frozenMulti, mapped_tables);

    /*
     * If it's a system catalog, queue an sinval message to flush all
     * catcaches on the catalog when we reach CommandCounterIncrement.
     */
    if (is_system_catalog)
        CacheInvalidateCatalog(OIDOldHeap);

    /*
     * Rebuild each index on the relation (but not the toast table, which is
     * all-new at this point).  It is important to do this before the DROP
     * step because if we are processing a system catalog that will be used
     * during DROP, we want to have its indexes available.  There is no
     * advantage to the other order anyway because this is all transactional,
     * so no chance to reclaim disk space before commit.  We do not need a
     * final CommandCounterIncrement() because reindex_relation does it.
     *
     * Note: because index_build is called via reindex_relation, it will never
     * set indcheckxmin true for the indexes.  This is OK even though in some
     * sense we are building new indexes rather than rebuilding existing ones,
     * because the new heap won't contain any HOT chains at all, let alone
     * broken ones, so it can't be necessary to set indcheckxmin.
     */
    reindex_flags = REINDEX_REL_SUPPRESS_INDEX_USE;
    if (check_constraints)
        reindex_flags |= REINDEX_REL_CHECK_CONSTRAINTS;
    reindex_relation(OIDOldHeap, reindex_flags);

    /* Destroy new heap with old filenode */
    object.classId = RelationRelationId;
    object.objectId = OIDNewHeap;
    object.objectSubId = 0;

    /*
     * The new relation is local to our transaction and we know nothing
     * depends on it, so DROP_RESTRICT should be OK.
     */
    performDeletion(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);

    /* performDeletion does CommandCounterIncrement at end */

    /*
     * Now we must remove any relation mapping entries that we set up for the
     * transient table, as well as its toast table and toast index if any. If
     * we fail to do this before commit, the relmapper will complain about new
     * permanent map entries being added post-bootstrap.
     */
    for (i = 0; OidIsValid(mapped_tables[i]); i++)
        RelationMapRemoveMapping(mapped_tables[i]);

    /*
     * At this point, everything is kosher except that, if we did toast swap
     * by links, the toast table's name corresponds to the transient table.
     * The name is irrelevant to the backend because it's referenced by OID,
     * but users looking at the catalogs could be confused.  Rename it to
     * prevent this problem.
     *
     * Note no lock required on the relation, because we already hold an
     * exclusive lock on it.
     */
    if (!swap_toast_by_content)
    {
        Relation    newrel;

        newrel = heap_open(OIDOldHeap, NoLock);
        if (OidIsValid(newrel->rd_rel->reltoastrelid))
        {
            Relation    toastrel;
            Oid         toastidx;
            char        NewToastName[NAMEDATALEN];

            toastrel = relation_open(newrel->rd_rel->reltoastrelid,
                                     AccessShareLock);
            toastidx = toastrel->rd_rel->reltoastidxid;
            relation_close(toastrel, AccessShareLock);

            /* rename the toast table ... */
            snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u",
                     OIDOldHeap);
            RenameRelationInternal(newrel->rd_rel->reltoastrelid,
                                   NewToastName, true);

            /* ... and its index too */
            snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u_index",
                     OIDOldHeap);
            RenameRelationInternal(toastidx,
                                   NewToastName, true);
        }
        relation_close(newrel, NoLock);
    }
}

static List * get_tables_to_cluster ( MemoryContext cluster_context ) [static]

Definition at line 1569 of file cluster.c.

References AccessShareLock, Anum_pg_index_indisclustered, BoolGetDatum, BTEqualStrategyNumber, ForwardScanDirection, GETSTRUCT, GetUserId(), heap_beginscan(), heap_endscan(), heap_getnext(), heap_open(), RelToCluster::indexOid, IndexRelationId, lcons(), MemoryContextSwitchTo(), NULL, palloc(), pg_class_ownercheck(), relation_close(), ScanKeyInit(), SnapshotNow, and RelToCluster::tableOid.

Referenced by cluster().

{
    Relation    indRelation;
    HeapScanDesc scan;
    ScanKeyData entry;
    HeapTuple   indexTuple;
    Form_pg_index index;
    MemoryContext old_context;
    RelToCluster *rvtc;
    List       *rvs = NIL;

    /*
     * Get all indexes that have indisclustered set and are owned by
     * appropriate user. System relations or nailed-in relations cannot ever
     * have indisclustered set, because CLUSTER will refuse to set it when
     * called with one of them as argument.
     */
    indRelation = heap_open(IndexRelationId, AccessShareLock);
    ScanKeyInit(&entry,
                Anum_pg_index_indisclustered,
                BTEqualStrategyNumber, F_BOOLEQ,
                BoolGetDatum(true));
    scan = heap_beginscan(indRelation, SnapshotNow, 1, &entry);
    while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
    {
        index = (Form_pg_index) GETSTRUCT(indexTuple);

        if (!pg_class_ownercheck(index->indrelid, GetUserId()))
            continue;

        /*
         * We have to build the list in a different memory context so it will
         * survive the cross-transaction processing
         */
        old_context = MemoryContextSwitchTo(cluster_context);

        rvtc = (RelToCluster *) palloc(sizeof(RelToCluster));
        rvtc->tableOid = index->indrelid;
        rvtc->indexOid = index->indexrelid;
        rvs = lcons(rvtc, rvs);

        MemoryContextSwitchTo(old_context);
    }
    heap_endscan(scan);

    relation_close(indRelation, AccessShareLock);

    return rvs;
}

Oid make_new_heap	(	Oid	OIDOldHeap,
		Oid	NewTableSpace
	)

Definition at line 614 of file cluster.c.

References AccessExclusiveLock, AlterTableCreateToastTable(), Anum_pg_class_reloptions, Assert, CommandCounterIncrement(), elog, ERROR, heap_close, heap_create_with_catalog(), heap_open(), HeapTupleIsValid, InvalidOid, NIL, NoLock, ObjectIdGetDatum, OidIsValid, ONCOMMIT_NOOP, RelationData::rd_rel, RelationGetDescr, RelationGetNamespace, RelationIsMapped, ReleaseSysCache(), RELOID, SearchSysCache1, snprintf(), and SysCacheGetAttr().

Referenced by ATRewriteTables(), ExecRefreshMatView(), and rebuild_relation().

{
    TupleDesc   OldHeapDesc;
    char        NewHeapName[NAMEDATALEN];
    Oid         OIDNewHeap;
    Oid         toastid;
    Relation    OldHeap;
    HeapTuple   tuple;
    Datum       reloptions;
    bool        isNull;

    OldHeap = heap_open(OIDOldHeap, AccessExclusiveLock);
    OldHeapDesc = RelationGetDescr(OldHeap);

    /*
     * Note that the NewHeap will not receive any of the defaults or
     * constraints associated with the OldHeap; we don't need 'em, and there's
     * no reason to spend cycles inserting them into the catalogs only to
     * delete them.
     */

    /*
     * But we do want to use reloptions of the old heap for new heap.
     */
    tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(OIDOldHeap));
    if (!HeapTupleIsValid(tuple))
        elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
    reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
                                 &isNull);
    if (isNull)
        reloptions = (Datum) 0;

    /*
     * Create the new heap, using a temporary name in the same namespace as
     * the existing table.  NOTE: there is some risk of collision with user
     * relnames.  Working around this seems more trouble than it's worth; in
     * particular, we can't create the new heap in a different namespace from
     * the old, or we will have problems with the TEMP status of temp tables.
     *
     * Note: the new heap is not a shared relation, even if we are rebuilding
     * a shared rel.  However, we do make the new heap mapped if the source is
     * mapped.  This simplifies swap_relation_files, and is absolutely
     * necessary for rebuilding pg_class, for reasons explained there.
     */
    snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", OIDOldHeap);

    OIDNewHeap = heap_create_with_catalog(NewHeapName,
                                          RelationGetNamespace(OldHeap),
                                          NewTableSpace,
                                          InvalidOid,
                                          InvalidOid,
                                          InvalidOid,
                                          OldHeap->rd_rel->relowner,
                                          OldHeapDesc,
                                          NIL,
                                          OldHeap->rd_rel->relkind,
                                          OldHeap->rd_rel->relpersistence,
                                          false,
                                          RelationIsMapped(OldHeap),
                                          true,
                                          0,
                                          ONCOMMIT_NOOP,
                                          reloptions,
                                          false,
                                          true,
                                          true);
    Assert(OIDNewHeap != InvalidOid);

    ReleaseSysCache(tuple);

    /*
     * Advance command counter so that the newly-created relation's catalog
     * tuples will be visible to heap_open.
     */
    CommandCounterIncrement();

    /*
     * If necessary, create a TOAST table for the new relation.
     *
     * If the relation doesn't have a TOAST table already, we can't need one
     * for the new relation.  The other way around is possible though: if some
     * wide columns have been dropped, AlterTableCreateToastTable can decide
     * that no TOAST table is needed for the new table.
     *
     * Note that AlterTableCreateToastTable ends with CommandCounterIncrement,
     * so that the TOAST table will be visible for insertion.
     */
    toastid = OldHeap->rd_rel->reltoastrelid;
    if (OidIsValid(toastid))
    {
        /* keep the existing toast table's reloptions, if any */
        tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(toastid));
        if (!HeapTupleIsValid(tuple))
            elog(ERROR, "cache lookup failed for relation %u", toastid);
        reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
                                     &isNull);
        if (isNull)
            reloptions = (Datum) 0;

        AlterTableCreateToastTable(OIDNewHeap, reloptions);

        ReleaseSysCache(tuple);
    }

    heap_close(OldHeap, NoLock);

    return OIDNewHeap;
}

void mark_index_clustered	(	Relation	rel,
		Oid	indexOid,
		bool	is_internal
	)

Definition at line 486 of file cluster.c.

References CatalogUpdateIndexes(), elog, ERROR, GETSTRUCT, heap_close, heap_freetuple(), heap_open(), HeapTupleIsValid, IndexIsValid, IndexRelationId, INDEXRELID, InvalidOid, InvokeObjectPostAlterHookArg, lfirst_oid, ObjectIdGetDatum, OidIsValid, RelationGetIndexList(), ReleaseSysCache(), RowExclusiveLock, SearchSysCache1, SearchSysCacheCopy1, simple_heap_update(), and HeapTupleData::t_self.

Referenced by ATExecClusterOn(), ATExecDropCluster(), and rebuild_relation().

{
    HeapTuple   indexTuple;
    Form_pg_index indexForm;
    Relation    pg_index;
    ListCell   *index;

    /*
     * If the index is already marked clustered, no need to do anything.
     */
    if (OidIsValid(indexOid))
    {
        indexTuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexOid));
        if (!HeapTupleIsValid(indexTuple))
            elog(ERROR, "cache lookup failed for index %u", indexOid);
        indexForm = (Form_pg_index) GETSTRUCT(indexTuple);

        if (indexForm->indisclustered)
        {
            ReleaseSysCache(indexTuple);
            return;
        }

        ReleaseSysCache(indexTuple);
    }

    /*
     * Check each index of the relation and set/clear the bit as needed.
     */
    pg_index = heap_open(IndexRelationId, RowExclusiveLock);

    foreach(index, RelationGetIndexList(rel))
    {
        Oid         thisIndexOid = lfirst_oid(index);

        indexTuple = SearchSysCacheCopy1(INDEXRELID,
                                         ObjectIdGetDatum(thisIndexOid));
        if (!HeapTupleIsValid(indexTuple))
            elog(ERROR, "cache lookup failed for index %u", thisIndexOid);
        indexForm = (Form_pg_index) GETSTRUCT(indexTuple);

        /*
         * Unset the bit if set.  We know it's wrong because we checked this
         * earlier.
         */
        if (indexForm->indisclustered)
        {
            indexForm->indisclustered = false;
            simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
            CatalogUpdateIndexes(pg_index, indexTuple);
        }
        else if (thisIndexOid == indexOid)
        {
            /* this was checked earlier, but let's be real sure */
            if (!IndexIsValid(indexForm))
                elog(ERROR, "cannot cluster on invalid index %u", indexOid);
            indexForm->indisclustered = true;
            simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
            CatalogUpdateIndexes(pg_index, indexTuple);
        }

        InvokeObjectPostAlterHookArg(IndexRelationId, thisIndexOid, 0,
                                     InvalidOid, is_internal);

        heap_freetuple(indexTuple);
    }

    heap_close(pg_index, RowExclusiveLock);
}

static void rebuild_relation	(	Relation	OldHeap,
		Oid	indexOid,
		int	freeze_min_age,
		int	freeze_table_age,
		bool	verbose
	)			`[static]`

Definition at line 565 of file cluster.c.

References copy_heap_data(), finish_heap_swap(), heap_close, IsSystemRelation(), make_new_heap(), mark_index_clustered(), NoLock, OidIsValid, RelationData::rd_rel, and RelationGetRelid.

Referenced by cluster_rel().

{
    Oid         tableOid = RelationGetRelid(OldHeap);
    Oid         tableSpace = OldHeap->rd_rel->reltablespace;
    Oid         OIDNewHeap;
    bool        is_system_catalog;
    bool        swap_toast_by_content;
    TransactionId frozenXid;
    MultiXactId frozenMulti;

    /* Mark the correct index as clustered */
    if (OidIsValid(indexOid))
        mark_index_clustered(OldHeap, indexOid, true);

    /* Remember if it's a system catalog */
    is_system_catalog = IsSystemRelation(OldHeap);

    /* Close relcache entry, but keep lock until transaction commit */
    heap_close(OldHeap, NoLock);

    /* Create the transient table that will receive the re-ordered data */
    OIDNewHeap = make_new_heap(tableOid, tableSpace);

    /* Copy the heap data into the new table in the desired order */
    copy_heap_data(OIDNewHeap, tableOid, indexOid,
                   freeze_min_age, freeze_table_age, verbose,
                   &swap_toast_by_content, &frozenXid, &frozenMulti);

    /*
     * Swap the physical files of the target and transient tables, then
     * rebuild the target's indexes and throw away the transient table.
     */
    finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog,
                     swap_toast_by_content, false, true,
                     frozenXid, frozenMulti);
}

static void reform_and_rewrite_tuple	(	HeapTuple	tuple,
		TupleDesc	oldTupDesc,
		TupleDesc	newTupDesc,
		Datum *	values,
		bool *	isnull,
		bool	newRelHasOids,
		RewriteState	rwstate
	)			`[static]`

Definition at line 1637 of file cluster.c.

References tupleDesc::attrs, heap_deform_tuple(), heap_form_tuple(), heap_freetuple(), HeapTupleGetOid, HeapTupleSetOid, i, tupleDesc::natts, and rewrite_heap_tuple().

Referenced by copy_heap_data().

{
    HeapTuple   copiedTuple;
    int         i;

    heap_deform_tuple(tuple, oldTupDesc, values, isnull);

    /* Be sure to null out any dropped columns */
    for (i = 0; i < newTupDesc->natts; i++)
    {
        if (newTupDesc->attrs[i]->attisdropped)
            isnull[i] = true;
    }

    copiedTuple = heap_form_tuple(newTupDesc, values, isnull);

    /* Preserve OID, if any */
    if (newRelHasOids)
        HeapTupleSetOid(copiedTuple, HeapTupleGetOid(tuple));

    /* The heap rewrite module does the rest */
    rewrite_heap_tuple(rwstate, tuple, copiedTuple);

    heap_freetuple(copiedTuple);
}

static void swap_relation_files	(	Oid	r1,
		Oid	r2,
		bool	target_is_pg_class,
		bool	swap_toast_by_content,
		bool	is_internal,
		TransactionId	frozenXid,
		MultiXactId	frozenMulti,
		Oid *	mapped_tables
	)			`[static]`

Definition at line 1128 of file cluster.c.

References Assert, CacheInvalidateRelcacheByTuple(), CatalogCloseIndexes(), CatalogIndexInsert(), CatalogOpenIndexes(), ObjectAddress::classId, deleteDependencyRecordsFor(), DEPENDENCY_INTERNAL, elog, ERROR, GETSTRUCT, heap_close, heap_freetuple(), heap_open(), HeapTupleIsValid, InvalidMultiXactId, InvalidOid, InvalidTransactionId, InvokeObjectPostAlterHookArg, IsSystemClass(), MultiXactIdIsValid, NameStr, ObjectAddress::objectId, ObjectIdGetDatum, ObjectAddress::objectSubId, OidIsValid, recordDependencyOn(), RelationCloseSmgrByOid(), RelationMapOidToFilenode(), RelationMapUpdateMap(), RelationRelationId, RELKIND_INDEX, RELOID, RowExclusiveLock, SearchSysCacheCopy1, simple_heap_update(), HeapTupleData::t_self, and TransactionIdIsNormal.

Referenced by finish_heap_swap().

{
    Relation    relRelation;
    HeapTuple   reltup1,
                reltup2;
    Form_pg_class relform1,
                relform2;
    Oid         relfilenode1,
                relfilenode2;
    Oid         swaptemp;
    CatalogIndexState indstate;

    /* We need writable copies of both pg_class tuples. */
    relRelation = heap_open(RelationRelationId, RowExclusiveLock);

    reltup1 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r1));
    if (!HeapTupleIsValid(reltup1))
        elog(ERROR, "cache lookup failed for relation %u", r1);
    relform1 = (Form_pg_class) GETSTRUCT(reltup1);

    reltup2 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r2));
    if (!HeapTupleIsValid(reltup2))
        elog(ERROR, "cache lookup failed for relation %u", r2);
    relform2 = (Form_pg_class) GETSTRUCT(reltup2);

    relfilenode1 = relform1->relfilenode;
    relfilenode2 = relform2->relfilenode;

    if (OidIsValid(relfilenode1) && OidIsValid(relfilenode2))
    {
        /* Normal non-mapped relations: swap relfilenodes and reltablespaces */
        Assert(!target_is_pg_class);

        swaptemp = relform1->relfilenode;
        relform1->relfilenode = relform2->relfilenode;
        relform2->relfilenode = swaptemp;

        swaptemp = relform1->reltablespace;
        relform1->reltablespace = relform2->reltablespace;
        relform2->reltablespace = swaptemp;

        /* Also swap toast links, if we're swapping by links */
        if (!swap_toast_by_content)
        {
            swaptemp = relform1->reltoastrelid;
            relform1->reltoastrelid = relform2->reltoastrelid;
            relform2->reltoastrelid = swaptemp;

            /* we should NOT swap reltoastidxid */
        }
    }
    else
    {
        /*
         * Mapped-relation case.  Here we have to swap the relation mappings
         * instead of modifying the pg_class columns.  Both must be mapped.
         */
        if (OidIsValid(relfilenode1) || OidIsValid(relfilenode2))
            elog(ERROR, "cannot swap mapped relation \"%s\" with non-mapped relation",
                 NameStr(relform1->relname));

        /*
         * We can't change the tablespace of a mapped rel, and we can't handle
         * toast link swapping for one either, because we must not apply any
         * critical changes to its pg_class row.  These cases should be
         * prevented by upstream permissions tests, so this check is a
         * non-user-facing emergency backstop.
         */
        if (relform1->reltablespace != relform2->reltablespace)
            elog(ERROR, "cannot change tablespace of mapped relation \"%s\"",
                 NameStr(relform1->relname));
        if (!swap_toast_by_content &&
            (relform1->reltoastrelid || relform2->reltoastrelid))
            elog(ERROR, "cannot swap toast by links for mapped relation \"%s\"",
                 NameStr(relform1->relname));

        /*
         * Fetch the mappings --- shouldn't fail, but be paranoid
         */
        relfilenode1 = RelationMapOidToFilenode(r1, relform1->relisshared);
        if (!OidIsValid(relfilenode1))
            elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
                 NameStr(relform1->relname), r1);
        relfilenode2 = RelationMapOidToFilenode(r2, relform2->relisshared);
        if (!OidIsValid(relfilenode2))
            elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
                 NameStr(relform2->relname), r2);

        /*
         * Send replacement mappings to relmapper.  Note these won't actually
         * take effect until CommandCounterIncrement.
         */
        RelationMapUpdateMap(r1, relfilenode2, relform1->relisshared, false);
        RelationMapUpdateMap(r2, relfilenode1, relform2->relisshared, false);

        /* Pass OIDs of mapped r2 tables back to caller */
        *mapped_tables++ = r2;
    }

    /*
     * In the case of a shared catalog, these next few steps will only affect
     * our own database's pg_class row; but that's okay, because they are all
     * noncritical updates.  That's also an important fact for the case of a
     * mapped catalog, because it's possible that we'll commit the map change
     * and then fail to commit the pg_class update.
     */

    /* set rel1's frozen Xid and minimum MultiXid */
    if (relform1->relkind != RELKIND_INDEX)
    {
        Assert(TransactionIdIsNormal(frozenXid));
        relform1->relfrozenxid = frozenXid;
        Assert(MultiXactIdIsValid(frozenMulti));
        relform1->relminmxid = frozenMulti;
    }

    /* swap size statistics too, since new rel has freshly-updated stats */
    {
        int32       swap_pages;
        float4      swap_tuples;
        int32       swap_allvisible;

        swap_pages = relform1->relpages;
        relform1->relpages = relform2->relpages;
        relform2->relpages = swap_pages;

        swap_tuples = relform1->reltuples;
        relform1->reltuples = relform2->reltuples;
        relform2->reltuples = swap_tuples;

        swap_allvisible = relform1->relallvisible;
        relform1->relallvisible = relform2->relallvisible;
        relform2->relallvisible = swap_allvisible;
    }

    /*
     * Update the tuples in pg_class --- unless the target relation of the
     * swap is pg_class itself.  In that case, there is zero point in making
     * changes because we'd be updating the old data that we're about to throw
     * away.  Because the real work being done here for a mapped relation is
     * just to change the relation map settings, it's all right to not update
     * the pg_class rows in this case.
     */
    if (!target_is_pg_class)
    {
        simple_heap_update(relRelation, &reltup1->t_self, reltup1);
        simple_heap_update(relRelation, &reltup2->t_self, reltup2);

        /* Keep system catalogs current */
        indstate = CatalogOpenIndexes(relRelation);
        CatalogIndexInsert(indstate, reltup1);
        CatalogIndexInsert(indstate, reltup2);
        CatalogCloseIndexes(indstate);
    }
    else
    {
        /* no update ... but we do still need relcache inval */
        CacheInvalidateRelcacheByTuple(reltup1);
        CacheInvalidateRelcacheByTuple(reltup2);
    }

    /*
     * Post alter hook for modified relations. The change to r2 is always
     * internal, but r1 depends on the invocation context.
     */
    InvokeObjectPostAlterHookArg(RelationRelationId, r1, 0,
                                 InvalidOid, is_internal);
    InvokeObjectPostAlterHookArg(RelationRelationId, r2, 0,
                                 InvalidOid, true);

    /*
     * If we have toast tables associated with the relations being swapped,
     * deal with them too.
     */
    if (relform1->reltoastrelid || relform2->reltoastrelid)
    {
        if (swap_toast_by_content)
        {
            if (relform1->reltoastrelid && relform2->reltoastrelid)
            {
                /* Recursively swap the contents of the toast tables */
                swap_relation_files(relform1->reltoastrelid,
                                    relform2->reltoastrelid,
                                    target_is_pg_class,
                                    swap_toast_by_content,
                                    is_internal,
                                    frozenXid,
                                    frozenMulti,
                                    mapped_tables);
            }
            else
            {
                /* caller messed up */
                elog(ERROR, "cannot swap toast files by content when there's only one");
            }
        }
        else
        {
            /*
             * We swapped the ownership links, so we need to change dependency
             * data to match.
             *
             * NOTE: it is possible that only one table has a toast table.
             *
             * NOTE: at present, a TOAST table's only dependency is the one on
             * its owning table.  If more are ever created, we'd need to use
             * something more selective than deleteDependencyRecordsFor() to
             * get rid of just the link we want.
             */
            ObjectAddress baseobject,
                        toastobject;
            long        count;

            /*
             * We disallow this case for system catalogs, to avoid the
             * possibility that the catalog we're rebuilding is one of the
             * ones the dependency changes would change.  It's too late to be
             * making any data changes to the target catalog.
             */
            if (IsSystemClass(relform1))
                elog(ERROR, "cannot swap toast files by links for system catalogs");

            /* Delete old dependencies */
            if (relform1->reltoastrelid)
            {
                count = deleteDependencyRecordsFor(RelationRelationId,
                                                   relform1->reltoastrelid,
                                                   false);
                if (count != 1)
                    elog(ERROR, "expected one dependency record for TOAST table, found %ld",
                         count);
            }
            if (relform2->reltoastrelid)
            {
                count = deleteDependencyRecordsFor(RelationRelationId,
                                                   relform2->reltoastrelid,
                                                   false);
                if (count != 1)
                    elog(ERROR, "expected one dependency record for TOAST table, found %ld",
                         count);
            }

            /* Register new dependencies */
            baseobject.classId = RelationRelationId;
            baseobject.objectSubId = 0;
            toastobject.classId = RelationRelationId;
            toastobject.objectSubId = 0;

            if (relform1->reltoastrelid)
            {
                baseobject.objectId = r1;
                toastobject.objectId = relform1->reltoastrelid;
                recordDependencyOn(&toastobject, &baseobject,
                                   DEPENDENCY_INTERNAL);
            }

            if (relform2->reltoastrelid)
            {
                baseobject.objectId = r2;
                toastobject.objectId = relform2->reltoastrelid;
                recordDependencyOn(&toastobject, &baseobject,
                                   DEPENDENCY_INTERNAL);
            }
        }
    }

    /*
     * If we're swapping two toast tables by content, do the same for their
     * indexes.
     */
    if (swap_toast_by_content &&
        relform1->reltoastidxid && relform2->reltoastidxid)
        swap_relation_files(relform1->reltoastidxid,
                            relform2->reltoastidxid,
                            target_is_pg_class,
                            swap_toast_by_content,
                            is_internal,
                            InvalidTransactionId,
                            InvalidMultiXactId,
                            mapped_tables);

    /* Clean up. */
    heap_freetuple(reltup1);
    heap_freetuple(reltup2);

    heap_close(relRelation, RowExclusiveLock);

    /*
     * Close both relcache entries' smgr links.  We need this kluge because
     * both links will be invalidated during upcoming CommandCounterIncrement.
     * Whichever of the rels is the second to be cleared will have a dangling
     * reference to the other's smgr entry.  Rather than trying to avoid this
     * by ordering operations just so, it's easiest to close the links first.
     * (Fortunately, since one of the entries is local in our transaction,
     * it's sufficient to clear out our own relcache this way; the problem
     * cannot arise for other backends when they see our update on the
     * non-transient relation.)
     *
     * Caution: the placement of this step interacts with the decision to
     * handle toast rels by recursion.  When we are trying to rebuild pg_class
     * itself, the smgr close on pg_class must happen after all accesses in
     * this function.
     */
    RelationCloseSmgrByOid(r1);
    RelationCloseSmgrByOid(r2);
}

Header And Logo

cluster.c File Reference

Data Structures

Functions

Function Documentation