PostgreSQL Source Code: src/backend/commands/cluster.c Source File

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * cluster.c
00004  *    CLUSTER a table on an index.  This is now also used for VACUUM FULL.
00005  *
00006  * There is hardly anything left of Paul Brown's original implementation...
00007  *
00008  *
00009  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00010  * Portions Copyright (c) 1994-5, Regents of the University of California
00011  *
00012  *
00013  * IDENTIFICATION
00014  *    src/backend/commands/cluster.c
00015  *
00016  *-------------------------------------------------------------------------
00017  */
00018 #include "postgres.h"
00019 
00020 #include "access/multixact.h"
00021 #include "access/relscan.h"
00022 #include "access/rewriteheap.h"
00023 #include "access/transam.h"
00024 #include "access/xact.h"
00025 #include "catalog/catalog.h"
00026 #include "catalog/dependency.h"
00027 #include "catalog/heap.h"
00028 #include "catalog/index.h"
00029 #include "catalog/namespace.h"
00030 #include "catalog/objectaccess.h"
00031 #include "catalog/toasting.h"
00032 #include "commands/cluster.h"
00033 #include "commands/matview.h"
00034 #include "commands/tablecmds.h"
00035 #include "commands/vacuum.h"
00036 #include "miscadmin.h"
00037 #include "optimizer/planner.h"
00038 #include "storage/bufmgr.h"
00039 #include "storage/lmgr.h"
00040 #include "storage/predicate.h"
00041 #include "storage/smgr.h"
00042 #include "utils/acl.h"
00043 #include "utils/fmgroids.h"
00044 #include "utils/inval.h"
00045 #include "utils/lsyscache.h"
00046 #include "utils/memutils.h"
00047 #include "utils/pg_rusage.h"
00048 #include "utils/relmapper.h"
00049 #include "utils/snapmgr.h"
00050 #include "utils/syscache.h"
00051 #include "utils/tqual.h"
00052 #include "utils/tuplesort.h"
00053 
00054 
00055 /*
00056  * This struct is used to pass around the information on tables to be
00057  * clustered. We need this so we can make a list of them when invoked without
00058  * a specific table/index pair.
00059  */
00060 typedef struct
00061 {
00062     Oid         tableOid;
00063     Oid         indexOid;
00064 } RelToCluster;
00065 
00066 
00067 static void rebuild_relation(Relation OldHeap, Oid indexOid,
00068                  int freeze_min_age, int freeze_table_age, bool verbose);
00069 static void copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
00070                int freeze_min_age, int freeze_table_age, bool verbose,
00071                bool *pSwapToastByContent, TransactionId *pFreezeXid,
00072                MultiXactId *pFreezeMulti);
00073 static List *get_tables_to_cluster(MemoryContext cluster_context);
00074 static void reform_and_rewrite_tuple(HeapTuple tuple,
00075                          TupleDesc oldTupDesc, TupleDesc newTupDesc,
00076                          Datum *values, bool *isnull,
00077                          bool newRelHasOids, RewriteState rwstate);
00078 
00079 
00080 /*---------------------------------------------------------------------------
00081  * This cluster code allows for clustering multiple tables at once. Because
00082  * of this, we cannot just run everything on a single transaction, or we
00083  * would be forced to acquire exclusive locks on all the tables being
00084  * clustered, simultaneously --- very likely leading to deadlock.
00085  *
00086  * To solve this we follow a similar strategy to VACUUM code,
00087  * clustering each relation in a separate transaction. For this to work,
00088  * we need to:
00089  *  - provide a separate memory context so that we can pass information in
00090  *    a way that survives across transactions
00091  *  - start a new transaction every time a new relation is clustered
00092  *  - check for validity of the information on to-be-clustered relations,
00093  *    as someone might have deleted a relation behind our back, or
00094  *    clustered one on a different index
00095  *  - end the transaction
00096  *
00097  * The single-relation case does not have any such overhead.
00098  *
00099  * We also allow a relation to be specified without index.  In that case,
00100  * the indisclustered bit will be looked up, and an ERROR will be thrown
00101  * if there is no index with the bit set.
00102  *---------------------------------------------------------------------------
00103  */
00104 void
00105 cluster(ClusterStmt *stmt, bool isTopLevel)
00106 {
00107     if (stmt->relation != NULL)
00108     {
00109         /* This is the single-relation case. */
00110         Oid         tableOid,
00111                     indexOid = InvalidOid;
00112         Relation    rel;
00113 
00114         /* Find, lock, and check permissions on the table */
00115         tableOid = RangeVarGetRelidExtended(stmt->relation,
00116                                             AccessExclusiveLock,
00117                                             false, false,
00118                                             RangeVarCallbackOwnsTable, NULL);
00119         rel = heap_open(tableOid, NoLock);
00120 
00121         /*
00122          * Reject clustering a remote temp table ... their local buffer
00123          * manager is not going to cope.
00124          */
00125         if (RELATION_IS_OTHER_TEMP(rel))
00126             ereport(ERROR,
00127                     (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
00128                errmsg("cannot cluster temporary tables of other sessions")));
00129 
00130         if (stmt->indexname == NULL)
00131         {
00132             ListCell   *index;
00133 
00134             /* We need to find the index that has indisclustered set. */
00135             foreach(index, RelationGetIndexList(rel))
00136             {
00137                 HeapTuple   idxtuple;
00138                 Form_pg_index indexForm;
00139 
00140                 indexOid = lfirst_oid(index);
00141                 idxtuple = SearchSysCache1(INDEXRELID,
00142                                            ObjectIdGetDatum(indexOid));
00143                 if (!HeapTupleIsValid(idxtuple))
00144                     elog(ERROR, "cache lookup failed for index %u", indexOid);
00145                 indexForm = (Form_pg_index) GETSTRUCT(idxtuple);
00146                 if (indexForm->indisclustered)
00147                 {
00148                     ReleaseSysCache(idxtuple);
00149                     break;
00150                 }
00151                 ReleaseSysCache(idxtuple);
00152                 indexOid = InvalidOid;
00153             }
00154 
00155             if (!OidIsValid(indexOid))
00156                 ereport(ERROR,
00157                         (errcode(ERRCODE_UNDEFINED_OBJECT),
00158                          errmsg("there is no previously clustered index for table \"%s\"",
00159                                 stmt->relation->relname)));
00160         }
00161         else
00162         {
00163             /*
00164              * The index is expected to be in the same namespace as the
00165              * relation.
00166              */
00167             indexOid = get_relname_relid(stmt->indexname,
00168                                          rel->rd_rel->relnamespace);
00169             if (!OidIsValid(indexOid))
00170                 ereport(ERROR,
00171                         (errcode(ERRCODE_UNDEFINED_OBJECT),
00172                        errmsg("index \"%s\" for table \"%s\" does not exist",
00173                               stmt->indexname, stmt->relation->relname)));
00174         }
00175 
00176         /* close relation, keep lock till commit */
00177         heap_close(rel, NoLock);
00178 
00179         /* Do the job */
00180         cluster_rel(tableOid, indexOid, false, stmt->verbose, -1, -1);
00181     }
00182     else
00183     {
00184         /*
00185          * This is the "multi relation" case. We need to cluster all tables
00186          * that have some index with indisclustered set.
00187          */
00188         MemoryContext cluster_context;
00189         List       *rvs;
00190         ListCell   *rv;
00191 
00192         /*
00193          * We cannot run this form of CLUSTER inside a user transaction block;
00194          * we'd be holding locks way too long.
00195          */
00196         PreventTransactionChain(isTopLevel, "CLUSTER");
00197 
00198         /*
00199          * Create special memory context for cross-transaction storage.
00200          *
00201          * Since it is a child of PortalContext, it will go away even in case
00202          * of error.
00203          */
00204         cluster_context = AllocSetContextCreate(PortalContext,
00205                                                 "Cluster",
00206                                                 ALLOCSET_DEFAULT_MINSIZE,
00207                                                 ALLOCSET_DEFAULT_INITSIZE,
00208                                                 ALLOCSET_DEFAULT_MAXSIZE);
00209 
00210         /*
00211          * Build the list of relations to cluster.  Note that this lives in
00212          * cluster_context.
00213          */
00214         rvs = get_tables_to_cluster(cluster_context);
00215 
00216         /* Commit to get out of starting transaction */
00217         PopActiveSnapshot();
00218         CommitTransactionCommand();
00219 
00220         /* Ok, now that we've got them all, cluster them one by one */
00221         foreach(rv, rvs)
00222         {
00223             RelToCluster *rvtc = (RelToCluster *) lfirst(rv);
00224 
00225             /* Start a new transaction for each relation. */
00226             StartTransactionCommand();
00227             /* functions in indexes may want a snapshot set */
00228             PushActiveSnapshot(GetTransactionSnapshot());
00229             cluster_rel(rvtc->tableOid, rvtc->indexOid, true, stmt->verbose,
00230                         -1, -1);
00231             PopActiveSnapshot();
00232             CommitTransactionCommand();
00233         }
00234 
00235         /* Start a new transaction for the cleanup work. */
00236         StartTransactionCommand();
00237 
00238         /* Clean up working storage */
00239         MemoryContextDelete(cluster_context);
00240     }
00241 }
00242 
00243 /*
00244  * cluster_rel
00245  *
00246  * This clusters the table by creating a new, clustered table and
00247  * swapping the relfilenodes of the new table and the old table, so
00248  * the OID of the original table is preserved.  Thus we do not lose
00249  * GRANT, inheritance nor references to this table (this was a bug
00250  * in releases thru 7.3).
00251  *
00252  * Indexes are rebuilt too, via REINDEX. Since we are effectively bulk-loading
00253  * the new table, it's better to create the indexes afterwards than to fill
00254  * them incrementally while we load the table.
00255  *
00256  * If indexOid is InvalidOid, the table will be rewritten in physical order
00257  * instead of index order.  This is the new implementation of VACUUM FULL,
00258  * and error messages should refer to the operation as VACUUM not CLUSTER.
00259  */
00260 void
00261 cluster_rel(Oid tableOid, Oid indexOid, bool recheck, bool verbose,
00262             int freeze_min_age, int freeze_table_age)
00263 {
00264     Relation    OldHeap;
00265 
00266     /* Check for user-requested abort. */
00267     CHECK_FOR_INTERRUPTS();
00268 
00269     /*
00270      * We grab exclusive access to the target rel and index for the duration
00271      * of the transaction.  (This is redundant for the single-transaction
00272      * case, since cluster() already did it.)  The index lock is taken inside
00273      * check_index_is_clusterable.
00274      */
00275     OldHeap = try_relation_open(tableOid, AccessExclusiveLock);
00276 
00277     /* If the table has gone away, we can skip processing it */
00278     if (!OldHeap)
00279         return;
00280 
00281     /*
00282      * Since we may open a new transaction for each relation, we have to check
00283      * that the relation still is what we think it is.
00284      *
00285      * If this is a single-transaction CLUSTER, we can skip these tests. We
00286      * *must* skip the one on indisclustered since it would reject an attempt
00287      * to cluster a not-previously-clustered index.
00288      */
00289     if (recheck)
00290     {
00291         HeapTuple   tuple;
00292         Form_pg_index indexForm;
00293 
00294         /* Check that the user still owns the relation */
00295         if (!pg_class_ownercheck(tableOid, GetUserId()))
00296         {
00297             relation_close(OldHeap, AccessExclusiveLock);
00298             return;
00299         }
00300 
00301         /*
00302          * Silently skip a temp table for a remote session.  Only doing this
00303          * check in the "recheck" case is appropriate (which currently means
00304          * somebody is executing a database-wide CLUSTER), because there is
00305          * another check in cluster() which will stop any attempt to cluster
00306          * remote temp tables by name.  There is another check in cluster_rel
00307          * which is redundant, but we leave it for extra safety.
00308          */
00309         if (RELATION_IS_OTHER_TEMP(OldHeap))
00310         {
00311             relation_close(OldHeap, AccessExclusiveLock);
00312             return;
00313         }
00314 
00315         if (OidIsValid(indexOid))
00316         {
00317             /*
00318              * Check that the index still exists
00319              */
00320             if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(indexOid)))
00321             {
00322                 relation_close(OldHeap, AccessExclusiveLock);
00323                 return;
00324             }
00325 
00326             /*
00327              * Check that the index is still the one with indisclustered set.
00328              */
00329             tuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexOid));
00330             if (!HeapTupleIsValid(tuple))       /* probably can't happen */
00331             {
00332                 relation_close(OldHeap, AccessExclusiveLock);
00333                 return;
00334             }
00335             indexForm = (Form_pg_index) GETSTRUCT(tuple);
00336             if (!indexForm->indisclustered)
00337             {
00338                 ReleaseSysCache(tuple);
00339                 relation_close(OldHeap, AccessExclusiveLock);
00340                 return;
00341             }
00342             ReleaseSysCache(tuple);
00343         }
00344     }
00345 
00346     /*
00347      * We allow VACUUM FULL, but not CLUSTER, on shared catalogs.  CLUSTER
00348      * would work in most respects, but the index would only get marked as
00349      * indisclustered in the current database, leading to unexpected behavior
00350      * if CLUSTER were later invoked in another database.
00351      */
00352     if (OidIsValid(indexOid) && OldHeap->rd_rel->relisshared)
00353         ereport(ERROR,
00354                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
00355                  errmsg("cannot cluster a shared catalog")));
00356 
00357     /*
00358      * Don't process temp tables of other backends ... their local buffer
00359      * manager is not going to cope.
00360      */
00361     if (RELATION_IS_OTHER_TEMP(OldHeap))
00362     {
00363         if (OidIsValid(indexOid))
00364             ereport(ERROR,
00365                     (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
00366                errmsg("cannot cluster temporary tables of other sessions")));
00367         else
00368             ereport(ERROR,
00369                     (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
00370                 errmsg("cannot vacuum temporary tables of other sessions")));
00371     }
00372 
00373     /*
00374      * Also check for active uses of the relation in the current transaction,
00375      * including open scans and pending AFTER trigger events.
00376      */
00377     CheckTableNotInUse(OldHeap, OidIsValid(indexOid) ? "CLUSTER" : "VACUUM");
00378 
00379     /* Check heap and index are valid to cluster on */
00380     if (OidIsValid(indexOid))
00381         check_index_is_clusterable(OldHeap, indexOid, recheck, AccessExclusiveLock);
00382 
00383     /*
00384      * Quietly ignore the request if this is a materialized view which has not
00385      * been populated from its query. No harm is done because there is no data
00386      * to deal with, and we don't want to throw an error if this is part of a
00387      * multi-relation request -- for example, CLUSTER was run on the entire
00388      * database.
00389      */
00390     if (OldHeap->rd_rel->relkind == RELKIND_MATVIEW &&
00391         !OldHeap->rd_ispopulated)
00392     {
00393         relation_close(OldHeap, AccessExclusiveLock);
00394         return;
00395     }
00396 
00397     /*
00398      * All predicate locks on the tuples or pages are about to be made
00399      * invalid, because we move tuples around.  Promote them to relation
00400      * locks.  Predicate locks on indexes will be promoted when they are
00401      * reindexed.
00402      */
00403     TransferPredicateLocksToHeapRelation(OldHeap);
00404 
00405     /* rebuild_relation does all the dirty work */
00406     rebuild_relation(OldHeap, indexOid, freeze_min_age, freeze_table_age,
00407                      verbose);
00408 
00409     /* NB: rebuild_relation does heap_close() on OldHeap */
00410 }
00411 
00412 /*
00413  * Verify that the specified heap and index are valid to cluster on
00414  *
00415  * Side effect: obtains exclusive lock on the index.  The caller should
00416  * already have exclusive lock on the table, so the index lock is likely
00417  * redundant, but it seems best to grab it anyway to ensure the index
00418  * definition can't change under us.
00419  */
00420 void
00421 check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck, LOCKMODE lockmode)
00422 {
00423     Relation    OldIndex;
00424 
00425     OldIndex = index_open(indexOid, lockmode);
00426 
00427     /*
00428      * Check that index is in fact an index on the given relation
00429      */
00430     if (OldIndex->rd_index == NULL ||
00431         OldIndex->rd_index->indrelid != RelationGetRelid(OldHeap))
00432         ereport(ERROR,
00433                 (errcode(ERRCODE_WRONG_OBJECT_TYPE),
00434                  errmsg("\"%s\" is not an index for table \"%s\"",
00435                         RelationGetRelationName(OldIndex),
00436                         RelationGetRelationName(OldHeap))));
00437 
00438     /* Index AM must allow clustering */
00439     if (!OldIndex->rd_am->amclusterable)
00440         ereport(ERROR,
00441                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
00442                  errmsg("cannot cluster on index \"%s\" because access method does not support clustering",
00443                         RelationGetRelationName(OldIndex))));
00444 
00445     /*
00446      * Disallow clustering on incomplete indexes (those that might not index
00447      * every row of the relation).  We could relax this by making a separate
00448      * seqscan pass over the table to copy the missing rows, but that seems
00449      * expensive and tedious.
00450      */
00451     if (!heap_attisnull(OldIndex->rd_indextuple, Anum_pg_index_indpred))
00452         ereport(ERROR,
00453                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
00454                  errmsg("cannot cluster on partial index \"%s\"",
00455                         RelationGetRelationName(OldIndex))));
00456 
00457     /*
00458      * Disallow if index is left over from a failed CREATE INDEX CONCURRENTLY;
00459      * it might well not contain entries for every heap row, or might not even
00460      * be internally consistent.  (But note that we don't check indcheckxmin;
00461      * the worst consequence of following broken HOT chains would be that we
00462      * might put recently-dead tuples out-of-order in the new table, and there
00463      * is little harm in that.)
00464      */
00465     if (!IndexIsValid(OldIndex->rd_index))
00466         ereport(ERROR,
00467                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
00468                  errmsg("cannot cluster on invalid index \"%s\"",
00469                         RelationGetRelationName(OldIndex))));
00470 
00471     /* Drop relcache refcnt on OldIndex, but keep lock */
00472     index_close(OldIndex, NoLock);
00473 }
00474 
00475 /*
00476  * mark_index_clustered: mark the specified index as the one clustered on
00477  *
00478  * With indexOid == InvalidOid, will mark all indexes of rel not-clustered.
00479  *
00480  * Note: we do transactional updates of the pg_index rows, which are unsafe
00481  * against concurrent SnapshotNow scans of pg_index.  Therefore this is unsafe
00482  * to execute with less than full exclusive lock on the parent table;
00483  * otherwise concurrent executions of RelationGetIndexList could miss indexes.
00484  */
00485 void
00486 mark_index_clustered(Relation rel, Oid indexOid, bool is_internal)
00487 {
00488     HeapTuple   indexTuple;
00489     Form_pg_index indexForm;
00490     Relation    pg_index;
00491     ListCell   *index;
00492 
00493     /*
00494      * If the index is already marked clustered, no need to do anything.
00495      */
00496     if (OidIsValid(indexOid))
00497     {
00498         indexTuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(indexOid));
00499         if (!HeapTupleIsValid(indexTuple))
00500             elog(ERROR, "cache lookup failed for index %u", indexOid);
00501         indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
00502 
00503         if (indexForm->indisclustered)
00504         {
00505             ReleaseSysCache(indexTuple);
00506             return;
00507         }
00508 
00509         ReleaseSysCache(indexTuple);
00510     }
00511 
00512     /*
00513      * Check each index of the relation and set/clear the bit as needed.
00514      */
00515     pg_index = heap_open(IndexRelationId, RowExclusiveLock);
00516 
00517     foreach(index, RelationGetIndexList(rel))
00518     {
00519         Oid         thisIndexOid = lfirst_oid(index);
00520 
00521         indexTuple = SearchSysCacheCopy1(INDEXRELID,
00522                                          ObjectIdGetDatum(thisIndexOid));
00523         if (!HeapTupleIsValid(indexTuple))
00524             elog(ERROR, "cache lookup failed for index %u", thisIndexOid);
00525         indexForm = (Form_pg_index) GETSTRUCT(indexTuple);
00526 
00527         /*
00528          * Unset the bit if set.  We know it's wrong because we checked this
00529          * earlier.
00530          */
00531         if (indexForm->indisclustered)
00532         {
00533             indexForm->indisclustered = false;
00534             simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
00535             CatalogUpdateIndexes(pg_index, indexTuple);
00536         }
00537         else if (thisIndexOid == indexOid)
00538         {
00539             /* this was checked earlier, but let's be real sure */
00540             if (!IndexIsValid(indexForm))
00541                 elog(ERROR, "cannot cluster on invalid index %u", indexOid);
00542             indexForm->indisclustered = true;
00543             simple_heap_update(pg_index, &indexTuple->t_self, indexTuple);
00544             CatalogUpdateIndexes(pg_index, indexTuple);
00545         }
00546 
00547         InvokeObjectPostAlterHookArg(IndexRelationId, thisIndexOid, 0,
00548                                      InvalidOid, is_internal);
00549 
00550         heap_freetuple(indexTuple);
00551     }
00552 
00553     heap_close(pg_index, RowExclusiveLock);
00554 }
00555 
00556 /*
00557  * rebuild_relation: rebuild an existing relation in index or physical order
00558  *
00559  * OldHeap: table to rebuild --- must be opened and exclusive-locked!
00560  * indexOid: index to cluster by, or InvalidOid to rewrite in physical order.
00561  *
00562  * NB: this routine closes OldHeap at the right time; caller should not.
00563  */
00564 static void
00565 rebuild_relation(Relation OldHeap, Oid indexOid,
00566                  int freeze_min_age, int freeze_table_age, bool verbose)
00567 {
00568     Oid         tableOid = RelationGetRelid(OldHeap);
00569     Oid         tableSpace = OldHeap->rd_rel->reltablespace;
00570     Oid         OIDNewHeap;
00571     bool        is_system_catalog;
00572     bool        swap_toast_by_content;
00573     TransactionId frozenXid;
00574     MultiXactId frozenMulti;
00575 
00576     /* Mark the correct index as clustered */
00577     if (OidIsValid(indexOid))
00578         mark_index_clustered(OldHeap, indexOid, true);
00579 
00580     /* Remember if it's a system catalog */
00581     is_system_catalog = IsSystemRelation(OldHeap);
00582 
00583     /* Close relcache entry, but keep lock until transaction commit */
00584     heap_close(OldHeap, NoLock);
00585 
00586     /* Create the transient table that will receive the re-ordered data */
00587     OIDNewHeap = make_new_heap(tableOid, tableSpace);
00588 
00589     /* Copy the heap data into the new table in the desired order */
00590     copy_heap_data(OIDNewHeap, tableOid, indexOid,
00591                    freeze_min_age, freeze_table_age, verbose,
00592                    &swap_toast_by_content, &frozenXid, &frozenMulti);
00593 
00594     /*
00595      * Swap the physical files of the target and transient tables, then
00596      * rebuild the target's indexes and throw away the transient table.
00597      */
00598     finish_heap_swap(tableOid, OIDNewHeap, is_system_catalog,
00599                      swap_toast_by_content, false, true,
00600                      frozenXid, frozenMulti);
00601 }
00602 
00603 
00604 /*
00605  * Create the transient table that will be filled with new data during
00606  * CLUSTER, ALTER TABLE, and similar operations.  The transient table
00607  * duplicates the logical structure of the OldHeap, but is placed in
00608  * NewTableSpace which might be different from OldHeap's.
00609  *
00610  * After this, the caller should load the new heap with transferred/modified
00611  * data, then call finish_heap_swap to complete the operation.
00612  */
00613 Oid
00614 make_new_heap(Oid OIDOldHeap, Oid NewTableSpace)
00615 {
00616     TupleDesc   OldHeapDesc;
00617     char        NewHeapName[NAMEDATALEN];
00618     Oid         OIDNewHeap;
00619     Oid         toastid;
00620     Relation    OldHeap;
00621     HeapTuple   tuple;
00622     Datum       reloptions;
00623     bool        isNull;
00624 
00625     OldHeap = heap_open(OIDOldHeap, AccessExclusiveLock);
00626     OldHeapDesc = RelationGetDescr(OldHeap);
00627 
00628     /*
00629      * Note that the NewHeap will not receive any of the defaults or
00630      * constraints associated with the OldHeap; we don't need 'em, and there's
00631      * no reason to spend cycles inserting them into the catalogs only to
00632      * delete them.
00633      */
00634 
00635     /*
00636      * But we do want to use reloptions of the old heap for new heap.
00637      */
00638     tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(OIDOldHeap));
00639     if (!HeapTupleIsValid(tuple))
00640         elog(ERROR, "cache lookup failed for relation %u", OIDOldHeap);
00641     reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
00642                                  &isNull);
00643     if (isNull)
00644         reloptions = (Datum) 0;
00645 
00646     /*
00647      * Create the new heap, using a temporary name in the same namespace as
00648      * the existing table.  NOTE: there is some risk of collision with user
00649      * relnames.  Working around this seems more trouble than it's worth; in
00650      * particular, we can't create the new heap in a different namespace from
00651      * the old, or we will have problems with the TEMP status of temp tables.
00652      *
00653      * Note: the new heap is not a shared relation, even if we are rebuilding
00654      * a shared rel.  However, we do make the new heap mapped if the source is
00655      * mapped.  This simplifies swap_relation_files, and is absolutely
00656      * necessary for rebuilding pg_class, for reasons explained there.
00657      */
00658     snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", OIDOldHeap);
00659 
00660     OIDNewHeap = heap_create_with_catalog(NewHeapName,
00661                                           RelationGetNamespace(OldHeap),
00662                                           NewTableSpace,
00663                                           InvalidOid,
00664                                           InvalidOid,
00665                                           InvalidOid,
00666                                           OldHeap->rd_rel->relowner,
00667                                           OldHeapDesc,
00668                                           NIL,
00669                                           OldHeap->rd_rel->relkind,
00670                                           OldHeap->rd_rel->relpersistence,
00671                                           false,
00672                                           RelationIsMapped(OldHeap),
00673                                           true,
00674                                           0,
00675                                           ONCOMMIT_NOOP,
00676                                           reloptions,
00677                                           false,
00678                                           true,
00679                                           true);
00680     Assert(OIDNewHeap != InvalidOid);
00681 
00682     ReleaseSysCache(tuple);
00683 
00684     /*
00685      * Advance command counter so that the newly-created relation's catalog
00686      * tuples will be visible to heap_open.
00687      */
00688     CommandCounterIncrement();
00689 
00690     /*
00691      * If necessary, create a TOAST table for the new relation.
00692      *
00693      * If the relation doesn't have a TOAST table already, we can't need one
00694      * for the new relation.  The other way around is possible though: if some
00695      * wide columns have been dropped, AlterTableCreateToastTable can decide
00696      * that no TOAST table is needed for the new table.
00697      *
00698      * Note that AlterTableCreateToastTable ends with CommandCounterIncrement,
00699      * so that the TOAST table will be visible for insertion.
00700      */
00701     toastid = OldHeap->rd_rel->reltoastrelid;
00702     if (OidIsValid(toastid))
00703     {
00704         /* keep the existing toast table's reloptions, if any */
00705         tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(toastid));
00706         if (!HeapTupleIsValid(tuple))
00707             elog(ERROR, "cache lookup failed for relation %u", toastid);
00708         reloptions = SysCacheGetAttr(RELOID, tuple, Anum_pg_class_reloptions,
00709                                      &isNull);
00710         if (isNull)
00711             reloptions = (Datum) 0;
00712 
00713         AlterTableCreateToastTable(OIDNewHeap, reloptions);
00714 
00715         ReleaseSysCache(tuple);
00716     }
00717 
00718     heap_close(OldHeap, NoLock);
00719 
00720     return OIDNewHeap;
00721 }
00722 
00723 /*
00724  * Do the physical copying of heap data.
00725  *
00726  * There are two output parameters:
00727  * *pSwapToastByContent is set true if toast tables must be swapped by content.
00728  * *pFreezeXid receives the TransactionId used as freeze cutoff point.
00729  */
00730 static void
00731 copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
00732                int freeze_min_age, int freeze_table_age, bool verbose,
00733                bool *pSwapToastByContent, TransactionId *pFreezeXid,
00734                MultiXactId *pFreezeMulti)
00735 {
00736     Relation    NewHeap,
00737                 OldHeap,
00738                 OldIndex;
00739     TupleDesc   oldTupDesc;
00740     TupleDesc   newTupDesc;
00741     int         natts;
00742     Datum      *values;
00743     bool       *isnull;
00744     IndexScanDesc indexScan;
00745     HeapScanDesc heapScan;
00746     bool        use_wal;
00747     bool        is_system_catalog;
00748     TransactionId OldestXmin;
00749     TransactionId FreezeXid;
00750     MultiXactId MultiXactFrzLimit;
00751     RewriteState rwstate;
00752     bool        use_sort;
00753     Tuplesortstate *tuplesort;
00754     double      num_tuples = 0,
00755                 tups_vacuumed = 0,
00756                 tups_recently_dead = 0;
00757     int         elevel = verbose ? INFO : DEBUG2;
00758     PGRUsage    ru0;
00759 
00760     pg_rusage_init(&ru0);
00761 
00762     /*
00763      * Open the relations we need.
00764      */
00765     NewHeap = heap_open(OIDNewHeap, AccessExclusiveLock);
00766     OldHeap = heap_open(OIDOldHeap, AccessExclusiveLock);
00767     if (OidIsValid(OIDOldIndex))
00768         OldIndex = index_open(OIDOldIndex, AccessExclusiveLock);
00769     else
00770         OldIndex = NULL;
00771 
00772     /*
00773      * Their tuple descriptors should be exactly alike, but here we only need
00774      * assume that they have the same number of columns.
00775      */
00776     oldTupDesc = RelationGetDescr(OldHeap);
00777     newTupDesc = RelationGetDescr(NewHeap);
00778     Assert(newTupDesc->natts == oldTupDesc->natts);
00779 
00780     /* Preallocate values/isnull arrays */
00781     natts = newTupDesc->natts;
00782     values = (Datum *) palloc(natts * sizeof(Datum));
00783     isnull = (bool *) palloc(natts * sizeof(bool));
00784 
00785     /*
00786      * If the OldHeap has a toast table, get lock on the toast table to keep
00787      * it from being vacuumed.  This is needed because autovacuum processes
00788      * toast tables independently of their main tables, with no lock on the
00789      * latter.  If an autovacuum were to start on the toast table after we
00790      * compute our OldestXmin below, it would use a later OldestXmin, and then
00791      * possibly remove as DEAD toast tuples belonging to main tuples we think
00792      * are only RECENTLY_DEAD.  Then we'd fail while trying to copy those
00793      * tuples.
00794      *
00795      * We don't need to open the toast relation here, just lock it.  The lock
00796      * will be held till end of transaction.
00797      */
00798     if (OldHeap->rd_rel->reltoastrelid)
00799         LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock);
00800 
00801     /*
00802      * We need to log the copied data in WAL iff WAL archiving/streaming is
00803      * enabled AND it's a WAL-logged rel.
00804      */
00805     use_wal = XLogIsNeeded() && RelationNeedsWAL(NewHeap);
00806 
00807     /* use_wal off requires smgr_targblock be initially invalid */
00808     Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber);
00809 
00810     /*
00811      * If both tables have TOAST tables, perform toast swap by content.  It is
00812      * possible that the old table has a toast table but the new one doesn't,
00813      * if toastable columns have been dropped.  In that case we have to do
00814      * swap by links.  This is okay because swap by content is only essential
00815      * for system catalogs, and we don't support schema changes for them.
00816      */
00817     if (OldHeap->rd_rel->reltoastrelid && NewHeap->rd_rel->reltoastrelid)
00818     {
00819         *pSwapToastByContent = true;
00820 
00821         /*
00822          * When doing swap by content, any toast pointers written into NewHeap
00823          * must use the old toast table's OID, because that's where the toast
00824          * data will eventually be found.  Set this up by setting rd_toastoid.
00825          * This also tells toast_save_datum() to preserve the toast value
00826          * OIDs, which we want so as not to invalidate toast pointers in
00827          * system catalog caches, and to avoid making multiple copies of a
00828          * single toast value.
00829          *
00830          * Note that we must hold NewHeap open until we are done writing data,
00831          * since the relcache will not guarantee to remember this setting once
00832          * the relation is closed.  Also, this technique depends on the fact
00833          * that no one will try to read from the NewHeap until after we've
00834          * finished writing it and swapping the rels --- otherwise they could
00835          * follow the toast pointers to the wrong place.  (It would actually
00836          * work for values copied over from the old toast table, but not for
00837          * any values that we toast which were previously not toasted.)
00838          */
00839         NewHeap->rd_toastoid = OldHeap->rd_rel->reltoastrelid;
00840     }
00841     else
00842         *pSwapToastByContent = false;
00843 
00844     /*
00845      * compute xids used to freeze and weed out dead tuples.  We use -1
00846      * freeze_min_age to avoid having CLUSTER freeze tuples earlier than a
00847      * plain VACUUM would.
00848      */
00849     vacuum_set_xid_limits(freeze_min_age, freeze_table_age,
00850                           OldHeap->rd_rel->relisshared,
00851                           &OldestXmin, &FreezeXid, NULL, &MultiXactFrzLimit);
00852 
00853     /*
00854      * FreezeXid will become the table's new relfrozenxid, and that mustn't go
00855      * backwards, so take the max.
00856      */
00857     if (TransactionIdPrecedes(FreezeXid, OldHeap->rd_rel->relfrozenxid))
00858         FreezeXid = OldHeap->rd_rel->relfrozenxid;
00859 
00860     /* return selected values to caller */
00861     *pFreezeXid = FreezeXid;
00862     *pFreezeMulti = MultiXactFrzLimit;
00863 
00864     /* Remember if it's a system catalog */
00865     is_system_catalog = IsSystemRelation(OldHeap);
00866 
00867     /* Initialize the rewrite operation */
00868     rwstate = begin_heap_rewrite(NewHeap, OldestXmin, FreezeXid,
00869                                  MultiXactFrzLimit, use_wal);
00870 
00871     /*
00872      * Decide whether to use an indexscan or seqscan-and-optional-sort to scan
00873      * the OldHeap.  We know how to use a sort to duplicate the ordering of a
00874      * btree index, and will use seqscan-and-sort for that case if the planner
00875      * tells us it's cheaper.  Otherwise, always indexscan if an index is
00876      * provided, else plain seqscan.
00877      */
00878     if (OldIndex != NULL && OldIndex->rd_rel->relam == BTREE_AM_OID)
00879         use_sort = plan_cluster_use_sort(OIDOldHeap, OIDOldIndex);
00880     else
00881         use_sort = false;
00882 
00883     /* Set up sorting if wanted */
00884     if (use_sort)
00885         tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex,
00886                                             maintenance_work_mem, false);
00887     else
00888         tuplesort = NULL;
00889 
00890     /*
00891      * Prepare to scan the OldHeap.  To ensure we see recently-dead tuples
00892      * that still need to be copied, we scan with SnapshotAny and use
00893      * HeapTupleSatisfiesVacuum for the visibility test.
00894      */
00895     if (OldIndex != NULL && !use_sort)
00896     {
00897         heapScan = NULL;
00898         indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0);
00899         index_rescan(indexScan, NULL, 0, NULL, 0);
00900     }
00901     else
00902     {
00903         heapScan = heap_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL);
00904         indexScan = NULL;
00905     }
00906 
00907     /* Log what we're doing */
00908     if (indexScan != NULL)
00909         ereport(elevel,
00910                 (errmsg("clustering \"%s.%s\" using index scan on \"%s\"",
00911                         get_namespace_name(RelationGetNamespace(OldHeap)),
00912                         RelationGetRelationName(OldHeap),
00913                         RelationGetRelationName(OldIndex))));
00914     else if (tuplesort != NULL)
00915         ereport(elevel,
00916                 (errmsg("clustering \"%s.%s\" using sequential scan and sort",
00917                         get_namespace_name(RelationGetNamespace(OldHeap)),
00918                         RelationGetRelationName(OldHeap))));
00919     else
00920         ereport(elevel,
00921                 (errmsg("vacuuming \"%s.%s\"",
00922                         get_namespace_name(RelationGetNamespace(OldHeap)),
00923                         RelationGetRelationName(OldHeap))));
00924 
00925     if (OldHeap->rd_rel->relkind == RELKIND_MATVIEW)
00926         /* Make sure the heap looks good even if no rows are written. */
00927         SetMatViewToPopulated(NewHeap);
00928 
00929     /*
00930      * Scan through the OldHeap, either in OldIndex order or sequentially;
00931      * copy each tuple into the NewHeap, or transiently to the tuplesort
00932      * module.  Note that we don't bother sorting dead tuples (they won't get
00933      * to the new table anyway).
00934      */
00935     for (;;)
00936     {
00937         HeapTuple   tuple;
00938         Buffer      buf;
00939         bool        isdead;
00940 
00941         CHECK_FOR_INTERRUPTS();
00942 
00943         if (indexScan != NULL)
00944         {
00945             tuple = index_getnext(indexScan, ForwardScanDirection);
00946             if (tuple == NULL)
00947                 break;
00948 
00949             /* Since we used no scan keys, should never need to recheck */
00950             if (indexScan->xs_recheck)
00951                 elog(ERROR, "CLUSTER does not support lossy index conditions");
00952 
00953             buf = indexScan->xs_cbuf;
00954         }
00955         else
00956         {
00957             tuple = heap_getnext(heapScan, ForwardScanDirection);
00958             if (tuple == NULL)
00959                 break;
00960 
00961             buf = heapScan->rs_cbuf;
00962         }
00963 
00964         LockBuffer(buf, BUFFER_LOCK_SHARE);
00965 
00966         switch (HeapTupleSatisfiesVacuum(tuple->t_data, OldestXmin, buf))
00967         {
00968             case HEAPTUPLE_DEAD:
00969                 /* Definitely dead */
00970                 isdead = true;
00971                 break;
00972             case HEAPTUPLE_RECENTLY_DEAD:
00973                 tups_recently_dead += 1;
00974                 /* fall through */
00975             case HEAPTUPLE_LIVE:
00976                 /* Live or recently dead, must copy it */
00977                 isdead = false;
00978                 break;
00979             case HEAPTUPLE_INSERT_IN_PROGRESS:
00980 
00981                 /*
00982                  * Since we hold exclusive lock on the relation, normally the
00983                  * only way to see this is if it was inserted earlier in our
00984                  * own transaction.  However, it can happen in system
00985                  * catalogs, since we tend to release write lock before commit
00986                  * there.  Give a warning if neither case applies; but in any
00987                  * case we had better copy it.
00988                  */
00989                 if (!is_system_catalog &&
00990                     !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data)))
00991                     elog(WARNING, "concurrent insert in progress within table \"%s\"",
00992                          RelationGetRelationName(OldHeap));
00993                 /* treat as live */
00994                 isdead = false;
00995                 break;
00996             case HEAPTUPLE_DELETE_IN_PROGRESS:
00997 
00998                 /*
00999                  * Similar situation to INSERT_IN_PROGRESS case.
01000                  */
01001                 if (!is_system_catalog &&
01002                     !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data)))
01003                     elog(WARNING, "concurrent delete in progress within table \"%s\"",
01004                          RelationGetRelationName(OldHeap));
01005                 /* treat as recently dead */
01006                 tups_recently_dead += 1;
01007                 isdead = false;
01008                 break;
01009             default:
01010                 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
01011                 isdead = false; /* keep compiler quiet */
01012                 break;
01013         }
01014 
01015         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
01016 
01017         if (isdead)
01018         {
01019             tups_vacuumed += 1;
01020             /* heap rewrite module still needs to see it... */
01021             if (rewrite_heap_dead_tuple(rwstate, tuple))
01022             {
01023                 /* A previous recently-dead tuple is now known dead */
01024                 tups_vacuumed += 1;
01025                 tups_recently_dead -= 1;
01026             }
01027             continue;
01028         }
01029 
01030         num_tuples += 1;
01031         if (tuplesort != NULL)
01032             tuplesort_putheaptuple(tuplesort, tuple);
01033         else
01034             reform_and_rewrite_tuple(tuple,
01035                                      oldTupDesc, newTupDesc,
01036                                      values, isnull,
01037                                      NewHeap->rd_rel->relhasoids, rwstate);
01038     }
01039 
01040     if (indexScan != NULL)
01041         index_endscan(indexScan);
01042     if (heapScan != NULL)
01043         heap_endscan(heapScan);
01044 
01045     /*
01046      * In scan-and-sort mode, complete the sort, then read out all live tuples
01047      * from the tuplestore and write them to the new relation.
01048      */
01049     if (tuplesort != NULL)
01050     {
01051         tuplesort_performsort(tuplesort);
01052 
01053         for (;;)
01054         {
01055             HeapTuple   tuple;
01056             bool        shouldfree;
01057 
01058             CHECK_FOR_INTERRUPTS();
01059 
01060             tuple = tuplesort_getheaptuple(tuplesort, true, &shouldfree);
01061             if (tuple == NULL)
01062                 break;
01063 
01064             reform_and_rewrite_tuple(tuple,
01065                                      oldTupDesc, newTupDesc,
01066                                      values, isnull,
01067                                      NewHeap->rd_rel->relhasoids, rwstate);
01068 
01069             if (shouldfree)
01070                 heap_freetuple(tuple);
01071         }
01072 
01073         tuplesort_end(tuplesort);
01074     }
01075 
01076     /* Write out any remaining tuples, and fsync if needed */
01077     end_heap_rewrite(rwstate);
01078 
01079     /* Reset rd_toastoid just to be tidy --- it shouldn't be looked at again */
01080     NewHeap->rd_toastoid = InvalidOid;
01081 
01082     /* Log what we did */
01083     ereport(elevel,
01084             (errmsg("\"%s\": found %.0f removable, %.0f nonremovable row versions in %u pages",
01085                     RelationGetRelationName(OldHeap),
01086                     tups_vacuumed, num_tuples,
01087                     RelationGetNumberOfBlocks(OldHeap)),
01088              errdetail("%.0f dead row versions cannot be removed yet.\n"
01089                        "%s.",
01090                        tups_recently_dead,
01091                        pg_rusage_show(&ru0))));
01092 
01093     /* Clean up */
01094     pfree(values);
01095     pfree(isnull);
01096 
01097     if (OldIndex != NULL)
01098         index_close(OldIndex, NoLock);
01099     heap_close(OldHeap, NoLock);
01100     heap_close(NewHeap, NoLock);
01101 }
01102 
01103 /*
01104  * Swap the physical files of two given relations.
01105  *
01106  * We swap the physical identity (reltablespace and relfilenode) while
01107  * keeping the same logical identities of the two relations.
01108  *
01109  * We can swap associated TOAST data in either of two ways: recursively swap
01110  * the physical content of the toast tables (and their indexes), or swap the
01111  * TOAST links in the given relations' pg_class entries.  The former is needed
01112  * to manage rewrites of shared catalogs (where we cannot change the pg_class
01113  * links) while the latter is the only way to handle cases in which a toast
01114  * table is added or removed altogether.
01115  *
01116  * Additionally, the first relation is marked with relfrozenxid set to
01117  * frozenXid.  It seems a bit ugly to have this here, but the caller would
01118  * have to do it anyway, so having it here saves a heap_update.  Note: in
01119  * the swap-toast-links case, we assume we don't need to change the toast
01120  * table's relfrozenxid: the new version of the toast table should already
01121  * have relfrozenxid set to RecentXmin, which is good enough.
01122  *
01123  * Lastly, if r2 and its toast table and toast index (if any) are mapped,
01124  * their OIDs are emitted into mapped_tables[].  This is hacky but beats
01125  * having to look the information up again later in finish_heap_swap.
01126  */
01127 static void
01128 swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class,
01129                     bool swap_toast_by_content,
01130                     bool is_internal,
01131                     TransactionId frozenXid,
01132                     MultiXactId frozenMulti,
01133                     Oid *mapped_tables)
01134 {
01135     Relation    relRelation;
01136     HeapTuple   reltup1,
01137                 reltup2;
01138     Form_pg_class relform1,
01139                 relform2;
01140     Oid         relfilenode1,
01141                 relfilenode2;
01142     Oid         swaptemp;
01143     CatalogIndexState indstate;
01144 
01145     /* We need writable copies of both pg_class tuples. */
01146     relRelation = heap_open(RelationRelationId, RowExclusiveLock);
01147 
01148     reltup1 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r1));
01149     if (!HeapTupleIsValid(reltup1))
01150         elog(ERROR, "cache lookup failed for relation %u", r1);
01151     relform1 = (Form_pg_class) GETSTRUCT(reltup1);
01152 
01153     reltup2 = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(r2));
01154     if (!HeapTupleIsValid(reltup2))
01155         elog(ERROR, "cache lookup failed for relation %u", r2);
01156     relform2 = (Form_pg_class) GETSTRUCT(reltup2);
01157 
01158     relfilenode1 = relform1->relfilenode;
01159     relfilenode2 = relform2->relfilenode;
01160 
01161     if (OidIsValid(relfilenode1) && OidIsValid(relfilenode2))
01162     {
01163         /* Normal non-mapped relations: swap relfilenodes and reltablespaces */
01164         Assert(!target_is_pg_class);
01165 
01166         swaptemp = relform1->relfilenode;
01167         relform1->relfilenode = relform2->relfilenode;
01168         relform2->relfilenode = swaptemp;
01169 
01170         swaptemp = relform1->reltablespace;
01171         relform1->reltablespace = relform2->reltablespace;
01172         relform2->reltablespace = swaptemp;
01173 
01174         /* Also swap toast links, if we're swapping by links */
01175         if (!swap_toast_by_content)
01176         {
01177             swaptemp = relform1->reltoastrelid;
01178             relform1->reltoastrelid = relform2->reltoastrelid;
01179             relform2->reltoastrelid = swaptemp;
01180 
01181             /* we should NOT swap reltoastidxid */
01182         }
01183     }
01184     else
01185     {
01186         /*
01187          * Mapped-relation case.  Here we have to swap the relation mappings
01188          * instead of modifying the pg_class columns.  Both must be mapped.
01189          */
01190         if (OidIsValid(relfilenode1) || OidIsValid(relfilenode2))
01191             elog(ERROR, "cannot swap mapped relation \"%s\" with non-mapped relation",
01192                  NameStr(relform1->relname));
01193 
01194         /*
01195          * We can't change the tablespace of a mapped rel, and we can't handle
01196          * toast link swapping for one either, because we must not apply any
01197          * critical changes to its pg_class row.  These cases should be
01198          * prevented by upstream permissions tests, so this check is a
01199          * non-user-facing emergency backstop.
01200          */
01201         if (relform1->reltablespace != relform2->reltablespace)
01202             elog(ERROR, "cannot change tablespace of mapped relation \"%s\"",
01203                  NameStr(relform1->relname));
01204         if (!swap_toast_by_content &&
01205             (relform1->reltoastrelid || relform2->reltoastrelid))
01206             elog(ERROR, "cannot swap toast by links for mapped relation \"%s\"",
01207                  NameStr(relform1->relname));
01208 
01209         /*
01210          * Fetch the mappings --- shouldn't fail, but be paranoid
01211          */
01212         relfilenode1 = RelationMapOidToFilenode(r1, relform1->relisshared);
01213         if (!OidIsValid(relfilenode1))
01214             elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
01215                  NameStr(relform1->relname), r1);
01216         relfilenode2 = RelationMapOidToFilenode(r2, relform2->relisshared);
01217         if (!OidIsValid(relfilenode2))
01218             elog(ERROR, "could not find relation mapping for relation \"%s\", OID %u",
01219                  NameStr(relform2->relname), r2);
01220 
01221         /*
01222          * Send replacement mappings to relmapper.  Note these won't actually
01223          * take effect until CommandCounterIncrement.
01224          */
01225         RelationMapUpdateMap(r1, relfilenode2, relform1->relisshared, false);
01226         RelationMapUpdateMap(r2, relfilenode1, relform2->relisshared, false);
01227 
01228         /* Pass OIDs of mapped r2 tables back to caller */
01229         *mapped_tables++ = r2;
01230     }
01231 
01232     /*
01233      * In the case of a shared catalog, these next few steps will only affect
01234      * our own database's pg_class row; but that's okay, because they are all
01235      * noncritical updates.  That's also an important fact for the case of a
01236      * mapped catalog, because it's possible that we'll commit the map change
01237      * and then fail to commit the pg_class update.
01238      */
01239 
01240     /* set rel1's frozen Xid and minimum MultiXid */
01241     if (relform1->relkind != RELKIND_INDEX)
01242     {
01243         Assert(TransactionIdIsNormal(frozenXid));
01244         relform1->relfrozenxid = frozenXid;
01245         Assert(MultiXactIdIsValid(frozenMulti));
01246         relform1->relminmxid = frozenMulti;
01247     }
01248 
01249     /* swap size statistics too, since new rel has freshly-updated stats */
01250     {
01251         int32       swap_pages;
01252         float4      swap_tuples;
01253         int32       swap_allvisible;
01254 
01255         swap_pages = relform1->relpages;
01256         relform1->relpages = relform2->relpages;
01257         relform2->relpages = swap_pages;
01258 
01259         swap_tuples = relform1->reltuples;
01260         relform1->reltuples = relform2->reltuples;
01261         relform2->reltuples = swap_tuples;
01262 
01263         swap_allvisible = relform1->relallvisible;
01264         relform1->relallvisible = relform2->relallvisible;
01265         relform2->relallvisible = swap_allvisible;
01266     }
01267 
01268     /*
01269      * Update the tuples in pg_class --- unless the target relation of the
01270      * swap is pg_class itself.  In that case, there is zero point in making
01271      * changes because we'd be updating the old data that we're about to throw
01272      * away.  Because the real work being done here for a mapped relation is
01273      * just to change the relation map settings, it's all right to not update
01274      * the pg_class rows in this case.
01275      */
01276     if (!target_is_pg_class)
01277     {
01278         simple_heap_update(relRelation, &reltup1->t_self, reltup1);
01279         simple_heap_update(relRelation, &reltup2->t_self, reltup2);
01280 
01281         /* Keep system catalogs current */
01282         indstate = CatalogOpenIndexes(relRelation);
01283         CatalogIndexInsert(indstate, reltup1);
01284         CatalogIndexInsert(indstate, reltup2);
01285         CatalogCloseIndexes(indstate);
01286     }
01287     else
01288     {
01289         /* no update ... but we do still need relcache inval */
01290         CacheInvalidateRelcacheByTuple(reltup1);
01291         CacheInvalidateRelcacheByTuple(reltup2);
01292     }
01293 
01294     /*
01295      * Post alter hook for modified relations. The change to r2 is always
01296      * internal, but r1 depends on the invocation context.
01297      */
01298     InvokeObjectPostAlterHookArg(RelationRelationId, r1, 0,
01299                                  InvalidOid, is_internal);
01300     InvokeObjectPostAlterHookArg(RelationRelationId, r2, 0,
01301                                  InvalidOid, true);
01302 
01303     /*
01304      * If we have toast tables associated with the relations being swapped,
01305      * deal with them too.
01306      */
01307     if (relform1->reltoastrelid || relform2->reltoastrelid)
01308     {
01309         if (swap_toast_by_content)
01310         {
01311             if (relform1->reltoastrelid && relform2->reltoastrelid)
01312             {
01313                 /* Recursively swap the contents of the toast tables */
01314                 swap_relation_files(relform1->reltoastrelid,
01315                                     relform2->reltoastrelid,
01316                                     target_is_pg_class,
01317                                     swap_toast_by_content,
01318                                     is_internal,
01319                                     frozenXid,
01320                                     frozenMulti,
01321                                     mapped_tables);
01322             }
01323             else
01324             {
01325                 /* caller messed up */
01326                 elog(ERROR, "cannot swap toast files by content when there's only one");
01327             }
01328         }
01329         else
01330         {
01331             /*
01332              * We swapped the ownership links, so we need to change dependency
01333              * data to match.
01334              *
01335              * NOTE: it is possible that only one table has a toast table.
01336              *
01337              * NOTE: at present, a TOAST table's only dependency is the one on
01338              * its owning table.  If more are ever created, we'd need to use
01339              * something more selective than deleteDependencyRecordsFor() to
01340              * get rid of just the link we want.
01341              */
01342             ObjectAddress baseobject,
01343                         toastobject;
01344             long        count;
01345 
01346             /*
01347              * We disallow this case for system catalogs, to avoid the
01348              * possibility that the catalog we're rebuilding is one of the
01349              * ones the dependency changes would change.  It's too late to be
01350              * making any data changes to the target catalog.
01351              */
01352             if (IsSystemClass(relform1))
01353                 elog(ERROR, "cannot swap toast files by links for system catalogs");
01354 
01355             /* Delete old dependencies */
01356             if (relform1->reltoastrelid)
01357             {
01358                 count = deleteDependencyRecordsFor(RelationRelationId,
01359                                                    relform1->reltoastrelid,
01360                                                    false);
01361                 if (count != 1)
01362                     elog(ERROR, "expected one dependency record for TOAST table, found %ld",
01363                          count);
01364             }
01365             if (relform2->reltoastrelid)
01366             {
01367                 count = deleteDependencyRecordsFor(RelationRelationId,
01368                                                    relform2->reltoastrelid,
01369                                                    false);
01370                 if (count != 1)
01371                     elog(ERROR, "expected one dependency record for TOAST table, found %ld",
01372                          count);
01373             }
01374 
01375             /* Register new dependencies */
01376             baseobject.classId = RelationRelationId;
01377             baseobject.objectSubId = 0;
01378             toastobject.classId = RelationRelationId;
01379             toastobject.objectSubId = 0;
01380 
01381             if (relform1->reltoastrelid)
01382             {
01383                 baseobject.objectId = r1;
01384                 toastobject.objectId = relform1->reltoastrelid;
01385                 recordDependencyOn(&toastobject, &baseobject,
01386                                    DEPENDENCY_INTERNAL);
01387             }
01388 
01389             if (relform2->reltoastrelid)
01390             {
01391                 baseobject.objectId = r2;
01392                 toastobject.objectId = relform2->reltoastrelid;
01393                 recordDependencyOn(&toastobject, &baseobject,
01394                                    DEPENDENCY_INTERNAL);
01395             }
01396         }
01397     }
01398 
01399     /*
01400      * If we're swapping two toast tables by content, do the same for their
01401      * indexes.
01402      */
01403     if (swap_toast_by_content &&
01404         relform1->reltoastidxid && relform2->reltoastidxid)
01405         swap_relation_files(relform1->reltoastidxid,
01406                             relform2->reltoastidxid,
01407                             target_is_pg_class,
01408                             swap_toast_by_content,
01409                             is_internal,
01410                             InvalidTransactionId,
01411                             InvalidMultiXactId,
01412                             mapped_tables);
01413 
01414     /* Clean up. */
01415     heap_freetuple(reltup1);
01416     heap_freetuple(reltup2);
01417 
01418     heap_close(relRelation, RowExclusiveLock);
01419 
01420     /*
01421      * Close both relcache entries' smgr links.  We need this kluge because
01422      * both links will be invalidated during upcoming CommandCounterIncrement.
01423      * Whichever of the rels is the second to be cleared will have a dangling
01424      * reference to the other's smgr entry.  Rather than trying to avoid this
01425      * by ordering operations just so, it's easiest to close the links first.
01426      * (Fortunately, since one of the entries is local in our transaction,
01427      * it's sufficient to clear out our own relcache this way; the problem
01428      * cannot arise for other backends when they see our update on the
01429      * non-transient relation.)
01430      *
01431      * Caution: the placement of this step interacts with the decision to
01432      * handle toast rels by recursion.  When we are trying to rebuild pg_class
01433      * itself, the smgr close on pg_class must happen after all accesses in
01434      * this function.
01435      */
01436     RelationCloseSmgrByOid(r1);
01437     RelationCloseSmgrByOid(r2);
01438 }
01439 
01440 /*
01441  * Remove the transient table that was built by make_new_heap, and finish
01442  * cleaning up (including rebuilding all indexes on the old heap).
01443  */
01444 void
01445 finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
01446                  bool is_system_catalog,
01447                  bool swap_toast_by_content,
01448                  bool check_constraints,
01449                  bool is_internal,
01450                  TransactionId frozenXid,
01451                  MultiXactId frozenMulti)
01452 {
01453     ObjectAddress object;
01454     Oid         mapped_tables[4];
01455     int         reindex_flags;
01456     int         i;
01457 
01458     /* Zero out possible results from swapped_relation_files */
01459     memset(mapped_tables, 0, sizeof(mapped_tables));
01460 
01461     /*
01462      * Swap the contents of the heap relations (including any toast tables).
01463      * Also set old heap's relfrozenxid to frozenXid.
01464      */
01465     swap_relation_files(OIDOldHeap, OIDNewHeap,
01466                         (OIDOldHeap == RelationRelationId),
01467                         swap_toast_by_content, is_internal,
01468                         frozenXid, frozenMulti, mapped_tables);
01469 
01470     /*
01471      * If it's a system catalog, queue an sinval message to flush all
01472      * catcaches on the catalog when we reach CommandCounterIncrement.
01473      */
01474     if (is_system_catalog)
01475         CacheInvalidateCatalog(OIDOldHeap);
01476 
01477     /*
01478      * Rebuild each index on the relation (but not the toast table, which is
01479      * all-new at this point).  It is important to do this before the DROP
01480      * step because if we are processing a system catalog that will be used
01481      * during DROP, we want to have its indexes available.  There is no
01482      * advantage to the other order anyway because this is all transactional,
01483      * so no chance to reclaim disk space before commit.  We do not need a
01484      * final CommandCounterIncrement() because reindex_relation does it.
01485      *
01486      * Note: because index_build is called via reindex_relation, it will never
01487      * set indcheckxmin true for the indexes.  This is OK even though in some
01488      * sense we are building new indexes rather than rebuilding existing ones,
01489      * because the new heap won't contain any HOT chains at all, let alone
01490      * broken ones, so it can't be necessary to set indcheckxmin.
01491      */
01492     reindex_flags = REINDEX_REL_SUPPRESS_INDEX_USE;
01493     if (check_constraints)
01494         reindex_flags |= REINDEX_REL_CHECK_CONSTRAINTS;
01495     reindex_relation(OIDOldHeap, reindex_flags);
01496 
01497     /* Destroy new heap with old filenode */
01498     object.classId = RelationRelationId;
01499     object.objectId = OIDNewHeap;
01500     object.objectSubId = 0;
01501 
01502     /*
01503      * The new relation is local to our transaction and we know nothing
01504      * depends on it, so DROP_RESTRICT should be OK.
01505      */
01506     performDeletion(&object, DROP_RESTRICT, PERFORM_DELETION_INTERNAL);
01507 
01508     /* performDeletion does CommandCounterIncrement at end */
01509 
01510     /*
01511      * Now we must remove any relation mapping entries that we set up for the
01512      * transient table, as well as its toast table and toast index if any. If
01513      * we fail to do this before commit, the relmapper will complain about new
01514      * permanent map entries being added post-bootstrap.
01515      */
01516     for (i = 0; OidIsValid(mapped_tables[i]); i++)
01517         RelationMapRemoveMapping(mapped_tables[i]);
01518 
01519     /*
01520      * At this point, everything is kosher except that, if we did toast swap
01521      * by links, the toast table's name corresponds to the transient table.
01522      * The name is irrelevant to the backend because it's referenced by OID,
01523      * but users looking at the catalogs could be confused.  Rename it to
01524      * prevent this problem.
01525      *
01526      * Note no lock required on the relation, because we already hold an
01527      * exclusive lock on it.
01528      */
01529     if (!swap_toast_by_content)
01530     {
01531         Relation    newrel;
01532 
01533         newrel = heap_open(OIDOldHeap, NoLock);
01534         if (OidIsValid(newrel->rd_rel->reltoastrelid))
01535         {
01536             Relation    toastrel;
01537             Oid         toastidx;
01538             char        NewToastName[NAMEDATALEN];
01539 
01540             toastrel = relation_open(newrel->rd_rel->reltoastrelid,
01541                                      AccessShareLock);
01542             toastidx = toastrel->rd_rel->reltoastidxid;
01543             relation_close(toastrel, AccessShareLock);
01544 
01545             /* rename the toast table ... */
01546             snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u",
01547                      OIDOldHeap);
01548             RenameRelationInternal(newrel->rd_rel->reltoastrelid,
01549                                    NewToastName, true);
01550 
01551             /* ... and its index too */
01552             snprintf(NewToastName, NAMEDATALEN, "pg_toast_%u_index",
01553                      OIDOldHeap);
01554             RenameRelationInternal(toastidx,
01555                                    NewToastName, true);
01556         }
01557         relation_close(newrel, NoLock);
01558     }
01559 }
01560 
01561 
01562 /*
01563  * Get a list of tables that the current user owns and
01564  * have indisclustered set.  Return the list in a List * of rvsToCluster
01565  * with the tableOid and the indexOid on which the table is already
01566  * clustered.
01567  */
01568 static List *
01569 get_tables_to_cluster(MemoryContext cluster_context)
01570 {
01571     Relation    indRelation;
01572     HeapScanDesc scan;
01573     ScanKeyData entry;
01574     HeapTuple   indexTuple;
01575     Form_pg_index index;
01576     MemoryContext old_context;
01577     RelToCluster *rvtc;
01578     List       *rvs = NIL;
01579 
01580     /*
01581      * Get all indexes that have indisclustered set and are owned by
01582      * appropriate user. System relations or nailed-in relations cannot ever
01583      * have indisclustered set, because CLUSTER will refuse to set it when
01584      * called with one of them as argument.
01585      */
01586     indRelation = heap_open(IndexRelationId, AccessShareLock);
01587     ScanKeyInit(&entry,
01588                 Anum_pg_index_indisclustered,
01589                 BTEqualStrategyNumber, F_BOOLEQ,
01590                 BoolGetDatum(true));
01591     scan = heap_beginscan(indRelation, SnapshotNow, 1, &entry);
01592     while ((indexTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
01593     {
01594         index = (Form_pg_index) GETSTRUCT(indexTuple);
01595 
01596         if (!pg_class_ownercheck(index->indrelid, GetUserId()))
01597             continue;
01598 
01599         /*
01600          * We have to build the list in a different memory context so it will
01601          * survive the cross-transaction processing
01602          */
01603         old_context = MemoryContextSwitchTo(cluster_context);
01604 
01605         rvtc = (RelToCluster *) palloc(sizeof(RelToCluster));
01606         rvtc->tableOid = index->indrelid;
01607         rvtc->indexOid = index->indexrelid;
01608         rvs = lcons(rvtc, rvs);
01609 
01610         MemoryContextSwitchTo(old_context);
01611     }
01612     heap_endscan(scan);
01613 
01614     relation_close(indRelation, AccessShareLock);
01615 
01616     return rvs;
01617 }
01618 
01619 
01620 /*
01621  * Reconstruct and rewrite the given tuple
01622  *
01623  * We cannot simply copy the tuple as-is, for several reasons:
01624  *
01625  * 1. We'd like to squeeze out the values of any dropped columns, both
01626  * to save space and to ensure we have no corner-case failures. (It's
01627  * possible for example that the new table hasn't got a TOAST table
01628  * and so is unable to store any large values of dropped cols.)
01629  *
01630  * 2. The tuple might not even be legal for the new table; this is
01631  * currently only known to happen as an after-effect of ALTER TABLE
01632  * SET WITHOUT OIDS.
01633  *
01634  * So, we must reconstruct the tuple from component Datums.
01635  */
01636 static void
01637 reform_and_rewrite_tuple(HeapTuple tuple,
01638                          TupleDesc oldTupDesc, TupleDesc newTupDesc,
01639                          Datum *values, bool *isnull,
01640                          bool newRelHasOids, RewriteState rwstate)
01641 {
01642     HeapTuple   copiedTuple;
01643     int         i;
01644 
01645     heap_deform_tuple(tuple, oldTupDesc, values, isnull);
01646 
01647     /* Be sure to null out any dropped columns */
01648     for (i = 0; i < newTupDesc->natts; i++)
01649     {
01650         if (newTupDesc->attrs[i]->attisdropped)
01651             isnull[i] = true;
01652     }
01653 
01654     copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
01655 
01656     /* Preserve OID, if any */
01657     if (newRelHasOids)
01658         HeapTupleSetOid(copiedTuple, HeapTupleGetOid(tuple));
01659 
01660     /* The heap rewrite module does the rest */
01661     rewrite_heap_tuple(rwstate, tuple, copiedTuple);
01662 
01663     heap_freetuple(copiedTuple);
01664 }
Header And Logo

cluster.c