Header And Logo

PostgreSQL
| The world's most advanced open source database.

relmapper.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * relmapper.c
00004  *    Catalog-to-filenode mapping
00005  *
00006  * For most tables, the physical file underlying the table is specified by
00007  * pg_class.relfilenode.  However, that obviously won't work for pg_class
00008  * itself, nor for the other "nailed" catalogs for which we have to be able
00009  * to set up working Relation entries without access to pg_class.  It also
00010  * does not work for shared catalogs, since there is no practical way to
00011  * update other databases' pg_class entries when relocating a shared catalog.
00012  * Therefore, for these special catalogs (henceforth referred to as "mapped
00013  * catalogs") we rely on a separately maintained file that shows the mapping
00014  * from catalog OIDs to filenode numbers.  Each database has a map file for
00015  * its local mapped catalogs, and there is a separate map file for shared
00016  * catalogs.  Mapped catalogs have zero in their pg_class.relfilenode entries.
00017  *
00018  * Relocation of a normal table is committed (ie, the new physical file becomes
00019  * authoritative) when the pg_class row update commits.  For mapped catalogs,
00020  * the act of updating the map file is effectively commit of the relocation.
00021  * We postpone the file update till just before commit of the transaction
00022  * doing the rewrite, but there is necessarily a window between.  Therefore
00023  * mapped catalogs can only be relocated by operations such as VACUUM FULL
00024  * and CLUSTER, which make no transactionally-significant changes: it must be
00025  * safe for the new file to replace the old, even if the transaction itself
00026  * aborts.  An important factor here is that the indexes and toast table of
00027  * a mapped catalog must also be mapped, so that the rewrites/relocations of
00028  * all these files commit in a single map file update rather than being tied
00029  * to transaction commit.
00030  *
00031  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00032  * Portions Copyright (c) 1994, Regents of the University of California
00033  *
00034  *
00035  * IDENTIFICATION
00036  *    src/backend/utils/cache/relmapper.c
00037  *
00038  *-------------------------------------------------------------------------
00039  */
00040 #include "postgres.h"
00041 
00042 #include <fcntl.h>
00043 #include <sys/stat.h>
00044 #include <unistd.h>
00045 
00046 #include "access/xact.h"
00047 #include "catalog/catalog.h"
00048 #include "catalog/pg_tablespace.h"
00049 #include "catalog/storage.h"
00050 #include "miscadmin.h"
00051 #include "storage/fd.h"
00052 #include "storage/lwlock.h"
00053 #include "utils/inval.h"
00054 #include "utils/relmapper.h"
00055 
00056 
00057 /*
00058  * The map file is critical data: we have no automatic method for recovering
00059  * from loss or corruption of it.  We use a CRC so that we can detect
00060  * corruption.  To minimize the risk of failed updates, the map file should
00061  * be kept to no more than one standard-size disk sector (ie 512 bytes),
00062  * and we use overwrite-in-place rather than playing renaming games.
00063  * The struct layout below is designed to occupy exactly 512 bytes, which
00064  * might make filesystem updates a bit more efficient.
00065  *
00066  * Entries in the mappings[] array are in no particular order.  We could
00067  * speed searching by insisting on OID order, but it really shouldn't be
00068  * worth the trouble given the intended size of the mapping sets.
00069  */
00070 #define RELMAPPER_FILENAME      "pg_filenode.map"
00071 
00072 #define RELMAPPER_FILEMAGIC     0x592717        /* version ID value */
00073 
00074 #define MAX_MAPPINGS            62      /* 62 * 8 + 16 = 512 */
00075 
00076 typedef struct RelMapping
00077 {
00078     Oid         mapoid;         /* OID of a catalog */
00079     Oid         mapfilenode;    /* its filenode number */
00080 } RelMapping;
00081 
00082 typedef struct RelMapFile
00083 {
00084     int32       magic;          /* always RELMAPPER_FILEMAGIC */
00085     int32       num_mappings;   /* number of valid RelMapping entries */
00086     RelMapping  mappings[MAX_MAPPINGS];
00087     int32       crc;            /* CRC of all above */
00088     int32       pad;            /* to make the struct size be 512 exactly */
00089 } RelMapFile;
00090 
00091 /*
00092  * The currently known contents of the shared map file and our database's
00093  * local map file are stored here.  These can be reloaded from disk
00094  * immediately whenever we receive an update sinval message.
00095  */
00096 static RelMapFile shared_map;
00097 static RelMapFile local_map;
00098 
00099 /*
00100  * We use the same RelMapFile data structure to track uncommitted local
00101  * changes in the mappings (but note the magic and crc fields are not made
00102  * valid in these variables).  Currently, map updates are not allowed within
00103  * subtransactions, so one set of transaction-level changes is sufficient.
00104  *
00105  * The active_xxx variables contain updates that are valid in our transaction
00106  * and should be honored by RelationMapOidToFilenode.  The pending_xxx
00107  * variables contain updates we have been told about that aren't active yet;
00108  * they will become active at the next CommandCounterIncrement.  This setup
00109  * lets map updates act similarly to updates of pg_class rows, ie, they
00110  * become visible only at the next CommandCounterIncrement boundary.
00111  */
00112 static RelMapFile active_shared_updates;
00113 static RelMapFile active_local_updates;
00114 static RelMapFile pending_shared_updates;
00115 static RelMapFile pending_local_updates;
00116 
00117 
00118 /* non-export function prototypes */
00119 static void apply_map_update(RelMapFile *map, Oid relationId, Oid fileNode,
00120                  bool add_okay);
00121 static void merge_map_updates(RelMapFile *map, const RelMapFile *updates,
00122                   bool add_okay);
00123 static void load_relmap_file(bool shared);
00124 static void write_relmap_file(bool shared, RelMapFile *newmap,
00125                   bool write_wal, bool send_sinval, bool preserve_files,
00126                   Oid dbid, Oid tsid, const char *dbpath);
00127 static void perform_relmap_update(bool shared, const RelMapFile *updates);
00128 
00129 
00130 /*
00131  * RelationMapOidToFilenode
00132  *
00133  * The raison d' etre ... given a relation OID, look up its filenode.
00134  *
00135  * Although shared and local relation OIDs should never overlap, the caller
00136  * always knows which we need --- so pass that information to avoid useless
00137  * searching.
00138  *
00139  * Returns InvalidOid if the OID is not known (which should never happen,
00140  * but the caller is in a better position to report a meaningful error).
00141  */
00142 Oid
00143 RelationMapOidToFilenode(Oid relationId, bool shared)
00144 {
00145     const RelMapFile *map;
00146     int32       i;
00147 
00148     /* If there are active updates, believe those over the main maps */
00149     if (shared)
00150     {
00151         map = &active_shared_updates;
00152         for (i = 0; i < map->num_mappings; i++)
00153         {
00154             if (relationId == map->mappings[i].mapoid)
00155                 return map->mappings[i].mapfilenode;
00156         }
00157         map = &shared_map;
00158         for (i = 0; i < map->num_mappings; i++)
00159         {
00160             if (relationId == map->mappings[i].mapoid)
00161                 return map->mappings[i].mapfilenode;
00162         }
00163     }
00164     else
00165     {
00166         map = &active_local_updates;
00167         for (i = 0; i < map->num_mappings; i++)
00168         {
00169             if (relationId == map->mappings[i].mapoid)
00170                 return map->mappings[i].mapfilenode;
00171         }
00172         map = &local_map;
00173         for (i = 0; i < map->num_mappings; i++)
00174         {
00175             if (relationId == map->mappings[i].mapoid)
00176                 return map->mappings[i].mapfilenode;
00177         }
00178     }
00179 
00180     return InvalidOid;
00181 }
00182 
00183 /*
00184  * RelationMapUpdateMap
00185  *
00186  * Install a new relfilenode mapping for the specified relation.
00187  *
00188  * If immediate is true (or we're bootstrapping), the mapping is activated
00189  * immediately.  Otherwise it is made pending until CommandCounterIncrement.
00190  */
00191 void
00192 RelationMapUpdateMap(Oid relationId, Oid fileNode, bool shared,
00193                      bool immediate)
00194 {
00195     RelMapFile *map;
00196 
00197     if (IsBootstrapProcessingMode())
00198     {
00199         /*
00200          * In bootstrap mode, the mapping gets installed in permanent map.
00201          */
00202         if (shared)
00203             map = &shared_map;
00204         else
00205             map = &local_map;
00206     }
00207     else
00208     {
00209         /*
00210          * We don't currently support map changes within subtransactions. This
00211          * could be done with more bookkeeping infrastructure, but it doesn't
00212          * presently seem worth it.
00213          */
00214         if (GetCurrentTransactionNestLevel() > 1)
00215             elog(ERROR, "cannot change relation mapping within subtransaction");
00216 
00217         if (immediate)
00218         {
00219             /* Make it active, but only locally */
00220             if (shared)
00221                 map = &active_shared_updates;
00222             else
00223                 map = &active_local_updates;
00224         }
00225         else
00226         {
00227             /* Make it pending */
00228             if (shared)
00229                 map = &pending_shared_updates;
00230             else
00231                 map = &pending_local_updates;
00232         }
00233     }
00234     apply_map_update(map, relationId, fileNode, true);
00235 }
00236 
00237 /*
00238  * apply_map_update
00239  *
00240  * Insert a new mapping into the given map variable, replacing any existing
00241  * mapping for the same relation.
00242  *
00243  * In some cases the caller knows there must be an existing mapping; pass
00244  * add_okay = false to draw an error if not.
00245  */
00246 static void
00247 apply_map_update(RelMapFile *map, Oid relationId, Oid fileNode, bool add_okay)
00248 {
00249     int32       i;
00250 
00251     /* Replace any existing mapping */
00252     for (i = 0; i < map->num_mappings; i++)
00253     {
00254         if (relationId == map->mappings[i].mapoid)
00255         {
00256             map->mappings[i].mapfilenode = fileNode;
00257             return;
00258         }
00259     }
00260 
00261     /* Nope, need to add a new mapping */
00262     if (!add_okay)
00263         elog(ERROR, "attempt to apply a mapping to unmapped relation %u",
00264              relationId);
00265     if (map->num_mappings >= MAX_MAPPINGS)
00266         elog(ERROR, "ran out of space in relation map");
00267     map->mappings[map->num_mappings].mapoid = relationId;
00268     map->mappings[map->num_mappings].mapfilenode = fileNode;
00269     map->num_mappings++;
00270 }
00271 
00272 /*
00273  * merge_map_updates
00274  *
00275  * Merge all the updates in the given pending-update map into the target map.
00276  * This is just a bulk form of apply_map_update.
00277  */
00278 static void
00279 merge_map_updates(RelMapFile *map, const RelMapFile *updates, bool add_okay)
00280 {
00281     int32       i;
00282 
00283     for (i = 0; i < updates->num_mappings; i++)
00284     {
00285         apply_map_update(map,
00286                          updates->mappings[i].mapoid,
00287                          updates->mappings[i].mapfilenode,
00288                          add_okay);
00289     }
00290 }
00291 
00292 /*
00293  * RelationMapRemoveMapping
00294  *
00295  * Remove a relation's entry in the map.  This is only allowed for "active"
00296  * (but not committed) local mappings.  We need it so we can back out the
00297  * entry for the transient target file when doing VACUUM FULL/CLUSTER on
00298  * a mapped relation.
00299  */
00300 void
00301 RelationMapRemoveMapping(Oid relationId)
00302 {
00303     RelMapFile *map = &active_local_updates;
00304     int32       i;
00305 
00306     for (i = 0; i < map->num_mappings; i++)
00307     {
00308         if (relationId == map->mappings[i].mapoid)
00309         {
00310             /* Found it, collapse it out */
00311             map->mappings[i] = map->mappings[map->num_mappings - 1];
00312             map->num_mappings--;
00313             return;
00314         }
00315     }
00316     elog(ERROR, "could not find temporary mapping for relation %u",
00317          relationId);
00318 }
00319 
00320 /*
00321  * RelationMapInvalidate
00322  *
00323  * This routine is invoked for SI cache flush messages.  We must re-read
00324  * the indicated map file.  However, we might receive a SI message in a
00325  * process that hasn't yet, and might never, load the mapping files;
00326  * for example the autovacuum launcher, which *must not* try to read
00327  * a local map since it is attached to no particular database.
00328  * So, re-read only if the map is valid now.
00329  */
00330 void
00331 RelationMapInvalidate(bool shared)
00332 {
00333     if (shared)
00334     {
00335         if (shared_map.magic == RELMAPPER_FILEMAGIC)
00336             load_relmap_file(true);
00337     }
00338     else
00339     {
00340         if (local_map.magic == RELMAPPER_FILEMAGIC)
00341             load_relmap_file(false);
00342     }
00343 }
00344 
00345 /*
00346  * RelationMapInvalidateAll
00347  *
00348  * Reload all map files.  This is used to recover from SI message buffer
00349  * overflow: we can't be sure if we missed an inval message.
00350  * Again, reload only currently-valid maps.
00351  */
00352 void
00353 RelationMapInvalidateAll(void)
00354 {
00355     if (shared_map.magic == RELMAPPER_FILEMAGIC)
00356         load_relmap_file(true);
00357     if (local_map.magic == RELMAPPER_FILEMAGIC)
00358         load_relmap_file(false);
00359 }
00360 
00361 /*
00362  * AtCCI_RelationMap
00363  *
00364  * Activate any "pending" relation map updates at CommandCounterIncrement time.
00365  */
00366 void
00367 AtCCI_RelationMap(void)
00368 {
00369     if (pending_shared_updates.num_mappings != 0)
00370     {
00371         merge_map_updates(&active_shared_updates,
00372                           &pending_shared_updates,
00373                           true);
00374         pending_shared_updates.num_mappings = 0;
00375     }
00376     if (pending_local_updates.num_mappings != 0)
00377     {
00378         merge_map_updates(&active_local_updates,
00379                           &pending_local_updates,
00380                           true);
00381         pending_local_updates.num_mappings = 0;
00382     }
00383 }
00384 
00385 /*
00386  * AtEOXact_RelationMap
00387  *
00388  * Handle relation mapping at main-transaction commit or abort.
00389  *
00390  * During commit, this must be called as late as possible before the actual
00391  * transaction commit, so as to minimize the window where the transaction
00392  * could still roll back after committing map changes.  Although nothing
00393  * critically bad happens in such a case, we still would prefer that it
00394  * not happen, since we'd possibly be losing useful updates to the relations'
00395  * pg_class row(s).
00396  *
00397  * During abort, we just have to throw away any pending map changes.
00398  * Normal post-abort cleanup will take care of fixing relcache entries.
00399  */
00400 void
00401 AtEOXact_RelationMap(bool isCommit)
00402 {
00403     if (isCommit)
00404     {
00405         /*
00406          * We should not get here with any "pending" updates.  (We could
00407          * logically choose to treat such as committed, but in the current
00408          * code this should never happen.)
00409          */
00410         Assert(pending_shared_updates.num_mappings == 0);
00411         Assert(pending_local_updates.num_mappings == 0);
00412 
00413         /*
00414          * Write any active updates to the actual map files, then reset them.
00415          */
00416         if (active_shared_updates.num_mappings != 0)
00417         {
00418             perform_relmap_update(true, &active_shared_updates);
00419             active_shared_updates.num_mappings = 0;
00420         }
00421         if (active_local_updates.num_mappings != 0)
00422         {
00423             perform_relmap_update(false, &active_local_updates);
00424             active_local_updates.num_mappings = 0;
00425         }
00426     }
00427     else
00428     {
00429         /* Abort --- drop all local and pending updates */
00430         active_shared_updates.num_mappings = 0;
00431         active_local_updates.num_mappings = 0;
00432         pending_shared_updates.num_mappings = 0;
00433         pending_local_updates.num_mappings = 0;
00434     }
00435 }
00436 
00437 /*
00438  * AtPrepare_RelationMap
00439  *
00440  * Handle relation mapping at PREPARE.
00441  *
00442  * Currently, we don't support preparing any transaction that changes the map.
00443  */
00444 void
00445 AtPrepare_RelationMap(void)
00446 {
00447     if (active_shared_updates.num_mappings != 0 ||
00448         active_local_updates.num_mappings != 0 ||
00449         pending_shared_updates.num_mappings != 0 ||
00450         pending_local_updates.num_mappings != 0)
00451         ereport(ERROR,
00452                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
00453                  errmsg("cannot PREPARE a transaction that modified relation mapping")));
00454 }
00455 
00456 /*
00457  * CheckPointRelationMap
00458  *
00459  * This is called during a checkpoint.  It must ensure that any relation map
00460  * updates that were WAL-logged before the start of the checkpoint are
00461  * securely flushed to disk and will not need to be replayed later.  This
00462  * seems unlikely to be a performance-critical issue, so we use a simple
00463  * method: we just take and release the RelationMappingLock.  This ensures
00464  * that any already-logged map update is complete, because write_relmap_file
00465  * will fsync the map file before the lock is released.
00466  */
00467 void
00468 CheckPointRelationMap(void)
00469 {
00470     LWLockAcquire(RelationMappingLock, LW_SHARED);
00471     LWLockRelease(RelationMappingLock);
00472 }
00473 
00474 /*
00475  * RelationMapFinishBootstrap
00476  *
00477  * Write out the initial relation mapping files at the completion of
00478  * bootstrap.  All the mapped files should have been made known to us
00479  * via RelationMapUpdateMap calls.
00480  */
00481 void
00482 RelationMapFinishBootstrap(void)
00483 {
00484     Assert(IsBootstrapProcessingMode());
00485 
00486     /* Shouldn't be anything "pending" ... */
00487     Assert(active_shared_updates.num_mappings == 0);
00488     Assert(active_local_updates.num_mappings == 0);
00489     Assert(pending_shared_updates.num_mappings == 0);
00490     Assert(pending_local_updates.num_mappings == 0);
00491 
00492     /* Write the files; no WAL or sinval needed */
00493     write_relmap_file(true, &shared_map, false, false, false,
00494                       InvalidOid, GLOBALTABLESPACE_OID, NULL);
00495     write_relmap_file(false, &local_map, false, false, false,
00496                       MyDatabaseId, MyDatabaseTableSpace, DatabasePath);
00497 }
00498 
00499 /*
00500  * RelationMapInitialize
00501  *
00502  * This initializes the mapper module at process startup.  We can't access the
00503  * database yet, so just make sure the maps are empty.
00504  */
00505 void
00506 RelationMapInitialize(void)
00507 {
00508     /* The static variables should initialize to zeroes, but let's be sure */
00509     shared_map.magic = 0;       /* mark it not loaded */
00510     local_map.magic = 0;
00511     shared_map.num_mappings = 0;
00512     local_map.num_mappings = 0;
00513     active_shared_updates.num_mappings = 0;
00514     active_local_updates.num_mappings = 0;
00515     pending_shared_updates.num_mappings = 0;
00516     pending_local_updates.num_mappings = 0;
00517 }
00518 
00519 /*
00520  * RelationMapInitializePhase2
00521  *
00522  * This is called to prepare for access to pg_database during startup.
00523  * We should be able to read the shared map file now.
00524  */
00525 void
00526 RelationMapInitializePhase2(void)
00527 {
00528     /*
00529      * In bootstrap mode, the map file isn't there yet, so do nothing.
00530      */
00531     if (IsBootstrapProcessingMode())
00532         return;
00533 
00534     /*
00535      * Load the shared map file, die on error.
00536      */
00537     load_relmap_file(true);
00538 }
00539 
00540 /*
00541  * RelationMapInitializePhase3
00542  *
00543  * This is called as soon as we have determined MyDatabaseId and set up
00544  * DatabasePath.  At this point we should be able to read the local map file.
00545  */
00546 void
00547 RelationMapInitializePhase3(void)
00548 {
00549     /*
00550      * In bootstrap mode, the map file isn't there yet, so do nothing.
00551      */
00552     if (IsBootstrapProcessingMode())
00553         return;
00554 
00555     /*
00556      * Load the local map file, die on error.
00557      */
00558     load_relmap_file(false);
00559 }
00560 
00561 /*
00562  * load_relmap_file -- load data from the shared or local map file
00563  *
00564  * Because the map file is essential for access to core system catalogs,
00565  * failure to read it is a fatal error.
00566  *
00567  * Note that the local case requires DatabasePath to be set up.
00568  */
00569 static void
00570 load_relmap_file(bool shared)
00571 {
00572     RelMapFile *map;
00573     char        mapfilename[MAXPGPATH];
00574     pg_crc32    crc;
00575     int         fd;
00576 
00577     if (shared)
00578     {
00579         snprintf(mapfilename, sizeof(mapfilename), "global/%s",
00580                  RELMAPPER_FILENAME);
00581         map = &shared_map;
00582     }
00583     else
00584     {
00585         snprintf(mapfilename, sizeof(mapfilename), "%s/%s",
00586                  DatabasePath, RELMAPPER_FILENAME);
00587         map = &local_map;
00588     }
00589 
00590     /* Read data ... */
00591     fd = OpenTransientFile(mapfilename,
00592                            O_RDONLY | PG_BINARY, S_IRUSR | S_IWUSR);
00593     if (fd < 0)
00594         ereport(FATAL,
00595                 (errcode_for_file_access(),
00596                  errmsg("could not open relation mapping file \"%s\": %m",
00597                         mapfilename)));
00598 
00599     /*
00600      * Note: we could take RelationMappingLock in shared mode here, but it
00601      * seems unnecessary since our read() should be atomic against any
00602      * concurrent updater's write().  If the file is updated shortly after we
00603      * look, the sinval signaling mechanism will make us re-read it before we
00604      * are able to access any relation that's affected by the change.
00605      */
00606     if (read(fd, map, sizeof(RelMapFile)) != sizeof(RelMapFile))
00607         ereport(FATAL,
00608                 (errcode_for_file_access(),
00609                  errmsg("could not read relation mapping file \"%s\": %m",
00610                         mapfilename)));
00611 
00612     CloseTransientFile(fd);
00613 
00614     /* check for correct magic number, etc */
00615     if (map->magic != RELMAPPER_FILEMAGIC ||
00616         map->num_mappings < 0 ||
00617         map->num_mappings > MAX_MAPPINGS)
00618         ereport(FATAL,
00619                 (errmsg("relation mapping file \"%s\" contains invalid data",
00620                         mapfilename)));
00621 
00622     /* verify the CRC */
00623     INIT_CRC32(crc);
00624     COMP_CRC32(crc, (char *) map, offsetof(RelMapFile, crc));
00625     FIN_CRC32(crc);
00626 
00627     if (!EQ_CRC32(crc, map->crc))
00628         ereport(FATAL,
00629           (errmsg("relation mapping file \"%s\" contains incorrect checksum",
00630                   mapfilename)));
00631 }
00632 
00633 /*
00634  * Write out a new shared or local map file with the given contents.
00635  *
00636  * The magic number and CRC are automatically updated in *newmap.  On
00637  * success, we copy the data to the appropriate permanent static variable.
00638  *
00639  * If write_wal is TRUE then an appropriate WAL message is emitted.
00640  * (It will be false for bootstrap and WAL replay cases.)
00641  *
00642  * If send_sinval is TRUE then a SI invalidation message is sent.
00643  * (This should be true except in bootstrap case.)
00644  *
00645  * If preserve_files is TRUE then the storage manager is warned not to
00646  * delete the files listed in the map.
00647  *
00648  * Because this may be called during WAL replay when MyDatabaseId,
00649  * DatabasePath, etc aren't valid, we require the caller to pass in suitable
00650  * values.  The caller is also responsible for being sure no concurrent
00651  * map update could be happening.
00652  */
00653 static void
00654 write_relmap_file(bool shared, RelMapFile *newmap,
00655                   bool write_wal, bool send_sinval, bool preserve_files,
00656                   Oid dbid, Oid tsid, const char *dbpath)
00657 {
00658     int         fd;
00659     RelMapFile *realmap;
00660     char        mapfilename[MAXPGPATH];
00661 
00662     /*
00663      * Fill in the overhead fields and update CRC.
00664      */
00665     newmap->magic = RELMAPPER_FILEMAGIC;
00666     if (newmap->num_mappings < 0 || newmap->num_mappings > MAX_MAPPINGS)
00667         elog(ERROR, "attempt to write bogus relation mapping");
00668 
00669     INIT_CRC32(newmap->crc);
00670     COMP_CRC32(newmap->crc, (char *) newmap, offsetof(RelMapFile, crc));
00671     FIN_CRC32(newmap->crc);
00672 
00673     /*
00674      * Open the target file.  We prefer to do this before entering the
00675      * critical section, so that an open() failure need not force PANIC.
00676      */
00677     if (shared)
00678     {
00679         snprintf(mapfilename, sizeof(mapfilename), "global/%s",
00680                  RELMAPPER_FILENAME);
00681         realmap = &shared_map;
00682     }
00683     else
00684     {
00685         snprintf(mapfilename, sizeof(mapfilename), "%s/%s",
00686                  dbpath, RELMAPPER_FILENAME);
00687         realmap = &local_map;
00688     }
00689 
00690     fd = OpenTransientFile(mapfilename,
00691                            O_WRONLY | O_CREAT | PG_BINARY,
00692                            S_IRUSR | S_IWUSR);
00693     if (fd < 0)
00694         ereport(ERROR,
00695                 (errcode_for_file_access(),
00696                  errmsg("could not open relation mapping file \"%s\": %m",
00697                         mapfilename)));
00698 
00699     if (write_wal)
00700     {
00701         xl_relmap_update xlrec;
00702         XLogRecData rdata[2];
00703         XLogRecPtr  lsn;
00704 
00705         /* now errors are fatal ... */
00706         START_CRIT_SECTION();
00707 
00708         xlrec.dbid = dbid;
00709         xlrec.tsid = tsid;
00710         xlrec.nbytes = sizeof(RelMapFile);
00711 
00712         rdata[0].data = (char *) (&xlrec);
00713         rdata[0].len = MinSizeOfRelmapUpdate;
00714         rdata[0].buffer = InvalidBuffer;
00715         rdata[0].next = &(rdata[1]);
00716         rdata[1].data = (char *) newmap;
00717         rdata[1].len = sizeof(RelMapFile);
00718         rdata[1].buffer = InvalidBuffer;
00719         rdata[1].next = NULL;
00720 
00721         lsn = XLogInsert(RM_RELMAP_ID, XLOG_RELMAP_UPDATE, rdata);
00722 
00723         /* As always, WAL must hit the disk before the data update does */
00724         XLogFlush(lsn);
00725     }
00726 
00727     errno = 0;
00728     if (write(fd, newmap, sizeof(RelMapFile)) != sizeof(RelMapFile))
00729     {
00730         /* if write didn't set errno, assume problem is no disk space */
00731         if (errno == 0)
00732             errno = ENOSPC;
00733         ereport(ERROR,
00734                 (errcode_for_file_access(),
00735                  errmsg("could not write to relation mapping file \"%s\": %m",
00736                         mapfilename)));
00737     }
00738 
00739     /*
00740      * We choose to fsync the data to disk before considering the task done.
00741      * It would be possible to relax this if it turns out to be a performance
00742      * issue, but it would complicate checkpointing --- see notes for
00743      * CheckPointRelationMap.
00744      */
00745     if (pg_fsync(fd) != 0)
00746         ereport(ERROR,
00747                 (errcode_for_file_access(),
00748                  errmsg("could not fsync relation mapping file \"%s\": %m",
00749                         mapfilename)));
00750 
00751     if (CloseTransientFile(fd))
00752         ereport(ERROR,
00753                 (errcode_for_file_access(),
00754                  errmsg("could not close relation mapping file \"%s\": %m",
00755                         mapfilename)));
00756 
00757     /*
00758      * Now that the file is safely on disk, send sinval message to let other
00759      * backends know to re-read it.  We must do this inside the critical
00760      * section: if for some reason we fail to send the message, we have to
00761      * force a database-wide PANIC.  Otherwise other backends might continue
00762      * execution with stale mapping information, which would be catastrophic
00763      * as soon as others began to use the now-committed data.
00764      */
00765     if (send_sinval)
00766         CacheInvalidateRelmap(dbid);
00767 
00768     /*
00769      * Make sure that the files listed in the map are not deleted if the outer
00770      * transaction aborts.  This had better be within the critical section
00771      * too: it's not likely to fail, but if it did, we'd arrive at transaction
00772      * abort with the files still vulnerable.  PANICing will leave things in a
00773      * good state on-disk.
00774      *
00775      * Note: we're cheating a little bit here by assuming that mapped files
00776      * are either in pg_global or the database's default tablespace.
00777      */
00778     if (preserve_files)
00779     {
00780         int32       i;
00781 
00782         for (i = 0; i < newmap->num_mappings; i++)
00783         {
00784             RelFileNode rnode;
00785 
00786             rnode.spcNode = tsid;
00787             rnode.dbNode = dbid;
00788             rnode.relNode = newmap->mappings[i].mapfilenode;
00789             RelationPreserveStorage(rnode, false);
00790         }
00791     }
00792 
00793     /* Success, update permanent copy */
00794     memcpy(realmap, newmap, sizeof(RelMapFile));
00795 
00796     /* Critical section done */
00797     if (write_wal)
00798         END_CRIT_SECTION();
00799 }
00800 
00801 /*
00802  * Merge the specified updates into the appropriate "real" map,
00803  * and write out the changes.  This function must be used for committing
00804  * updates during normal multiuser operation.
00805  */
00806 static void
00807 perform_relmap_update(bool shared, const RelMapFile *updates)
00808 {
00809     RelMapFile  newmap;
00810 
00811     /*
00812      * Anyone updating a relation's mapping info should take exclusive lock on
00813      * that rel and hold it until commit.  This ensures that there will not be
00814      * concurrent updates on the same mapping value; but there could easily be
00815      * concurrent updates on different values in the same file. We cover that
00816      * by acquiring the RelationMappingLock, re-reading the target file to
00817      * ensure it's up to date, applying the updates, and writing the data
00818      * before releasing RelationMappingLock.
00819      *
00820      * There is only one RelationMappingLock.  In principle we could try to
00821      * have one per mapping file, but it seems unlikely to be worth the
00822      * trouble.
00823      */
00824     LWLockAcquire(RelationMappingLock, LW_EXCLUSIVE);
00825 
00826     /* Be certain we see any other updates just made */
00827     load_relmap_file(shared);
00828 
00829     /* Prepare updated data in a local variable */
00830     if (shared)
00831         memcpy(&newmap, &shared_map, sizeof(RelMapFile));
00832     else
00833         memcpy(&newmap, &local_map, sizeof(RelMapFile));
00834 
00835     /*
00836      * Apply the updates to newmap.  No new mappings should appear, unless
00837      * somebody is adding indexes to system catalogs.
00838      */
00839     merge_map_updates(&newmap, updates, allowSystemTableMods);
00840 
00841     /* Write out the updated map and do other necessary tasks */
00842     write_relmap_file(shared, &newmap, true, true, true,
00843                       (shared ? InvalidOid : MyDatabaseId),
00844                       (shared ? GLOBALTABLESPACE_OID : MyDatabaseTableSpace),
00845                       DatabasePath);
00846 
00847     /* Now we can release the lock */
00848     LWLockRelease(RelationMappingLock);
00849 }
00850 
00851 /*
00852  * RELMAP resource manager's routines
00853  */
00854 void
00855 relmap_redo(XLogRecPtr lsn, XLogRecord *record)
00856 {
00857     uint8       info = record->xl_info & ~XLR_INFO_MASK;
00858 
00859     /* Backup blocks are not used in relmap records */
00860     Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
00861 
00862     if (info == XLOG_RELMAP_UPDATE)
00863     {
00864         xl_relmap_update *xlrec = (xl_relmap_update *) XLogRecGetData(record);
00865         RelMapFile  newmap;
00866         char       *dbpath;
00867 
00868         if (xlrec->nbytes != sizeof(RelMapFile))
00869             elog(PANIC, "relmap_redo: wrong size %u in relmap update record",
00870                  xlrec->nbytes);
00871         memcpy(&newmap, xlrec->data, sizeof(newmap));
00872 
00873         /* We need to construct the pathname for this database */
00874         dbpath = GetDatabasePath(xlrec->dbid, xlrec->tsid);
00875 
00876         /*
00877          * Write out the new map and send sinval, but of course don't write a
00878          * new WAL entry.  There's no surrounding transaction to tell to
00879          * preserve files, either.
00880          *
00881          * There shouldn't be anyone else updating relmaps during WAL replay,
00882          * so we don't bother to take the RelationMappingLock.  We would need
00883          * to do so if load_relmap_file needed to interlock against writers.
00884          */
00885         write_relmap_file((xlrec->dbid == InvalidOid), &newmap,
00886                           false, true, false,
00887                           xlrec->dbid, xlrec->tsid, dbpath);
00888 
00889         pfree(dbpath);
00890     }
00891     else
00892         elog(PANIC, "relmap_redo: unknown op code %u", info);
00893 }