Header And Logo

PostgreSQL
| The world's most advanced open source database.

md.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * md.c
00004  *    This code manages relations that reside on magnetic disk.
00005  *
00006  * Or at least, that was what the Berkeley folk had in mind when they named
00007  * this file.  In reality, what this code provides is an interface from
00008  * the smgr API to Unix-like filesystem APIs, so it will work with any type
00009  * of device for which the operating system provides filesystem support.
00010  * It doesn't matter whether the bits are on spinning rust or some other
00011  * storage technology.
00012  *
00013  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00014  * Portions Copyright (c) 1994, Regents of the University of California
00015  *
00016  *
00017  * IDENTIFICATION
00018  *    src/backend/storage/smgr/md.c
00019  *
00020  *-------------------------------------------------------------------------
00021  */
00022 #include "postgres.h"
00023 
00024 #include <unistd.h>
00025 #include <fcntl.h>
00026 #include <sys/file.h>
00027 
00028 #include "miscadmin.h"
00029 #include "access/xlog.h"
00030 #include "catalog/catalog.h"
00031 #include "common/relpath.h"
00032 #include "portability/instr_time.h"
00033 #include "postmaster/bgwriter.h"
00034 #include "storage/fd.h"
00035 #include "storage/bufmgr.h"
00036 #include "storage/relfilenode.h"
00037 #include "storage/smgr.h"
00038 #include "utils/hsearch.h"
00039 #include "utils/memutils.h"
00040 #include "pg_trace.h"
00041 
00042 
00043 /* intervals for calling AbsorbFsyncRequests in mdsync and mdpostckpt */
00044 #define FSYNCS_PER_ABSORB       10
00045 #define UNLINKS_PER_ABSORB      10
00046 
00047 /*
00048  * Special values for the segno arg to RememberFsyncRequest.
00049  *
00050  * Note that CompactCheckpointerRequestQueue assumes that it's OK to remove an
00051  * fsync request from the queue if an identical, subsequent request is found.
00052  * See comments there before making changes here.
00053  */
00054 #define FORGET_RELATION_FSYNC   (InvalidBlockNumber)
00055 #define FORGET_DATABASE_FSYNC   (InvalidBlockNumber-1)
00056 #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
00057 
00058 /*
00059  * On Windows, we have to interpret EACCES as possibly meaning the same as
00060  * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
00061  * that's what you get.  Ugh.  This code is designed so that we don't
00062  * actually believe these cases are okay without further evidence (namely,
00063  * a pending fsync request getting canceled ... see mdsync).
00064  */
00065 #ifndef WIN32
00066 #define FILE_POSSIBLY_DELETED(err)  ((err) == ENOENT)
00067 #else
00068 #define FILE_POSSIBLY_DELETED(err)  ((err) == ENOENT || (err) == EACCES)
00069 #endif
00070 
00071 /*
00072  *  The magnetic disk storage manager keeps track of open file
00073  *  descriptors in its own descriptor pool.  This is done to make it
00074  *  easier to support relations that are larger than the operating
00075  *  system's file size limit (often 2GBytes).  In order to do that,
00076  *  we break relations up into "segment" files that are each shorter than
00077  *  the OS file size limit.  The segment size is set by the RELSEG_SIZE
00078  *  configuration constant in pg_config.h.
00079  *
00080  *  On disk, a relation must consist of consecutively numbered segment
00081  *  files in the pattern
00082  *      -- Zero or more full segments of exactly RELSEG_SIZE blocks each
00083  *      -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
00084  *      -- Optionally, any number of inactive segments of size 0 blocks.
00085  *  The full and partial segments are collectively the "active" segments.
00086  *  Inactive segments are those that once contained data but are currently
00087  *  not needed because of an mdtruncate() operation.  The reason for leaving
00088  *  them present at size zero, rather than unlinking them, is that other
00089  *  backends and/or the checkpointer might be holding open file references to
00090  *  such segments.  If the relation expands again after mdtruncate(), such
00091  *  that a deactivated segment becomes active again, it is important that
00092  *  such file references still be valid --- else data might get written
00093  *  out to an unlinked old copy of a segment file that will eventually
00094  *  disappear.
00095  *
00096  *  The file descriptor pointer (md_fd field) stored in the SMgrRelation
00097  *  cache is, therefore, just the head of a list of MdfdVec objects, one
00098  *  per segment.  But note the md_fd pointer can be NULL, indicating
00099  *  relation not open.
00100  *
00101  *  Also note that mdfd_chain == NULL does not necessarily mean the relation
00102  *  doesn't have another segment after this one; we may just not have
00103  *  opened the next segment yet.  (We could not have "all segments are
00104  *  in the chain" as an invariant anyway, since another backend could
00105  *  extend the relation when we weren't looking.)  We do not make chain
00106  *  entries for inactive segments, however; as soon as we find a partial
00107  *  segment, we assume that any subsequent segments are inactive.
00108  *
00109  *  All MdfdVec objects are palloc'd in the MdCxt memory context.
00110  */
00111 
00112 typedef struct _MdfdVec
00113 {
00114     File        mdfd_vfd;       /* fd number in fd.c's pool */
00115     BlockNumber mdfd_segno;     /* segment number, from 0 */
00116     struct _MdfdVec *mdfd_chain;    /* next segment, or NULL */
00117 } MdfdVec;
00118 
00119 static MemoryContext MdCxt;     /* context for all md.c allocations */
00120 
00121 
00122 /*
00123  * In some contexts (currently, standalone backends and the checkpointer)
00124  * we keep track of pending fsync operations: we need to remember all relation
00125  * segments that have been written since the last checkpoint, so that we can
00126  * fsync them down to disk before completing the next checkpoint.  This hash
00127  * table remembers the pending operations.  We use a hash table mostly as
00128  * a convenient way of merging duplicate requests.
00129  *
00130  * We use a similar mechanism to remember no-longer-needed files that can
00131  * be deleted after the next checkpoint, but we use a linked list instead of
00132  * a hash table, because we don't expect there to be any duplicate requests.
00133  *
00134  * These mechanisms are only used for non-temp relations; we never fsync
00135  * temp rels, nor do we need to postpone their deletion (see comments in
00136  * mdunlink).
00137  *
00138  * (Regular backends do not track pending operations locally, but forward
00139  * them to the checkpointer.)
00140  */
00141 typedef uint16 CycleCtr;        /* can be any convenient integer size */
00142 
00143 typedef struct
00144 {
00145     RelFileNode rnode;          /* hash table key (must be first!) */
00146     CycleCtr    cycle_ctr;      /* mdsync_cycle_ctr of oldest request */
00147     /* requests[f] has bit n set if we need to fsync segment n of fork f */
00148     Bitmapset  *requests[MAX_FORKNUM + 1];
00149     /* canceled[f] is true if we canceled fsyncs for fork "recently" */
00150     bool        canceled[MAX_FORKNUM + 1];
00151 } PendingOperationEntry;
00152 
00153 typedef struct
00154 {
00155     RelFileNode rnode;          /* the dead relation to delete */
00156     CycleCtr    cycle_ctr;      /* mdckpt_cycle_ctr when request was made */
00157 } PendingUnlinkEntry;
00158 
00159 static HTAB *pendingOpsTable = NULL;
00160 static List *pendingUnlinks = NIL;
00161 
00162 static CycleCtr mdsync_cycle_ctr = 0;
00163 static CycleCtr mdckpt_cycle_ctr = 0;
00164 
00165 
00166 typedef enum                    /* behavior for mdopen & _mdfd_getseg */
00167 {
00168     EXTENSION_FAIL,             /* ereport if segment not present */
00169     EXTENSION_RETURN_NULL,      /* return NULL if not present */
00170     EXTENSION_CREATE            /* create new segments as needed */
00171 } ExtensionBehavior;
00172 
00173 /* local routines */
00174 static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum,
00175              bool isRedo);
00176 static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum,
00177        ExtensionBehavior behavior);
00178 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
00179                        MdfdVec *seg);
00180 static void register_unlink(RelFileNodeBackend rnode);
00181 static MdfdVec *_fdvec_alloc(void);
00182 static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
00183               BlockNumber segno);
00184 static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno,
00185               BlockNumber segno, int oflags);
00186 static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno,
00187              BlockNumber blkno, bool skipFsync, ExtensionBehavior behavior);
00188 static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
00189            MdfdVec *seg);
00190 
00191 
00192 /*
00193  *  mdinit() -- Initialize private state for magnetic disk storage manager.
00194  */
00195 void
00196 mdinit(void)
00197 {
00198     MdCxt = AllocSetContextCreate(TopMemoryContext,
00199                                   "MdSmgr",
00200                                   ALLOCSET_DEFAULT_MINSIZE,
00201                                   ALLOCSET_DEFAULT_INITSIZE,
00202                                   ALLOCSET_DEFAULT_MAXSIZE);
00203 
00204     /*
00205      * Create pending-operations hashtable if we need it.  Currently, we need
00206      * it if we are standalone (not under a postmaster) or if we are a startup
00207      * or checkpointer auxiliary process.
00208      */
00209     if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess())
00210     {
00211         HASHCTL     hash_ctl;
00212 
00213         MemSet(&hash_ctl, 0, sizeof(hash_ctl));
00214         hash_ctl.keysize = sizeof(RelFileNode);
00215         hash_ctl.entrysize = sizeof(PendingOperationEntry);
00216         hash_ctl.hash = tag_hash;
00217         hash_ctl.hcxt = MdCxt;
00218         pendingOpsTable = hash_create("Pending Ops Table",
00219                                       100L,
00220                                       &hash_ctl,
00221                                    HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
00222         pendingUnlinks = NIL;
00223     }
00224 }
00225 
00226 /*
00227  * In archive recovery, we rely on checkpointer to do fsyncs, but we will have
00228  * already created the pendingOpsTable during initialization of the startup
00229  * process.  Calling this function drops the local pendingOpsTable so that
00230  * subsequent requests will be forwarded to checkpointer.
00231  */
00232 void
00233 SetForwardFsyncRequests(void)
00234 {
00235     /* Perform any pending fsyncs we may have queued up, then drop table */
00236     if (pendingOpsTable)
00237     {
00238         mdsync();
00239         hash_destroy(pendingOpsTable);
00240     }
00241     pendingOpsTable = NULL;
00242 
00243     /*
00244      * We should not have any pending unlink requests, since mdunlink doesn't
00245      * queue unlink requests when isRedo.
00246      */
00247     Assert(pendingUnlinks == NIL);
00248 }
00249 
00250 /*
00251  *  mdexists() -- Does the physical file exist?
00252  *
00253  * Note: this will return true for lingering files, with pending deletions
00254  */
00255 bool
00256 mdexists(SMgrRelation reln, ForkNumber forkNum)
00257 {
00258     /*
00259      * Close it first, to ensure that we notice if the fork has been unlinked
00260      * since we opened it.
00261      */
00262     mdclose(reln, forkNum);
00263 
00264     return (mdopen(reln, forkNum, EXTENSION_RETURN_NULL) != NULL);
00265 }
00266 
00267 /*
00268  *  mdcreate() -- Create a new relation on magnetic disk.
00269  *
00270  * If isRedo is true, it's okay for the relation to exist already.
00271  */
00272 void
00273 mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
00274 {
00275     char       *path;
00276     File        fd;
00277 
00278     if (isRedo && reln->md_fd[forkNum] != NULL)
00279         return;                 /* created and opened already... */
00280 
00281     Assert(reln->md_fd[forkNum] == NULL);
00282 
00283     path = relpath(reln->smgr_rnode, forkNum);
00284 
00285     fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
00286 
00287     if (fd < 0)
00288     {
00289         int         save_errno = errno;
00290 
00291         /*
00292          * During bootstrap, there are cases where a system relation will be
00293          * accessed (by internal backend processes) before the bootstrap
00294          * script nominally creates it.  Therefore, allow the file to exist
00295          * already, even if isRedo is not set.  (See also mdopen)
00296          */
00297         if (isRedo || IsBootstrapProcessingMode())
00298             fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
00299         if (fd < 0)
00300         {
00301             /* be sure to report the error reported by create, not open */
00302             errno = save_errno;
00303             ereport(ERROR,
00304                     (errcode_for_file_access(),
00305                      errmsg("could not create file \"%s\": %m", path)));
00306         }
00307     }
00308 
00309     pfree(path);
00310 
00311     reln->md_fd[forkNum] = _fdvec_alloc();
00312 
00313     reln->md_fd[forkNum]->mdfd_vfd = fd;
00314     reln->md_fd[forkNum]->mdfd_segno = 0;
00315     reln->md_fd[forkNum]->mdfd_chain = NULL;
00316 }
00317 
00318 /*
00319  *  mdunlink() -- Unlink a relation.
00320  *
00321  * Note that we're passed a RelFileNodeBackend --- by the time this is called,
00322  * there won't be an SMgrRelation hashtable entry anymore.
00323  *
00324  * forkNum can be a fork number to delete a specific fork, or InvalidForkNumber
00325  * to delete all forks.
00326  *
00327  * For regular relations, we don't unlink the first segment file of the rel,
00328  * but just truncate it to zero length, and record a request to unlink it after
00329  * the next checkpoint.  Additional segments can be unlinked immediately,
00330  * however.  Leaving the empty file in place prevents that relfilenode
00331  * number from being reused.  The scenario this protects us from is:
00332  * 1. We delete a relation (and commit, and actually remove its file).
00333  * 2. We create a new relation, which by chance gets the same relfilenode as
00334  *    the just-deleted one (OIDs must've wrapped around for that to happen).
00335  * 3. We crash before another checkpoint occurs.
00336  * During replay, we would delete the file and then recreate it, which is fine
00337  * if the contents of the file were repopulated by subsequent WAL entries.
00338  * But if we didn't WAL-log insertions, but instead relied on fsyncing the
00339  * file after populating it (as for instance CLUSTER and CREATE INDEX do),
00340  * the contents of the file would be lost forever.  By leaving the empty file
00341  * until after the next checkpoint, we prevent reassignment of the relfilenode
00342  * number until it's safe, because relfilenode assignment skips over any
00343  * existing file.
00344  *
00345  * We do not need to go through this dance for temp relations, though, because
00346  * we never make WAL entries for temp rels, and so a temp rel poses no threat
00347  * to the health of a regular rel that has taken over its relfilenode number.
00348  * The fact that temp rels and regular rels have different file naming
00349  * patterns provides additional safety.
00350  *
00351  * All the above applies only to the relation's main fork; other forks can
00352  * just be removed immediately, since they are not needed to prevent the
00353  * relfilenode number from being recycled.  Also, we do not carefully
00354  * track whether other forks have been created or not, but just attempt to
00355  * unlink them unconditionally; so we should never complain about ENOENT.
00356  *
00357  * If isRedo is true, it's unsurprising for the relation to be already gone.
00358  * Also, we should remove the file immediately instead of queuing a request
00359  * for later, since during redo there's no possibility of creating a
00360  * conflicting relation.
00361  *
00362  * Note: any failure should be reported as WARNING not ERROR, because
00363  * we are usually not in a transaction anymore when this is called.
00364  */
00365 void
00366 mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
00367 {
00368     /*
00369      * We have to clean out any pending fsync requests for the doomed
00370      * relation, else the next mdsync() will fail.  There can't be any such
00371      * requests for a temp relation, though.  We can send just one request
00372      * even when deleting multiple forks, since the fsync queuing code accepts
00373      * the "InvalidForkNumber = all forks" convention.
00374      */
00375     if (!RelFileNodeBackendIsTemp(rnode))
00376         ForgetRelationFsyncRequests(rnode.node, forkNum);
00377 
00378     /* Now do the per-fork work */
00379     if (forkNum == InvalidForkNumber)
00380     {
00381         for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++)
00382             mdunlinkfork(rnode, forkNum, isRedo);
00383     }
00384     else
00385         mdunlinkfork(rnode, forkNum, isRedo);
00386 }
00387 
00388 static void
00389 mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
00390 {
00391     char       *path;
00392     int         ret;
00393 
00394     path = relpath(rnode, forkNum);
00395 
00396     /*
00397      * Delete or truncate the first segment.
00398      */
00399     if (isRedo || forkNum != MAIN_FORKNUM || RelFileNodeBackendIsTemp(rnode))
00400     {
00401         ret = unlink(path);
00402         if (ret < 0 && errno != ENOENT)
00403             ereport(WARNING,
00404                     (errcode_for_file_access(),
00405                      errmsg("could not remove file \"%s\": %m", path)));
00406     }
00407     else
00408     {
00409         /* truncate(2) would be easier here, but Windows hasn't got it */
00410         int         fd;
00411 
00412         fd = OpenTransientFile(path, O_RDWR | PG_BINARY, 0);
00413         if (fd >= 0)
00414         {
00415             int         save_errno;
00416 
00417             ret = ftruncate(fd, 0);
00418             save_errno = errno;
00419             CloseTransientFile(fd);
00420             errno = save_errno;
00421         }
00422         else
00423             ret = -1;
00424         if (ret < 0 && errno != ENOENT)
00425             ereport(WARNING,
00426                     (errcode_for_file_access(),
00427                      errmsg("could not truncate file \"%s\": %m", path)));
00428 
00429         /* Register request to unlink first segment later */
00430         register_unlink(rnode);
00431     }
00432 
00433     /*
00434      * Delete any additional segments.
00435      */
00436     if (ret >= 0)
00437     {
00438         char       *segpath = (char *) palloc(strlen(path) + 12);
00439         BlockNumber segno;
00440 
00441         /*
00442          * Note that because we loop until getting ENOENT, we will correctly
00443          * remove all inactive segments as well as active ones.
00444          */
00445         for (segno = 1;; segno++)
00446         {
00447             sprintf(segpath, "%s.%u", path, segno);
00448             if (unlink(segpath) < 0)
00449             {
00450                 /* ENOENT is expected after the last segment... */
00451                 if (errno != ENOENT)
00452                     ereport(WARNING,
00453                             (errcode_for_file_access(),
00454                        errmsg("could not remove file \"%s\": %m", segpath)));
00455                 break;
00456             }
00457         }
00458         pfree(segpath);
00459     }
00460 
00461     pfree(path);
00462 }
00463 
00464 /*
00465  *  mdextend() -- Add a block to the specified relation.
00466  *
00467  *      The semantics are nearly the same as mdwrite(): write at the
00468  *      specified position.  However, this is to be used for the case of
00469  *      extending a relation (i.e., blocknum is at or beyond the current
00470  *      EOF).  Note that we assume writing a block beyond current EOF
00471  *      causes intervening file space to become filled with zeroes.
00472  */
00473 void
00474 mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
00475          char *buffer, bool skipFsync)
00476 {
00477     off_t       seekpos;
00478     int         nbytes;
00479     MdfdVec    *v;
00480 
00481     /* This assert is too expensive to have on normally ... */
00482 #ifdef CHECK_WRITE_VS_EXTEND
00483     Assert(blocknum >= mdnblocks(reln, forknum));
00484 #endif
00485 
00486     /*
00487      * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
00488      * more --- we mustn't create a block whose number actually is
00489      * InvalidBlockNumber.
00490      */
00491     if (blocknum == InvalidBlockNumber)
00492         ereport(ERROR,
00493                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
00494                  errmsg("cannot extend file \"%s\" beyond %u blocks",
00495                         relpath(reln->smgr_rnode, forknum),
00496                         InvalidBlockNumber)));
00497 
00498     v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
00499 
00500     seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
00501 
00502     Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
00503 
00504     /*
00505      * Note: because caller usually obtained blocknum by calling mdnblocks,
00506      * which did a seek(SEEK_END), this seek is often redundant and will be
00507      * optimized away by fd.c.  It's not redundant, however, if there is a
00508      * partial page at the end of the file. In that case we want to try to
00509      * overwrite the partial page with a full page.  It's also not redundant
00510      * if bufmgr.c had to dump another buffer of the same file to make room
00511      * for the new page's buffer.
00512      */
00513     if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
00514         ereport(ERROR,
00515                 (errcode_for_file_access(),
00516                  errmsg("could not seek to block %u in file \"%s\": %m",
00517                         blocknum, FilePathName(v->mdfd_vfd))));
00518 
00519     if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
00520     {
00521         if (nbytes < 0)
00522             ereport(ERROR,
00523                     (errcode_for_file_access(),
00524                      errmsg("could not extend file \"%s\": %m",
00525                             FilePathName(v->mdfd_vfd)),
00526                      errhint("Check free disk space.")));
00527         /* short write: complain appropriately */
00528         ereport(ERROR,
00529                 (errcode(ERRCODE_DISK_FULL),
00530                  errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
00531                         FilePathName(v->mdfd_vfd),
00532                         nbytes, BLCKSZ, blocknum),
00533                  errhint("Check free disk space.")));
00534     }
00535 
00536     if (!skipFsync && !SmgrIsTemp(reln))
00537         register_dirty_segment(reln, forknum, v);
00538 
00539     Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
00540 }
00541 
00542 /*
00543  *  mdopen() -- Open the specified relation.
00544  *
00545  * Note we only open the first segment, when there are multiple segments.
00546  *
00547  * If first segment is not present, either ereport or return NULL according
00548  * to "behavior".  We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
00549  * EXTENSION_CREATE means it's OK to extend an existing relation, not to
00550  * invent one out of whole cloth.
00551  */
00552 static MdfdVec *
00553 mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior)
00554 {
00555     MdfdVec    *mdfd;
00556     char       *path;
00557     File        fd;
00558 
00559     /* No work if already open */
00560     if (reln->md_fd[forknum])
00561         return reln->md_fd[forknum];
00562 
00563     path = relpath(reln->smgr_rnode, forknum);
00564 
00565     fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
00566 
00567     if (fd < 0)
00568     {
00569         /*
00570          * During bootstrap, there are cases where a system relation will be
00571          * accessed (by internal backend processes) before the bootstrap
00572          * script nominally creates it.  Therefore, accept mdopen() as a
00573          * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
00574          */
00575         if (IsBootstrapProcessingMode())
00576             fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
00577         if (fd < 0)
00578         {
00579             if (behavior == EXTENSION_RETURN_NULL &&
00580                 FILE_POSSIBLY_DELETED(errno))
00581             {
00582                 pfree(path);
00583                 return NULL;
00584             }
00585             ereport(ERROR,
00586                     (errcode_for_file_access(),
00587                      errmsg("could not open file \"%s\": %m", path)));
00588         }
00589     }
00590 
00591     pfree(path);
00592 
00593     reln->md_fd[forknum] = mdfd = _fdvec_alloc();
00594 
00595     mdfd->mdfd_vfd = fd;
00596     mdfd->mdfd_segno = 0;
00597     mdfd->mdfd_chain = NULL;
00598     Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
00599 
00600     return mdfd;
00601 }
00602 
00603 /*
00604  *  mdclose() -- Close the specified relation, if it isn't closed already.
00605  */
00606 void
00607 mdclose(SMgrRelation reln, ForkNumber forknum)
00608 {
00609     MdfdVec    *v = reln->md_fd[forknum];
00610 
00611     /* No work if already closed */
00612     if (v == NULL)
00613         return;
00614 
00615     reln->md_fd[forknum] = NULL;    /* prevent dangling pointer after error */
00616 
00617     while (v != NULL)
00618     {
00619         MdfdVec    *ov = v;
00620 
00621         /* if not closed already */
00622         if (v->mdfd_vfd >= 0)
00623             FileClose(v->mdfd_vfd);
00624         /* Now free vector */
00625         v = v->mdfd_chain;
00626         pfree(ov);
00627     }
00628 }
00629 
00630 /*
00631  *  mdprefetch() -- Initiate asynchronous read of the specified block of a relation
00632  */
00633 void
00634 mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
00635 {
00636 #ifdef USE_PREFETCH
00637     off_t       seekpos;
00638     MdfdVec    *v;
00639 
00640     v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
00641 
00642     seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
00643 
00644     Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
00645 
00646     (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ);
00647 #endif   /* USE_PREFETCH */
00648 }
00649 
00650 
00651 /*
00652  *  mdread() -- Read the specified block from a relation.
00653  */
00654 void
00655 mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
00656        char *buffer)
00657 {
00658     off_t       seekpos;
00659     int         nbytes;
00660     MdfdVec    *v;
00661 
00662     TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
00663                                         reln->smgr_rnode.node.spcNode,
00664                                         reln->smgr_rnode.node.dbNode,
00665                                         reln->smgr_rnode.node.relNode,
00666                                         reln->smgr_rnode.backend);
00667 
00668     v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
00669 
00670     seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
00671 
00672     Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
00673 
00674     if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
00675         ereport(ERROR,
00676                 (errcode_for_file_access(),
00677                  errmsg("could not seek to block %u in file \"%s\": %m",
00678                         blocknum, FilePathName(v->mdfd_vfd))));
00679 
00680     nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ);
00681 
00682     TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
00683                                        reln->smgr_rnode.node.spcNode,
00684                                        reln->smgr_rnode.node.dbNode,
00685                                        reln->smgr_rnode.node.relNode,
00686                                        reln->smgr_rnode.backend,
00687                                        nbytes,
00688                                        BLCKSZ);
00689 
00690     if (nbytes != BLCKSZ)
00691     {
00692         if (nbytes < 0)
00693             ereport(ERROR,
00694                     (errcode_for_file_access(),
00695                      errmsg("could not read block %u in file \"%s\": %m",
00696                             blocknum, FilePathName(v->mdfd_vfd))));
00697 
00698         /*
00699          * Short read: we are at or past EOF, or we read a partial block at
00700          * EOF.  Normally this is an error; upper levels should never try to
00701          * read a nonexistent block.  However, if zero_damaged_pages is ON or
00702          * we are InRecovery, we should instead return zeroes without
00703          * complaining.  This allows, for example, the case of trying to
00704          * update a block that was later truncated away.
00705          */
00706         if (zero_damaged_pages || InRecovery)
00707             MemSet(buffer, 0, BLCKSZ);
00708         else
00709             ereport(ERROR,
00710                     (errcode(ERRCODE_DATA_CORRUPTED),
00711                      errmsg("could not read block %u in file \"%s\": read only %d of %d bytes",
00712                             blocknum, FilePathName(v->mdfd_vfd),
00713                             nbytes, BLCKSZ)));
00714     }
00715 }
00716 
00717 /*
00718  *  mdwrite() -- Write the supplied block at the appropriate location.
00719  *
00720  *      This is to be used only for updating already-existing blocks of a
00721  *      relation (ie, those before the current EOF).  To extend a relation,
00722  *      use mdextend().
00723  */
00724 void
00725 mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
00726         char *buffer, bool skipFsync)
00727 {
00728     off_t       seekpos;
00729     int         nbytes;
00730     MdfdVec    *v;
00731 
00732     /* This assert is too expensive to have on normally ... */
00733 #ifdef CHECK_WRITE_VS_EXTEND
00734     Assert(blocknum < mdnblocks(reln, forknum));
00735 #endif
00736 
00737     TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
00738                                          reln->smgr_rnode.node.spcNode,
00739                                          reln->smgr_rnode.node.dbNode,
00740                                          reln->smgr_rnode.node.relNode,
00741                                          reln->smgr_rnode.backend);
00742 
00743     v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_FAIL);
00744 
00745     seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
00746 
00747     Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
00748 
00749     if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
00750         ereport(ERROR,
00751                 (errcode_for_file_access(),
00752                  errmsg("could not seek to block %u in file \"%s\": %m",
00753                         blocknum, FilePathName(v->mdfd_vfd))));
00754 
00755     nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ);
00756 
00757     TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
00758                                         reln->smgr_rnode.node.spcNode,
00759                                         reln->smgr_rnode.node.dbNode,
00760                                         reln->smgr_rnode.node.relNode,
00761                                         reln->smgr_rnode.backend,
00762                                         nbytes,
00763                                         BLCKSZ);
00764 
00765     if (nbytes != BLCKSZ)
00766     {
00767         if (nbytes < 0)
00768             ereport(ERROR,
00769                     (errcode_for_file_access(),
00770                      errmsg("could not write block %u in file \"%s\": %m",
00771                             blocknum, FilePathName(v->mdfd_vfd))));
00772         /* short write: complain appropriately */
00773         ereport(ERROR,
00774                 (errcode(ERRCODE_DISK_FULL),
00775                  errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes",
00776                         blocknum,
00777                         FilePathName(v->mdfd_vfd),
00778                         nbytes, BLCKSZ),
00779                  errhint("Check free disk space.")));
00780     }
00781 
00782     if (!skipFsync && !SmgrIsTemp(reln))
00783         register_dirty_segment(reln, forknum, v);
00784 }
00785 
00786 /*
00787  *  mdnblocks() -- Get the number of blocks stored in a relation.
00788  *
00789  *      Important side effect: all active segments of the relation are opened
00790  *      and added to the mdfd_chain list.  If this routine has not been
00791  *      called, then only segments up to the last one actually touched
00792  *      are present in the chain.
00793  */
00794 BlockNumber
00795 mdnblocks(SMgrRelation reln, ForkNumber forknum)
00796 {
00797     MdfdVec    *v = mdopen(reln, forknum, EXTENSION_FAIL);
00798     BlockNumber nblocks;
00799     BlockNumber segno = 0;
00800 
00801     /*
00802      * Skip through any segments that aren't the last one, to avoid redundant
00803      * seeks on them.  We have previously verified that these segments are
00804      * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
00805      *
00806      * NOTE: this assumption could only be wrong if another backend has
00807      * truncated the relation.  We rely on higher code levels to handle that
00808      * scenario by closing and re-opening the md fd, which is handled via
00809      * relcache flush.  (Since the checkpointer doesn't participate in
00810      * relcache flush, it could have segment chain entries for inactive
00811      * segments; that's OK because the checkpointer never needs to compute
00812      * relation size.)
00813      */
00814     while (v->mdfd_chain != NULL)
00815     {
00816         segno++;
00817         v = v->mdfd_chain;
00818     }
00819 
00820     for (;;)
00821     {
00822         nblocks = _mdnblocks(reln, forknum, v);
00823         if (nblocks > ((BlockNumber) RELSEG_SIZE))
00824             elog(FATAL, "segment too big");
00825         if (nblocks < ((BlockNumber) RELSEG_SIZE))
00826             return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
00827 
00828         /*
00829          * If segment is exactly RELSEG_SIZE, advance to next one.
00830          */
00831         segno++;
00832 
00833         if (v->mdfd_chain == NULL)
00834         {
00835             /*
00836              * Because we pass O_CREAT, we will create the next segment (with
00837              * zero length) immediately, if the last segment is of length
00838              * RELSEG_SIZE.  While perhaps not strictly necessary, this keeps
00839              * the logic simple.
00840              */
00841             v->mdfd_chain = _mdfd_openseg(reln, forknum, segno, O_CREAT);
00842             if (v->mdfd_chain == NULL)
00843                 ereport(ERROR,
00844                         (errcode_for_file_access(),
00845                          errmsg("could not open file \"%s\": %m",
00846                                 _mdfd_segpath(reln, forknum, segno))));
00847         }
00848 
00849         v = v->mdfd_chain;
00850     }
00851 }
00852 
00853 /*
00854  *  mdtruncate() -- Truncate relation to specified number of blocks.
00855  */
00856 void
00857 mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
00858 {
00859     MdfdVec    *v;
00860     BlockNumber curnblk;
00861     BlockNumber priorblocks;
00862 
00863     /*
00864      * NOTE: mdnblocks makes sure we have opened all active segments, so that
00865      * truncation loop will get them all!
00866      */
00867     curnblk = mdnblocks(reln, forknum);
00868     if (nblocks > curnblk)
00869     {
00870         /* Bogus request ... but no complaint if InRecovery */
00871         if (InRecovery)
00872             return;
00873         ereport(ERROR,
00874                 (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
00875                         relpath(reln->smgr_rnode, forknum),
00876                         nblocks, curnblk)));
00877     }
00878     if (nblocks == curnblk)
00879         return;                 /* no work */
00880 
00881     v = mdopen(reln, forknum, EXTENSION_FAIL);
00882 
00883     priorblocks = 0;
00884     while (v != NULL)
00885     {
00886         MdfdVec    *ov = v;
00887 
00888         if (priorblocks > nblocks)
00889         {
00890             /*
00891              * This segment is no longer active (and has already been unlinked
00892              * from the mdfd_chain). We truncate the file, but do not delete
00893              * it, for reasons explained in the header comments.
00894              */
00895             if (FileTruncate(v->mdfd_vfd, 0) < 0)
00896                 ereport(ERROR,
00897                         (errcode_for_file_access(),
00898                          errmsg("could not truncate file \"%s\": %m",
00899                                 FilePathName(v->mdfd_vfd))));
00900 
00901             if (!SmgrIsTemp(reln))
00902                 register_dirty_segment(reln, forknum, v);
00903             v = v->mdfd_chain;
00904             Assert(ov != reln->md_fd[forknum]); /* we never drop the 1st
00905                                                  * segment */
00906             pfree(ov);
00907         }
00908         else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
00909         {
00910             /*
00911              * This is the last segment we want to keep. Truncate the file to
00912              * the right length, and clear chain link that points to any
00913              * remaining segments (which we shall zap). NOTE: if nblocks is
00914              * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
00915              * segment to 0 length but keep it. This adheres to the invariant
00916              * given in the header comments.
00917              */
00918             BlockNumber lastsegblocks = nblocks - priorblocks;
00919 
00920             if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ) < 0)
00921                 ereport(ERROR,
00922                         (errcode_for_file_access(),
00923                     errmsg("could not truncate file \"%s\" to %u blocks: %m",
00924                            FilePathName(v->mdfd_vfd),
00925                            nblocks)));
00926             if (!SmgrIsTemp(reln))
00927                 register_dirty_segment(reln, forknum, v);
00928             v = v->mdfd_chain;
00929             ov->mdfd_chain = NULL;
00930         }
00931         else
00932         {
00933             /*
00934              * We still need this segment and 0 or more blocks beyond it, so
00935              * nothing to do here.
00936              */
00937             v = v->mdfd_chain;
00938         }
00939         priorblocks += RELSEG_SIZE;
00940     }
00941 }
00942 
00943 /*
00944  *  mdimmedsync() -- Immediately sync a relation to stable storage.
00945  *
00946  * Note that only writes already issued are synced; this routine knows
00947  * nothing of dirty buffers that may exist inside the buffer manager.
00948  */
00949 void
00950 mdimmedsync(SMgrRelation reln, ForkNumber forknum)
00951 {
00952     MdfdVec    *v;
00953 
00954     /*
00955      * NOTE: mdnblocks makes sure we have opened all active segments, so that
00956      * fsync loop will get them all!
00957      */
00958     mdnblocks(reln, forknum);
00959 
00960     v = mdopen(reln, forknum, EXTENSION_FAIL);
00961 
00962     while (v != NULL)
00963     {
00964         if (FileSync(v->mdfd_vfd) < 0)
00965             ereport(ERROR,
00966                     (errcode_for_file_access(),
00967                      errmsg("could not fsync file \"%s\": %m",
00968                             FilePathName(v->mdfd_vfd))));
00969         v = v->mdfd_chain;
00970     }
00971 }
00972 
00973 /*
00974  *  mdsync() -- Sync previous writes to stable storage.
00975  */
00976 void
00977 mdsync(void)
00978 {
00979     static bool mdsync_in_progress = false;
00980 
00981     HASH_SEQ_STATUS hstat;
00982     PendingOperationEntry *entry;
00983     int         absorb_counter;
00984 
00985     /* Statistics on sync times */
00986     int         processed = 0;
00987     instr_time  sync_start,
00988                 sync_end,
00989                 sync_diff;
00990     uint64      elapsed;
00991     uint64      longest = 0;
00992     uint64      total_elapsed = 0;
00993 
00994     /*
00995      * This is only called during checkpoints, and checkpoints should only
00996      * occur in processes that have created a pendingOpsTable.
00997      */
00998     if (!pendingOpsTable)
00999         elog(ERROR, "cannot sync without a pendingOpsTable");
01000 
01001     /*
01002      * If we are in the checkpointer, the sync had better include all fsync
01003      * requests that were queued by backends up to this point.  The tightest
01004      * race condition that could occur is that a buffer that must be written
01005      * and fsync'd for the checkpoint could have been dumped by a backend just
01006      * before it was visited by BufferSync().  We know the backend will have
01007      * queued an fsync request before clearing the buffer's dirtybit, so we
01008      * are safe as long as we do an Absorb after completing BufferSync().
01009      */
01010     AbsorbFsyncRequests();
01011 
01012     /*
01013      * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
01014      * checkpoint), we want to ignore fsync requests that are entered into the
01015      * hashtable after this point --- they should be processed next time,
01016      * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
01017      * ones: new ones will have cycle_ctr equal to the incremented value of
01018      * mdsync_cycle_ctr.
01019      *
01020      * In normal circumstances, all entries present in the table at this point
01021      * will have cycle_ctr exactly equal to the current (about to be old)
01022      * value of mdsync_cycle_ctr.  However, if we fail partway through the
01023      * fsync'ing loop, then older values of cycle_ctr might remain when we
01024      * come back here to try again.  Repeated checkpoint failures would
01025      * eventually wrap the counter around to the point where an old entry
01026      * might appear new, causing us to skip it, possibly allowing a checkpoint
01027      * to succeed that should not have.  To forestall wraparound, any time the
01028      * previous mdsync() failed to complete, run through the table and
01029      * forcibly set cycle_ctr = mdsync_cycle_ctr.
01030      *
01031      * Think not to merge this loop with the main loop, as the problem is
01032      * exactly that that loop may fail before having visited all the entries.
01033      * From a performance point of view it doesn't matter anyway, as this path
01034      * will never be taken in a system that's functioning normally.
01035      */
01036     if (mdsync_in_progress)
01037     {
01038         /* prior try failed, so update any stale cycle_ctr values */
01039         hash_seq_init(&hstat, pendingOpsTable);
01040         while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
01041         {
01042             entry->cycle_ctr = mdsync_cycle_ctr;
01043         }
01044     }
01045 
01046     /* Advance counter so that new hashtable entries are distinguishable */
01047     mdsync_cycle_ctr++;
01048 
01049     /* Set flag to detect failure if we don't reach the end of the loop */
01050     mdsync_in_progress = true;
01051 
01052     /* Now scan the hashtable for fsync requests to process */
01053     absorb_counter = FSYNCS_PER_ABSORB;
01054     hash_seq_init(&hstat, pendingOpsTable);
01055     while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
01056     {
01057         ForkNumber  forknum;
01058 
01059         /*
01060          * If the entry is new then don't process it this time; it might
01061          * contain multiple fsync-request bits, but they are all new.  Note
01062          * "continue" bypasses the hash-remove call at the bottom of the loop.
01063          */
01064         if (entry->cycle_ctr == mdsync_cycle_ctr)
01065             continue;
01066 
01067         /* Else assert we haven't missed it */
01068         Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
01069 
01070         /*
01071          * Scan over the forks and segments represented by the entry.
01072          *
01073          * The bitmap manipulations are slightly tricky, because we can call
01074          * AbsorbFsyncRequests() inside the loop and that could result in
01075          * bms_add_member() modifying and even re-palloc'ing the bitmapsets.
01076          * This is okay because we unlink each bitmapset from the hashtable
01077          * entry before scanning it.  That means that any incoming fsync
01078          * requests will be processed now if they reach the table before we
01079          * begin to scan their fork.
01080          */
01081         for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
01082         {
01083             Bitmapset  *requests = entry->requests[forknum];
01084             int         segno;
01085 
01086             entry->requests[forknum] = NULL;
01087             entry->canceled[forknum] = false;
01088 
01089             while ((segno = bms_first_member(requests)) >= 0)
01090             {
01091                 int         failures;
01092 
01093                 /*
01094                  * If fsync is off then we don't have to bother opening the
01095                  * file at all.  (We delay checking until this point so that
01096                  * changing fsync on the fly behaves sensibly.)
01097                  */
01098                 if (!enableFsync)
01099                     continue;
01100 
01101                 /*
01102                  * If in checkpointer, we want to absorb pending requests
01103                  * every so often to prevent overflow of the fsync request
01104                  * queue.  It is unspecified whether newly-added entries will
01105                  * be visited by hash_seq_search, but we don't care since we
01106                  * don't need to process them anyway.
01107                  */
01108                 if (--absorb_counter <= 0)
01109                 {
01110                     AbsorbFsyncRequests();
01111                     absorb_counter = FSYNCS_PER_ABSORB;
01112                 }
01113 
01114                 /*
01115                  * The fsync table could contain requests to fsync segments
01116                  * that have been deleted (unlinked) by the time we get to
01117                  * them. Rather than just hoping an ENOENT (or EACCES on
01118                  * Windows) error can be ignored, what we do on error is
01119                  * absorb pending requests and then retry.  Since mdunlink()
01120                  * queues a "cancel" message before actually unlinking, the
01121                  * fsync request is guaranteed to be marked canceled after the
01122                  * absorb if it really was this case. DROP DATABASE likewise
01123                  * has to tell us to forget fsync requests before it starts
01124                  * deletions.
01125                  */
01126                 for (failures = 0;; failures++) /* loop exits at "break" */
01127                 {
01128                     SMgrRelation reln;
01129                     MdfdVec    *seg;
01130                     char       *path;
01131                     int         save_errno;
01132 
01133                     /*
01134                      * Find or create an smgr hash entry for this relation.
01135                      * This may seem a bit unclean -- md calling smgr?  But
01136                      * it's really the best solution.  It ensures that the
01137                      * open file reference isn't permanently leaked if we get
01138                      * an error here. (You may say "but an unreferenced
01139                      * SMgrRelation is still a leak!" Not really, because the
01140                      * only case in which a checkpoint is done by a process
01141                      * that isn't about to shut down is in the checkpointer,
01142                      * and it will periodically do smgrcloseall(). This fact
01143                      * justifies our not closing the reln in the success path
01144                      * either, which is a good thing since in non-checkpointer
01145                      * cases we couldn't safely do that.)
01146                      */
01147                     reln = smgropen(entry->rnode, InvalidBackendId);
01148 
01149                     /* Attempt to open and fsync the target segment */
01150                     seg = _mdfd_getseg(reln, forknum,
01151                              (BlockNumber) segno * (BlockNumber) RELSEG_SIZE,
01152                                        false, EXTENSION_RETURN_NULL);
01153 
01154                     INSTR_TIME_SET_CURRENT(sync_start);
01155 
01156                     if (seg != NULL &&
01157                         FileSync(seg->mdfd_vfd) >= 0)
01158                     {
01159                         /* Success; update statistics about sync timing */
01160                         INSTR_TIME_SET_CURRENT(sync_end);
01161                         sync_diff = sync_end;
01162                         INSTR_TIME_SUBTRACT(sync_diff, sync_start);
01163                         elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
01164                         if (elapsed > longest)
01165                             longest = elapsed;
01166                         total_elapsed += elapsed;
01167                         processed++;
01168                         if (log_checkpoints)
01169                             elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
01170                                  processed,
01171                                  FilePathName(seg->mdfd_vfd),
01172                                  (double) elapsed / 1000);
01173 
01174                         break;  /* out of retry loop */
01175                     }
01176 
01177                     /* Compute file name for use in message */
01178                     save_errno = errno;
01179                     path = _mdfd_segpath(reln, forknum, (BlockNumber) segno);
01180                     errno = save_errno;
01181 
01182                     /*
01183                      * It is possible that the relation has been dropped or
01184                      * truncated since the fsync request was entered.
01185                      * Therefore, allow ENOENT, but only if we didn't fail
01186                      * already on this file.  This applies both for
01187                      * _mdfd_getseg() and for FileSync, since fd.c might have
01188                      * closed the file behind our back.
01189                      *
01190                      * XXX is there any point in allowing more than one retry?
01191                      * Don't see one at the moment, but easy to change the
01192                      * test here if so.
01193                      */
01194                     if (!FILE_POSSIBLY_DELETED(errno) ||
01195                         failures > 0)
01196                         ereport(ERROR,
01197                                 (errcode_for_file_access(),
01198                                  errmsg("could not fsync file \"%s\": %m",
01199                                         path)));
01200                     else
01201                         ereport(DEBUG1,
01202                                 (errcode_for_file_access(),
01203                         errmsg("could not fsync file \"%s\" but retrying: %m",
01204                                path)));
01205                     pfree(path);
01206 
01207                     /*
01208                      * Absorb incoming requests and check to see if a cancel
01209                      * arrived for this relation fork.
01210                      */
01211                     AbsorbFsyncRequests();
01212                     absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
01213 
01214                     if (entry->canceled[forknum])
01215                         break;
01216                 }               /* end retry loop */
01217             }
01218             bms_free(requests);
01219         }
01220 
01221         /*
01222          * We've finished everything that was requested before we started to
01223          * scan the entry.  If no new requests have been inserted meanwhile,
01224          * remove the entry.  Otherwise, update its cycle counter, as all the
01225          * requests now in it must have arrived during this cycle.
01226          */
01227         for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
01228         {
01229             if (entry->requests[forknum] != NULL)
01230                 break;
01231         }
01232         if (forknum <= MAX_FORKNUM)
01233             entry->cycle_ctr = mdsync_cycle_ctr;
01234         else
01235         {
01236             /* Okay to remove it */
01237             if (hash_search(pendingOpsTable, &entry->rnode,
01238                             HASH_REMOVE, NULL) == NULL)
01239                 elog(ERROR, "pendingOpsTable corrupted");
01240         }
01241     }                           /* end loop over hashtable entries */
01242 
01243     /* Return sync performance metrics for report at checkpoint end */
01244     CheckpointStats.ckpt_sync_rels = processed;
01245     CheckpointStats.ckpt_longest_sync = longest;
01246     CheckpointStats.ckpt_agg_sync_time = total_elapsed;
01247 
01248     /* Flag successful completion of mdsync */
01249     mdsync_in_progress = false;
01250 }
01251 
01252 /*
01253  * mdpreckpt() -- Do pre-checkpoint work
01254  *
01255  * To distinguish unlink requests that arrived before this checkpoint
01256  * started from those that arrived during the checkpoint, we use a cycle
01257  * counter similar to the one we use for fsync requests. That cycle
01258  * counter is incremented here.
01259  *
01260  * This must be called *before* the checkpoint REDO point is determined.
01261  * That ensures that we won't delete files too soon.
01262  *
01263  * Note that we can't do anything here that depends on the assumption
01264  * that the checkpoint will be completed.
01265  */
01266 void
01267 mdpreckpt(void)
01268 {
01269     /*
01270      * Any unlink requests arriving after this point will be assigned the next
01271      * cycle counter, and won't be unlinked until next checkpoint.
01272      */
01273     mdckpt_cycle_ctr++;
01274 }
01275 
01276 /*
01277  * mdpostckpt() -- Do post-checkpoint work
01278  *
01279  * Remove any lingering files that can now be safely removed.
01280  */
01281 void
01282 mdpostckpt(void)
01283 {
01284     int         absorb_counter;
01285 
01286     absorb_counter = UNLINKS_PER_ABSORB;
01287     while (pendingUnlinks != NIL)
01288     {
01289         PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
01290         char       *path;
01291 
01292         /*
01293          * New entries are appended to the end, so if the entry is new we've
01294          * reached the end of old entries.
01295          *
01296          * Note: if just the right number of consecutive checkpoints fail, we
01297          * could be fooled here by cycle_ctr wraparound.  However, the only
01298          * consequence is that we'd delay unlinking for one more checkpoint,
01299          * which is perfectly tolerable.
01300          */
01301         if (entry->cycle_ctr == mdckpt_cycle_ctr)
01302             break;
01303 
01304         /* Unlink the file */
01305         path = relpathperm(entry->rnode, MAIN_FORKNUM);
01306         if (unlink(path) < 0)
01307         {
01308             /*
01309              * There's a race condition, when the database is dropped at the
01310              * same time that we process the pending unlink requests. If the
01311              * DROP DATABASE deletes the file before we do, we will get ENOENT
01312              * here. rmtree() also has to ignore ENOENT errors, to deal with
01313              * the possibility that we delete the file first.
01314              */
01315             if (errno != ENOENT)
01316                 ereport(WARNING,
01317                         (errcode_for_file_access(),
01318                          errmsg("could not remove file \"%s\": %m", path)));
01319         }
01320         pfree(path);
01321 
01322         /* And remove the list entry */
01323         pendingUnlinks = list_delete_first(pendingUnlinks);
01324         pfree(entry);
01325 
01326         /*
01327          * As in mdsync, we don't want to stop absorbing fsync requests for a
01328          * long time when there are many deletions to be done.  We can safely
01329          * call AbsorbFsyncRequests() at this point in the loop (note it might
01330          * try to delete list entries).
01331          */
01332         if (--absorb_counter <= 0)
01333         {
01334             AbsorbFsyncRequests();
01335             absorb_counter = UNLINKS_PER_ABSORB;
01336         }
01337     }
01338 }
01339 
01340 /*
01341  * register_dirty_segment() -- Mark a relation segment as needing fsync
01342  *
01343  * If there is a local pending-ops table, just make an entry in it for
01344  * mdsync to process later.  Otherwise, try to pass off the fsync request
01345  * to the checkpointer process.  If that fails, just do the fsync
01346  * locally before returning (we hope this will not happen often enough
01347  * to be a performance problem).
01348  */
01349 static void
01350 register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
01351 {
01352     /* Temp relations should never be fsync'd */
01353     Assert(!SmgrIsTemp(reln));
01354 
01355     if (pendingOpsTable)
01356     {
01357         /* push it into local pending-ops table */
01358         RememberFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno);
01359     }
01360     else
01361     {
01362         if (ForwardFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno))
01363             return;             /* passed it off successfully */
01364 
01365         ereport(DEBUG1,
01366                 (errmsg("could not forward fsync request because request queue is full")));
01367 
01368         if (FileSync(seg->mdfd_vfd) < 0)
01369             ereport(ERROR,
01370                     (errcode_for_file_access(),
01371                      errmsg("could not fsync file \"%s\": %m",
01372                             FilePathName(seg->mdfd_vfd))));
01373     }
01374 }
01375 
01376 /*
01377  * register_unlink() -- Schedule a file to be deleted after next checkpoint
01378  *
01379  * We don't bother passing in the fork number, because this is only used
01380  * with main forks.
01381  *
01382  * As with register_dirty_segment, this could involve either a local or
01383  * a remote pending-ops table.
01384  */
01385 static void
01386 register_unlink(RelFileNodeBackend rnode)
01387 {
01388     /* Should never be used with temp relations */
01389     Assert(!RelFileNodeBackendIsTemp(rnode));
01390 
01391     if (pendingOpsTable)
01392     {
01393         /* push it into local pending-ops table */
01394         RememberFsyncRequest(rnode.node, MAIN_FORKNUM,
01395                              UNLINK_RELATION_REQUEST);
01396     }
01397     else
01398     {
01399         /*
01400          * Notify the checkpointer about it.  If we fail to queue the request
01401          * message, we have to sleep and try again, because we can't simply
01402          * delete the file now.  Ugly, but hopefully won't happen often.
01403          *
01404          * XXX should we just leave the file orphaned instead?
01405          */
01406         Assert(IsUnderPostmaster);
01407         while (!ForwardFsyncRequest(rnode.node, MAIN_FORKNUM,
01408                                     UNLINK_RELATION_REQUEST))
01409             pg_usleep(10000L);  /* 10 msec seems a good number */
01410     }
01411 }
01412 
01413 /*
01414  * RememberFsyncRequest() -- callback from checkpointer side of fsync request
01415  *
01416  * We stuff fsync requests into the local hash table for execution
01417  * during the checkpointer's next checkpoint.  UNLINK requests go into a
01418  * separate linked list, however, because they get processed separately.
01419  *
01420  * The range of possible segment numbers is way less than the range of
01421  * BlockNumber, so we can reserve high values of segno for special purposes.
01422  * We define three:
01423  * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation,
01424  *   either for one fork, or all forks if forknum is InvalidForkNumber
01425  * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
01426  * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
01427  *   checkpoint.
01428  * Note also that we're assuming real segment numbers don't exceed INT_MAX.
01429  *
01430  * (Handling FORGET_DATABASE_FSYNC requests is a tad slow because the hash
01431  * table has to be searched linearly, but dropping a database is a pretty
01432  * heavyweight operation anyhow, so we'll live with it.)
01433  */
01434 void
01435 RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
01436 {
01437     Assert(pendingOpsTable);
01438 
01439     if (segno == FORGET_RELATION_FSYNC)
01440     {
01441         /* Remove any pending requests for the relation (one or all forks) */
01442         PendingOperationEntry *entry;
01443 
01444         entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
01445                                                       &rnode,
01446                                                       HASH_FIND,
01447                                                       NULL);
01448         if (entry)
01449         {
01450             /*
01451              * We can't just delete the entry since mdsync could have an
01452              * active hashtable scan.  Instead we delete the bitmapsets; this
01453              * is safe because of the way mdsync is coded.  We also set the
01454              * "canceled" flags so that mdsync can tell that a cancel arrived
01455              * for the fork(s).
01456              */
01457             if (forknum == InvalidForkNumber)
01458             {
01459                 /* remove requests for all forks */
01460                 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
01461                 {
01462                     bms_free(entry->requests[forknum]);
01463                     entry->requests[forknum] = NULL;
01464                     entry->canceled[forknum] = true;
01465                 }
01466             }
01467             else
01468             {
01469                 /* remove requests for single fork */
01470                 bms_free(entry->requests[forknum]);
01471                 entry->requests[forknum] = NULL;
01472                 entry->canceled[forknum] = true;
01473             }
01474         }
01475     }
01476     else if (segno == FORGET_DATABASE_FSYNC)
01477     {
01478         /* Remove any pending requests for the entire database */
01479         HASH_SEQ_STATUS hstat;
01480         PendingOperationEntry *entry;
01481         ListCell   *cell,
01482                    *prev,
01483                    *next;
01484 
01485         /* Remove fsync requests */
01486         hash_seq_init(&hstat, pendingOpsTable);
01487         while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
01488         {
01489             if (entry->rnode.dbNode == rnode.dbNode)
01490             {
01491                 /* remove requests for all forks */
01492                 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
01493                 {
01494                     bms_free(entry->requests[forknum]);
01495                     entry->requests[forknum] = NULL;
01496                     entry->canceled[forknum] = true;
01497                 }
01498             }
01499         }
01500 
01501         /* Remove unlink requests */
01502         prev = NULL;
01503         for (cell = list_head(pendingUnlinks); cell; cell = next)
01504         {
01505             PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
01506 
01507             next = lnext(cell);
01508             if (entry->rnode.dbNode == rnode.dbNode)
01509             {
01510                 pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
01511                 pfree(entry);
01512             }
01513             else
01514                 prev = cell;
01515         }
01516     }
01517     else if (segno == UNLINK_RELATION_REQUEST)
01518     {
01519         /* Unlink request: put it in the linked list */
01520         MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
01521         PendingUnlinkEntry *entry;
01522 
01523         /* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */
01524         Assert(forknum == MAIN_FORKNUM);
01525 
01526         entry = palloc(sizeof(PendingUnlinkEntry));
01527         entry->rnode = rnode;
01528         entry->cycle_ctr = mdckpt_cycle_ctr;
01529 
01530         pendingUnlinks = lappend(pendingUnlinks, entry);
01531 
01532         MemoryContextSwitchTo(oldcxt);
01533     }
01534     else
01535     {
01536         /* Normal case: enter a request to fsync this segment */
01537         MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
01538         PendingOperationEntry *entry;
01539         bool        found;
01540 
01541         entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
01542                                                       &rnode,
01543                                                       HASH_ENTER,
01544                                                       &found);
01545         /* if new entry, initialize it */
01546         if (!found)
01547         {
01548             entry->cycle_ctr = mdsync_cycle_ctr;
01549             MemSet(entry->requests, 0, sizeof(entry->requests));
01550             MemSet(entry->canceled, 0, sizeof(entry->canceled));
01551         }
01552 
01553         /*
01554          * NB: it's intentional that we don't change cycle_ctr if the entry
01555          * already exists.  The cycle_ctr must represent the oldest fsync
01556          * request that could be in the entry.
01557          */
01558 
01559         entry->requests[forknum] = bms_add_member(entry->requests[forknum],
01560                                                   (int) segno);
01561 
01562         MemoryContextSwitchTo(oldcxt);
01563     }
01564 }
01565 
01566 /*
01567  * ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork
01568  *
01569  * forknum == InvalidForkNumber means all forks, although this code doesn't
01570  * actually know that, since it's just forwarding the request elsewhere.
01571  */
01572 void
01573 ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
01574 {
01575     if (pendingOpsTable)
01576     {
01577         /* standalone backend or startup process: fsync state is local */
01578         RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC);
01579     }
01580     else if (IsUnderPostmaster)
01581     {
01582         /*
01583          * Notify the checkpointer about it.  If we fail to queue the cancel
01584          * message, we have to sleep and try again ... ugly, but hopefully
01585          * won't happen often.
01586          *
01587          * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
01588          * error would leave the no-longer-used file still present on disk,
01589          * which would be bad, so I'm inclined to assume that the checkpointer
01590          * will always empty the queue soon.
01591          */
01592         while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
01593             pg_usleep(10000L);  /* 10 msec seems a good number */
01594 
01595         /*
01596          * Note we don't wait for the checkpointer to actually absorb the
01597          * cancel message; see mdsync() for the implications.
01598          */
01599     }
01600 }
01601 
01602 /*
01603  * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
01604  */
01605 void
01606 ForgetDatabaseFsyncRequests(Oid dbid)
01607 {
01608     RelFileNode rnode;
01609 
01610     rnode.dbNode = dbid;
01611     rnode.spcNode = 0;
01612     rnode.relNode = 0;
01613 
01614     if (pendingOpsTable)
01615     {
01616         /* standalone backend or startup process: fsync state is local */
01617         RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC);
01618     }
01619     else if (IsUnderPostmaster)
01620     {
01621         /* see notes in ForgetRelationFsyncRequests */
01622         while (!ForwardFsyncRequest(rnode, InvalidForkNumber,
01623                                     FORGET_DATABASE_FSYNC))
01624             pg_usleep(10000L);  /* 10 msec seems a good number */
01625     }
01626 }
01627 
01628 
01629 /*
01630  *  _fdvec_alloc() -- Make a MdfdVec object.
01631  */
01632 static MdfdVec *
01633 _fdvec_alloc(void)
01634 {
01635     return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
01636 }
01637 
01638 /*
01639  * Return the filename for the specified segment of the relation. The
01640  * returned string is palloc'd.
01641  */
01642 static char *
01643 _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
01644 {
01645     char       *path,
01646                *fullpath;
01647 
01648     path = relpath(reln->smgr_rnode, forknum);
01649 
01650     if (segno > 0)
01651     {
01652         /* be sure we have enough space for the '.segno' */
01653         fullpath = (char *) palloc(strlen(path) + 12);
01654         sprintf(fullpath, "%s.%u", path, segno);
01655         pfree(path);
01656     }
01657     else
01658         fullpath = path;
01659 
01660     return fullpath;
01661 }
01662 
01663 /*
01664  * Open the specified segment of the relation,
01665  * and make a MdfdVec object for it.  Returns NULL on failure.
01666  */
01667 static MdfdVec *
01668 _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
01669               int oflags)
01670 {
01671     MdfdVec    *v;
01672     int         fd;
01673     char       *fullpath;
01674 
01675     fullpath = _mdfd_segpath(reln, forknum, segno);
01676 
01677     /* open the file */
01678     fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
01679 
01680     pfree(fullpath);
01681 
01682     if (fd < 0)
01683         return NULL;
01684 
01685     /* allocate an mdfdvec entry for it */
01686     v = _fdvec_alloc();
01687 
01688     /* fill the entry */
01689     v->mdfd_vfd = fd;
01690     v->mdfd_segno = segno;
01691     v->mdfd_chain = NULL;
01692     Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
01693 
01694     /* all done */
01695     return v;
01696 }
01697 
01698 /*
01699  *  _mdfd_getseg() -- Find the segment of the relation holding the
01700  *      specified block.
01701  *
01702  * If the segment doesn't exist, we ereport, return NULL, or create the
01703  * segment, according to "behavior".  Note: skipFsync is only used in the
01704  * EXTENSION_CREATE case.
01705  */
01706 static MdfdVec *
01707 _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
01708              bool skipFsync, ExtensionBehavior behavior)
01709 {
01710     MdfdVec    *v = mdopen(reln, forknum, behavior);
01711     BlockNumber targetseg;
01712     BlockNumber nextsegno;
01713 
01714     if (!v)
01715         return NULL;            /* only possible if EXTENSION_RETURN_NULL */
01716 
01717     targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
01718     for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
01719     {
01720         Assert(nextsegno == v->mdfd_segno + 1);
01721 
01722         if (v->mdfd_chain == NULL)
01723         {
01724             /*
01725              * Normally we will create new segments only if authorized by the
01726              * caller (i.e., we are doing mdextend()).  But when doing WAL
01727              * recovery, create segments anyway; this allows cases such as
01728              * replaying WAL data that has a write into a high-numbered
01729              * segment of a relation that was later deleted.  We want to go
01730              * ahead and create the segments so we can finish out the replay.
01731              *
01732              * We have to maintain the invariant that segments before the last
01733              * active segment are of size RELSEG_SIZE; therefore, pad them out
01734              * with zeroes if needed.  (This only matters if caller is
01735              * extending the relation discontiguously, but that can happen in
01736              * hash indexes.)
01737              */
01738             if (behavior == EXTENSION_CREATE || InRecovery)
01739             {
01740                 if (_mdnblocks(reln, forknum, v) < RELSEG_SIZE)
01741                 {
01742                     char       *zerobuf = palloc0(BLCKSZ);
01743 
01744                     mdextend(reln, forknum,
01745                              nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
01746                              zerobuf, skipFsync);
01747                     pfree(zerobuf);
01748                 }
01749                 v->mdfd_chain = _mdfd_openseg(reln, forknum, +nextsegno, O_CREAT);
01750             }
01751             else
01752             {
01753                 /* We won't create segment if not existent */
01754                 v->mdfd_chain = _mdfd_openseg(reln, forknum, nextsegno, 0);
01755             }
01756             if (v->mdfd_chain == NULL)
01757             {
01758                 if (behavior == EXTENSION_RETURN_NULL &&
01759                     FILE_POSSIBLY_DELETED(errno))
01760                     return NULL;
01761                 ereport(ERROR,
01762                         (errcode_for_file_access(),
01763                    errmsg("could not open file \"%s\" (target block %u): %m",
01764                           _mdfd_segpath(reln, forknum, nextsegno),
01765                           blkno)));
01766             }
01767         }
01768         v = v->mdfd_chain;
01769     }
01770     return v;
01771 }
01772 
01773 /*
01774  * Get number of blocks present in a single disk file
01775  */
01776 static BlockNumber
01777 _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
01778 {
01779     off_t       len;
01780 
01781     len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
01782     if (len < 0)
01783         ereport(ERROR,
01784                 (errcode_for_file_access(),
01785                  errmsg("could not seek to end of file \"%s\": %m",
01786                         FilePathName(seg->mdfd_vfd))));
01787     /* note that this calculation will ignore any partial block at EOF */
01788     return (BlockNumber) (len / BLCKSZ);
01789 }