#include "postgres.h"
#include <unistd.h>
#include <fcntl.h>
#include <sys/file.h>
#include "miscadmin.h"
#include "access/xlog.h"
#include "catalog/catalog.h"
#include "common/relpath.h"
#include "portability/instr_time.h"
#include "postmaster/bgwriter.h"
#include "storage/fd.h"
#include "storage/bufmgr.h"
#include "storage/relfilenode.h"
#include "storage/smgr.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"
#include "pg_trace.h"

Include dependency graph for md.c:

Data Structures
struct	_MdfdVec
struct	PendingOperationEntry
struct	PendingUnlinkEntry
Defines
#define	FSYNCS_PER_ABSORB 10
#define	UNLINKS_PER_ABSORB 10
#define	FORGET_RELATION_FSYNC (InvalidBlockNumber)
#define	FORGET_DATABASE_FSYNC (InvalidBlockNumber-1)
#define	UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
#define	FILE_POSSIBLY_DELETED(err) ((err) == ENOENT)
Typedefs
typedef struct _MdfdVec	MdfdVec
typedef uint16	CycleCtr
Enumerations
enum	ExtensionBehavior { EXTENSION_FAIL, EXTENSION_RETURN_NULL, EXTENSION_CREATE }
Functions
static void	mdunlinkfork (RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
static MdfdVec *	mdopen (SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior)
static void	register_dirty_segment (SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
static void	register_unlink (RelFileNodeBackend rnode)
static MdfdVec *	_fdvec_alloc (void)
static char *	_mdfd_segpath (SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
static MdfdVec *	_mdfd_openseg (SMgrRelation reln, ForkNumber forkno, BlockNumber segno, int oflags)
static MdfdVec *	_mdfd_getseg (SMgrRelation reln, ForkNumber forkno, BlockNumber blkno, bool skipFsync, ExtensionBehavior behavior)
static BlockNumber	_mdnblocks (SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
void	mdinit (void)
void	SetForwardFsyncRequests (void)
bool	mdexists (SMgrRelation reln, ForkNumber forkNum)
void	mdcreate (SMgrRelation reln, ForkNumber forkNum, bool isRedo)
void	mdunlink (RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
void	mdextend (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
void	mdclose (SMgrRelation reln, ForkNumber forknum)
void	mdprefetch (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
void	mdread (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer)
void	mdwrite (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync)
BlockNumber	mdnblocks (SMgrRelation reln, ForkNumber forknum)
void	mdtruncate (SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
void	mdimmedsync (SMgrRelation reln, ForkNumber forknum)
void	mdsync (void)
void	mdpreckpt (void)
void	mdpostckpt (void)
void	RememberFsyncRequest (RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
void	ForgetRelationFsyncRequests (RelFileNode rnode, ForkNumber forknum)
void	ForgetDatabaseFsyncRequests (Oid dbid)
Variables
static MemoryContext	MdCxt
static HTAB *	pendingOpsTable = NULL
static List *	pendingUnlinks = NIL
static CycleCtr	mdsync_cycle_ctr = 0
static CycleCtr	mdckpt_cycle_ctr = 0

Define Documentation

#define FILE_POSSIBLY_DELETED ( err ) ((err) == ENOENT)

Definition at line 66 of file md.c.

Referenced by _mdfd_getseg(), mdopen(), and mdsync().

#define FORGET_DATABASE_FSYNC (InvalidBlockNumber-1)

Definition at line 55 of file md.c.

Referenced by ForgetDatabaseFsyncRequests(), and RememberFsyncRequest().

#define FORGET_RELATION_FSYNC (InvalidBlockNumber)

Definition at line 54 of file md.c.

Referenced by ForgetRelationFsyncRequests(), and RememberFsyncRequest().

#define FSYNCS_PER_ABSORB 10

Definition at line 44 of file md.c.

#define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)

Definition at line 56 of file md.c.

Referenced by register_unlink(), and RememberFsyncRequest().

#define UNLINKS_PER_ABSORB 10

Definition at line 45 of file md.c.

Typedef Documentation

typedef uint16 CycleCtr

Definition at line 141 of file md.c.

typedef struct _MdfdVec MdfdVec

Enumeration Type Documentation

enum ExtensionBehavior

Enumerator:

EXTENSION_FAIL
EXTENSION_RETURN_NULL
EXTENSION_CREATE

Definition at line 166 of file md.c.

{
    EXTENSION_FAIL,             /* ereport if segment not present */
    EXTENSION_RETURN_NULL,      /* return NULL if not present */
    EXTENSION_CREATE            /* create new segments as needed */
} ExtensionBehavior;

Function Documentation

static MdfdVec * _fdvec_alloc ( void ) [static]

Definition at line 1633 of file md.c.

References MemoryContextAlloc().

Referenced by _mdfd_openseg(), mdcreate(), and mdopen().

{
    return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
}

static MdfdVec * _mdfd_getseg	(	SMgrRelation	reln,
		ForkNumber	forkno,
		BlockNumber	blkno,
		bool	skipFsync,
		ExtensionBehavior	behavior
	)			`[static]`

Definition at line 1707 of file md.c.

References _mdfd_openseg(), _mdfd_segpath(), _mdnblocks(), Assert, ereport, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_CREATE, EXTENSION_RETURN_NULL, FILE_POSSIBLY_DELETED, InRecovery, mdextend(), _MdfdVec::mdfd_chain, _MdfdVec::mdfd_segno, mdopen(), NULL, palloc0(), and pfree().

Referenced by mdextend(), mdprefetch(), mdread(), mdsync(), and mdwrite().

{
    MdfdVec    *v = mdopen(reln, forknum, behavior);
    BlockNumber targetseg;
    BlockNumber nextsegno;

    if (!v)
        return NULL;            /* only possible if EXTENSION_RETURN_NULL */

    targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
    for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
    {
        Assert(nextsegno == v->mdfd_segno + 1);

        if (v->mdfd_chain == NULL)
        {
            /*
             * Normally we will create new segments only if authorized by the
             * caller (i.e., we are doing mdextend()).  But when doing WAL
             * recovery, create segments anyway; this allows cases such as
             * replaying WAL data that has a write into a high-numbered
             * segment of a relation that was later deleted.  We want to go
             * ahead and create the segments so we can finish out the replay.
             *
             * We have to maintain the invariant that segments before the last
             * active segment are of size RELSEG_SIZE; therefore, pad them out
             * with zeroes if needed.  (This only matters if caller is
             * extending the relation discontiguously, but that can happen in
             * hash indexes.)
             */
            if (behavior == EXTENSION_CREATE || InRecovery)
            {
                if (_mdnblocks(reln, forknum, v) < RELSEG_SIZE)
                {
                    char       *zerobuf = palloc0(BLCKSZ);

                    mdextend(reln, forknum,
                             nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
                             zerobuf, skipFsync);
                    pfree(zerobuf);
                }
                v->mdfd_chain = _mdfd_openseg(reln, forknum, +nextsegno, O_CREAT);
            }
            else
            {
                /* We won't create segment if not existent */
                v->mdfd_chain = _mdfd_openseg(reln, forknum, nextsegno, 0);
            }
            if (v->mdfd_chain == NULL)
            {
                if (behavior == EXTENSION_RETURN_NULL &&
                    FILE_POSSIBLY_DELETED(errno))
                    return NULL;
                ereport(ERROR,
                        (errcode_for_file_access(),
                   errmsg("could not open file \"%s\" (target block %u): %m",
                          _mdfd_segpath(reln, forknum, nextsegno),
                          blkno)));
            }
        }
        v = v->mdfd_chain;
    }
    return v;
}

static MdfdVec * _mdfd_openseg	(	SMgrRelation	reln,
		ForkNumber	forkno,
		BlockNumber	segno,
		int	oflags
	)			`[static]`

Definition at line 1668 of file md.c.

References _fdvec_alloc(), _mdfd_segpath(), _mdnblocks(), Assert, _MdfdVec::mdfd_chain, _MdfdVec::mdfd_segno, _MdfdVec::mdfd_vfd, PathNameOpenFile(), pfree(), and PG_BINARY.

Referenced by _mdfd_getseg(), and mdnblocks().

{
    MdfdVec    *v;
    int         fd;
    char       *fullpath;

    fullpath = _mdfd_segpath(reln, forknum, segno);

    /* open the file */
    fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);

    pfree(fullpath);

    if (fd < 0)
        return NULL;

    /* allocate an mdfdvec entry for it */
    v = _fdvec_alloc();

    /* fill the entry */
    v->mdfd_vfd = fd;
    v->mdfd_segno = segno;
    v->mdfd_chain = NULL;
    Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));

    /* all done */
    return v;
}

static char * _mdfd_segpath	(	SMgrRelation	reln,
		ForkNumber	forknum,
		BlockNumber	segno
	)			`[static]`

Definition at line 1643 of file md.c.

References palloc(), pfree(), relpath, and SMgrRelationData::smgr_rnode.

Referenced by _mdfd_getseg(), _mdfd_openseg(), mdnblocks(), and mdsync().

{
    char       *path,
               *fullpath;

    path = relpath(reln->smgr_rnode, forknum);

    if (segno > 0)
    {
        /* be sure we have enough space for the '.segno' */
        fullpath = (char *) palloc(strlen(path) + 12);
        sprintf(fullpath, "%s.%u", path, segno);
        pfree(path);
    }
    else
        fullpath = path;

    return fullpath;
}

static BlockNumber _mdnblocks	(	SMgrRelation	reln,
		ForkNumber	forknum,
		MdfdVec *	seg
	)			`[static]`

Definition at line 1777 of file md.c.

References ereport, errcode_for_file_access(), errmsg(), ERROR, FilePathName(), FileSeek(), and _MdfdVec::mdfd_vfd.

Referenced by _mdfd_getseg(), _mdfd_openseg(), mdextend(), mdnblocks(), and mdopen().

{
    off_t       len;

    len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
    if (len < 0)
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not seek to end of file \"%s\": %m",
                        FilePathName(seg->mdfd_vfd))));
    /* note that this calculation will ignore any partial block at EOF */
    return (BlockNumber) (len / BLCKSZ);
}

void ForgetDatabaseFsyncRequests ( Oid dbid )

Definition at line 1606 of file md.c.

References RelFileNode::dbNode, FORGET_DATABASE_FSYNC, ForwardFsyncRequest(), InvalidForkNumber, IsUnderPostmaster, pg_usleep(), RelFileNode::relNode, RememberFsyncRequest(), and RelFileNode::spcNode.

Referenced by dbase_redo(), and dropdb().

{
    RelFileNode rnode;

    rnode.dbNode = dbid;
    rnode.spcNode = 0;
    rnode.relNode = 0;

    if (pendingOpsTable)
    {
        /* standalone backend or startup process: fsync state is local */
        RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC);
    }
    else if (IsUnderPostmaster)
    {
        /* see notes in ForgetRelationFsyncRequests */
        while (!ForwardFsyncRequest(rnode, InvalidForkNumber,
                                    FORGET_DATABASE_FSYNC))
            pg_usleep(10000L);  /* 10 msec seems a good number */
    }
}

void ForgetRelationFsyncRequests	(	RelFileNode	rnode,
		ForkNumber	forknum
	)

Definition at line 1573 of file md.c.

References FORGET_RELATION_FSYNC, ForwardFsyncRequest(), IsUnderPostmaster, pg_usleep(), and RememberFsyncRequest().

Referenced by mdunlink().

{
    if (pendingOpsTable)
    {
        /* standalone backend or startup process: fsync state is local */
        RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC);
    }
    else if (IsUnderPostmaster)
    {
        /*
         * Notify the checkpointer about it.  If we fail to queue the cancel
         * message, we have to sleep and try again ... ugly, but hopefully
         * won't happen often.
         *
         * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
         * error would leave the no-longer-used file still present on disk,
         * which would be bad, so I'm inclined to assume that the checkpointer
         * will always empty the queue soon.
         */
        while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
            pg_usleep(10000L);  /* 10 msec seems a good number */

        /*
         * Note we don't wait for the checkpointer to actually absorb the
         * cancel message; see mdsync() for the implications.
         */
    }
}

void mdclose	(	SMgrRelation	reln,
		ForkNumber	forknum
	)

Definition at line 607 of file md.c.

References FileClose(), SMgrRelationData::md_fd, _MdfdVec::mdfd_chain, _MdfdVec::mdfd_vfd, NULL, and pfree().

Referenced by mdexists().

{
    MdfdVec    *v = reln->md_fd[forknum];

    /* No work if already closed */
    if (v == NULL)
        return;

    reln->md_fd[forknum] = NULL;    /* prevent dangling pointer after error */

    while (v != NULL)
    {
        MdfdVec    *ov = v;

        /* if not closed already */
        if (v->mdfd_vfd >= 0)
            FileClose(v->mdfd_vfd);
        /* Now free vector */
        v = v->mdfd_chain;
        pfree(ov);
    }
}

void mdcreate	(	SMgrRelation	reln,
		ForkNumber	forkNum,
		bool	isRedo
	)

Definition at line 273 of file md.c.

References _fdvec_alloc(), Assert, ereport, errcode_for_file_access(), errmsg(), ERROR, IsBootstrapProcessingMode, SMgrRelationData::md_fd, _MdfdVec::mdfd_chain, _MdfdVec::mdfd_segno, _MdfdVec::mdfd_vfd, NULL, PathNameOpenFile(), pfree(), PG_BINARY, relpath, and SMgrRelationData::smgr_rnode.

{
    char       *path;
    File        fd;

    if (isRedo && reln->md_fd[forkNum] != NULL)
        return;                 /* created and opened already... */

    Assert(reln->md_fd[forkNum] == NULL);

    path = relpath(reln->smgr_rnode, forkNum);

    fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);

    if (fd < 0)
    {
        int         save_errno = errno;

        /*
         * During bootstrap, there are cases where a system relation will be
         * accessed (by internal backend processes) before the bootstrap
         * script nominally creates it.  Therefore, allow the file to exist
         * already, even if isRedo is not set.  (See also mdopen)
         */
        if (isRedo || IsBootstrapProcessingMode())
            fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
        if (fd < 0)
        {
            /* be sure to report the error reported by create, not open */
            errno = save_errno;
            ereport(ERROR,
                    (errcode_for_file_access(),
                     errmsg("could not create file \"%s\": %m", path)));
        }
    }

    pfree(path);

    reln->md_fd[forkNum] = _fdvec_alloc();

    reln->md_fd[forkNum]->mdfd_vfd = fd;
    reln->md_fd[forkNum]->mdfd_segno = 0;
    reln->md_fd[forkNum]->mdfd_chain = NULL;
}

bool mdexists	(	SMgrRelation	reln,
		ForkNumber	forkNum
	)

Definition at line 256 of file md.c.

References EXTENSION_RETURN_NULL, mdclose(), mdopen(), and NULL.

{
    /*
     * Close it first, to ensure that we notice if the fork has been unlinked
     * since we opened it.
     */
    mdclose(reln, forkNum);

    return (mdopen(reln, forkNum, EXTENSION_RETURN_NULL) != NULL);
}

void mdextend	(	SMgrRelation	reln,
		ForkNumber	forknum,
		BlockNumber	blocknum,
		char *	buffer,
		bool	skipFsync
	)

Definition at line 474 of file md.c.

References _mdfd_getseg(), _mdnblocks(), Assert, ereport, errcode(), errcode_for_file_access(), errhint(), errmsg(), ERROR, EXTENSION_CREATE, FilePathName(), FileSeek(), FileWrite(), InvalidBlockNumber, _MdfdVec::mdfd_vfd, mdnblocks(), register_dirty_segment(), relpath, SMgrRelationData::smgr_rnode, and SmgrIsTemp.

Referenced by _mdfd_getseg().

{
    off_t       seekpos;
    int         nbytes;
    MdfdVec    *v;

    /* This assert is too expensive to have on normally ... */
#ifdef CHECK_WRITE_VS_EXTEND
    Assert(blocknum >= mdnblocks(reln, forknum));
#endif

    /*
     * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
     * more --- we mustn't create a block whose number actually is
     * InvalidBlockNumber.
     */
    if (blocknum == InvalidBlockNumber)
        ereport(ERROR,
                (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                 errmsg("cannot extend file \"%s\" beyond %u blocks",
                        relpath(reln->smgr_rnode, forknum),
                        InvalidBlockNumber)));

    v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);

    seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));

    Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);

    /*
     * Note: because caller usually obtained blocknum by calling mdnblocks,
     * which did a seek(SEEK_END), this seek is often redundant and will be
     * optimized away by fd.c.  It's not redundant, however, if there is a
     * partial page at the end of the file. In that case we want to try to
     * overwrite the partial page with a full page.  It's also not redundant
     * if bufmgr.c had to dump another buffer of the same file to make room
     * for the new page's buffer.
     */
    if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not seek to block %u in file \"%s\": %m",
                        blocknum, FilePathName(v->mdfd_vfd))));

    if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
    {
        if (nbytes < 0)
            ereport(ERROR,
                    (errcode_for_file_access(),
                     errmsg("could not extend file \"%s\": %m",
                            FilePathName(v->mdfd_vfd)),
                     errhint("Check free disk space.")));
        /* short write: complain appropriately */
        ereport(ERROR,
                (errcode(ERRCODE_DISK_FULL),
                 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
                        FilePathName(v->mdfd_vfd),
                        nbytes, BLCKSZ, blocknum),
                 errhint("Check free disk space.")));
    }

    if (!skipFsync && !SmgrIsTemp(reln))
        register_dirty_segment(reln, forknum, v);

    Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
}

void mdimmedsync	(	SMgrRelation	reln,
		ForkNumber	forknum
	)

Definition at line 950 of file md.c.

References ereport, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_FAIL, FilePathName(), FileSync(), _MdfdVec::mdfd_chain, _MdfdVec::mdfd_vfd, mdnblocks(), mdopen(), and NULL.

{
    MdfdVec    *v;

    /*
     * NOTE: mdnblocks makes sure we have opened all active segments, so that
     * fsync loop will get them all!
     */
    mdnblocks(reln, forknum);

    v = mdopen(reln, forknum, EXTENSION_FAIL);

    while (v != NULL)
    {
        if (FileSync(v->mdfd_vfd) < 0)
            ereport(ERROR,
                    (errcode_for_file_access(),
                     errmsg("could not fsync file \"%s\": %m",
                            FilePathName(v->mdfd_vfd))));
        v = v->mdfd_chain;
    }
}

void mdinit ( void )

Definition at line 196 of file md.c.

References ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE, ALLOCSET_DEFAULT_MINSIZE, AllocSetContextCreate(), AmCheckpointerProcess, AmStartupProcess, HASHCTL::entrysize, HASHCTL::hash, HASH_CONTEXT, hash_create(), HASH_ELEM, HASH_FUNCTION, HASHCTL::hcxt, IsUnderPostmaster, HASHCTL::keysize, MemSet, and TopMemoryContext.

{
    MdCxt = AllocSetContextCreate(TopMemoryContext,
                                  "MdSmgr",
                                  ALLOCSET_DEFAULT_MINSIZE,
                                  ALLOCSET_DEFAULT_INITSIZE,
                                  ALLOCSET_DEFAULT_MAXSIZE);

    /*
     * Create pending-operations hashtable if we need it.  Currently, we need
     * it if we are standalone (not under a postmaster) or if we are a startup
     * or checkpointer auxiliary process.
     */
    if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess())
    {
        HASHCTL     hash_ctl;

        MemSet(&hash_ctl, 0, sizeof(hash_ctl));
        hash_ctl.keysize = sizeof(RelFileNode);
        hash_ctl.entrysize = sizeof(PendingOperationEntry);
        hash_ctl.hash = tag_hash;
        hash_ctl.hcxt = MdCxt;
        pendingOpsTable = hash_create("Pending Ops Table",
                                      100L,
                                      &hash_ctl,
                                   HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
        pendingUnlinks = NIL;
    }
}

BlockNumber mdnblocks	(	SMgrRelation	reln,
		ForkNumber	forknum
	)

Definition at line 795 of file md.c.

References _mdfd_openseg(), _mdfd_segpath(), _mdnblocks(), elog, ereport, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_FAIL, FATAL, _MdfdVec::mdfd_chain, mdopen(), and NULL.

Referenced by mdextend(), mdimmedsync(), mdtruncate(), and mdwrite().

{
    MdfdVec    *v = mdopen(reln, forknum, EXTENSION_FAIL);
    BlockNumber nblocks;
    BlockNumber segno = 0;

    /*
     * Skip through any segments that aren't the last one, to avoid redundant
     * seeks on them.  We have previously verified that these segments are
     * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
     *
     * NOTE: this assumption could only be wrong if another backend has
     * truncated the relation.  We rely on higher code levels to handle that
     * scenario by closing and re-opening the md fd, which is handled via
     * relcache flush.  (Since the checkpointer doesn't participate in
     * relcache flush, it could have segment chain entries for inactive
     * segments; that's OK because the checkpointer never needs to compute
     * relation size.)
     */
    while (v->mdfd_chain != NULL)
    {
        segno++;
        v = v->mdfd_chain;
    }

    for (;;)
    {
        nblocks = _mdnblocks(reln, forknum, v);
        if (nblocks > ((BlockNumber) RELSEG_SIZE))
            elog(FATAL, "segment too big");
        if (nblocks < ((BlockNumber) RELSEG_SIZE))
            return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;

        /*
         * If segment is exactly RELSEG_SIZE, advance to next one.
         */
        segno++;

        if (v->mdfd_chain == NULL)
        {
            /*
             * Because we pass O_CREAT, we will create the next segment (with
             * zero length) immediately, if the last segment is of length
             * RELSEG_SIZE.  While perhaps not strictly necessary, this keeps
             * the logic simple.
             */
            v->mdfd_chain = _mdfd_openseg(reln, forknum, segno, O_CREAT);
            if (v->mdfd_chain == NULL)
                ereport(ERROR,
                        (errcode_for_file_access(),
                         errmsg("could not open file \"%s\": %m",
                                _mdfd_segpath(reln, forknum, segno))));
        }

        v = v->mdfd_chain;
    }
}

static MdfdVec * mdopen	(	SMgrRelation	reln,
		ForkNumber	forknum,
		ExtensionBehavior	behavior
	)			`[static]`

Definition at line 553 of file md.c.

References _fdvec_alloc(), _mdnblocks(), Assert, ereport, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_RETURN_NULL, FILE_POSSIBLY_DELETED, IsBootstrapProcessingMode, SMgrRelationData::md_fd, _MdfdVec::mdfd_chain, _MdfdVec::mdfd_segno, _MdfdVec::mdfd_vfd, PathNameOpenFile(), pfree(), PG_BINARY, relpath, and SMgrRelationData::smgr_rnode.

Referenced by _mdfd_getseg(), mdexists(), mdimmedsync(), mdnblocks(), and mdtruncate().

{
    MdfdVec    *mdfd;
    char       *path;
    File        fd;

    /* No work if already open */
    if (reln->md_fd[forknum])
        return reln->md_fd[forknum];

    path = relpath(reln->smgr_rnode, forknum);

    fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);

    if (fd < 0)
    {
        /*
         * During bootstrap, there are cases where a system relation will be
         * accessed (by internal backend processes) before the bootstrap
         * script nominally creates it.  Therefore, accept mdopen() as a
         * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
         */
        if (IsBootstrapProcessingMode())
            fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
        if (fd < 0)
        {
            if (behavior == EXTENSION_RETURN_NULL &&
                FILE_POSSIBLY_DELETED(errno))
            {
                pfree(path);
                return NULL;
            }
            ereport(ERROR,
                    (errcode_for_file_access(),
                     errmsg("could not open file \"%s\": %m", path)));
        }
    }

    pfree(path);

    reln->md_fd[forknum] = mdfd = _fdvec_alloc();

    mdfd->mdfd_vfd = fd;
    mdfd->mdfd_segno = 0;
    mdfd->mdfd_chain = NULL;
    Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));

    return mdfd;
}

void mdpostckpt ( void )

Definition at line 1282 of file md.c.

References AbsorbFsyncRequests(), PendingUnlinkEntry::cycle_ctr, ereport, errcode_for_file_access(), errmsg(), linitial, list_delete_first(), MAIN_FORKNUM, mdckpt_cycle_ctr, NIL, pfree(), relpathperm, PendingUnlinkEntry::rnode, unlink(), and WARNING.

{
    int         absorb_counter;

    absorb_counter = UNLINKS_PER_ABSORB;
    while (pendingUnlinks != NIL)
    {
        PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
        char       *path;

        /*
         * New entries are appended to the end, so if the entry is new we've
         * reached the end of old entries.
         *
         * Note: if just the right number of consecutive checkpoints fail, we
         * could be fooled here by cycle_ctr wraparound.  However, the only
         * consequence is that we'd delay unlinking for one more checkpoint,
         * which is perfectly tolerable.
         */
        if (entry->cycle_ctr == mdckpt_cycle_ctr)
            break;

        /* Unlink the file */
        path = relpathperm(entry->rnode, MAIN_FORKNUM);
        if (unlink(path) < 0)
        {
            /*
             * There's a race condition, when the database is dropped at the
             * same time that we process the pending unlink requests. If the
             * DROP DATABASE deletes the file before we do, we will get ENOENT
             * here. rmtree() also has to ignore ENOENT errors, to deal with
             * the possibility that we delete the file first.
             */
            if (errno != ENOENT)
                ereport(WARNING,
                        (errcode_for_file_access(),
                         errmsg("could not remove file \"%s\": %m", path)));
        }
        pfree(path);

        /* And remove the list entry */
        pendingUnlinks = list_delete_first(pendingUnlinks);
        pfree(entry);

        /*
         * As in mdsync, we don't want to stop absorbing fsync requests for a
         * long time when there are many deletions to be done.  We can safely
         * call AbsorbFsyncRequests() at this point in the loop (note it might
         * try to delete list entries).
         */
        if (--absorb_counter <= 0)
        {
            AbsorbFsyncRequests();
            absorb_counter = UNLINKS_PER_ABSORB;
        }
    }
}

void mdpreckpt ( void )

Definition at line 1267 of file md.c.

References mdckpt_cycle_ctr.

{
    /*
     * Any unlink requests arriving after this point will be assigned the next
     * cycle counter, and won't be unlinked until next checkpoint.
     */
    mdckpt_cycle_ctr++;
}

void mdprefetch	(	SMgrRelation	reln,
		ForkNumber	forknum,
		BlockNumber	blocknum
	)

Definition at line 634 of file md.c.

References _mdfd_getseg(), Assert, EXTENSION_FAIL, FilePrefetch(), and _MdfdVec::mdfd_vfd.

{
#ifdef USE_PREFETCH
    off_t       seekpos;
    MdfdVec    *v;

    v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);

    seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));

    Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);

    (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ);
#endif   /* USE_PREFETCH */
}

void mdread	(	SMgrRelation	reln,
		ForkNumber	forknum,
		BlockNumber	blocknum,
		char *	buffer
	)

Definition at line 655 of file md.c.

References _mdfd_getseg(), Assert, RelFileNodeBackend::backend, RelFileNode::dbNode, ereport, errcode(), errcode_for_file_access(), errmsg(), ERROR, EXTENSION_FAIL, FilePathName(), FileRead(), FileSeek(), InRecovery, _MdfdVec::mdfd_vfd, MemSet, RelFileNodeBackend::node, RelFileNode::relNode, SMgrRelationData::smgr_rnode, RelFileNode::spcNode, and zero_damaged_pages.

{
    off_t       seekpos;
    int         nbytes;
    MdfdVec    *v;

    TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
                                        reln->smgr_rnode.node.spcNode,
                                        reln->smgr_rnode.node.dbNode,
                                        reln->smgr_rnode.node.relNode,
                                        reln->smgr_rnode.backend);

    v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);

    seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));

    Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);

    if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not seek to block %u in file \"%s\": %m",
                        blocknum, FilePathName(v->mdfd_vfd))));

    nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ);

    TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
                                       reln->smgr_rnode.node.spcNode,
                                       reln->smgr_rnode.node.dbNode,
                                       reln->smgr_rnode.node.relNode,
                                       reln->smgr_rnode.backend,
                                       nbytes,
                                       BLCKSZ);

    if (nbytes != BLCKSZ)
    {
        if (nbytes < 0)
            ereport(ERROR,
                    (errcode_for_file_access(),
                     errmsg("could not read block %u in file \"%s\": %m",
                            blocknum, FilePathName(v->mdfd_vfd))));

        /*
         * Short read: we are at or past EOF, or we read a partial block at
         * EOF.  Normally this is an error; upper levels should never try to
         * read a nonexistent block.  However, if zero_damaged_pages is ON or
         * we are InRecovery, we should instead return zeroes without
         * complaining.  This allows, for example, the case of trying to
         * update a block that was later truncated away.
         */
        if (zero_damaged_pages || InRecovery)
            MemSet(buffer, 0, BLCKSZ);
        else
            ereport(ERROR,
                    (errcode(ERRCODE_DATA_CORRUPTED),
                     errmsg("could not read block %u in file \"%s\": read only %d of %d bytes",
                            blocknum, FilePathName(v->mdfd_vfd),
                            nbytes, BLCKSZ)));
    }
}

void mdsync ( void )

Definition at line 977 of file md.c.

References _mdfd_getseg(), _mdfd_segpath(), AbsorbFsyncRequests(), Assert, bms_first_member(), bms_free(), PendingOperationEntry::canceled, CheckpointStats, CheckpointStatsData::ckpt_agg_sync_time, CheckpointStatsData::ckpt_longest_sync, CheckpointStatsData::ckpt_sync_rels, PendingOperationEntry::cycle_ctr, DEBUG1, elog, enableFsync, ereport, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_RETURN_NULL, FILE_POSSIBLY_DELETED, FilePathName(), FileSync(), HASH_REMOVE, hash_search(), hash_seq_init(), hash_seq_search(), INSTR_TIME_GET_MICROSEC, INSTR_TIME_SET_CURRENT, INSTR_TIME_SUBTRACT, InvalidBackendId, log_checkpoints, longest(), MAX_FORKNUM, _MdfdVec::mdfd_vfd, mdsync_cycle_ctr, NULL, pfree(), PendingOperationEntry::requests, PendingOperationEntry::rnode, and smgropen().

Referenced by SetForwardFsyncRequests().

{
    static bool mdsync_in_progress = false;

    HASH_SEQ_STATUS hstat;
    PendingOperationEntry *entry;
    int         absorb_counter;

    /* Statistics on sync times */
    int         processed = 0;
    instr_time  sync_start,
                sync_end,
                sync_diff;
    uint64      elapsed;
    uint64      longest = 0;
    uint64      total_elapsed = 0;

    /*
     * This is only called during checkpoints, and checkpoints should only
     * occur in processes that have created a pendingOpsTable.
     */
    if (!pendingOpsTable)
        elog(ERROR, "cannot sync without a pendingOpsTable");

    /*
     * If we are in the checkpointer, the sync had better include all fsync
     * requests that were queued by backends up to this point.  The tightest
     * race condition that could occur is that a buffer that must be written
     * and fsync'd for the checkpoint could have been dumped by a backend just
     * before it was visited by BufferSync().  We know the backend will have
     * queued an fsync request before clearing the buffer's dirtybit, so we
     * are safe as long as we do an Absorb after completing BufferSync().
     */
    AbsorbFsyncRequests();

    /*
     * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
     * checkpoint), we want to ignore fsync requests that are entered into the
     * hashtable after this point --- they should be processed next time,
     * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
     * ones: new ones will have cycle_ctr equal to the incremented value of
     * mdsync_cycle_ctr.
     *
     * In normal circumstances, all entries present in the table at this point
     * will have cycle_ctr exactly equal to the current (about to be old)
     * value of mdsync_cycle_ctr.  However, if we fail partway through the
     * fsync'ing loop, then older values of cycle_ctr might remain when we
     * come back here to try again.  Repeated checkpoint failures would
     * eventually wrap the counter around to the point where an old entry
     * might appear new, causing us to skip it, possibly allowing a checkpoint
     * to succeed that should not have.  To forestall wraparound, any time the
     * previous mdsync() failed to complete, run through the table and
     * forcibly set cycle_ctr = mdsync_cycle_ctr.
     *
     * Think not to merge this loop with the main loop, as the problem is
     * exactly that that loop may fail before having visited all the entries.
     * From a performance point of view it doesn't matter anyway, as this path
     * will never be taken in a system that's functioning normally.
     */
    if (mdsync_in_progress)
    {
        /* prior try failed, so update any stale cycle_ctr values */
        hash_seq_init(&hstat, pendingOpsTable);
        while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
        {
            entry->cycle_ctr = mdsync_cycle_ctr;
        }
    }

    /* Advance counter so that new hashtable entries are distinguishable */
    mdsync_cycle_ctr++;

    /* Set flag to detect failure if we don't reach the end of the loop */
    mdsync_in_progress = true;

    /* Now scan the hashtable for fsync requests to process */
    absorb_counter = FSYNCS_PER_ABSORB;
    hash_seq_init(&hstat, pendingOpsTable);
    while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
    {
        ForkNumber  forknum;

        /*
         * If the entry is new then don't process it this time; it might
         * contain multiple fsync-request bits, but they are all new.  Note
         * "continue" bypasses the hash-remove call at the bottom of the loop.
         */
        if (entry->cycle_ctr == mdsync_cycle_ctr)
            continue;

        /* Else assert we haven't missed it */
        Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);

        /*
         * Scan over the forks and segments represented by the entry.
         *
         * The bitmap manipulations are slightly tricky, because we can call
         * AbsorbFsyncRequests() inside the loop and that could result in
         * bms_add_member() modifying and even re-palloc'ing the bitmapsets.
         * This is okay because we unlink each bitmapset from the hashtable
         * entry before scanning it.  That means that any incoming fsync
         * requests will be processed now if they reach the table before we
         * begin to scan their fork.
         */
        for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
        {
            Bitmapset  *requests = entry->requests[forknum];
            int         segno;

            entry->requests[forknum] = NULL;
            entry->canceled[forknum] = false;

            while ((segno = bms_first_member(requests)) >= 0)
            {
                int         failures;

                /*
                 * If fsync is off then we don't have to bother opening the
                 * file at all.  (We delay checking until this point so that
                 * changing fsync on the fly behaves sensibly.)
                 */
                if (!enableFsync)
                    continue;

                /*
                 * If in checkpointer, we want to absorb pending requests
                 * every so often to prevent overflow of the fsync request
                 * queue.  It is unspecified whether newly-added entries will
                 * be visited by hash_seq_search, but we don't care since we
                 * don't need to process them anyway.
                 */
                if (--absorb_counter <= 0)
                {
                    AbsorbFsyncRequests();
                    absorb_counter = FSYNCS_PER_ABSORB;
                }

                /*
                 * The fsync table could contain requests to fsync segments
                 * that have been deleted (unlinked) by the time we get to
                 * them. Rather than just hoping an ENOENT (or EACCES on
                 * Windows) error can be ignored, what we do on error is
                 * absorb pending requests and then retry.  Since mdunlink()
                 * queues a "cancel" message before actually unlinking, the
                 * fsync request is guaranteed to be marked canceled after the
                 * absorb if it really was this case. DROP DATABASE likewise
                 * has to tell us to forget fsync requests before it starts
                 * deletions.
                 */
                for (failures = 0;; failures++) /* loop exits at "break" */
                {
                    SMgrRelation reln;
                    MdfdVec    *seg;
                    char       *path;
                    int         save_errno;

                    /*
                     * Find or create an smgr hash entry for this relation.
                     * This may seem a bit unclean -- md calling smgr?  But
                     * it's really the best solution.  It ensures that the
                     * open file reference isn't permanently leaked if we get
                     * an error here. (You may say "but an unreferenced
                     * SMgrRelation is still a leak!" Not really, because the
                     * only case in which a checkpoint is done by a process
                     * that isn't about to shut down is in the checkpointer,
                     * and it will periodically do smgrcloseall(). This fact
                     * justifies our not closing the reln in the success path
                     * either, which is a good thing since in non-checkpointer
                     * cases we couldn't safely do that.)
                     */
                    reln = smgropen(entry->rnode, InvalidBackendId);

                    /* Attempt to open and fsync the target segment */
                    seg = _mdfd_getseg(reln, forknum,
                             (BlockNumber) segno * (BlockNumber) RELSEG_SIZE,
                                       false, EXTENSION_RETURN_NULL);

                    INSTR_TIME_SET_CURRENT(sync_start);

                    if (seg != NULL &&
                        FileSync(seg->mdfd_vfd) >= 0)
                    {
                        /* Success; update statistics about sync timing */
                        INSTR_TIME_SET_CURRENT(sync_end);
                        sync_diff = sync_end;
                        INSTR_TIME_SUBTRACT(sync_diff, sync_start);
                        elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
                        if (elapsed > longest)
                            longest = elapsed;
                        total_elapsed += elapsed;
                        processed++;
                        if (log_checkpoints)
                            elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
                                 processed,
                                 FilePathName(seg->mdfd_vfd),
                                 (double) elapsed / 1000);

                        break;  /* out of retry loop */
                    }

                    /* Compute file name for use in message */
                    save_errno = errno;
                    path = _mdfd_segpath(reln, forknum, (BlockNumber) segno);
                    errno = save_errno;

                    /*
                     * It is possible that the relation has been dropped or
                     * truncated since the fsync request was entered.
                     * Therefore, allow ENOENT, but only if we didn't fail
                     * already on this file.  This applies both for
                     * _mdfd_getseg() and for FileSync, since fd.c might have
                     * closed the file behind our back.
                     *
                     * XXX is there any point in allowing more than one retry?
                     * Don't see one at the moment, but easy to change the
                     * test here if so.
                     */
                    if (!FILE_POSSIBLY_DELETED(errno) ||
                        failures > 0)
                        ereport(ERROR,
                                (errcode_for_file_access(),
                                 errmsg("could not fsync file \"%s\": %m",
                                        path)));
                    else
                        ereport(DEBUG1,
                                (errcode_for_file_access(),
                        errmsg("could not fsync file \"%s\" but retrying: %m",
                               path)));
                    pfree(path);

                    /*
                     * Absorb incoming requests and check to see if a cancel
                     * arrived for this relation fork.
                     */
                    AbsorbFsyncRequests();
                    absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */

                    if (entry->canceled[forknum])
                        break;
                }               /* end retry loop */
            }
            bms_free(requests);
        }

        /*
         * We've finished everything that was requested before we started to
         * scan the entry.  If no new requests have been inserted meanwhile,
         * remove the entry.  Otherwise, update its cycle counter, as all the
         * requests now in it must have arrived during this cycle.
         */
        for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
        {
            if (entry->requests[forknum] != NULL)
                break;
        }
        if (forknum <= MAX_FORKNUM)
            entry->cycle_ctr = mdsync_cycle_ctr;
        else
        {
            /* Okay to remove it */
            if (hash_search(pendingOpsTable, &entry->rnode,
                            HASH_REMOVE, NULL) == NULL)
                elog(ERROR, "pendingOpsTable corrupted");
        }
    }                           /* end loop over hashtable entries */

    /* Return sync performance metrics for report at checkpoint end */
    CheckpointStats.ckpt_sync_rels = processed;
    CheckpointStats.ckpt_longest_sync = longest;
    CheckpointStats.ckpt_agg_sync_time = total_elapsed;

    /* Flag successful completion of mdsync */
    mdsync_in_progress = false;
}

void mdtruncate	(	SMgrRelation	reln,
		ForkNumber	forknum,
		BlockNumber	nblocks
	)

Definition at line 857 of file md.c.

References Assert, ereport, errcode_for_file_access(), errmsg(), ERROR, EXTENSION_FAIL, FilePathName(), FileTruncate(), InRecovery, SMgrRelationData::md_fd, _MdfdVec::mdfd_chain, _MdfdVec::mdfd_vfd, mdnblocks(), mdopen(), NULL, pfree(), register_dirty_segment(), relpath, SMgrRelationData::smgr_rnode, and SmgrIsTemp.

{
    MdfdVec    *v;
    BlockNumber curnblk;
    BlockNumber priorblocks;

    /*
     * NOTE: mdnblocks makes sure we have opened all active segments, so that
     * truncation loop will get them all!
     */
    curnblk = mdnblocks(reln, forknum);
    if (nblocks > curnblk)
    {
        /* Bogus request ... but no complaint if InRecovery */
        if (InRecovery)
            return;
        ereport(ERROR,
                (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
                        relpath(reln->smgr_rnode, forknum),
                        nblocks, curnblk)));
    }
    if (nblocks == curnblk)
        return;                 /* no work */

    v = mdopen(reln, forknum, EXTENSION_FAIL);

    priorblocks = 0;
    while (v != NULL)
    {
        MdfdVec    *ov = v;

        if (priorblocks > nblocks)
        {
            /*
             * This segment is no longer active (and has already been unlinked
             * from the mdfd_chain). We truncate the file, but do not delete
             * it, for reasons explained in the header comments.
             */
            if (FileTruncate(v->mdfd_vfd, 0) < 0)
                ereport(ERROR,
                        (errcode_for_file_access(),
                         errmsg("could not truncate file \"%s\": %m",
                                FilePathName(v->mdfd_vfd))));

            if (!SmgrIsTemp(reln))
                register_dirty_segment(reln, forknum, v);
            v = v->mdfd_chain;
            Assert(ov != reln->md_fd[forknum]); /* we never drop the 1st
                                                 * segment */
            pfree(ov);
        }
        else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
        {
            /*
             * This is the last segment we want to keep. Truncate the file to
             * the right length, and clear chain link that points to any
             * remaining segments (which we shall zap). NOTE: if nblocks is
             * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
             * segment to 0 length but keep it. This adheres to the invariant
             * given in the header comments.
             */
            BlockNumber lastsegblocks = nblocks - priorblocks;

            if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ) < 0)
                ereport(ERROR,
                        (errcode_for_file_access(),
                    errmsg("could not truncate file \"%s\" to %u blocks: %m",
                           FilePathName(v->mdfd_vfd),
                           nblocks)));
            if (!SmgrIsTemp(reln))
                register_dirty_segment(reln, forknum, v);
            v = v->mdfd_chain;
            ov->mdfd_chain = NULL;
        }
        else
        {
            /*
             * We still need this segment and 0 or more blocks beyond it, so
             * nothing to do here.
             */
            v = v->mdfd_chain;
        }
        priorblocks += RELSEG_SIZE;
    }
}

void mdunlink	(	RelFileNodeBackend	rnode,
		ForkNumber	forkNum,
		bool	isRedo
	)

Definition at line 366 of file md.c.

References ForgetRelationFsyncRequests(), InvalidForkNumber, mdunlinkfork(), RelFileNodeBackend::node, and RelFileNodeBackendIsTemp.

{
    /*
     * We have to clean out any pending fsync requests for the doomed
     * relation, else the next mdsync() will fail.  There can't be any such
     * requests for a temp relation, though.  We can send just one request
     * even when deleting multiple forks, since the fsync queuing code accepts
     * the "InvalidForkNumber = all forks" convention.
     */
    if (!RelFileNodeBackendIsTemp(rnode))
        ForgetRelationFsyncRequests(rnode.node, forkNum);

    /* Now do the per-fork work */
    if (forkNum == InvalidForkNumber)
    {
        for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++)
            mdunlinkfork(rnode, forkNum, isRedo);
    }
    else
        mdunlinkfork(rnode, forkNum, isRedo);
}

static void mdunlinkfork	(	RelFileNodeBackend	rnode,
		ForkNumber	forkNum,
		bool	isRedo
	)			`[static]`

Definition at line 389 of file md.c.

References CloseTransientFile(), ereport, errcode_for_file_access(), errmsg(), ftruncate, MAIN_FORKNUM, OpenTransientFile(), palloc(), pfree(), PG_BINARY, register_unlink(), RelFileNodeBackendIsTemp, relpath, unlink(), and WARNING.

Referenced by mdunlink().

{
    char       *path;
    int         ret;

    path = relpath(rnode, forkNum);

    /*
     * Delete or truncate the first segment.
     */
    if (isRedo || forkNum != MAIN_FORKNUM || RelFileNodeBackendIsTemp(rnode))
    {
        ret = unlink(path);
        if (ret < 0 && errno != ENOENT)
            ereport(WARNING,
                    (errcode_for_file_access(),
                     errmsg("could not remove file \"%s\": %m", path)));
    }
    else
    {
        /* truncate(2) would be easier here, but Windows hasn't got it */
        int         fd;

        fd = OpenTransientFile(path, O_RDWR | PG_BINARY, 0);
        if (fd >= 0)
        {
            int         save_errno;

            ret = ftruncate(fd, 0);
            save_errno = errno;
            CloseTransientFile(fd);
            errno = save_errno;
        }
        else
            ret = -1;
        if (ret < 0 && errno != ENOENT)
            ereport(WARNING,
                    (errcode_for_file_access(),
                     errmsg("could not truncate file \"%s\": %m", path)));

        /* Register request to unlink first segment later */
        register_unlink(rnode);
    }

    /*
     * Delete any additional segments.
     */
    if (ret >= 0)
    {
        char       *segpath = (char *) palloc(strlen(path) + 12);
        BlockNumber segno;

        /*
         * Note that because we loop until getting ENOENT, we will correctly
         * remove all inactive segments as well as active ones.
         */
        for (segno = 1;; segno++)
        {
            sprintf(segpath, "%s.%u", path, segno);
            if (unlink(segpath) < 0)
            {
                /* ENOENT is expected after the last segment... */
                if (errno != ENOENT)
                    ereport(WARNING,
                            (errcode_for_file_access(),
                       errmsg("could not remove file \"%s\": %m", segpath)));
                break;
            }
        }
        pfree(segpath);
    }

    pfree(path);
}

void mdwrite	(	SMgrRelation	reln,
		ForkNumber	forknum,
		BlockNumber	blocknum,
		char *	buffer,
		bool	skipFsync
	)

Definition at line 725 of file md.c.

References _mdfd_getseg(), Assert, RelFileNodeBackend::backend, RelFileNode::dbNode, ereport, errcode(), errcode_for_file_access(), errhint(), errmsg(), ERROR, EXTENSION_FAIL, FilePathName(), FileSeek(), FileWrite(), _MdfdVec::mdfd_vfd, mdnblocks(), RelFileNodeBackend::node, register_dirty_segment(), RelFileNode::relNode, SMgrRelationData::smgr_rnode, SmgrIsTemp, and RelFileNode::spcNode.

{
    off_t       seekpos;
    int         nbytes;
    MdfdVec    *v;

    /* This assert is too expensive to have on normally ... */
#ifdef CHECK_WRITE_VS_EXTEND
    Assert(blocknum < mdnblocks(reln, forknum));
#endif

    TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
                                         reln->smgr_rnode.node.spcNode,
                                         reln->smgr_rnode.node.dbNode,
                                         reln->smgr_rnode.node.relNode,
                                         reln->smgr_rnode.backend);

    v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_FAIL);

    seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));

    Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);

    if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not seek to block %u in file \"%s\": %m",
                        blocknum, FilePathName(v->mdfd_vfd))));

    nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ);

    TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
                                        reln->smgr_rnode.node.spcNode,
                                        reln->smgr_rnode.node.dbNode,
                                        reln->smgr_rnode.node.relNode,
                                        reln->smgr_rnode.backend,
                                        nbytes,
                                        BLCKSZ);

    if (nbytes != BLCKSZ)
    {
        if (nbytes < 0)
            ereport(ERROR,
                    (errcode_for_file_access(),
                     errmsg("could not write block %u in file \"%s\": %m",
                            blocknum, FilePathName(v->mdfd_vfd))));
        /* short write: complain appropriately */
        ereport(ERROR,
                (errcode(ERRCODE_DISK_FULL),
                 errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes",
                        blocknum,
                        FilePathName(v->mdfd_vfd),
                        nbytes, BLCKSZ),
                 errhint("Check free disk space.")));
    }

    if (!skipFsync && !SmgrIsTemp(reln))
        register_dirty_segment(reln, forknum, v);
}

static void register_dirty_segment	(	SMgrRelation	reln,
		ForkNumber	forknum,
		MdfdVec *	seg
	)			`[static]`

Definition at line 1350 of file md.c.

References Assert, DEBUG1, ereport, errcode_for_file_access(), errmsg(), ERROR, FilePathName(), FileSync(), ForwardFsyncRequest(), _MdfdVec::mdfd_segno, _MdfdVec::mdfd_vfd, RelFileNodeBackend::node, RememberFsyncRequest(), SMgrRelationData::smgr_rnode, and SmgrIsTemp.

Referenced by mdextend(), mdtruncate(), and mdwrite().

{
    /* Temp relations should never be fsync'd */
    Assert(!SmgrIsTemp(reln));

    if (pendingOpsTable)
    {
        /* push it into local pending-ops table */
        RememberFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno);
    }
    else
    {
        if (ForwardFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno))
            return;             /* passed it off successfully */

        ereport(DEBUG1,
                (errmsg("could not forward fsync request because request queue is full")));

        if (FileSync(seg->mdfd_vfd) < 0)
            ereport(ERROR,
                    (errcode_for_file_access(),
                     errmsg("could not fsync file \"%s\": %m",
                            FilePathName(seg->mdfd_vfd))));
    }
}

static void register_unlink ( RelFileNodeBackend rnode ) [static]

Definition at line 1386 of file md.c.

References Assert, ForwardFsyncRequest(), IsUnderPostmaster, MAIN_FORKNUM, RelFileNodeBackend::node, pg_usleep(), RelFileNodeBackendIsTemp, RememberFsyncRequest(), and UNLINK_RELATION_REQUEST.

Referenced by mdunlinkfork().

{
    /* Should never be used with temp relations */
    Assert(!RelFileNodeBackendIsTemp(rnode));

    if (pendingOpsTable)
    {
        /* push it into local pending-ops table */
        RememberFsyncRequest(rnode.node, MAIN_FORKNUM,
                             UNLINK_RELATION_REQUEST);
    }
    else
    {
        /*
         * Notify the checkpointer about it.  If we fail to queue the request
         * message, we have to sleep and try again, because we can't simply
         * delete the file now.  Ugly, but hopefully won't happen often.
         *
         * XXX should we just leave the file orphaned instead?
         */
        Assert(IsUnderPostmaster);
        while (!ForwardFsyncRequest(rnode.node, MAIN_FORKNUM,
                                    UNLINK_RELATION_REQUEST))
            pg_usleep(10000L);  /* 10 msec seems a good number */
    }
}

void RememberFsyncRequest	(	RelFileNode	rnode,
		ForkNumber	forknum,
		BlockNumber	segno
	)

Definition at line 1435 of file md.c.

References Assert, bms_add_member(), bms_free(), PendingOperationEntry::canceled, PendingOperationEntry::cycle_ctr, PendingUnlinkEntry::cycle_ctr, RelFileNode::dbNode, FORGET_DATABASE_FSYNC, FORGET_RELATION_FSYNC, HASH_ENTER, HASH_FIND, hash_search(), hash_seq_init(), hash_seq_search(), InvalidForkNumber, lappend(), lfirst, list_delete_cell(), list_head(), lnext, MAIN_FORKNUM, mdckpt_cycle_ctr, mdsync_cycle_ctr, MemoryContextSwitchTo(), MemSet, NULL, palloc(), pfree(), PendingOperationEntry::requests, PendingUnlinkEntry::rnode, PendingOperationEntry::rnode, and UNLINK_RELATION_REQUEST.

Referenced by AbsorbFsyncRequests(), ForgetDatabaseFsyncRequests(), ForgetRelationFsyncRequests(), register_dirty_segment(), and register_unlink().

{
    Assert(pendingOpsTable);

    if (segno == FORGET_RELATION_FSYNC)
    {
        /* Remove any pending requests for the relation (one or all forks) */
        PendingOperationEntry *entry;

        entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
                                                      &rnode,
                                                      HASH_FIND,
                                                      NULL);
        if (entry)
        {
            /*
             * We can't just delete the entry since mdsync could have an
             * active hashtable scan.  Instead we delete the bitmapsets; this
             * is safe because of the way mdsync is coded.  We also set the
             * "canceled" flags so that mdsync can tell that a cancel arrived
             * for the fork(s).
             */
            if (forknum == InvalidForkNumber)
            {
                /* remove requests for all forks */
                for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
                {
                    bms_free(entry->requests[forknum]);
                    entry->requests[forknum] = NULL;
                    entry->canceled[forknum] = true;
                }
            }
            else
            {
                /* remove requests for single fork */
                bms_free(entry->requests[forknum]);
                entry->requests[forknum] = NULL;
                entry->canceled[forknum] = true;
            }
        }
    }
    else if (segno == FORGET_DATABASE_FSYNC)
    {
        /* Remove any pending requests for the entire database */
        HASH_SEQ_STATUS hstat;
        PendingOperationEntry *entry;
        ListCell   *cell,
                   *prev,
                   *next;

        /* Remove fsync requests */
        hash_seq_init(&hstat, pendingOpsTable);
        while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
        {
            if (entry->rnode.dbNode == rnode.dbNode)
            {
                /* remove requests for all forks */
                for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
                {
                    bms_free(entry->requests[forknum]);
                    entry->requests[forknum] = NULL;
                    entry->canceled[forknum] = true;
                }
            }
        }

        /* Remove unlink requests */
        prev = NULL;
        for (cell = list_head(pendingUnlinks); cell; cell = next)
        {
            PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);

            next = lnext(cell);
            if (entry->rnode.dbNode == rnode.dbNode)
            {
                pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
                pfree(entry);
            }
            else
                prev = cell;
        }
    }
    else if (segno == UNLINK_RELATION_REQUEST)
    {
        /* Unlink request: put it in the linked list */
        MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
        PendingUnlinkEntry *entry;

        /* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */
        Assert(forknum == MAIN_FORKNUM);

        entry = palloc(sizeof(PendingUnlinkEntry));
        entry->rnode = rnode;
        entry->cycle_ctr = mdckpt_cycle_ctr;

        pendingUnlinks = lappend(pendingUnlinks, entry);

        MemoryContextSwitchTo(oldcxt);
    }
    else
    {
        /* Normal case: enter a request to fsync this segment */
        MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
        PendingOperationEntry *entry;
        bool        found;

        entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
                                                      &rnode,
                                                      HASH_ENTER,
                                                      &found);
        /* if new entry, initialize it */
        if (!found)
        {
            entry->cycle_ctr = mdsync_cycle_ctr;
            MemSet(entry->requests, 0, sizeof(entry->requests));
            MemSet(entry->canceled, 0, sizeof(entry->canceled));
        }

        /*
         * NB: it's intentional that we don't change cycle_ctr if the entry
         * already exists.  The cycle_ctr must represent the oldest fsync
         * request that could be in the entry.
         */

        entry->requests[forknum] = bms_add_member(entry->requests[forknum],
                                                  (int) segno);

        MemoryContextSwitchTo(oldcxt);
    }
}

void SetForwardFsyncRequests ( void )

Definition at line 233 of file md.c.

References Assert, hash_destroy(), mdsync(), and NIL.

Referenced by StartupXLOG().

{
    /* Perform any pending fsyncs we may have queued up, then drop table */
    if (pendingOpsTable)
    {
        mdsync();
        hash_destroy(pendingOpsTable);
    }
    pendingOpsTable = NULL;

    /*
     * We should not have any pending unlink requests, since mdunlink doesn't
     * queue unlink requests when isRedo.
     */
    Assert(pendingUnlinks == NIL);
}