Header And Logo

PostgreSQL
| The world's most advanced open source database.

fd.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * fd.c
00004  *    Virtual file descriptor code.
00005  *
00006  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00007  * Portions Copyright (c) 1994, Regents of the University of California
00008  *
00009  * IDENTIFICATION
00010  *    src/backend/storage/file/fd.c
00011  *
00012  * NOTES:
00013  *
00014  * This code manages a cache of 'virtual' file descriptors (VFDs).
00015  * The server opens many file descriptors for a variety of reasons,
00016  * including base tables, scratch files (e.g., sort and hash spool
00017  * files), and random calls to C library routines like system(3); it
00018  * is quite easy to exceed system limits on the number of open files a
00019  * single process can have.  (This is around 256 on many modern
00020  * operating systems, but can be as low as 32 on others.)
00021  *
00022  * VFDs are managed as an LRU pool, with actual OS file descriptors
00023  * being opened and closed as needed.  Obviously, if a routine is
00024  * opened using these interfaces, all subsequent operations must also
00025  * be through these interfaces (the File type is not a real file
00026  * descriptor).
00027  *
00028  * For this scheme to work, most (if not all) routines throughout the
00029  * server should use these interfaces instead of calling the C library
00030  * routines (e.g., open(2) and fopen(3)) themselves.  Otherwise, we
00031  * may find ourselves short of real file descriptors anyway.
00032  *
00033  * INTERFACE ROUTINES
00034  *
00035  * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
00036  * A File opened with OpenTemporaryFile is automatically deleted when the
00037  * File is closed, either explicitly or implicitly at end of transaction or
00038  * process exit. PathNameOpenFile is intended for files that are held open
00039  * for a long time, like relation files. It is the caller's responsibility
00040  * to close them, there is no automatic mechanism in fd.c for that.
00041  *
00042  * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
00043  * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
00044  * They behave like the corresponding native functions, except that the handle
00045  * is registered with the current subtransaction, and will be automatically
00046  * closed at abort. These are intended for short operations like reading a
00047  * configuration file, and there is a fixed limit on the number of files that
00048  * can be opened using these functions at any one time.
00049  *
00050  * Finally, BasicOpenFile is just a thin wrapper around open() that can
00051  * release file descriptors in use by the virtual file descriptors if
00052  * necessary. There is no automatic cleanup of file descriptors returned by
00053  * BasicOpenFile, it is solely the caller's responsibility to close the file
00054  * descriptor by calling close(2).
00055  *
00056  *-------------------------------------------------------------------------
00057  */
00058 
00059 #include "postgres.h"
00060 
00061 #include <sys/file.h>
00062 #include <sys/param.h>
00063 #include <sys/stat.h>
00064 #include <unistd.h>
00065 #include <fcntl.h>
00066 #ifdef HAVE_SYS_RESOURCE_H
00067 #include <sys/resource.h>       /* for getrlimit */
00068 #endif
00069 
00070 #include "miscadmin.h"
00071 #include "access/xact.h"
00072 #include "catalog/catalog.h"
00073 #include "catalog/pg_tablespace.h"
00074 #include "common/relpath.h"
00075 #include "pgstat.h"
00076 #include "storage/fd.h"
00077 #include "storage/ipc.h"
00078 #include "utils/guc.h"
00079 #include "utils/resowner_private.h"
00080 
00081 
00082 /*
00083  * We must leave some file descriptors free for system(), the dynamic loader,
00084  * and other code that tries to open files without consulting fd.c.  This
00085  * is the number left free.  (While we can be pretty sure we won't get
00086  * EMFILE, there's never any guarantee that we won't get ENFILE due to
00087  * other processes chewing up FDs.  So it's a bad idea to try to open files
00088  * without consulting fd.c.  Nonetheless we cannot control all code.)
00089  *
00090  * Because this is just a fixed setting, we are effectively assuming that
00091  * no such code will leave FDs open over the long term; otherwise the slop
00092  * is likely to be insufficient.  Note in particular that we expect that
00093  * loading a shared library does not result in any permanent increase in
00094  * the number of open files.  (This appears to be true on most if not
00095  * all platforms as of Feb 2004.)
00096  */
00097 #define NUM_RESERVED_FDS        10
00098 
00099 /*
00100  * If we have fewer than this many usable FDs after allowing for the reserved
00101  * ones, choke.
00102  */
00103 #define FD_MINFREE              10
00104 
00105 
00106 /*
00107  * A number of platforms allow individual processes to open many more files
00108  * than they can really support when *many* processes do the same thing.
00109  * This GUC parameter lets the DBA limit max_safe_fds to something less than
00110  * what the postmaster's initial probe suggests will work.
00111  */
00112 int         max_files_per_process = 1000;
00113 
00114 /*
00115  * Maximum number of file descriptors to open for either VFD entries or
00116  * AllocateFile/AllocateDir/OpenTransientFile operations.  This is initialized
00117  * to a conservative value, and remains that way indefinitely in bootstrap or
00118  * standalone-backend cases.  In normal postmaster operation, the postmaster
00119  * calls set_max_safe_fds() late in initialization to update the value, and
00120  * that value is then inherited by forked subprocesses.
00121  *
00122  * Note: the value of max_files_per_process is taken into account while
00123  * setting this variable, and so need not be tested separately.
00124  */
00125 int         max_safe_fds = 32;  /* default if not changed */
00126 
00127 
00128 /* Debugging.... */
00129 
00130 #ifdef FDDEBUG
00131 #define DO_DB(A) A
00132 #else
00133 #define DO_DB(A)                /* A */
00134 #endif
00135 
00136 #define VFD_CLOSED (-1)
00137 
00138 #define FileIsValid(file) \
00139     ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
00140 
00141 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
00142 
00143 #define FileUnknownPos ((off_t) -1)
00144 
00145 /* these are the assigned bits in fdstate below: */
00146 #define FD_TEMPORARY        (1 << 0)    /* T = delete when closed */
00147 #define FD_XACT_TEMPORARY   (1 << 1)    /* T = delete at eoXact */
00148 
00149 typedef struct vfd
00150 {
00151     int         fd;             /* current FD, or VFD_CLOSED if none */
00152     unsigned short fdstate;     /* bitflags for VFD's state */
00153     ResourceOwner resowner;     /* owner, for automatic cleanup */
00154     File        nextFree;       /* link to next free VFD, if in freelist */
00155     File        lruMoreRecently;    /* doubly linked recency-of-use list */
00156     File        lruLessRecently;
00157     off_t       seekPos;        /* current logical file position */
00158     off_t       fileSize;       /* current size of file (0 if not temporary) */
00159     char       *fileName;       /* name of file, or NULL for unused VFD */
00160     /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
00161     int         fileFlags;      /* open(2) flags for (re)opening the file */
00162     int         fileMode;       /* mode to pass to open(2) */
00163 } Vfd;
00164 
00165 /*
00166  * Virtual File Descriptor array pointer and size.  This grows as
00167  * needed.  'File' values are indexes into this array.
00168  * Note that VfdCache[0] is not a usable VFD, just a list header.
00169  */
00170 static Vfd *VfdCache;
00171 static Size SizeVfdCache = 0;
00172 
00173 /*
00174  * Number of file descriptors known to be in use by VFD entries.
00175  */
00176 static int  nfile = 0;
00177 
00178 /*
00179  * Flag to tell whether it's worth scanning VfdCache looking for temp files
00180  * to close
00181  */
00182 static bool have_xact_temporary_files = false;
00183 
00184 /*
00185  * Tracks the total size of all temporary files.  Note: when temp_file_limit
00186  * is being enforced, this cannot overflow since the limit cannot be more
00187  * than INT_MAX kilobytes.  When not enforcing, it could theoretically
00188  * overflow, but we don't care.
00189  */
00190 static uint64 temporary_files_size = 0;
00191 
00192 /*
00193  * List of OS handles opened with AllocateFile, AllocateDir and
00194  * OpenTransientFile.
00195  *
00196  * Since we don't want to encourage heavy use of those functions,
00197  * it seems OK to put a pretty small maximum limit on the number of
00198  * simultaneously allocated descs.
00199  */
00200 #define MAX_ALLOCATED_DESCS  32
00201 
00202 typedef enum
00203 {
00204     AllocateDescFile,
00205     AllocateDescPipe,
00206     AllocateDescDir,
00207     AllocateDescRawFD
00208 } AllocateDescKind;
00209 
00210 typedef struct
00211 {
00212     AllocateDescKind kind;
00213     union
00214     {
00215         FILE       *file;
00216         DIR        *dir;
00217         int         fd;
00218     }           desc;
00219     SubTransactionId create_subid;
00220 } AllocateDesc;
00221 
00222 static int  numAllocatedDescs = 0;
00223 static AllocateDesc allocatedDescs[MAX_ALLOCATED_DESCS];
00224 
00225 /*
00226  * Number of temporary files opened during the current session;
00227  * this is used in generation of tempfile names.
00228  */
00229 static long tempFileCounter = 0;
00230 
00231 /*
00232  * Array of OIDs of temp tablespaces.  When numTempTableSpaces is -1,
00233  * this has not been set in the current transaction.
00234  */
00235 static Oid *tempTableSpaces = NULL;
00236 static int  numTempTableSpaces = -1;
00237 static int  nextTempTableSpace = 0;
00238 
00239 
00240 /*--------------------
00241  *
00242  * Private Routines
00243  *
00244  * Delete          - delete a file from the Lru ring
00245  * LruDelete       - remove a file from the Lru ring and close its FD
00246  * Insert          - put a file at the front of the Lru ring
00247  * LruInsert       - put a file at the front of the Lru ring and open it
00248  * ReleaseLruFile  - Release an fd by closing the last entry in the Lru ring
00249  * AllocateVfd     - grab a free (or new) file record (from VfdArray)
00250  * FreeVfd         - free a file record
00251  *
00252  * The Least Recently Used ring is a doubly linked list that begins and
00253  * ends on element zero.  Element zero is special -- it doesn't represent
00254  * a file and its "fd" field always == VFD_CLOSED.  Element zero is just an
00255  * anchor that shows us the beginning/end of the ring.
00256  * Only VFD elements that are currently really open (have an FD assigned) are
00257  * in the Lru ring.  Elements that are "virtually" open can be recognized
00258  * by having a non-null fileName field.
00259  *
00260  * example:
00261  *
00262  *     /--less----\                /---------\
00263  *     v           \              v           \
00264  *   #0 --more---> LeastRecentlyUsed --more-\ \
00265  *    ^\                                    | |
00266  *     \\less--> MostRecentlyUsedFile   <---/ |
00267  *      \more---/                    \--less--/
00268  *
00269  *--------------------
00270  */
00271 static void Delete(File file);
00272 static void LruDelete(File file);
00273 static void Insert(File file);
00274 static int  LruInsert(File file);
00275 static bool ReleaseLruFile(void);
00276 static File AllocateVfd(void);
00277 static void FreeVfd(File file);
00278 
00279 static int  FileAccess(File file);
00280 static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
00281 static void AtProcExit_Files(int code, Datum arg);
00282 static void CleanupTempFiles(bool isProcExit);
00283 static void RemovePgTempFilesInDir(const char *tmpdirname);
00284 static void RemovePgTempRelationFiles(const char *tsdirname);
00285 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
00286 static bool looks_like_temp_rel_name(const char *name);
00287 
00288 
00289 /*
00290  * pg_fsync --- do fsync with or without writethrough
00291  */
00292 int
00293 pg_fsync(int fd)
00294 {
00295     /* #if is to skip the sync_method test if there's no need for it */
00296 #if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
00297     if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
00298         return pg_fsync_writethrough(fd);
00299     else
00300 #endif
00301         return pg_fsync_no_writethrough(fd);
00302 }
00303 
00304 
00305 /*
00306  * pg_fsync_no_writethrough --- same as fsync except does nothing if
00307  *  enableFsync is off
00308  */
00309 int
00310 pg_fsync_no_writethrough(int fd)
00311 {
00312     if (enableFsync)
00313         return fsync(fd);
00314     else
00315         return 0;
00316 }
00317 
00318 /*
00319  * pg_fsync_writethrough
00320  */
00321 int
00322 pg_fsync_writethrough(int fd)
00323 {
00324     if (enableFsync)
00325     {
00326 #ifdef WIN32
00327         return _commit(fd);
00328 #elif defined(F_FULLFSYNC)
00329         return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
00330 #else
00331         errno = ENOSYS;
00332         return -1;
00333 #endif
00334     }
00335     else
00336         return 0;
00337 }
00338 
00339 /*
00340  * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
00341  *
00342  * Not all platforms have fdatasync; treat as fsync if not available.
00343  */
00344 int
00345 pg_fdatasync(int fd)
00346 {
00347     if (enableFsync)
00348     {
00349 #ifdef HAVE_FDATASYNC
00350         return fdatasync(fd);
00351 #else
00352         return fsync(fd);
00353 #endif
00354     }
00355     else
00356         return 0;
00357 }
00358 
00359 /*
00360  * pg_flush_data --- advise OS that the data described won't be needed soon
00361  *
00362  * Not all platforms have sync_file_range or posix_fadvise; treat as no-op
00363  * if not available.  Also, treat as no-op if enableFsync is off; this is
00364  * because the call isn't free, and some platforms such as Linux will actually
00365  * block the requestor until the write is scheduled.
00366  */
00367 int
00368 pg_flush_data(int fd, off_t offset, off_t amount)
00369 {
00370     if (enableFsync)
00371     {
00372 #if defined(HAVE_SYNC_FILE_RANGE)
00373         return sync_file_range(fd, offset, amount, SYNC_FILE_RANGE_WRITE);
00374 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
00375         return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED);
00376 #endif
00377     }
00378     return 0;
00379 }
00380 
00381 
00382 /*
00383  * InitFileAccess --- initialize this module during backend startup
00384  *
00385  * This is called during either normal or standalone backend start.
00386  * It is *not* called in the postmaster.
00387  */
00388 void
00389 InitFileAccess(void)
00390 {
00391     Assert(SizeVfdCache == 0);  /* call me only once */
00392 
00393     /* initialize cache header entry */
00394     VfdCache = (Vfd *) malloc(sizeof(Vfd));
00395     if (VfdCache == NULL)
00396         ereport(FATAL,
00397                 (errcode(ERRCODE_OUT_OF_MEMORY),
00398                  errmsg("out of memory")));
00399 
00400     MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
00401     VfdCache->fd = VFD_CLOSED;
00402 
00403     SizeVfdCache = 1;
00404 
00405     /* register proc-exit hook to ensure temp files are dropped at exit */
00406     on_proc_exit(AtProcExit_Files, 0);
00407 }
00408 
00409 /*
00410  * count_usable_fds --- count how many FDs the system will let us open,
00411  *      and estimate how many are already open.
00412  *
00413  * We stop counting if usable_fds reaches max_to_probe.  Note: a small
00414  * value of max_to_probe might result in an underestimate of already_open;
00415  * we must fill in any "gaps" in the set of used FDs before the calculation
00416  * of already_open will give the right answer.  In practice, max_to_probe
00417  * of a couple of dozen should be enough to ensure good results.
00418  *
00419  * We assume stdin (FD 0) is available for dup'ing
00420  */
00421 static void
00422 count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
00423 {
00424     int        *fd;
00425     int         size;
00426     int         used = 0;
00427     int         highestfd = 0;
00428     int         j;
00429 
00430 #ifdef HAVE_GETRLIMIT
00431     struct rlimit rlim;
00432     int         getrlimit_status;
00433 #endif
00434 
00435     size = 1024;
00436     fd = (int *) palloc(size * sizeof(int));
00437 
00438 #ifdef HAVE_GETRLIMIT
00439 #ifdef RLIMIT_NOFILE            /* most platforms use RLIMIT_NOFILE */
00440     getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
00441 #else                           /* but BSD doesn't ... */
00442     getrlimit_status = getrlimit(RLIMIT_OFILE, &rlim);
00443 #endif   /* RLIMIT_NOFILE */
00444     if (getrlimit_status != 0)
00445         ereport(WARNING, (errmsg("getrlimit failed: %m")));
00446 #endif   /* HAVE_GETRLIMIT */
00447 
00448     /* dup until failure or probe limit reached */
00449     for (;;)
00450     {
00451         int         thisfd;
00452 
00453 #ifdef HAVE_GETRLIMIT
00454 
00455         /*
00456          * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
00457          * some platforms
00458          */
00459         if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
00460             break;
00461 #endif
00462 
00463         thisfd = dup(0);
00464         if (thisfd < 0)
00465         {
00466             /* Expect EMFILE or ENFILE, else it's fishy */
00467             if (errno != EMFILE && errno != ENFILE)
00468                 elog(WARNING, "dup(0) failed after %d successes: %m", used);
00469             break;
00470         }
00471 
00472         if (used >= size)
00473         {
00474             size *= 2;
00475             fd = (int *) repalloc(fd, size * sizeof(int));
00476         }
00477         fd[used++] = thisfd;
00478 
00479         if (highestfd < thisfd)
00480             highestfd = thisfd;
00481 
00482         if (used >= max_to_probe)
00483             break;
00484     }
00485 
00486     /* release the files we opened */
00487     for (j = 0; j < used; j++)
00488         close(fd[j]);
00489 
00490     pfree(fd);
00491 
00492     /*
00493      * Return results.  usable_fds is just the number of successful dups. We
00494      * assume that the system limit is highestfd+1 (remember 0 is a legal FD
00495      * number) and so already_open is highestfd+1 - usable_fds.
00496      */
00497     *usable_fds = used;
00498     *already_open = highestfd + 1 - used;
00499 }
00500 
00501 /*
00502  * set_max_safe_fds
00503  *      Determine number of filedescriptors that fd.c is allowed to use
00504  */
00505 void
00506 set_max_safe_fds(void)
00507 {
00508     int         usable_fds;
00509     int         already_open;
00510 
00511     /*----------
00512      * We want to set max_safe_fds to
00513      *          MIN(usable_fds, max_files_per_process - already_open)
00514      * less the slop factor for files that are opened without consulting
00515      * fd.c.  This ensures that we won't exceed either max_files_per_process
00516      * or the experimentally-determined EMFILE limit.
00517      *----------
00518      */
00519     count_usable_fds(max_files_per_process,
00520                      &usable_fds, &already_open);
00521 
00522     max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
00523 
00524     /*
00525      * Take off the FDs reserved for system() etc.
00526      */
00527     max_safe_fds -= NUM_RESERVED_FDS;
00528 
00529     /*
00530      * Make sure we still have enough to get by.
00531      */
00532     if (max_safe_fds < FD_MINFREE)
00533         ereport(FATAL,
00534                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
00535                  errmsg("insufficient file descriptors available to start server process"),
00536                  errdetail("System allows %d, we need at least %d.",
00537                            max_safe_fds + NUM_RESERVED_FDS,
00538                            FD_MINFREE + NUM_RESERVED_FDS)));
00539 
00540     elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
00541          max_safe_fds, usable_fds, already_open);
00542 }
00543 
00544 /*
00545  * BasicOpenFile --- same as open(2) except can free other FDs if needed
00546  *
00547  * This is exported for use by places that really want a plain kernel FD,
00548  * but need to be proof against running out of FDs.  Once an FD has been
00549  * successfully returned, it is the caller's responsibility to ensure that
00550  * it will not be leaked on ereport()!  Most users should *not* call this
00551  * routine directly, but instead use the VFD abstraction level, which
00552  * provides protection against descriptor leaks as well as management of
00553  * files that need to be open for more than a short period of time.
00554  *
00555  * Ideally this should be the *only* direct call of open() in the backend.
00556  * In practice, the postmaster calls open() directly, and there are some
00557  * direct open() calls done early in backend startup.  Those are OK since
00558  * this module wouldn't have any open files to close at that point anyway.
00559  */
00560 int
00561 BasicOpenFile(FileName fileName, int fileFlags, int fileMode)
00562 {
00563     int         fd;
00564 
00565 tryAgain:
00566     fd = open(fileName, fileFlags, fileMode);
00567 
00568     if (fd >= 0)
00569         return fd;              /* success! */
00570 
00571     if (errno == EMFILE || errno == ENFILE)
00572     {
00573         int         save_errno = errno;
00574 
00575         ereport(LOG,
00576                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
00577                  errmsg("out of file descriptors: %m; release and retry")));
00578         errno = 0;
00579         if (ReleaseLruFile())
00580             goto tryAgain;
00581         errno = save_errno;
00582     }
00583 
00584     return -1;                  /* failure */
00585 }
00586 
00587 #if defined(FDDEBUG)
00588 
00589 static void
00590 _dump_lru(void)
00591 {
00592     int         mru = VfdCache[0].lruLessRecently;
00593     Vfd        *vfdP = &VfdCache[mru];
00594     char        buf[2048];
00595 
00596     snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
00597     while (mru != 0)
00598     {
00599         mru = vfdP->lruLessRecently;
00600         vfdP = &VfdCache[mru];
00601         snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
00602     }
00603     snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
00604     elog(LOG, "%s", buf);
00605 }
00606 #endif   /* FDDEBUG */
00607 
00608 static void
00609 Delete(File file)
00610 {
00611     Vfd        *vfdP;
00612 
00613     Assert(file != 0);
00614 
00615     DO_DB(elog(LOG, "Delete %d (%s)",
00616                file, VfdCache[file].fileName));
00617     DO_DB(_dump_lru());
00618 
00619     vfdP = &VfdCache[file];
00620 
00621     VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
00622     VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
00623 
00624     DO_DB(_dump_lru());
00625 }
00626 
00627 static void
00628 LruDelete(File file)
00629 {
00630     Vfd        *vfdP;
00631 
00632     Assert(file != 0);
00633 
00634     DO_DB(elog(LOG, "LruDelete %d (%s)",
00635                file, VfdCache[file].fileName));
00636 
00637     vfdP = &VfdCache[file];
00638 
00639     /* delete the vfd record from the LRU ring */
00640     Delete(file);
00641 
00642     /* save the seek position */
00643     vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
00644     Assert(vfdP->seekPos != (off_t) -1);
00645 
00646     /* close the file */
00647     if (close(vfdP->fd))
00648         elog(ERROR, "could not close file \"%s\": %m", vfdP->fileName);
00649 
00650     --nfile;
00651     vfdP->fd = VFD_CLOSED;
00652 }
00653 
00654 static void
00655 Insert(File file)
00656 {
00657     Vfd        *vfdP;
00658 
00659     Assert(file != 0);
00660 
00661     DO_DB(elog(LOG, "Insert %d (%s)",
00662                file, VfdCache[file].fileName));
00663     DO_DB(_dump_lru());
00664 
00665     vfdP = &VfdCache[file];
00666 
00667     vfdP->lruMoreRecently = 0;
00668     vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
00669     VfdCache[0].lruLessRecently = file;
00670     VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
00671 
00672     DO_DB(_dump_lru());
00673 }
00674 
00675 /* returns 0 on success, -1 on re-open failure (with errno set) */
00676 static int
00677 LruInsert(File file)
00678 {
00679     Vfd        *vfdP;
00680 
00681     Assert(file != 0);
00682 
00683     DO_DB(elog(LOG, "LruInsert %d (%s)",
00684                file, VfdCache[file].fileName));
00685 
00686     vfdP = &VfdCache[file];
00687 
00688     if (FileIsNotOpen(file))
00689     {
00690         while (nfile + numAllocatedDescs >= max_safe_fds)
00691         {
00692             if (!ReleaseLruFile())
00693                 break;
00694         }
00695 
00696         /*
00697          * The open could still fail for lack of file descriptors, eg due to
00698          * overall system file table being full.  So, be prepared to release
00699          * another FD if necessary...
00700          */
00701         vfdP->fd = BasicOpenFile(vfdP->fileName, vfdP->fileFlags,
00702                                  vfdP->fileMode);
00703         if (vfdP->fd < 0)
00704         {
00705             DO_DB(elog(LOG, "RE_OPEN FAILED: %d", errno));
00706             return vfdP->fd;
00707         }
00708         else
00709         {
00710             DO_DB(elog(LOG, "RE_OPEN SUCCESS"));
00711             ++nfile;
00712         }
00713 
00714         /* seek to the right position */
00715         if (vfdP->seekPos != (off_t) 0)
00716         {
00717             off_t returnValue PG_USED_FOR_ASSERTS_ONLY;
00718 
00719             returnValue = lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
00720             Assert(returnValue != (off_t) -1);
00721         }
00722     }
00723 
00724     /*
00725      * put it at the head of the Lru ring
00726      */
00727 
00728     Insert(file);
00729 
00730     return 0;
00731 }
00732 
00733 static bool
00734 ReleaseLruFile(void)
00735 {
00736     DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
00737 
00738     if (nfile > 0)
00739     {
00740         /*
00741          * There are opened files and so there should be at least one used vfd
00742          * in the ring.
00743          */
00744         Assert(VfdCache[0].lruMoreRecently != 0);
00745         LruDelete(VfdCache[0].lruMoreRecently);
00746         return true;            /* freed a file */
00747     }
00748     return false;               /* no files available to free */
00749 }
00750 
00751 static File
00752 AllocateVfd(void)
00753 {
00754     Index       i;
00755     File        file;
00756 
00757     DO_DB(elog(LOG, "AllocateVfd. Size %lu", SizeVfdCache));
00758 
00759     Assert(SizeVfdCache > 0);   /* InitFileAccess not called? */
00760 
00761     if (VfdCache[0].nextFree == 0)
00762     {
00763         /*
00764          * The free list is empty so it is time to increase the size of the
00765          * array.  We choose to double it each time this happens. However,
00766          * there's not much point in starting *real* small.
00767          */
00768         Size        newCacheSize = SizeVfdCache * 2;
00769         Vfd        *newVfdCache;
00770 
00771         if (newCacheSize < 32)
00772             newCacheSize = 32;
00773 
00774         /*
00775          * Be careful not to clobber VfdCache ptr if realloc fails.
00776          */
00777         newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
00778         if (newVfdCache == NULL)
00779             ereport(ERROR,
00780                     (errcode(ERRCODE_OUT_OF_MEMORY),
00781                      errmsg("out of memory")));
00782         VfdCache = newVfdCache;
00783 
00784         /*
00785          * Initialize the new entries and link them into the free list.
00786          */
00787         for (i = SizeVfdCache; i < newCacheSize; i++)
00788         {
00789             MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
00790             VfdCache[i].nextFree = i + 1;
00791             VfdCache[i].fd = VFD_CLOSED;
00792         }
00793         VfdCache[newCacheSize - 1].nextFree = 0;
00794         VfdCache[0].nextFree = SizeVfdCache;
00795 
00796         /*
00797          * Record the new size
00798          */
00799         SizeVfdCache = newCacheSize;
00800     }
00801 
00802     file = VfdCache[0].nextFree;
00803 
00804     VfdCache[0].nextFree = VfdCache[file].nextFree;
00805 
00806     return file;
00807 }
00808 
00809 static void
00810 FreeVfd(File file)
00811 {
00812     Vfd        *vfdP = &VfdCache[file];
00813 
00814     DO_DB(elog(LOG, "FreeVfd: %d (%s)",
00815                file, vfdP->fileName ? vfdP->fileName : ""));
00816 
00817     if (vfdP->fileName != NULL)
00818     {
00819         free(vfdP->fileName);
00820         vfdP->fileName = NULL;
00821     }
00822     vfdP->fdstate = 0x0;
00823 
00824     vfdP->nextFree = VfdCache[0].nextFree;
00825     VfdCache[0].nextFree = file;
00826 }
00827 
00828 /* returns 0 on success, -1 on re-open failure (with errno set) */
00829 static int
00830 FileAccess(File file)
00831 {
00832     int         returnValue;
00833 
00834     DO_DB(elog(LOG, "FileAccess %d (%s)",
00835                file, VfdCache[file].fileName));
00836 
00837     /*
00838      * Is the file open?  If not, open it and put it at the head of the LRU
00839      * ring (possibly closing the least recently used file to get an FD).
00840      */
00841 
00842     if (FileIsNotOpen(file))
00843     {
00844         returnValue = LruInsert(file);
00845         if (returnValue != 0)
00846             return returnValue;
00847     }
00848     else if (VfdCache[0].lruLessRecently != file)
00849     {
00850         /*
00851          * We now know that the file is open and that it is not the last one
00852          * accessed, so we need to move it to the head of the Lru ring.
00853          */
00854 
00855         Delete(file);
00856         Insert(file);
00857     }
00858 
00859     return 0;
00860 }
00861 
00862 /*
00863  *  Called when we get a shared invalidation message on some relation.
00864  */
00865 #ifdef NOT_USED
00866 void
00867 FileInvalidate(File file)
00868 {
00869     Assert(FileIsValid(file));
00870     if (!FileIsNotOpen(file))
00871         LruDelete(file);
00872 }
00873 #endif
00874 
00875 /*
00876  * open a file in an arbitrary directory
00877  *
00878  * NB: if the passed pathname is relative (which it usually is),
00879  * it will be interpreted relative to the process' working directory
00880  * (which should always be $PGDATA when this code is running).
00881  */
00882 File
00883 PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
00884 {
00885     char       *fnamecopy;
00886     File        file;
00887     Vfd        *vfdP;
00888 
00889     DO_DB(elog(LOG, "PathNameOpenFile: %s %x %o",
00890                fileName, fileFlags, fileMode));
00891 
00892     /*
00893      * We need a malloc'd copy of the file name; fail cleanly if no room.
00894      */
00895     fnamecopy = strdup(fileName);
00896     if (fnamecopy == NULL)
00897         ereport(ERROR,
00898                 (errcode(ERRCODE_OUT_OF_MEMORY),
00899                  errmsg("out of memory")));
00900 
00901     file = AllocateVfd();
00902     vfdP = &VfdCache[file];
00903 
00904     while (nfile + numAllocatedDescs >= max_safe_fds)
00905     {
00906         if (!ReleaseLruFile())
00907             break;
00908     }
00909 
00910     vfdP->fd = BasicOpenFile(fileName, fileFlags, fileMode);
00911 
00912     if (vfdP->fd < 0)
00913     {
00914         FreeVfd(file);
00915         free(fnamecopy);
00916         return -1;
00917     }
00918     ++nfile;
00919     DO_DB(elog(LOG, "PathNameOpenFile: success %d",
00920                vfdP->fd));
00921 
00922     Insert(file);
00923 
00924     vfdP->fileName = fnamecopy;
00925     /* Saved flags are adjusted to be OK for re-opening file */
00926     vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
00927     vfdP->fileMode = fileMode;
00928     vfdP->seekPos = 0;
00929     vfdP->fileSize = 0;
00930     vfdP->fdstate = 0x0;
00931     vfdP->resowner = NULL;
00932 
00933     return file;
00934 }
00935 
00936 /*
00937  * Open a temporary file that will disappear when we close it.
00938  *
00939  * This routine takes care of generating an appropriate tempfile name.
00940  * There's no need to pass in fileFlags or fileMode either, since only
00941  * one setting makes any sense for a temp file.
00942  *
00943  * Unless interXact is true, the file is remembered by CurrentResourceOwner
00944  * to ensure it's closed and deleted when it's no longer needed, typically at
00945  * the end-of-transaction. In most cases, you don't want temporary files to
00946  * outlive the transaction that created them, so this should be false -- but
00947  * if you need "somewhat" temporary storage, this might be useful. In either
00948  * case, the file is removed when the File is explicitly closed.
00949  */
00950 File
00951 OpenTemporaryFile(bool interXact)
00952 {
00953     File        file = 0;
00954 
00955     /*
00956      * If some temp tablespace(s) have been given to us, try to use the next
00957      * one.  If a given tablespace can't be found, we silently fall back to
00958      * the database's default tablespace.
00959      *
00960      * BUT: if the temp file is slated to outlive the current transaction,
00961      * force it into the database's default tablespace, so that it will not
00962      * pose a threat to possible tablespace drop attempts.
00963      */
00964     if (numTempTableSpaces > 0 && !interXact)
00965     {
00966         Oid         tblspcOid = GetNextTempTableSpace();
00967 
00968         if (OidIsValid(tblspcOid))
00969             file = OpenTemporaryFileInTablespace(tblspcOid, false);
00970     }
00971 
00972     /*
00973      * If not, or if tablespace is bad, create in database's default
00974      * tablespace.  MyDatabaseTableSpace should normally be set before we get
00975      * here, but just in case it isn't, fall back to pg_default tablespace.
00976      */
00977     if (file <= 0)
00978         file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
00979                                              MyDatabaseTableSpace :
00980                                              DEFAULTTABLESPACE_OID,
00981                                              true);
00982 
00983     /* Mark it for deletion at close */
00984     VfdCache[file].fdstate |= FD_TEMPORARY;
00985 
00986     /* Register it with the current resource owner */
00987     if (!interXact)
00988     {
00989         VfdCache[file].fdstate |= FD_XACT_TEMPORARY;
00990 
00991         ResourceOwnerEnlargeFiles(CurrentResourceOwner);
00992         ResourceOwnerRememberFile(CurrentResourceOwner, file);
00993         VfdCache[file].resowner = CurrentResourceOwner;
00994 
00995         /* ensure cleanup happens at eoxact */
00996         have_xact_temporary_files = true;
00997     }
00998 
00999     return file;
01000 }
01001 
01002 /*
01003  * Open a temporary file in a specific tablespace.
01004  * Subroutine for OpenTemporaryFile, which see for details.
01005  */
01006 static File
01007 OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
01008 {
01009     char        tempdirpath[MAXPGPATH];
01010     char        tempfilepath[MAXPGPATH];
01011     File        file;
01012 
01013     /*
01014      * Identify the tempfile directory for this tablespace.
01015      *
01016      * If someone tries to specify pg_global, use pg_default instead.
01017      */
01018     if (tblspcOid == DEFAULTTABLESPACE_OID ||
01019         tblspcOid == GLOBALTABLESPACE_OID)
01020     {
01021         /* The default tablespace is {datadir}/base */
01022         snprintf(tempdirpath, sizeof(tempdirpath), "base/%s",
01023                  PG_TEMP_FILES_DIR);
01024     }
01025     else
01026     {
01027         /* All other tablespaces are accessed via symlinks */
01028         snprintf(tempdirpath, sizeof(tempdirpath), "pg_tblspc/%u/%s/%s",
01029                  tblspcOid, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
01030     }
01031 
01032     /*
01033      * Generate a tempfile name that should be unique within the current
01034      * database instance.
01035      */
01036     snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
01037              tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
01038 
01039     /*
01040      * Open the file.  Note: we don't use O_EXCL, in case there is an orphaned
01041      * temp file that can be reused.
01042      */
01043     file = PathNameOpenFile(tempfilepath,
01044                             O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
01045                             0600);
01046     if (file <= 0)
01047     {
01048         /*
01049          * We might need to create the tablespace's tempfile directory, if no
01050          * one has yet done so.
01051          *
01052          * Don't check for error from mkdir; it could fail if someone else
01053          * just did the same thing.  If it doesn't work then we'll bomb out on
01054          * the second create attempt, instead.
01055          */
01056         mkdir(tempdirpath, S_IRWXU);
01057 
01058         file = PathNameOpenFile(tempfilepath,
01059                                 O_RDWR | O_CREAT | O_TRUNC | PG_BINARY,
01060                                 0600);
01061         if (file <= 0 && rejectError)
01062             elog(ERROR, "could not create temporary file \"%s\": %m",
01063                  tempfilepath);
01064     }
01065 
01066     return file;
01067 }
01068 
01069 /*
01070  * close a file when done with it
01071  */
01072 void
01073 FileClose(File file)
01074 {
01075     Vfd        *vfdP;
01076 
01077     Assert(FileIsValid(file));
01078 
01079     DO_DB(elog(LOG, "FileClose: %d (%s)",
01080                file, VfdCache[file].fileName));
01081 
01082     vfdP = &VfdCache[file];
01083 
01084     if (!FileIsNotOpen(file))
01085     {
01086         /* remove the file from the lru ring */
01087         Delete(file);
01088 
01089         /* close the file */
01090         if (close(vfdP->fd))
01091             elog(ERROR, "could not close file \"%s\": %m", vfdP->fileName);
01092 
01093         --nfile;
01094         vfdP->fd = VFD_CLOSED;
01095     }
01096 
01097     /*
01098      * Delete the file if it was temporary, and make a log entry if wanted
01099      */
01100     if (vfdP->fdstate & FD_TEMPORARY)
01101     {
01102         struct stat filestats;
01103         int         stat_errno;
01104 
01105         /*
01106          * If we get an error, as could happen within the ereport/elog calls,
01107          * we'll come right back here during transaction abort.  Reset the
01108          * flag to ensure that we can't get into an infinite loop.  This code
01109          * is arranged to ensure that the worst-case consequence is failing to
01110          * emit log message(s), not failing to attempt the unlink.
01111          */
01112         vfdP->fdstate &= ~FD_TEMPORARY;
01113 
01114         /* Subtract its size from current usage (do first in case of error) */
01115         temporary_files_size -= vfdP->fileSize;
01116         vfdP->fileSize = 0;
01117 
01118         /* first try the stat() */
01119         if (stat(vfdP->fileName, &filestats))
01120             stat_errno = errno;
01121         else
01122             stat_errno = 0;
01123 
01124         /* in any case do the unlink */
01125         if (unlink(vfdP->fileName))
01126             elog(LOG, "could not unlink file \"%s\": %m", vfdP->fileName);
01127 
01128         /* and last report the stat results */
01129         if (stat_errno == 0)
01130         {
01131             pgstat_report_tempfile(filestats.st_size);
01132 
01133             if (log_temp_files >= 0)
01134             {
01135                 if ((filestats.st_size / 1024) >= log_temp_files)
01136                     ereport(LOG,
01137                             (errmsg("temporary file: path \"%s\", size %lu",
01138                                     vfdP->fileName,
01139                                     (unsigned long) filestats.st_size)));
01140             }
01141         }
01142         else
01143         {
01144             errno = stat_errno;
01145             elog(LOG, "could not stat file \"%s\": %m", vfdP->fileName);
01146         }
01147     }
01148 
01149     /* Unregister it from the resource owner */
01150     if (vfdP->resowner)
01151         ResourceOwnerForgetFile(vfdP->resowner, file);
01152 
01153     /*
01154      * Return the Vfd slot to the free list
01155      */
01156     FreeVfd(file);
01157 }
01158 
01159 /*
01160  * FilePrefetch - initiate asynchronous read of a given range of the file.
01161  * The logical seek position is unaffected.
01162  *
01163  * Currently the only implementation of this function is using posix_fadvise
01164  * which is the simplest standardized interface that accomplishes this.
01165  * We could add an implementation using libaio in the future; but note that
01166  * this API is inappropriate for libaio, which wants to have a buffer provided
01167  * to read into.
01168  */
01169 int
01170 FilePrefetch(File file, off_t offset, int amount)
01171 {
01172 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
01173     int         returnCode;
01174 
01175     Assert(FileIsValid(file));
01176 
01177     DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d",
01178                file, VfdCache[file].fileName,
01179                (int64) offset, amount));
01180 
01181     returnCode = FileAccess(file);
01182     if (returnCode < 0)
01183         return returnCode;
01184 
01185     returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
01186                                POSIX_FADV_WILLNEED);
01187 
01188     return returnCode;
01189 #else
01190     Assert(FileIsValid(file));
01191     return 0;
01192 #endif
01193 }
01194 
01195 int
01196 FileRead(File file, char *buffer, int amount)
01197 {
01198     int         returnCode;
01199 
01200     Assert(FileIsValid(file));
01201 
01202     DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
01203                file, VfdCache[file].fileName,
01204                (int64) VfdCache[file].seekPos,
01205                amount, buffer));
01206 
01207     returnCode = FileAccess(file);
01208     if (returnCode < 0)
01209         return returnCode;
01210 
01211 retry:
01212     returnCode = read(VfdCache[file].fd, buffer, amount);
01213 
01214     if (returnCode >= 0)
01215         VfdCache[file].seekPos += returnCode;
01216     else
01217     {
01218         /*
01219          * Windows may run out of kernel buffers and return "Insufficient
01220          * system resources" error.  Wait a bit and retry to solve it.
01221          *
01222          * It is rumored that EINTR is also possible on some Unix filesystems,
01223          * in which case immediate retry is indicated.
01224          */
01225 #ifdef WIN32
01226         DWORD       error = GetLastError();
01227 
01228         switch (error)
01229         {
01230             case ERROR_NO_SYSTEM_RESOURCES:
01231                 pg_usleep(1000L);
01232                 errno = EINTR;
01233                 break;
01234             default:
01235                 _dosmaperr(error);
01236                 break;
01237         }
01238 #endif
01239         /* OK to retry if interrupted */
01240         if (errno == EINTR)
01241             goto retry;
01242 
01243         /* Trouble, so assume we don't know the file position anymore */
01244         VfdCache[file].seekPos = FileUnknownPos;
01245     }
01246 
01247     return returnCode;
01248 }
01249 
01250 int
01251 FileWrite(File file, char *buffer, int amount)
01252 {
01253     int         returnCode;
01254 
01255     Assert(FileIsValid(file));
01256 
01257     DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
01258                file, VfdCache[file].fileName,
01259                (int64) VfdCache[file].seekPos,
01260                amount, buffer));
01261 
01262     returnCode = FileAccess(file);
01263     if (returnCode < 0)
01264         return returnCode;
01265 
01266     /*
01267      * If enforcing temp_file_limit and it's a temp file, check to see if the
01268      * write would overrun temp_file_limit, and throw error if so.  Note: it's
01269      * really a modularity violation to throw error here; we should set errno
01270      * and return -1.  However, there's no way to report a suitable error
01271      * message if we do that.  All current callers would just throw error
01272      * immediately anyway, so this is safe at present.
01273      */
01274     if (temp_file_limit >= 0 && (VfdCache[file].fdstate & FD_TEMPORARY))
01275     {
01276         off_t       newPos = VfdCache[file].seekPos + amount;
01277 
01278         if (newPos > VfdCache[file].fileSize)
01279         {
01280             uint64      newTotal = temporary_files_size;
01281 
01282             newTotal += newPos - VfdCache[file].fileSize;
01283             if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
01284                 ereport(ERROR,
01285                         (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
01286                  errmsg("temporary file size exceeds temp_file_limit (%dkB)",
01287                         temp_file_limit)));
01288         }
01289     }
01290 
01291 retry:
01292     errno = 0;
01293     returnCode = write(VfdCache[file].fd, buffer, amount);
01294 
01295     /* if write didn't set errno, assume problem is no disk space */
01296     if (returnCode != amount && errno == 0)
01297         errno = ENOSPC;
01298 
01299     if (returnCode >= 0)
01300     {
01301         VfdCache[file].seekPos += returnCode;
01302 
01303         /* maintain fileSize and temporary_files_size if it's a temp file */
01304         if (VfdCache[file].fdstate & FD_TEMPORARY)
01305         {
01306             off_t       newPos = VfdCache[file].seekPos;
01307 
01308             if (newPos > VfdCache[file].fileSize)
01309             {
01310                 temporary_files_size += newPos - VfdCache[file].fileSize;
01311                 VfdCache[file].fileSize = newPos;
01312             }
01313         }
01314     }
01315     else
01316     {
01317         /*
01318          * See comments in FileRead()
01319          */
01320 #ifdef WIN32
01321         DWORD       error = GetLastError();
01322 
01323         switch (error)
01324         {
01325             case ERROR_NO_SYSTEM_RESOURCES:
01326                 pg_usleep(1000L);
01327                 errno = EINTR;
01328                 break;
01329             default:
01330                 _dosmaperr(error);
01331                 break;
01332         }
01333 #endif
01334         /* OK to retry if interrupted */
01335         if (errno == EINTR)
01336             goto retry;
01337 
01338         /* Trouble, so assume we don't know the file position anymore */
01339         VfdCache[file].seekPos = FileUnknownPos;
01340     }
01341 
01342     return returnCode;
01343 }
01344 
01345 int
01346 FileSync(File file)
01347 {
01348     int         returnCode;
01349 
01350     Assert(FileIsValid(file));
01351 
01352     DO_DB(elog(LOG, "FileSync: %d (%s)",
01353                file, VfdCache[file].fileName));
01354 
01355     returnCode = FileAccess(file);
01356     if (returnCode < 0)
01357         return returnCode;
01358 
01359     return pg_fsync(VfdCache[file].fd);
01360 }
01361 
01362 off_t
01363 FileSeek(File file, off_t offset, int whence)
01364 {
01365     int         returnCode;
01366 
01367     Assert(FileIsValid(file));
01368 
01369     DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
01370                file, VfdCache[file].fileName,
01371                (int64) VfdCache[file].seekPos,
01372                (int64) offset, whence));
01373 
01374     if (FileIsNotOpen(file))
01375     {
01376         switch (whence)
01377         {
01378             case SEEK_SET:
01379                 if (offset < 0)
01380                     elog(ERROR, "invalid seek offset: " INT64_FORMAT,
01381                          (int64) offset);
01382                 VfdCache[file].seekPos = offset;
01383                 break;
01384             case SEEK_CUR:
01385                 VfdCache[file].seekPos += offset;
01386                 break;
01387             case SEEK_END:
01388                 returnCode = FileAccess(file);
01389                 if (returnCode < 0)
01390                     return returnCode;
01391                 VfdCache[file].seekPos = lseek(VfdCache[file].fd,
01392                                                offset, whence);
01393                 break;
01394             default:
01395                 elog(ERROR, "invalid whence: %d", whence);
01396                 break;
01397         }
01398     }
01399     else
01400     {
01401         switch (whence)
01402         {
01403             case SEEK_SET:
01404                 if (offset < 0)
01405                     elog(ERROR, "invalid seek offset: " INT64_FORMAT,
01406                          (int64) offset);
01407                 if (VfdCache[file].seekPos != offset)
01408                     VfdCache[file].seekPos = lseek(VfdCache[file].fd,
01409                                                    offset, whence);
01410                 break;
01411             case SEEK_CUR:
01412                 if (offset != 0 || VfdCache[file].seekPos == FileUnknownPos)
01413                     VfdCache[file].seekPos = lseek(VfdCache[file].fd,
01414                                                    offset, whence);
01415                 break;
01416             case SEEK_END:
01417                 VfdCache[file].seekPos = lseek(VfdCache[file].fd,
01418                                                offset, whence);
01419                 break;
01420             default:
01421                 elog(ERROR, "invalid whence: %d", whence);
01422                 break;
01423         }
01424     }
01425     return VfdCache[file].seekPos;
01426 }
01427 
01428 /*
01429  * XXX not actually used but here for completeness
01430  */
01431 #ifdef NOT_USED
01432 off_t
01433 FileTell(File file)
01434 {
01435     Assert(FileIsValid(file));
01436     DO_DB(elog(LOG, "FileTell %d (%s)",
01437                file, VfdCache[file].fileName));
01438     return VfdCache[file].seekPos;
01439 }
01440 #endif
01441 
01442 int
01443 FileTruncate(File file, off_t offset)
01444 {
01445     int         returnCode;
01446 
01447     Assert(FileIsValid(file));
01448 
01449     DO_DB(elog(LOG, "FileTruncate %d (%s)",
01450                file, VfdCache[file].fileName));
01451 
01452     returnCode = FileAccess(file);
01453     if (returnCode < 0)
01454         return returnCode;
01455 
01456     returnCode = ftruncate(VfdCache[file].fd, offset);
01457 
01458     if (returnCode == 0 && VfdCache[file].fileSize > offset)
01459     {
01460         /* adjust our state for truncation of a temp file */
01461         Assert(VfdCache[file].fdstate & FD_TEMPORARY);
01462         temporary_files_size -= VfdCache[file].fileSize - offset;
01463         VfdCache[file].fileSize = offset;
01464     }
01465 
01466     return returnCode;
01467 }
01468 
01469 /*
01470  * Return the pathname associated with an open file.
01471  *
01472  * The returned string points to an internal buffer, which is valid until
01473  * the file is closed.
01474  */
01475 char *
01476 FilePathName(File file)
01477 {
01478     Assert(FileIsValid(file));
01479 
01480     return VfdCache[file].fileName;
01481 }
01482 
01483 
01484 /*
01485  * Routines that want to use stdio (ie, FILE*) should use AllocateFile
01486  * rather than plain fopen().  This lets fd.c deal with freeing FDs if
01487  * necessary to open the file.  When done, call FreeFile rather than fclose.
01488  *
01489  * Note that files that will be open for any significant length of time
01490  * should NOT be handled this way, since they cannot share kernel file
01491  * descriptors with other files; there is grave risk of running out of FDs
01492  * if anyone locks down too many FDs.  Most callers of this routine are
01493  * simply reading a config file that they will read and close immediately.
01494  *
01495  * fd.c will automatically close all files opened with AllocateFile at
01496  * transaction commit or abort; this prevents FD leakage if a routine
01497  * that calls AllocateFile is terminated prematurely by ereport(ERROR).
01498  *
01499  * Ideally this should be the *only* direct call of fopen() in the backend.
01500  */
01501 FILE *
01502 AllocateFile(const char *name, const char *mode)
01503 {
01504     FILE       *file;
01505 
01506     DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
01507                numAllocatedDescs, name));
01508 
01509     /*
01510      * The test against MAX_ALLOCATED_DESCS prevents us from overflowing
01511      * allocatedFiles[]; the test against max_safe_fds prevents AllocateFile
01512      * from hogging every one of the available FDs, which'd lead to infinite
01513      * looping.
01514      */
01515     if (numAllocatedDescs >= MAX_ALLOCATED_DESCS ||
01516         numAllocatedDescs >= max_safe_fds - 1)
01517         elog(ERROR, "exceeded MAX_ALLOCATED_DESCS while trying to open file \"%s\"",
01518              name);
01519 
01520 TryAgain:
01521     if ((file = fopen(name, mode)) != NULL)
01522     {
01523         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
01524 
01525         desc->kind = AllocateDescFile;
01526         desc->desc.file = file;
01527         desc->create_subid = GetCurrentSubTransactionId();
01528         numAllocatedDescs++;
01529         return desc->desc.file;
01530     }
01531 
01532     if (errno == EMFILE || errno == ENFILE)
01533     {
01534         int         save_errno = errno;
01535 
01536         ereport(LOG,
01537                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
01538                  errmsg("out of file descriptors: %m; release and retry")));
01539         errno = 0;
01540         if (ReleaseLruFile())
01541             goto TryAgain;
01542         errno = save_errno;
01543     }
01544 
01545     return NULL;
01546 }
01547 
01548 
01549 /*
01550  * Like AllocateFile, but returns an unbuffered fd like open(2)
01551  */
01552 int
01553 OpenTransientFile(FileName fileName, int fileFlags, int fileMode)
01554 {
01555     int         fd;
01556 
01557 
01558     DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
01559                numAllocatedDescs, fileName));
01560 
01561     /*
01562      * The test against MAX_ALLOCATED_DESCS prevents us from overflowing
01563      * allocatedFiles[]; the test against max_safe_fds prevents BasicOpenFile
01564      * from hogging every one of the available FDs, which'd lead to infinite
01565      * looping.
01566      */
01567     if (numAllocatedDescs >= MAX_ALLOCATED_DESCS ||
01568         numAllocatedDescs >= max_safe_fds - 1)
01569         elog(ERROR, "exceeded MAX_ALLOCATED_DESCS while trying to open file \"%s\"",
01570              fileName);
01571 
01572     fd = BasicOpenFile(fileName, fileFlags, fileMode);
01573 
01574     if (fd >= 0)
01575     {
01576         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
01577 
01578         desc->kind = AllocateDescRawFD;
01579         desc->desc.fd = fd;
01580         desc->create_subid = GetCurrentSubTransactionId();
01581         numAllocatedDescs++;
01582 
01583         return fd;
01584     }
01585 
01586     return -1;                  /* failure */
01587 }
01588 
01589 /*
01590  * Routines that want to initiate a pipe stream should use OpenPipeStream
01591  * rather than plain popen().  This lets fd.c deal with freeing FDs if
01592  * necessary.  When done, call ClosePipeStream rather than pclose.
01593  */
01594 FILE *
01595 OpenPipeStream(const char *command, const char *mode)
01596 {
01597     FILE       *file;
01598 
01599     DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
01600                numAllocatedDescs, command));
01601 
01602     /*
01603      * The test against MAX_ALLOCATED_DESCS prevents us from overflowing
01604      * allocatedFiles[]; the test against max_safe_fds prevents AllocateFile
01605      * from hogging every one of the available FDs, which'd lead to infinite
01606      * looping.
01607      */
01608     if (numAllocatedDescs >= MAX_ALLOCATED_DESCS ||
01609         numAllocatedDescs >= max_safe_fds - 1)
01610         elog(ERROR, "exceeded MAX_ALLOCATED_DESCS while trying to execute command \"%s\"",
01611              command);
01612 
01613 TryAgain:
01614     fflush(stdout);
01615     fflush(stderr);
01616     errno = 0;
01617     if ((file = popen(command, mode)) != NULL)
01618     {
01619         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
01620 
01621         desc->kind = AllocateDescPipe;
01622         desc->desc.file = file;
01623         desc->create_subid = GetCurrentSubTransactionId();
01624         numAllocatedDescs++;
01625         return desc->desc.file;
01626     }
01627 
01628     if (errno == EMFILE || errno == ENFILE)
01629     {
01630         int         save_errno = errno;
01631 
01632         ereport(LOG,
01633                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
01634                  errmsg("out of file descriptors: %m; release and retry")));
01635         errno = 0;
01636         if (ReleaseLruFile())
01637             goto TryAgain;
01638         errno = save_errno;
01639     }
01640 
01641     return NULL;
01642 }
01643 
01644 /*
01645  * Free an AllocateDesc of any type.
01646  *
01647  * The argument *must* point into the allocatedDescs[] array.
01648  */
01649 static int
01650 FreeDesc(AllocateDesc *desc)
01651 {
01652     int         result;
01653 
01654     /* Close the underlying object */
01655     switch (desc->kind)
01656     {
01657         case AllocateDescFile:
01658             result = fclose(desc->desc.file);
01659             break;
01660         case AllocateDescPipe:
01661             result = pclose(desc->desc.file);
01662             break;
01663         case AllocateDescDir:
01664             result = closedir(desc->desc.dir);
01665             break;
01666         case AllocateDescRawFD:
01667             result = close(desc->desc.fd);
01668             break;
01669         default:
01670             elog(ERROR, "AllocateDesc kind not recognized");
01671             result = 0;         /* keep compiler quiet */
01672             break;
01673     }
01674 
01675     /* Compact storage in the allocatedDescs array */
01676     numAllocatedDescs--;
01677     *desc = allocatedDescs[numAllocatedDescs];
01678 
01679     return result;
01680 }
01681 
01682 /*
01683  * Close a file returned by AllocateFile.
01684  *
01685  * Note we do not check fclose's return value --- it is up to the caller
01686  * to handle close errors.
01687  */
01688 int
01689 FreeFile(FILE *file)
01690 {
01691     int         i;
01692 
01693     DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
01694 
01695     /* Remove file from list of allocated files, if it's present */
01696     for (i = numAllocatedDescs; --i >= 0;)
01697     {
01698         AllocateDesc *desc = &allocatedDescs[i];
01699 
01700         if (desc->kind == AllocateDescFile && desc->desc.file == file)
01701             return FreeDesc(desc);
01702     }
01703 
01704     /* Only get here if someone passes us a file not in allocatedDescs */
01705     elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
01706 
01707     return fclose(file);
01708 }
01709 
01710 /*
01711  * Close a file returned by OpenTransientFile.
01712  *
01713  * Note we do not check close's return value --- it is up to the caller
01714  * to handle close errors.
01715  */
01716 int
01717 CloseTransientFile(int fd)
01718 {
01719     int         i;
01720 
01721     DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
01722 
01723     /* Remove fd from list of allocated files, if it's present */
01724     for (i = numAllocatedDescs; --i >= 0;)
01725     {
01726         AllocateDesc *desc = &allocatedDescs[i];
01727 
01728         if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
01729             return FreeDesc(desc);
01730     }
01731 
01732     /* Only get here if someone passes us a file not in allocatedDescs */
01733     elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
01734 
01735     return close(fd);
01736 }
01737 
01738 /*
01739  * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
01740  * rather than plain opendir().  This lets fd.c deal with freeing FDs if
01741  * necessary to open the directory, and with closing it after an elog.
01742  * When done, call FreeDir rather than closedir.
01743  *
01744  * Ideally this should be the *only* direct call of opendir() in the backend.
01745  */
01746 DIR *
01747 AllocateDir(const char *dirname)
01748 {
01749     DIR        *dir;
01750 
01751     DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
01752                numAllocatedDescs, dirname));
01753 
01754     /*
01755      * The test against MAX_ALLOCATED_DESCS prevents us from overflowing
01756      * allocatedDescs[]; the test against max_safe_fds prevents AllocateDir
01757      * from hogging every one of the available FDs, which'd lead to infinite
01758      * looping.
01759      */
01760     if (numAllocatedDescs >= MAX_ALLOCATED_DESCS ||
01761         numAllocatedDescs >= max_safe_fds - 1)
01762         elog(ERROR, "exceeded MAX_ALLOCATED_DESCS while trying to open directory \"%s\"",
01763              dirname);
01764 
01765 TryAgain:
01766     if ((dir = opendir(dirname)) != NULL)
01767     {
01768         AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
01769 
01770         desc->kind = AllocateDescDir;
01771         desc->desc.dir = dir;
01772         desc->create_subid = GetCurrentSubTransactionId();
01773         numAllocatedDescs++;
01774         return desc->desc.dir;
01775     }
01776 
01777     if (errno == EMFILE || errno == ENFILE)
01778     {
01779         int         save_errno = errno;
01780 
01781         ereport(LOG,
01782                 (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
01783                  errmsg("out of file descriptors: %m; release and retry")));
01784         errno = 0;
01785         if (ReleaseLruFile())
01786             goto TryAgain;
01787         errno = save_errno;
01788     }
01789 
01790     return NULL;
01791 }
01792 
01793 /*
01794  * Read a directory opened with AllocateDir, ereport'ing any error.
01795  *
01796  * This is easier to use than raw readdir() since it takes care of some
01797  * otherwise rather tedious and error-prone manipulation of errno.  Also,
01798  * if you are happy with a generic error message for AllocateDir failure,
01799  * you can just do
01800  *
01801  *      dir = AllocateDir(path);
01802  *      while ((dirent = ReadDir(dir, path)) != NULL)
01803  *          process dirent;
01804  *      FreeDir(dir);
01805  *
01806  * since a NULL dir parameter is taken as indicating AllocateDir failed.
01807  * (Make sure errno hasn't been changed since AllocateDir if you use this
01808  * shortcut.)
01809  *
01810  * The pathname passed to AllocateDir must be passed to this routine too,
01811  * but it is only used for error reporting.
01812  */
01813 struct dirent *
01814 ReadDir(DIR *dir, const char *dirname)
01815 {
01816     struct dirent *dent;
01817 
01818     /* Give a generic message for AllocateDir failure, if caller didn't */
01819     if (dir == NULL)
01820         ereport(ERROR,
01821                 (errcode_for_file_access(),
01822                  errmsg("could not open directory \"%s\": %m",
01823                         dirname)));
01824 
01825     errno = 0;
01826     if ((dent = readdir(dir)) != NULL)
01827         return dent;
01828 
01829 #ifdef WIN32
01830 
01831     /*
01832      * This fix is in mingw cvs (runtime/mingwex/dirent.c rev 1.4), but not in
01833      * released version
01834      */
01835     if (GetLastError() == ERROR_NO_MORE_FILES)
01836         errno = 0;
01837 #endif
01838 
01839     if (errno)
01840         ereport(ERROR,
01841                 (errcode_for_file_access(),
01842                  errmsg("could not read directory \"%s\": %m",
01843                         dirname)));
01844     return NULL;
01845 }
01846 
01847 /*
01848  * Close a directory opened with AllocateDir.
01849  *
01850  * Note we do not check closedir's return value --- it is up to the caller
01851  * to handle close errors.
01852  */
01853 int
01854 FreeDir(DIR *dir)
01855 {
01856     int         i;
01857 
01858     DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
01859 
01860     /* Remove dir from list of allocated dirs, if it's present */
01861     for (i = numAllocatedDescs; --i >= 0;)
01862     {
01863         AllocateDesc *desc = &allocatedDescs[i];
01864 
01865         if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
01866             return FreeDesc(desc);
01867     }
01868 
01869     /* Only get here if someone passes us a dir not in allocatedDescs */
01870     elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
01871 
01872     return closedir(dir);
01873 }
01874 
01875 
01876 /*
01877  * Close a pipe stream returned by OpenPipeStream.
01878  */
01879 int
01880 ClosePipeStream(FILE *file)
01881 {
01882     int         i;
01883 
01884     DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
01885 
01886     /* Remove file from list of allocated files, if it's present */
01887     for (i = numAllocatedDescs; --i >= 0;)
01888     {
01889         AllocateDesc *desc = &allocatedDescs[i];
01890 
01891         if (desc->kind == AllocateDescPipe && desc->desc.file == file)
01892             return FreeDesc(desc);
01893     }
01894 
01895     /* Only get here if someone passes us a file not in allocatedDescs */
01896     elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
01897 
01898     return pclose(file);
01899 }
01900 
01901 /*
01902  * closeAllVfds
01903  *
01904  * Force all VFDs into the physically-closed state, so that the fewest
01905  * possible number of kernel file descriptors are in use.  There is no
01906  * change in the logical state of the VFDs.
01907  */
01908 void
01909 closeAllVfds(void)
01910 {
01911     Index       i;
01912 
01913     if (SizeVfdCache > 0)
01914     {
01915         Assert(FileIsNotOpen(0));       /* Make sure ring not corrupted */
01916         for (i = 1; i < SizeVfdCache; i++)
01917         {
01918             if (!FileIsNotOpen(i))
01919                 LruDelete(i);
01920         }
01921     }
01922 }
01923 
01924 
01925 /*
01926  * SetTempTablespaces
01927  *
01928  * Define a list (actually an array) of OIDs of tablespaces to use for
01929  * temporary files.  This list will be used until end of transaction,
01930  * unless this function is called again before then.  It is caller's
01931  * responsibility that the passed-in array has adequate lifespan (typically
01932  * it'd be allocated in TopTransactionContext).
01933  */
01934 void
01935 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
01936 {
01937     Assert(numSpaces >= 0);
01938     tempTableSpaces = tableSpaces;
01939     numTempTableSpaces = numSpaces;
01940 
01941     /*
01942      * Select a random starting point in the list.  This is to minimize
01943      * conflicts between backends that are most likely sharing the same list
01944      * of temp tablespaces.  Note that if we create multiple temp files in the
01945      * same transaction, we'll advance circularly through the list --- this
01946      * ensures that large temporary sort files are nicely spread across all
01947      * available tablespaces.
01948      */
01949     if (numSpaces > 1)
01950         nextTempTableSpace = random() % numSpaces;
01951     else
01952         nextTempTableSpace = 0;
01953 }
01954 
01955 /*
01956  * TempTablespacesAreSet
01957  *
01958  * Returns TRUE if SetTempTablespaces has been called in current transaction.
01959  * (This is just so that tablespaces.c doesn't need its own per-transaction
01960  * state.)
01961  */
01962 bool
01963 TempTablespacesAreSet(void)
01964 {
01965     return (numTempTableSpaces >= 0);
01966 }
01967 
01968 /*
01969  * GetNextTempTableSpace
01970  *
01971  * Select the next temp tablespace to use.  A result of InvalidOid means
01972  * to use the current database's default tablespace.
01973  */
01974 Oid
01975 GetNextTempTableSpace(void)
01976 {
01977     if (numTempTableSpaces > 0)
01978     {
01979         /* Advance nextTempTableSpace counter with wraparound */
01980         if (++nextTempTableSpace >= numTempTableSpaces)
01981             nextTempTableSpace = 0;
01982         return tempTableSpaces[nextTempTableSpace];
01983     }
01984     return InvalidOid;
01985 }
01986 
01987 
01988 /*
01989  * AtEOSubXact_Files
01990  *
01991  * Take care of subtransaction commit/abort.  At abort, we close temp files
01992  * that the subtransaction may have opened.  At commit, we reassign the
01993  * files that were opened to the parent subtransaction.
01994  */
01995 void
01996 AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
01997                   SubTransactionId parentSubid)
01998 {
01999     Index       i;
02000 
02001     for (i = 0; i < numAllocatedDescs; i++)
02002     {
02003         if (allocatedDescs[i].create_subid == mySubid)
02004         {
02005             if (isCommit)
02006                 allocatedDescs[i].create_subid = parentSubid;
02007             else
02008             {
02009                 /* have to recheck the item after FreeDesc (ugly) */
02010                 FreeDesc(&allocatedDescs[i--]);
02011             }
02012         }
02013     }
02014 }
02015 
02016 /*
02017  * AtEOXact_Files
02018  *
02019  * This routine is called during transaction commit or abort (it doesn't
02020  * particularly care which).  All still-open per-transaction temporary file
02021  * VFDs are closed, which also causes the underlying files to be deleted
02022  * (although they should've been closed already by the ResourceOwner
02023  * cleanup). Furthermore, all "allocated" stdio files are closed. We also
02024  * forget any transaction-local temp tablespace list.
02025  */
02026 void
02027 AtEOXact_Files(void)
02028 {
02029     CleanupTempFiles(false);
02030     tempTableSpaces = NULL;
02031     numTempTableSpaces = -1;
02032 }
02033 
02034 /*
02035  * AtProcExit_Files
02036  *
02037  * on_proc_exit hook to clean up temp files during backend shutdown.
02038  * Here, we want to clean up *all* temp files including interXact ones.
02039  */
02040 static void
02041 AtProcExit_Files(int code, Datum arg)
02042 {
02043     CleanupTempFiles(true);
02044 }
02045 
02046 /*
02047  * Close temporary files and delete their underlying files.
02048  *
02049  * isProcExit: if true, this is being called as the backend process is
02050  * exiting. If that's the case, we should remove all temporary files; if
02051  * that's not the case, we are being called for transaction commit/abort
02052  * and should only remove transaction-local temp files.  In either case,
02053  * also clean up "allocated" stdio files, dirs and fds.
02054  */
02055 static void
02056 CleanupTempFiles(bool isProcExit)
02057 {
02058     Index       i;
02059 
02060     /*
02061      * Careful here: at proc_exit we need extra cleanup, not just
02062      * xact_temporary files.
02063      */
02064     if (isProcExit || have_xact_temporary_files)
02065     {
02066         Assert(FileIsNotOpen(0));       /* Make sure ring not corrupted */
02067         for (i = 1; i < SizeVfdCache; i++)
02068         {
02069             unsigned short fdstate = VfdCache[i].fdstate;
02070 
02071             if ((fdstate & FD_TEMPORARY) && VfdCache[i].fileName != NULL)
02072             {
02073                 /*
02074                  * If we're in the process of exiting a backend process, close
02075                  * all temporary files. Otherwise, only close temporary files
02076                  * local to the current transaction. They should be closed by
02077                  * the ResourceOwner mechanism already, so this is just a
02078                  * debugging cross-check.
02079                  */
02080                 if (isProcExit)
02081                     FileClose(i);
02082                 else if (fdstate & FD_XACT_TEMPORARY)
02083                 {
02084                     elog(WARNING,
02085                          "temporary file %s not closed at end-of-transaction",
02086                          VfdCache[i].fileName);
02087                     FileClose(i);
02088                 }
02089             }
02090         }
02091 
02092         have_xact_temporary_files = false;
02093     }
02094 
02095     /* Clean up "allocated" stdio files, dirs and fds. */
02096     while (numAllocatedDescs > 0)
02097         FreeDesc(&allocatedDescs[0]);
02098 }
02099 
02100 
02101 /*
02102  * Remove temporary and temporary relation files left over from a prior
02103  * postmaster session
02104  *
02105  * This should be called during postmaster startup.  It will forcibly
02106  * remove any leftover files created by OpenTemporaryFile and any leftover
02107  * temporary relation files created by mdcreate.
02108  *
02109  * NOTE: we could, but don't, call this during a post-backend-crash restart
02110  * cycle.  The argument for not doing it is that someone might want to examine
02111  * the temp files for debugging purposes.  This does however mean that
02112  * OpenTemporaryFile had better allow for collision with an existing temp
02113  * file name.
02114  */
02115 void
02116 RemovePgTempFiles(void)
02117 {
02118     char        temp_path[MAXPGPATH];
02119     DIR        *spc_dir;
02120     struct dirent *spc_de;
02121 
02122     /*
02123      * First process temp files in pg_default ($PGDATA/base)
02124      */
02125     snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
02126     RemovePgTempFilesInDir(temp_path);
02127     RemovePgTempRelationFiles("base");
02128 
02129     /*
02130      * Cycle through temp directories for all non-default tablespaces.
02131      */
02132     spc_dir = AllocateDir("pg_tblspc");
02133 
02134     while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL)
02135     {
02136         if (strcmp(spc_de->d_name, ".") == 0 ||
02137             strcmp(spc_de->d_name, "..") == 0)
02138             continue;
02139 
02140         snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
02141             spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
02142         RemovePgTempFilesInDir(temp_path);
02143 
02144         snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
02145                  spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
02146         RemovePgTempRelationFiles(temp_path);
02147     }
02148 
02149     FreeDir(spc_dir);
02150 
02151     /*
02152      * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
02153      * DataDir as well.
02154      */
02155 #ifdef EXEC_BACKEND
02156     RemovePgTempFilesInDir(PG_TEMP_FILES_DIR);
02157 #endif
02158 }
02159 
02160 /* Process one pgsql_tmp directory for RemovePgTempFiles */
02161 static void
02162 RemovePgTempFilesInDir(const char *tmpdirname)
02163 {
02164     DIR        *temp_dir;
02165     struct dirent *temp_de;
02166     char        rm_path[MAXPGPATH];
02167 
02168     temp_dir = AllocateDir(tmpdirname);
02169     if (temp_dir == NULL)
02170     {
02171         /* anything except ENOENT is fishy */
02172         if (errno != ENOENT)
02173             elog(LOG,
02174                  "could not open temporary-files directory \"%s\": %m",
02175                  tmpdirname);
02176         return;
02177     }
02178 
02179     while ((temp_de = ReadDir(temp_dir, tmpdirname)) != NULL)
02180     {
02181         if (strcmp(temp_de->d_name, ".") == 0 ||
02182             strcmp(temp_de->d_name, "..") == 0)
02183             continue;
02184 
02185         snprintf(rm_path, sizeof(rm_path), "%s/%s",
02186                  tmpdirname, temp_de->d_name);
02187 
02188         if (strncmp(temp_de->d_name,
02189                     PG_TEMP_FILE_PREFIX,
02190                     strlen(PG_TEMP_FILE_PREFIX)) == 0)
02191             unlink(rm_path);    /* note we ignore any error */
02192         else
02193             elog(LOG,
02194                  "unexpected file found in temporary-files directory: \"%s\"",
02195                  rm_path);
02196     }
02197 
02198     FreeDir(temp_dir);
02199 }
02200 
02201 /* Process one tablespace directory, look for per-DB subdirectories */
02202 static void
02203 RemovePgTempRelationFiles(const char *tsdirname)
02204 {
02205     DIR        *ts_dir;
02206     struct dirent *de;
02207     char        dbspace_path[MAXPGPATH];
02208 
02209     ts_dir = AllocateDir(tsdirname);
02210     if (ts_dir == NULL)
02211     {
02212         /* anything except ENOENT is fishy */
02213         if (errno != ENOENT)
02214             elog(LOG,
02215                  "could not open tablespace directory \"%s\": %m",
02216                  tsdirname);
02217         return;
02218     }
02219 
02220     while ((de = ReadDir(ts_dir, tsdirname)) != NULL)
02221     {
02222         int         i = 0;
02223 
02224         /*
02225          * We're only interested in the per-database directories, which have
02226          * numeric names.  Note that this code will also (properly) ignore "."
02227          * and "..".
02228          */
02229         while (isdigit((unsigned char) de->d_name[i]))
02230             ++i;
02231         if (de->d_name[i] != '\0' || i == 0)
02232             continue;
02233 
02234         snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
02235                  tsdirname, de->d_name);
02236         RemovePgTempRelationFilesInDbspace(dbspace_path);
02237     }
02238 
02239     FreeDir(ts_dir);
02240 }
02241 
02242 /* Process one per-dbspace directory for RemovePgTempRelationFiles */
02243 static void
02244 RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
02245 {
02246     DIR        *dbspace_dir;
02247     struct dirent *de;
02248     char        rm_path[MAXPGPATH];
02249 
02250     dbspace_dir = AllocateDir(dbspacedirname);
02251     if (dbspace_dir == NULL)
02252     {
02253         /* we just saw this directory, so it really ought to be there */
02254         elog(LOG,
02255              "could not open dbspace directory \"%s\": %m",
02256              dbspacedirname);
02257         return;
02258     }
02259 
02260     while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
02261     {
02262         if (!looks_like_temp_rel_name(de->d_name))
02263             continue;
02264 
02265         snprintf(rm_path, sizeof(rm_path), "%s/%s",
02266                  dbspacedirname, de->d_name);
02267 
02268         unlink(rm_path);        /* note we ignore any error */
02269     }
02270 
02271     FreeDir(dbspace_dir);
02272 }
02273 
02274 /* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
02275 static bool
02276 looks_like_temp_rel_name(const char *name)
02277 {
02278     int         pos;
02279     int         savepos;
02280 
02281     /* Must start with "t". */
02282     if (name[0] != 't')
02283         return false;
02284 
02285     /* Followed by a non-empty string of digits and then an underscore. */
02286     for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
02287         ;
02288     if (pos == 1 || name[pos] != '_')
02289         return false;
02290 
02291     /* Followed by another nonempty string of digits. */
02292     for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
02293         ;
02294     if (savepos == pos)
02295         return false;
02296 
02297     /* We might have _forkname or .segment or both. */
02298     if (name[pos] == '_')
02299     {
02300         int         forkchar = forkname_chars(&name[pos + 1], NULL);
02301 
02302         if (forkchar <= 0)
02303             return false;
02304         pos += forkchar + 1;
02305     }
02306     if (name[pos] == '.')
02307     {
02308         int         segchar;
02309 
02310         for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
02311             ;
02312         if (segchar <= 1)
02313             return false;
02314         pos += segchar;
02315     }
02316 
02317     /* Now we should be at the end. */
02318     if (name[pos] != '\0')
02319         return false;
02320     return true;
02321 }