Header And Logo

PostgreSQL
| The world's most advanced open source database.

autovacuum.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * autovacuum.c
00004  *
00005  * PostgreSQL Integrated Autovacuum Daemon
00006  *
00007  * The autovacuum system is structured in two different kinds of processes: the
00008  * autovacuum launcher and the autovacuum worker.  The launcher is an
00009  * always-running process, started by the postmaster when the autovacuum GUC
00010  * parameter is set.  The launcher schedules autovacuum workers to be started
00011  * when appropriate.  The workers are the processes which execute the actual
00012  * vacuuming; they connect to a database as determined in the launcher, and
00013  * once connected they examine the catalogs to select the tables to vacuum.
00014  *
00015  * The autovacuum launcher cannot start the worker processes by itself,
00016  * because doing so would cause robustness issues (namely, failure to shut
00017  * them down on exceptional conditions, and also, since the launcher is
00018  * connected to shared memory and is thus subject to corruption there, it is
00019  * not as robust as the postmaster).  So it leaves that task to the postmaster.
00020  *
00021  * There is an autovacuum shared memory area, where the launcher stores
00022  * information about the database it wants vacuumed.  When it wants a new
00023  * worker to start, it sets a flag in shared memory and sends a signal to the
00024  * postmaster.  Then postmaster knows nothing more than it must start a worker;
00025  * so it forks a new child, which turns into a worker.  This new process
00026  * connects to shared memory, and there it can inspect the information that the
00027  * launcher has set up.
00028  *
00029  * If the fork() call fails in the postmaster, it sets a flag in the shared
00030  * memory area, and sends a signal to the launcher.  The launcher, upon
00031  * noticing the flag, can try starting the worker again by resending the
00032  * signal.  Note that the failure can only be transient (fork failure due to
00033  * high load, memory pressure, too many processes, etc); more permanent
00034  * problems, like failure to connect to a database, are detected later in the
00035  * worker and dealt with just by having the worker exit normally.  The launcher
00036  * will launch a new worker again later, per schedule.
00037  *
00038  * When the worker is done vacuuming it sends SIGUSR2 to the launcher.  The
00039  * launcher then wakes up and is able to launch another worker, if the schedule
00040  * is so tight that a new worker is needed immediately.  At this time the
00041  * launcher can also balance the settings for the various remaining workers'
00042  * cost-based vacuum delay feature.
00043  *
00044  * Note that there can be more than one worker in a database concurrently.
00045  * They will store the table they are currently vacuuming in shared memory, so
00046  * that other workers avoid being blocked waiting for the vacuum lock for that
00047  * table.  They will also reload the pgstats data just before vacuuming each
00048  * table, to avoid vacuuming a table that was just finished being vacuumed by
00049  * another worker and thus is no longer noted in shared memory.  However,
00050  * there is a window (caused by pgstat delay) on which a worker may choose a
00051  * table that was already vacuumed; this is a bug in the current design.
00052  *
00053  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00054  * Portions Copyright (c) 1994, Regents of the University of California
00055  *
00056  *
00057  * IDENTIFICATION
00058  *    src/backend/postmaster/autovacuum.c
00059  *
00060  *-------------------------------------------------------------------------
00061  */
00062 #include "postgres.h"
00063 
00064 #include <signal.h>
00065 #include <sys/types.h>
00066 #include <sys/time.h>
00067 #include <time.h>
00068 #include <unistd.h>
00069 
00070 #include "access/heapam.h"
00071 #include "access/htup_details.h"
00072 #include "access/multixact.h"
00073 #include "access/reloptions.h"
00074 #include "access/transam.h"
00075 #include "access/xact.h"
00076 #include "catalog/dependency.h"
00077 #include "catalog/namespace.h"
00078 #include "catalog/pg_database.h"
00079 #include "commands/dbcommands.h"
00080 #include "commands/vacuum.h"
00081 #include "lib/ilist.h"
00082 #include "libpq/pqsignal.h"
00083 #include "miscadmin.h"
00084 #include "pgstat.h"
00085 #include "postmaster/autovacuum.h"
00086 #include "postmaster/fork_process.h"
00087 #include "postmaster/postmaster.h"
00088 #include "storage/bufmgr.h"
00089 #include "storage/ipc.h"
00090 #include "storage/latch.h"
00091 #include "storage/pmsignal.h"
00092 #include "storage/proc.h"
00093 #include "storage/procsignal.h"
00094 #include "storage/sinvaladt.h"
00095 #include "tcop/tcopprot.h"
00096 #include "utils/fmgroids.h"
00097 #include "utils/lsyscache.h"
00098 #include "utils/memutils.h"
00099 #include "utils/ps_status.h"
00100 #include "utils/rel.h"
00101 #include "utils/snapmgr.h"
00102 #include "utils/syscache.h"
00103 #include "utils/timeout.h"
00104 #include "utils/timestamp.h"
00105 #include "utils/tqual.h"
00106 
00107 
00108 /*
00109  * GUC parameters
00110  */
00111 bool        autovacuum_start_daemon = false;
00112 int         autovacuum_max_workers;
00113 int         autovacuum_naptime;
00114 int         autovacuum_vac_thresh;
00115 double      autovacuum_vac_scale;
00116 int         autovacuum_anl_thresh;
00117 double      autovacuum_anl_scale;
00118 int         autovacuum_freeze_max_age;
00119 
00120 int         autovacuum_vac_cost_delay;
00121 int         autovacuum_vac_cost_limit;
00122 
00123 int         Log_autovacuum_min_duration = -1;
00124 
00125 /* how long to keep pgstat data in the launcher, in milliseconds */
00126 #define STATS_READ_DELAY 1000
00127 
00128 /* the minimum allowed time between two awakenings of the launcher */
00129 #define MIN_AUTOVAC_SLEEPTIME 100.0     /* milliseconds */
00130 
00131 /* Flags to tell if we are in an autovacuum process */
00132 static bool am_autovacuum_launcher = false;
00133 static bool am_autovacuum_worker = false;
00134 
00135 /* Flags set by signal handlers */
00136 static volatile sig_atomic_t got_SIGHUP = false;
00137 static volatile sig_atomic_t got_SIGUSR2 = false;
00138 static volatile sig_atomic_t got_SIGTERM = false;
00139 
00140 /* Comparison points for determining whether freeze_max_age is exceeded */
00141 static TransactionId recentXid;
00142 static MultiXactId recentMulti;
00143 
00144 /* Default freeze ages to use for autovacuum (varies by database) */
00145 static int  default_freeze_min_age;
00146 static int  default_freeze_table_age;
00147 
00148 /* Memory context for long-lived data */
00149 static MemoryContext AutovacMemCxt;
00150 
00151 /* struct to keep track of databases in launcher */
00152 typedef struct avl_dbase
00153 {
00154     Oid         adl_datid;      /* hash key -- must be first */
00155     TimestampTz adl_next_worker;
00156     int         adl_score;
00157     dlist_node  adl_node;
00158 } avl_dbase;
00159 
00160 /* struct to keep track of databases in worker */
00161 typedef struct avw_dbase
00162 {
00163     Oid         adw_datid;
00164     char       *adw_name;
00165     TransactionId adw_frozenxid;
00166     MultiXactId adw_frozenmulti;
00167     PgStat_StatDBEntry *adw_entry;
00168 } avw_dbase;
00169 
00170 /* struct to keep track of tables to vacuum and/or analyze, in 1st pass */
00171 typedef struct av_relation
00172 {
00173     Oid         ar_toastrelid;  /* hash key - must be first */
00174     Oid         ar_relid;
00175     bool        ar_hasrelopts;
00176     AutoVacOpts ar_reloptions;  /* copy of AutoVacOpts from the main table's
00177                                  * reloptions, or NULL if none */
00178 } av_relation;
00179 
00180 /* struct to keep track of tables to vacuum and/or analyze, after rechecking */
00181 typedef struct autovac_table
00182 {
00183     Oid         at_relid;
00184     bool        at_dovacuum;
00185     bool        at_doanalyze;
00186     int         at_freeze_min_age;
00187     int         at_freeze_table_age;
00188     int         at_vacuum_cost_delay;
00189     int         at_vacuum_cost_limit;
00190     bool        at_wraparound;
00191     char       *at_relname;
00192     char       *at_nspname;
00193     char       *at_datname;
00194 } autovac_table;
00195 
00196 /*-------------
00197  * This struct holds information about a single worker's whereabouts.  We keep
00198  * an array of these in shared memory, sized according to
00199  * autovacuum_max_workers.
00200  *
00201  * wi_links     entry into free list or running list
00202  * wi_dboid     OID of the database this worker is supposed to work on
00203  * wi_tableoid  OID of the table currently being vacuumed, if any
00204  * wi_proc      pointer to PGPROC of the running worker, NULL if not started
00205  * wi_launchtime Time at which this worker was launched
00206  * wi_cost_*    Vacuum cost-based delay parameters current in this worker
00207  *
00208  * All fields are protected by AutovacuumLock, except for wi_tableoid which is
00209  * protected by AutovacuumScheduleLock (which is read-only for everyone except
00210  * that worker itself).
00211  *-------------
00212  */
00213 typedef struct WorkerInfoData
00214 {
00215     dlist_node  wi_links;
00216     Oid         wi_dboid;
00217     Oid         wi_tableoid;
00218     PGPROC     *wi_proc;
00219     TimestampTz wi_launchtime;
00220     int         wi_cost_delay;
00221     int         wi_cost_limit;
00222     int         wi_cost_limit_base;
00223 }   WorkerInfoData;
00224 
00225 typedef struct WorkerInfoData *WorkerInfo;
00226 
00227 /*
00228  * Possible signals received by the launcher from remote processes.  These are
00229  * stored atomically in shared memory so that other processes can set them
00230  * without locking.
00231  */
00232 typedef enum
00233 {
00234     AutoVacForkFailed,          /* failed trying to start a worker */
00235     AutoVacRebalance,           /* rebalance the cost limits */
00236     AutoVacNumSignals           /* must be last */
00237 }   AutoVacuumSignal;
00238 
00239 /*-------------
00240  * The main autovacuum shmem struct.  On shared memory we store this main
00241  * struct and the array of WorkerInfo structs.  This struct keeps:
00242  *
00243  * av_signal        set by other processes to indicate various conditions
00244  * av_launcherpid   the PID of the autovacuum launcher
00245  * av_freeWorkers   the WorkerInfo freelist
00246  * av_runningWorkers the WorkerInfo non-free queue
00247  * av_startingWorker pointer to WorkerInfo currently being started (cleared by
00248  *                  the worker itself as soon as it's up and running)
00249  *
00250  * This struct is protected by AutovacuumLock, except for av_signal and parts
00251  * of the worker list (see above).
00252  *-------------
00253  */
00254 typedef struct
00255 {
00256     sig_atomic_t av_signal[AutoVacNumSignals];
00257     pid_t       av_launcherpid;
00258     dlist_head  av_freeWorkers;
00259     dlist_head  av_runningWorkers;
00260     WorkerInfo  av_startingWorker;
00261 } AutoVacuumShmemStruct;
00262 
00263 static AutoVacuumShmemStruct *AutoVacuumShmem;
00264 
00265 /*
00266  * the database list (of avl_dbase elements) in the launcher, and the context
00267  * that contains it
00268  */
00269 static dlist_head DatabaseList = DLIST_STATIC_INIT(DatabaseList);
00270 static MemoryContext DatabaseListCxt = NULL;
00271 
00272 /* Pointer to my own WorkerInfo, valid on each worker */
00273 static WorkerInfo MyWorkerInfo = NULL;
00274 
00275 /* PID of launcher, valid only in worker while shutting down */
00276 int         AutovacuumLauncherPid = 0;
00277 
00278 #ifdef EXEC_BACKEND
00279 static pid_t avlauncher_forkexec(void);
00280 static pid_t avworker_forkexec(void);
00281 #endif
00282 NON_EXEC_STATIC void AutoVacWorkerMain(int argc, char *argv[]) __attribute__((noreturn));
00283 NON_EXEC_STATIC void AutoVacLauncherMain(int argc, char *argv[]) __attribute__((noreturn));
00284 
00285 static Oid  do_start_worker(void);
00286 static void launcher_determine_sleep(bool canlaunch, bool recursing,
00287                          struct timeval * nap);
00288 static void launch_worker(TimestampTz now);
00289 static List *get_database_list(void);
00290 static void rebuild_database_list(Oid newdb);
00291 static int  db_comparator(const void *a, const void *b);
00292 static void autovac_balance_cost(void);
00293 
00294 static void do_autovacuum(void);
00295 static void FreeWorkerInfo(int code, Datum arg);
00296 
00297 static autovac_table *table_recheck_autovac(Oid relid, HTAB *table_toast_map,
00298                       TupleDesc pg_class_desc);
00299 static void relation_needs_vacanalyze(Oid relid, AutoVacOpts *relopts,
00300                           Form_pg_class classForm,
00301                           PgStat_StatTabEntry *tabentry,
00302                           bool *dovacuum, bool *doanalyze, bool *wraparound);
00303 
00304 static void autovacuum_do_vac_analyze(autovac_table *tab,
00305                           BufferAccessStrategy bstrategy);
00306 static AutoVacOpts *extract_autovac_opts(HeapTuple tup,
00307                      TupleDesc pg_class_desc);
00308 static PgStat_StatTabEntry *get_pgstat_tabentry_relid(Oid relid, bool isshared,
00309                           PgStat_StatDBEntry *shared,
00310                           PgStat_StatDBEntry *dbentry);
00311 static void autovac_report_activity(autovac_table *tab);
00312 static void avl_sighup_handler(SIGNAL_ARGS);
00313 static void avl_sigusr2_handler(SIGNAL_ARGS);
00314 static void avl_sigterm_handler(SIGNAL_ARGS);
00315 static void autovac_refresh_stats(void);
00316 
00317 
00318 
00319 /********************************************************************
00320  *                    AUTOVACUUM LAUNCHER CODE
00321  ********************************************************************/
00322 
00323 #ifdef EXEC_BACKEND
00324 /*
00325  * forkexec routine for the autovacuum launcher process.
00326  *
00327  * Format up the arglist, then fork and exec.
00328  */
00329 static pid_t
00330 avlauncher_forkexec(void)
00331 {
00332     char       *av[10];
00333     int         ac = 0;
00334 
00335     av[ac++] = "postgres";
00336     av[ac++] = "--forkavlauncher";
00337     av[ac++] = NULL;            /* filled in by postmaster_forkexec */
00338     av[ac] = NULL;
00339 
00340     Assert(ac < lengthof(av));
00341 
00342     return postmaster_forkexec(ac, av);
00343 }
00344 
00345 /*
00346  * We need this set from the outside, before InitProcess is called
00347  */
00348 void
00349 AutovacuumLauncherIAm(void)
00350 {
00351     am_autovacuum_launcher = true;
00352 }
00353 #endif
00354 
00355 /*
00356  * Main entry point for autovacuum launcher process, to be called from the
00357  * postmaster.
00358  */
00359 int
00360 StartAutoVacLauncher(void)
00361 {
00362     pid_t       AutoVacPID;
00363 
00364 #ifdef EXEC_BACKEND
00365     switch ((AutoVacPID = avlauncher_forkexec()))
00366 #else
00367     switch ((AutoVacPID = fork_process()))
00368 #endif
00369     {
00370         case -1:
00371             ereport(LOG,
00372                  (errmsg("could not fork autovacuum launcher process: %m")));
00373             return 0;
00374 
00375 #ifndef EXEC_BACKEND
00376         case 0:
00377             /* in postmaster child ... */
00378             /* Close the postmaster's sockets */
00379             ClosePostmasterPorts(false);
00380 
00381             /* Lose the postmaster's on-exit routines */
00382             on_exit_reset();
00383 
00384             AutoVacLauncherMain(0, NULL);
00385             break;
00386 #endif
00387         default:
00388             return (int) AutoVacPID;
00389     }
00390 
00391     /* shouldn't get here */
00392     return 0;
00393 }
00394 
00395 /*
00396  * Main loop for the autovacuum launcher process.
00397  */
00398 NON_EXEC_STATIC void
00399 AutoVacLauncherMain(int argc, char *argv[])
00400 {
00401     sigjmp_buf  local_sigjmp_buf;
00402 
00403     /* we are a postmaster subprocess now */
00404     IsUnderPostmaster = true;
00405     am_autovacuum_launcher = true;
00406 
00407     /* reset MyProcPid */
00408     MyProcPid = getpid();
00409 
00410     /* record Start Time for logging */
00411     MyStartTime = time(NULL);
00412 
00413     /* Identify myself via ps */
00414     init_ps_display("autovacuum launcher process", "", "", "");
00415 
00416     ereport(LOG,
00417             (errmsg("autovacuum launcher started")));
00418 
00419     if (PostAuthDelay)
00420         pg_usleep(PostAuthDelay * 1000000L);
00421 
00422     SetProcessingMode(InitProcessing);
00423 
00424     /*
00425      * If possible, make this process a group leader, so that the postmaster
00426      * can signal any child processes too.  (autovacuum probably never has any
00427      * child processes, but for consistency we make all postmaster child
00428      * processes do this.)
00429      */
00430 #ifdef HAVE_SETSID
00431     if (setsid() < 0)
00432         elog(FATAL, "setsid() failed: %m");
00433 #endif
00434 
00435     /*
00436      * Set up signal handlers.  We operate on databases much like a regular
00437      * backend, so we use the same signal handling.  See equivalent code in
00438      * tcop/postgres.c.
00439      */
00440     pqsignal(SIGHUP, avl_sighup_handler);
00441     pqsignal(SIGINT, StatementCancelHandler);
00442     pqsignal(SIGTERM, avl_sigterm_handler);
00443 
00444     pqsignal(SIGQUIT, quickdie);
00445     InitializeTimeouts();       /* establishes SIGALRM handler */
00446 
00447     pqsignal(SIGPIPE, SIG_IGN);
00448     pqsignal(SIGUSR1, procsignal_sigusr1_handler);
00449     pqsignal(SIGUSR2, avl_sigusr2_handler);
00450     pqsignal(SIGFPE, FloatExceptionHandler);
00451     pqsignal(SIGCHLD, SIG_DFL);
00452 
00453     /* Early initialization */
00454     BaseInit();
00455 
00456     /*
00457      * Create a per-backend PGPROC struct in shared memory, except in the
00458      * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
00459      * this before we can use LWLocks (and in the EXEC_BACKEND case we already
00460      * had to do some stuff with LWLocks).
00461      */
00462 #ifndef EXEC_BACKEND
00463     InitProcess();
00464 #endif
00465 
00466     InitPostgres(NULL, InvalidOid, NULL, NULL);
00467 
00468     SetProcessingMode(NormalProcessing);
00469 
00470     /*
00471      * Create a memory context that we will do all our work in.  We do this so
00472      * that we can reset the context during error recovery and thereby avoid
00473      * possible memory leaks.
00474      */
00475     AutovacMemCxt = AllocSetContextCreate(TopMemoryContext,
00476                                           "Autovacuum Launcher",
00477                                           ALLOCSET_DEFAULT_MINSIZE,
00478                                           ALLOCSET_DEFAULT_INITSIZE,
00479                                           ALLOCSET_DEFAULT_MAXSIZE);
00480     MemoryContextSwitchTo(AutovacMemCxt);
00481 
00482     /*
00483      * If an exception is encountered, processing resumes here.
00484      *
00485      * This code is a stripped down version of PostgresMain error recovery.
00486      */
00487     if (sigsetjmp(local_sigjmp_buf, 1) != 0)
00488     {
00489         /* since not using PG_TRY, must reset error stack by hand */
00490         error_context_stack = NULL;
00491 
00492         /* Prevents interrupts while cleaning up */
00493         HOLD_INTERRUPTS();
00494 
00495         /* Forget any pending QueryCancel or timeout request */
00496         QueryCancelPending = false;
00497         disable_all_timeouts(false);
00498         QueryCancelPending = false;     /* again in case timeout occurred */
00499 
00500         /* Report the error to the server log */
00501         EmitErrorReport();
00502 
00503         /* Abort the current transaction in order to recover */
00504         AbortCurrentTransaction();
00505 
00506         /*
00507          * Now return to normal top-level context and clear ErrorContext for
00508          * next time.
00509          */
00510         MemoryContextSwitchTo(AutovacMemCxt);
00511         FlushErrorState();
00512 
00513         /* Flush any leaked data in the top-level context */
00514         MemoryContextResetAndDeleteChildren(AutovacMemCxt);
00515 
00516         /* don't leave dangling pointers to freed memory */
00517         DatabaseListCxt = NULL;
00518         dlist_init(&DatabaseList);
00519 
00520         /*
00521          * Make sure pgstat also considers our stat data as gone.  Note: we
00522          * mustn't use autovac_refresh_stats here.
00523          */
00524         pgstat_clear_snapshot();
00525 
00526         /* Now we can allow interrupts again */
00527         RESUME_INTERRUPTS();
00528 
00529         /*
00530          * Sleep at least 1 second after any error.  We don't want to be
00531          * filling the error logs as fast as we can.
00532          */
00533         pg_usleep(1000000L);
00534     }
00535 
00536     /* We can now handle ereport(ERROR) */
00537     PG_exception_stack = &local_sigjmp_buf;
00538 
00539     /* must unblock signals before calling rebuild_database_list */
00540     PG_SETMASK(&UnBlockSig);
00541 
00542     /*
00543      * Force zero_damaged_pages OFF in the autovac process, even if it is set
00544      * in postgresql.conf.  We don't really want such a dangerous option being
00545      * applied non-interactively.
00546      */
00547     SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE);
00548 
00549     /*
00550      * Force statement_timeout and lock_timeout to zero to avoid letting these
00551      * settings prevent regular maintenance from being executed.
00552      */
00553     SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
00554     SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
00555 
00556     /*
00557      * Force default_transaction_isolation to READ COMMITTED.  We don't want
00558      * to pay the overhead of serializable mode, nor add any risk of causing
00559      * deadlocks or delaying other transactions.
00560      */
00561     SetConfigOption("default_transaction_isolation", "read committed",
00562                     PGC_SUSET, PGC_S_OVERRIDE);
00563 
00564     /* in emergency mode, just start a worker and go away */
00565     if (!AutoVacuumingActive())
00566     {
00567         do_start_worker();
00568         proc_exit(0);           /* done */
00569     }
00570 
00571     AutoVacuumShmem->av_launcherpid = MyProcPid;
00572 
00573     /*
00574      * Create the initial database list.  The invariant we want this list to
00575      * keep is that it's ordered by decreasing next_time.  As soon as an entry
00576      * is updated to a higher time, it will be moved to the front (which is
00577      * correct because the only operation is to add autovacuum_naptime to the
00578      * entry, and time always increases).
00579      */
00580     rebuild_database_list(InvalidOid);
00581 
00582     for (;;)
00583     {
00584         struct timeval nap;
00585         TimestampTz current_time = 0;
00586         bool        can_launch;
00587         int         rc;
00588 
00589         /*
00590          * This loop is a bit different from the normal use of WaitLatch,
00591          * because we'd like to sleep before the first launch of a child
00592          * process.  So it's WaitLatch, then ResetLatch, then check for
00593          * wakening conditions.
00594          */
00595 
00596         launcher_determine_sleep(!dlist_is_empty(&AutoVacuumShmem->av_freeWorkers),
00597                                  false, &nap);
00598 
00599         /* Allow sinval catchup interrupts while sleeping */
00600         EnableCatchupInterrupt();
00601 
00602         /*
00603          * Wait until naptime expires or we get some type of signal (all the
00604          * signal handlers will wake us by calling SetLatch).
00605          */
00606         rc = WaitLatch(&MyProc->procLatch,
00607                        WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
00608                        (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L));
00609 
00610         ResetLatch(&MyProc->procLatch);
00611 
00612         DisableCatchupInterrupt();
00613 
00614         /*
00615          * Emergency bailout if postmaster has died.  This is to avoid the
00616          * necessity for manual cleanup of all postmaster children.
00617          */
00618         if (rc & WL_POSTMASTER_DEATH)
00619             proc_exit(1);
00620 
00621         /* the normal shutdown case */
00622         if (got_SIGTERM)
00623             break;
00624 
00625         if (got_SIGHUP)
00626         {
00627             got_SIGHUP = false;
00628             ProcessConfigFile(PGC_SIGHUP);
00629 
00630             /* shutdown requested in config file? */
00631             if (!AutoVacuumingActive())
00632                 break;
00633 
00634             /* rebalance in case the default cost parameters changed */
00635             LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
00636             autovac_balance_cost();
00637             LWLockRelease(AutovacuumLock);
00638 
00639             /* rebuild the list in case the naptime changed */
00640             rebuild_database_list(InvalidOid);
00641         }
00642 
00643         /*
00644          * a worker finished, or postmaster signalled failure to start a
00645          * worker
00646          */
00647         if (got_SIGUSR2)
00648         {
00649             got_SIGUSR2 = false;
00650 
00651             /* rebalance cost limits, if needed */
00652             if (AutoVacuumShmem->av_signal[AutoVacRebalance])
00653             {
00654                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
00655                 AutoVacuumShmem->av_signal[AutoVacRebalance] = false;
00656                 autovac_balance_cost();
00657                 LWLockRelease(AutovacuumLock);
00658             }
00659 
00660             if (AutoVacuumShmem->av_signal[AutoVacForkFailed])
00661             {
00662                 /*
00663                  * If the postmaster failed to start a new worker, we sleep
00664                  * for a little while and resend the signal.  The new worker's
00665                  * state is still in memory, so this is sufficient.  After
00666                  * that, we restart the main loop.
00667                  *
00668                  * XXX should we put a limit to the number of times we retry?
00669                  * I don't think it makes much sense, because a future start
00670                  * of a worker will continue to fail in the same way.
00671                  */
00672                 AutoVacuumShmem->av_signal[AutoVacForkFailed] = false;
00673                 pg_usleep(1000000L);    /* 1s */
00674                 SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER);
00675                 continue;
00676             }
00677         }
00678 
00679         /*
00680          * There are some conditions that we need to check before trying to
00681          * start a launcher.  First, we need to make sure that there is a
00682          * launcher slot available.  Second, we need to make sure that no
00683          * other worker failed while starting up.
00684          */
00685 
00686         current_time = GetCurrentTimestamp();
00687         LWLockAcquire(AutovacuumLock, LW_SHARED);
00688 
00689         can_launch = !dlist_is_empty(&AutoVacuumShmem->av_freeWorkers);
00690 
00691         if (AutoVacuumShmem->av_startingWorker != NULL)
00692         {
00693             int         waittime;
00694             WorkerInfo  worker = AutoVacuumShmem->av_startingWorker;
00695 
00696             /*
00697              * We can't launch another worker when another one is still
00698              * starting up (or failed while doing so), so just sleep for a bit
00699              * more; that worker will wake us up again as soon as it's ready.
00700              * We will only wait autovacuum_naptime seconds (up to a maximum
00701              * of 60 seconds) for this to happen however.  Note that failure
00702              * to connect to a particular database is not a problem here,
00703              * because the worker removes itself from the startingWorker
00704              * pointer before trying to connect.  Problems detected by the
00705              * postmaster (like fork() failure) are also reported and handled
00706              * differently.  The only problems that may cause this code to
00707              * fire are errors in the earlier sections of AutoVacWorkerMain,
00708              * before the worker removes the WorkerInfo from the
00709              * startingWorker pointer.
00710              */
00711             waittime = Min(autovacuum_naptime, 60) * 1000;
00712             if (TimestampDifferenceExceeds(worker->wi_launchtime, current_time,
00713                                            waittime))
00714             {
00715                 LWLockRelease(AutovacuumLock);
00716                 LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
00717 
00718                 /*
00719                  * No other process can put a worker in starting mode, so if
00720                  * startingWorker is still INVALID after exchanging our lock,
00721                  * we assume it's the same one we saw above (so we don't
00722                  * recheck the launch time).
00723                  */
00724                 if (AutoVacuumShmem->av_startingWorker != NULL)
00725                 {
00726                     worker = AutoVacuumShmem->av_startingWorker;
00727                     worker->wi_dboid = InvalidOid;
00728                     worker->wi_tableoid = InvalidOid;
00729                     worker->wi_proc = NULL;
00730                     worker->wi_launchtime = 0;
00731                     dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
00732                                     &worker->wi_links);
00733                     AutoVacuumShmem->av_startingWorker = NULL;
00734                     elog(WARNING, "worker took too long to start; canceled");
00735                 }
00736             }
00737             else
00738                 can_launch = false;
00739         }
00740         LWLockRelease(AutovacuumLock);  /* either shared or exclusive */
00741 
00742         /* if we can't do anything, just go back to sleep */
00743         if (!can_launch)
00744             continue;
00745 
00746         /* We're OK to start a new worker */
00747 
00748         if (dlist_is_empty(&DatabaseList))
00749         {
00750             /*
00751              * Special case when the list is empty: start a worker right away.
00752              * This covers the initial case, when no database is in pgstats
00753              * (thus the list is empty).  Note that the constraints in
00754              * launcher_determine_sleep keep us from starting workers too
00755              * quickly (at most once every autovacuum_naptime when the list is
00756              * empty).
00757              */
00758             launch_worker(current_time);
00759         }
00760         else
00761         {
00762             /*
00763              * because rebuild_database_list constructs a list with most
00764              * distant adl_next_worker first, we obtain our database from the
00765              * tail of the list.
00766              */
00767             avl_dbase  *avdb;
00768 
00769             avdb = dlist_tail_element(avl_dbase, adl_node, &DatabaseList);
00770 
00771             /*
00772              * launch a worker if next_worker is right now or it is in the
00773              * past
00774              */
00775             if (TimestampDifferenceExceeds(avdb->adl_next_worker,
00776                                            current_time, 0))
00777                 launch_worker(current_time);
00778         }
00779     }
00780 
00781     /* Normal exit from the autovac launcher is here */
00782     ereport(LOG,
00783             (errmsg("autovacuum launcher shutting down")));
00784     AutoVacuumShmem->av_launcherpid = 0;
00785 
00786     proc_exit(0);               /* done */
00787 }
00788 
00789 /*
00790  * Determine the time to sleep, based on the database list.
00791  *
00792  * The "canlaunch" parameter indicates whether we can start a worker right now,
00793  * for example due to the workers being all busy.  If this is false, we will
00794  * cause a long sleep, which will be interrupted when a worker exits.
00795  */
00796 static void
00797 launcher_determine_sleep(bool canlaunch, bool recursing, struct timeval * nap)
00798 {
00799     /*
00800      * We sleep until the next scheduled vacuum.  We trust that when the
00801      * database list was built, care was taken so that no entries have times
00802      * in the past; if the first entry has too close a next_worker value, or a
00803      * time in the past, we will sleep a small nominal time.
00804      */
00805     if (!canlaunch)
00806     {
00807         nap->tv_sec = autovacuum_naptime;
00808         nap->tv_usec = 0;
00809     }
00810     else if (!dlist_is_empty(&DatabaseList))
00811     {
00812         TimestampTz current_time = GetCurrentTimestamp();
00813         TimestampTz next_wakeup;
00814         avl_dbase  *avdb;
00815         long        secs;
00816         int         usecs;
00817 
00818         avdb = dlist_tail_element(avl_dbase, adl_node, &DatabaseList);
00819 
00820         next_wakeup = avdb->adl_next_worker;
00821         TimestampDifference(current_time, next_wakeup, &secs, &usecs);
00822 
00823         nap->tv_sec = secs;
00824         nap->tv_usec = usecs;
00825     }
00826     else
00827     {
00828         /* list is empty, sleep for whole autovacuum_naptime seconds  */
00829         nap->tv_sec = autovacuum_naptime;
00830         nap->tv_usec = 0;
00831     }
00832 
00833     /*
00834      * If the result is exactly zero, it means a database had an entry with
00835      * time in the past.  Rebuild the list so that the databases are evenly
00836      * distributed again, and recalculate the time to sleep.  This can happen
00837      * if there are more tables needing vacuum than workers, and they all take
00838      * longer to vacuum than autovacuum_naptime.
00839      *
00840      * We only recurse once.  rebuild_database_list should always return times
00841      * in the future, but it seems best not to trust too much on that.
00842      */
00843     if (nap->tv_sec == 0 && nap->tv_usec == 0 && !recursing)
00844     {
00845         rebuild_database_list(InvalidOid);
00846         launcher_determine_sleep(canlaunch, true, nap);
00847         return;
00848     }
00849 
00850     /* The smallest time we'll allow the launcher to sleep. */
00851     if (nap->tv_sec <= 0 && nap->tv_usec <= MIN_AUTOVAC_SLEEPTIME * 1000)
00852     {
00853         nap->tv_sec = 0;
00854         nap->tv_usec = MIN_AUTOVAC_SLEEPTIME * 1000;
00855     }
00856 }
00857 
00858 /*
00859  * Build an updated DatabaseList.  It must only contain databases that appear
00860  * in pgstats, and must be sorted by next_worker from highest to lowest,
00861  * distributed regularly across the next autovacuum_naptime interval.
00862  *
00863  * Receives the Oid of the database that made this list be generated (we call
00864  * this the "new" database, because when the database was already present on
00865  * the list, we expect that this function is not called at all).  The
00866  * preexisting list, if any, will be used to preserve the order of the
00867  * databases in the autovacuum_naptime period.  The new database is put at the
00868  * end of the interval.  The actual values are not saved, which should not be
00869  * much of a problem.
00870  */
00871 static void
00872 rebuild_database_list(Oid newdb)
00873 {
00874     List       *dblist;
00875     ListCell   *cell;
00876     MemoryContext newcxt;
00877     MemoryContext oldcxt;
00878     MemoryContext tmpcxt;
00879     HASHCTL     hctl;
00880     int         score;
00881     int         nelems;
00882     HTAB       *dbhash;
00883     dlist_iter  iter;
00884 
00885     /* use fresh stats */
00886     autovac_refresh_stats();
00887 
00888     newcxt = AllocSetContextCreate(AutovacMemCxt,
00889                                    "AV dblist",
00890                                    ALLOCSET_DEFAULT_MINSIZE,
00891                                    ALLOCSET_DEFAULT_INITSIZE,
00892                                    ALLOCSET_DEFAULT_MAXSIZE);
00893     tmpcxt = AllocSetContextCreate(newcxt,
00894                                    "tmp AV dblist",
00895                                    ALLOCSET_DEFAULT_MINSIZE,
00896                                    ALLOCSET_DEFAULT_INITSIZE,
00897                                    ALLOCSET_DEFAULT_MAXSIZE);
00898     oldcxt = MemoryContextSwitchTo(tmpcxt);
00899 
00900     /*
00901      * Implementing this is not as simple as it sounds, because we need to put
00902      * the new database at the end of the list; next the databases that were
00903      * already on the list, and finally (at the tail of the list) all the
00904      * other databases that are not on the existing list.
00905      *
00906      * To do this, we build an empty hash table of scored databases.  We will
00907      * start with the lowest score (zero) for the new database, then
00908      * increasing scores for the databases in the existing list, in order, and
00909      * lastly increasing scores for all databases gotten via
00910      * get_database_list() that are not already on the hash.
00911      *
00912      * Then we will put all the hash elements into an array, sort the array by
00913      * score, and finally put the array elements into the new doubly linked
00914      * list.
00915      */
00916     hctl.keysize = sizeof(Oid);
00917     hctl.entrysize = sizeof(avl_dbase);
00918     hctl.hash = oid_hash;
00919     hctl.hcxt = tmpcxt;
00920     dbhash = hash_create("db hash", 20, &hctl,  /* magic number here FIXME */
00921                          HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
00922 
00923     /* start by inserting the new database */
00924     score = 0;
00925     if (OidIsValid(newdb))
00926     {
00927         avl_dbase  *db;
00928         PgStat_StatDBEntry *entry;
00929 
00930         /* only consider this database if it has a pgstat entry */
00931         entry = pgstat_fetch_stat_dbentry(newdb);
00932         if (entry != NULL)
00933         {
00934             /* we assume it isn't found because the hash was just created */
00935             db = hash_search(dbhash, &newdb, HASH_ENTER, NULL);
00936 
00937             /* hash_search already filled in the key */
00938             db->adl_score = score++;
00939             /* next_worker is filled in later */
00940         }
00941     }
00942 
00943     /* Now insert the databases from the existing list */
00944     dlist_foreach(iter, &DatabaseList)
00945     {
00946         avl_dbase  *avdb = dlist_container(avl_dbase, adl_node, iter.cur);
00947         avl_dbase  *db;
00948         bool        found;
00949         PgStat_StatDBEntry *entry;
00950 
00951         /*
00952          * skip databases with no stat entries -- in particular, this gets
00953          * rid of dropped databases
00954          */
00955         entry = pgstat_fetch_stat_dbentry(avdb->adl_datid);
00956         if (entry == NULL)
00957             continue;
00958 
00959         db = hash_search(dbhash, &(avdb->adl_datid), HASH_ENTER, &found);
00960 
00961         if (!found)
00962         {
00963             /* hash_search already filled in the key */
00964             db->adl_score = score++;
00965             /* next_worker is filled in later */
00966         }
00967     }
00968 
00969     /* finally, insert all qualifying databases not previously inserted */
00970     dblist = get_database_list();
00971     foreach(cell, dblist)
00972     {
00973         avw_dbase  *avdb = lfirst(cell);
00974         avl_dbase  *db;
00975         bool        found;
00976         PgStat_StatDBEntry *entry;
00977 
00978         /* only consider databases with a pgstat entry */
00979         entry = pgstat_fetch_stat_dbentry(avdb->adw_datid);
00980         if (entry == NULL)
00981             continue;
00982 
00983         db = hash_search(dbhash, &(avdb->adw_datid), HASH_ENTER, &found);
00984         /* only update the score if the database was not already on the hash */
00985         if (!found)
00986         {
00987             /* hash_search already filled in the key */
00988             db->adl_score = score++;
00989             /* next_worker is filled in later */
00990         }
00991     }
00992     nelems = score;
00993 
00994     /* from here on, the allocated memory belongs to the new list */
00995     MemoryContextSwitchTo(newcxt);
00996     dlist_init(&DatabaseList);
00997 
00998     if (nelems > 0)
00999     {
01000         TimestampTz current_time;
01001         int         millis_increment;
01002         avl_dbase  *dbary;
01003         avl_dbase  *db;
01004         HASH_SEQ_STATUS seq;
01005         int         i;
01006 
01007         /* put all the hash elements into an array */
01008         dbary = palloc(nelems * sizeof(avl_dbase));
01009 
01010         i = 0;
01011         hash_seq_init(&seq, dbhash);
01012         while ((db = hash_seq_search(&seq)) != NULL)
01013             memcpy(&(dbary[i++]), db, sizeof(avl_dbase));
01014 
01015         /* sort the array */
01016         qsort(dbary, nelems, sizeof(avl_dbase), db_comparator);
01017 
01018         /*
01019          * Determine the time interval between databases in the schedule. If
01020          * we see that the configured naptime would take us to sleep times
01021          * lower than our min sleep time (which launcher_determine_sleep is
01022          * coded not to allow), silently use a larger naptime (but don't touch
01023          * the GUC variable).
01024          */
01025         millis_increment = 1000.0 * autovacuum_naptime / nelems;
01026         if (millis_increment <= MIN_AUTOVAC_SLEEPTIME)
01027             millis_increment = MIN_AUTOVAC_SLEEPTIME * 1.1;
01028 
01029         current_time = GetCurrentTimestamp();
01030 
01031         /*
01032          * move the elements from the array into the dllist, setting the
01033          * next_worker while walking the array
01034          */
01035         for (i = 0; i < nelems; i++)
01036         {
01037             avl_dbase  *db = &(dbary[i]);
01038 
01039             current_time = TimestampTzPlusMilliseconds(current_time,
01040                                                        millis_increment);
01041             db->adl_next_worker = current_time;
01042 
01043             /* later elements should go closer to the head of the list */
01044             dlist_push_head(&DatabaseList, &db->adl_node);
01045         }
01046     }
01047 
01048     /* all done, clean up memory */
01049     if (DatabaseListCxt != NULL)
01050         MemoryContextDelete(DatabaseListCxt);
01051     MemoryContextDelete(tmpcxt);
01052     DatabaseListCxt = newcxt;
01053     MemoryContextSwitchTo(oldcxt);
01054 }
01055 
01056 /* qsort comparator for avl_dbase, using adl_score */
01057 static int
01058 db_comparator(const void *a, const void *b)
01059 {
01060     if (((const avl_dbase *) a)->adl_score == ((const avl_dbase *) b)->adl_score)
01061         return 0;
01062     else
01063         return (((const avl_dbase *) a)->adl_score < ((const avl_dbase *) b)->adl_score) ? 1 : -1;
01064 }
01065 
01066 /*
01067  * do_start_worker
01068  *
01069  * Bare-bones procedure for starting an autovacuum worker from the launcher.
01070  * It determines what database to work on, sets up shared memory stuff and
01071  * signals postmaster to start the worker.  It fails gracefully if invoked when
01072  * autovacuum_workers are already active.
01073  *
01074  * Return value is the OID of the database that the worker is going to process,
01075  * or InvalidOid if no worker was actually started.
01076  */
01077 static Oid
01078 do_start_worker(void)
01079 {
01080     List       *dblist;
01081     ListCell   *cell;
01082     TransactionId xidForceLimit;
01083     MultiXactId multiForceLimit;
01084     bool        for_xid_wrap;
01085     bool        for_multi_wrap;
01086     avw_dbase  *avdb;
01087     TimestampTz current_time;
01088     bool        skipit = false;
01089     Oid         retval = InvalidOid;
01090     MemoryContext tmpcxt,
01091                 oldcxt;
01092 
01093     /* return quickly when there are no free workers */
01094     LWLockAcquire(AutovacuumLock, LW_SHARED);
01095     if (dlist_is_empty(&AutoVacuumShmem->av_freeWorkers))
01096     {
01097         LWLockRelease(AutovacuumLock);
01098         return InvalidOid;
01099     }
01100     LWLockRelease(AutovacuumLock);
01101 
01102     /*
01103      * Create and switch to a temporary context to avoid leaking the memory
01104      * allocated for the database list.
01105      */
01106     tmpcxt = AllocSetContextCreate(CurrentMemoryContext,
01107                                    "Start worker tmp cxt",
01108                                    ALLOCSET_DEFAULT_MINSIZE,
01109                                    ALLOCSET_DEFAULT_INITSIZE,
01110                                    ALLOCSET_DEFAULT_MAXSIZE);
01111     oldcxt = MemoryContextSwitchTo(tmpcxt);
01112 
01113     /* use fresh stats */
01114     autovac_refresh_stats();
01115 
01116     /* Get a list of databases */
01117     dblist = get_database_list();
01118 
01119     /*
01120      * Determine the oldest datfrozenxid/relfrozenxid that we will allow to
01121      * pass without forcing a vacuum.  (This limit can be tightened for
01122      * particular tables, but not loosened.)
01123      */
01124     recentXid = ReadNewTransactionId();
01125     xidForceLimit = recentXid - autovacuum_freeze_max_age;
01126     /* ensure it's a "normal" XID, else TransactionIdPrecedes misbehaves */
01127     /* this can cause the limit to go backwards by 3, but that's OK */
01128     if (xidForceLimit < FirstNormalTransactionId)
01129         xidForceLimit -= FirstNormalTransactionId;
01130 
01131     /* Also determine the oldest datminmxid we will consider. */
01132     recentMulti = ReadNextMultiXactId();
01133     multiForceLimit = recentMulti - autovacuum_freeze_max_age;
01134     if (multiForceLimit < FirstMultiXactId)
01135         multiForceLimit -= FirstMultiXactId;
01136 
01137     /*
01138      * Choose a database to connect to.  We pick the database that was least
01139      * recently auto-vacuumed, or one that needs vacuuming to prevent Xid
01140      * wraparound-related data loss.  If any db at risk of Xid wraparound is
01141      * found, we pick the one with oldest datfrozenxid, independently of
01142      * autovacuum times; similarly we pick the one with the oldest datminmxid
01143      * if any is in MultiXactId wraparound.  Note that those in Xid wraparound
01144      * danger are given more priority than those in multi wraparound danger.
01145      *
01146      * Note that a database with no stats entry is not considered, except for
01147      * Xid wraparound purposes.  The theory is that if no one has ever
01148      * connected to it since the stats were last initialized, it doesn't need
01149      * vacuuming.
01150      *
01151      * XXX This could be improved if we had more info about whether it needs
01152      * vacuuming before connecting to it.  Perhaps look through the pgstats
01153      * data for the database's tables?  One idea is to keep track of the
01154      * number of new and dead tuples per database in pgstats.  However it
01155      * isn't clear how to construct a metric that measures that and not cause
01156      * starvation for less busy databases.
01157      */
01158     avdb = NULL;
01159     for_xid_wrap = false;
01160     for_multi_wrap = false;
01161     current_time = GetCurrentTimestamp();
01162     foreach(cell, dblist)
01163     {
01164         avw_dbase  *tmp = lfirst(cell);
01165         dlist_iter iter;
01166 
01167         /* Check to see if this one is at risk of wraparound */
01168         if (TransactionIdPrecedes(tmp->adw_frozenxid, xidForceLimit))
01169         {
01170             if (avdb == NULL ||
01171                 TransactionIdPrecedes(tmp->adw_frozenxid,
01172                                       avdb->adw_frozenxid))
01173                 avdb = tmp;
01174             for_xid_wrap = true;
01175             continue;
01176         }
01177         else if (for_xid_wrap)
01178             continue;           /* ignore not-at-risk DBs */
01179         else if (MultiXactIdPrecedes(tmp->adw_frozenmulti, multiForceLimit))
01180         {
01181             if (avdb == NULL ||
01182                 MultiXactIdPrecedes(tmp->adw_frozenmulti,
01183                                     avdb->adw_frozenmulti))
01184                 avdb = tmp;
01185             for_multi_wrap = true;
01186             continue;
01187         }
01188         else if (for_multi_wrap)
01189             continue;           /* ignore not-at-risk DBs */
01190 
01191         /* Find pgstat entry if any */
01192         tmp->adw_entry = pgstat_fetch_stat_dbentry(tmp->adw_datid);
01193 
01194         /*
01195          * Skip a database with no pgstat entry; it means it hasn't seen any
01196          * activity.
01197          */
01198         if (!tmp->adw_entry)
01199             continue;
01200 
01201         /*
01202          * Also, skip a database that appears on the database list as having
01203          * been processed recently (less than autovacuum_naptime seconds ago).
01204          * We do this so that we don't select a database which we just
01205          * selected, but that pgstat hasn't gotten around to updating the last
01206          * autovacuum time yet.
01207          */
01208         skipit = false;
01209 
01210         dlist_reverse_foreach(iter, &DatabaseList)
01211         {
01212             avl_dbase  *dbp = dlist_container(avl_dbase, adl_node, iter.cur);
01213 
01214             if (dbp->adl_datid == tmp->adw_datid)
01215             {
01216                 /*
01217                  * Skip this database if its next_worker value falls between
01218                  * the current time and the current time plus naptime.
01219                  */
01220                 if (!TimestampDifferenceExceeds(dbp->adl_next_worker,
01221                                                 current_time, 0) &&
01222                     !TimestampDifferenceExceeds(current_time,
01223                                                 dbp->adl_next_worker,
01224                                                 autovacuum_naptime * 1000))
01225                     skipit = true;
01226 
01227                 break;
01228             }
01229         }
01230         if (skipit)
01231             continue;
01232 
01233         /*
01234          * Remember the db with oldest autovac time.  (If we are here, both
01235          * tmp->entry and db->entry must be non-null.)
01236          */
01237         if (avdb == NULL ||
01238             tmp->adw_entry->last_autovac_time < avdb->adw_entry->last_autovac_time)
01239             avdb = tmp;
01240     }
01241 
01242     /* Found a database -- process it */
01243     if (avdb != NULL)
01244     {
01245         WorkerInfo  worker;
01246         dlist_node *wptr;
01247 
01248         LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
01249 
01250         /*
01251          * Get a worker entry from the freelist.  We checked above, so there
01252          * really should be a free slot.
01253          */
01254         wptr = dlist_pop_head_node(&AutoVacuumShmem->av_freeWorkers);
01255 
01256         worker = dlist_container(WorkerInfoData, wi_links, wptr);
01257         worker->wi_dboid = avdb->adw_datid;
01258         worker->wi_proc = NULL;
01259         worker->wi_launchtime = GetCurrentTimestamp();
01260 
01261         AutoVacuumShmem->av_startingWorker = worker;
01262 
01263         LWLockRelease(AutovacuumLock);
01264 
01265         SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER);
01266 
01267         retval = avdb->adw_datid;
01268     }
01269     else if (skipit)
01270     {
01271         /*
01272          * If we skipped all databases on the list, rebuild it, because it
01273          * probably contains a dropped database.
01274          */
01275         rebuild_database_list(InvalidOid);
01276     }
01277 
01278     MemoryContextSwitchTo(oldcxt);
01279     MemoryContextDelete(tmpcxt);
01280 
01281     return retval;
01282 }
01283 
01284 /*
01285  * launch_worker
01286  *
01287  * Wrapper for starting a worker from the launcher.  Besides actually starting
01288  * it, update the database list to reflect the next time that another one will
01289  * need to be started on the selected database.  The actual database choice is
01290  * left to do_start_worker.
01291  *
01292  * This routine is also expected to insert an entry into the database list if
01293  * the selected database was previously absent from the list.
01294  */
01295 static void
01296 launch_worker(TimestampTz now)
01297 {
01298     Oid         dbid;
01299     dlist_iter  iter;
01300 
01301     dbid = do_start_worker();
01302     if (OidIsValid(dbid))
01303     {
01304         bool found = false;
01305 
01306         /*
01307          * Walk the database list and update the corresponding entry.  If the
01308          * database is not on the list, we'll recreate the list.
01309          */
01310         dlist_foreach(iter, &DatabaseList)
01311         {
01312             avl_dbase  *avdb = dlist_container(avl_dbase, adl_node, iter.cur);
01313 
01314             if (avdb->adl_datid == dbid)
01315             {
01316                 found = true;
01317 
01318                 /*
01319                  * add autovacuum_naptime seconds to the current time, and use
01320                  * that as the new "next_worker" field for this database.
01321                  */
01322                 avdb->adl_next_worker =
01323                     TimestampTzPlusMilliseconds(now, autovacuum_naptime * 1000);
01324 
01325                 dlist_move_head(&DatabaseList, iter.cur);
01326                 break;
01327             }
01328         }
01329 
01330         /*
01331          * If the database was not present in the database list, we rebuild
01332          * the list.  It's possible that the database does not get into the
01333          * list anyway, for example if it's a database that doesn't have a
01334          * pgstat entry, but this is not a problem because we don't want to
01335          * schedule workers regularly into those in any case.
01336          */
01337         if (!found)
01338             rebuild_database_list(dbid);
01339     }
01340 }
01341 
01342 /*
01343  * Called from postmaster to signal a failure to fork a process to become
01344  * worker.  The postmaster should kill(SIGUSR2) the launcher shortly
01345  * after calling this function.
01346  */
01347 void
01348 AutoVacWorkerFailed(void)
01349 {
01350     AutoVacuumShmem->av_signal[AutoVacForkFailed] = true;
01351 }
01352 
01353 /* SIGHUP: set flag to re-read config file at next convenient time */
01354 static void
01355 avl_sighup_handler(SIGNAL_ARGS)
01356 {
01357     int         save_errno = errno;
01358 
01359     got_SIGHUP = true;
01360     if (MyProc)
01361         SetLatch(&MyProc->procLatch);
01362 
01363     errno = save_errno;
01364 }
01365 
01366 /* SIGUSR2: a worker is up and running, or just finished, or failed to fork */
01367 static void
01368 avl_sigusr2_handler(SIGNAL_ARGS)
01369 {
01370     int         save_errno = errno;
01371 
01372     got_SIGUSR2 = true;
01373     if (MyProc)
01374         SetLatch(&MyProc->procLatch);
01375 
01376     errno = save_errno;
01377 }
01378 
01379 /* SIGTERM: time to die */
01380 static void
01381 avl_sigterm_handler(SIGNAL_ARGS)
01382 {
01383     int         save_errno = errno;
01384 
01385     got_SIGTERM = true;
01386     if (MyProc)
01387         SetLatch(&MyProc->procLatch);
01388 
01389     errno = save_errno;
01390 }
01391 
01392 
01393 /********************************************************************
01394  *                    AUTOVACUUM WORKER CODE
01395  ********************************************************************/
01396 
01397 #ifdef EXEC_BACKEND
01398 /*
01399  * forkexec routines for the autovacuum worker.
01400  *
01401  * Format up the arglist, then fork and exec.
01402  */
01403 static pid_t
01404 avworker_forkexec(void)
01405 {
01406     char       *av[10];
01407     int         ac = 0;
01408 
01409     av[ac++] = "postgres";
01410     av[ac++] = "--forkavworker";
01411     av[ac++] = NULL;            /* filled in by postmaster_forkexec */
01412     av[ac] = NULL;
01413 
01414     Assert(ac < lengthof(av));
01415 
01416     return postmaster_forkexec(ac, av);
01417 }
01418 
01419 /*
01420  * We need this set from the outside, before InitProcess is called
01421  */
01422 void
01423 AutovacuumWorkerIAm(void)
01424 {
01425     am_autovacuum_worker = true;
01426 }
01427 #endif
01428 
01429 /*
01430  * Main entry point for autovacuum worker process.
01431  *
01432  * This code is heavily based on pgarch.c, q.v.
01433  */
01434 int
01435 StartAutoVacWorker(void)
01436 {
01437     pid_t       worker_pid;
01438 
01439 #ifdef EXEC_BACKEND
01440     switch ((worker_pid = avworker_forkexec()))
01441 #else
01442     switch ((worker_pid = fork_process()))
01443 #endif
01444     {
01445         case -1:
01446             ereport(LOG,
01447                     (errmsg("could not fork autovacuum worker process: %m")));
01448             return 0;
01449 
01450 #ifndef EXEC_BACKEND
01451         case 0:
01452             /* in postmaster child ... */
01453             /* Close the postmaster's sockets */
01454             ClosePostmasterPorts(false);
01455 
01456             /* Lose the postmaster's on-exit routines */
01457             on_exit_reset();
01458 
01459             AutoVacWorkerMain(0, NULL);
01460             break;
01461 #endif
01462         default:
01463             return (int) worker_pid;
01464     }
01465 
01466     /* shouldn't get here */
01467     return 0;
01468 }
01469 
01470 /*
01471  * AutoVacWorkerMain
01472  */
01473 NON_EXEC_STATIC void
01474 AutoVacWorkerMain(int argc, char *argv[])
01475 {
01476     sigjmp_buf  local_sigjmp_buf;
01477     Oid         dbid;
01478 
01479     /* we are a postmaster subprocess now */
01480     IsUnderPostmaster = true;
01481     am_autovacuum_worker = true;
01482 
01483     /* reset MyProcPid */
01484     MyProcPid = getpid();
01485 
01486     /* record Start Time for logging */
01487     MyStartTime = time(NULL);
01488 
01489     /* Identify myself via ps */
01490     init_ps_display("autovacuum worker process", "", "", "");
01491 
01492     SetProcessingMode(InitProcessing);
01493 
01494     /*
01495      * If possible, make this process a group leader, so that the postmaster
01496      * can signal any child processes too.  (autovacuum probably never has any
01497      * child processes, but for consistency we make all postmaster child
01498      * processes do this.)
01499      */
01500 #ifdef HAVE_SETSID
01501     if (setsid() < 0)
01502         elog(FATAL, "setsid() failed: %m");
01503 #endif
01504 
01505     /*
01506      * Set up signal handlers.  We operate on databases much like a regular
01507      * backend, so we use the same signal handling.  See equivalent code in
01508      * tcop/postgres.c.
01509      *
01510      * Currently, we don't pay attention to postgresql.conf changes that
01511      * happen during a single daemon iteration, so we can ignore SIGHUP.
01512      */
01513     pqsignal(SIGHUP, SIG_IGN);
01514 
01515     /*
01516      * SIGINT is used to signal canceling the current table's vacuum; SIGTERM
01517      * means abort and exit cleanly, and SIGQUIT means abandon ship.
01518      */
01519     pqsignal(SIGINT, StatementCancelHandler);
01520     pqsignal(SIGTERM, die);
01521     pqsignal(SIGQUIT, quickdie);
01522     InitializeTimeouts();       /* establishes SIGALRM handler */
01523 
01524     pqsignal(SIGPIPE, SIG_IGN);
01525     pqsignal(SIGUSR1, procsignal_sigusr1_handler);
01526     pqsignal(SIGUSR2, SIG_IGN);
01527     pqsignal(SIGFPE, FloatExceptionHandler);
01528     pqsignal(SIGCHLD, SIG_DFL);
01529 
01530     /* Early initialization */
01531     BaseInit();
01532 
01533     /*
01534      * Create a per-backend PGPROC struct in shared memory, except in the
01535      * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
01536      * this before we can use LWLocks (and in the EXEC_BACKEND case we already
01537      * had to do some stuff with LWLocks).
01538      */
01539 #ifndef EXEC_BACKEND
01540     InitProcess();
01541 #endif
01542 
01543     /*
01544      * If an exception is encountered, processing resumes here.
01545      *
01546      * See notes in postgres.c about the design of this coding.
01547      */
01548     if (sigsetjmp(local_sigjmp_buf, 1) != 0)
01549     {
01550         /* Prevents interrupts while cleaning up */
01551         HOLD_INTERRUPTS();
01552 
01553         /* Report the error to the server log */
01554         EmitErrorReport();
01555 
01556         /*
01557          * We can now go away.  Note that because we called InitProcess, a
01558          * callback was registered to do ProcKill, which will clean up
01559          * necessary state.
01560          */
01561         proc_exit(0);
01562     }
01563 
01564     /* We can now handle ereport(ERROR) */
01565     PG_exception_stack = &local_sigjmp_buf;
01566 
01567     PG_SETMASK(&UnBlockSig);
01568 
01569     /*
01570      * Force zero_damaged_pages OFF in the autovac process, even if it is set
01571      * in postgresql.conf.  We don't really want such a dangerous option being
01572      * applied non-interactively.
01573      */
01574     SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE);
01575 
01576     /*
01577      * Force statement_timeout and lock_timeout to zero to avoid letting these
01578      * settings prevent regular maintenance from being executed.
01579      */
01580     SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
01581     SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
01582 
01583     /*
01584      * Force default_transaction_isolation to READ COMMITTED.  We don't want
01585      * to pay the overhead of serializable mode, nor add any risk of causing
01586      * deadlocks or delaying other transactions.
01587      */
01588     SetConfigOption("default_transaction_isolation", "read committed",
01589                     PGC_SUSET, PGC_S_OVERRIDE);
01590 
01591     /*
01592      * Force synchronous replication off to allow regular maintenance even if
01593      * we are waiting for standbys to connect. This is important to ensure we
01594      * aren't blocked from performing anti-wraparound tasks.
01595      */
01596     if (synchronous_commit > SYNCHRONOUS_COMMIT_LOCAL_FLUSH)
01597         SetConfigOption("synchronous_commit", "local",
01598                         PGC_SUSET, PGC_S_OVERRIDE);
01599 
01600     /*
01601      * Get the info about the database we're going to work on.
01602      */
01603     LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
01604 
01605     /*
01606      * beware of startingWorker being INVALID; this should normally not
01607      * happen, but if a worker fails after forking and before this, the
01608      * launcher might have decided to remove it from the queue and start
01609      * again.
01610      */
01611     if (AutoVacuumShmem->av_startingWorker != NULL)
01612     {
01613         MyWorkerInfo = AutoVacuumShmem->av_startingWorker;
01614         dbid = MyWorkerInfo->wi_dboid;
01615         MyWorkerInfo->wi_proc = MyProc;
01616 
01617         /* insert into the running list */
01618         dlist_push_head(&AutoVacuumShmem->av_runningWorkers,
01619                         &MyWorkerInfo->wi_links);
01620 
01621         /*
01622          * remove from the "starting" pointer, so that the launcher can start
01623          * a new worker if required
01624          */
01625         AutoVacuumShmem->av_startingWorker = NULL;
01626         LWLockRelease(AutovacuumLock);
01627 
01628         on_shmem_exit(FreeWorkerInfo, 0);
01629 
01630         /* wake up the launcher */
01631         if (AutoVacuumShmem->av_launcherpid != 0)
01632             kill(AutoVacuumShmem->av_launcherpid, SIGUSR2);
01633     }
01634     else
01635     {
01636         /* no worker entry for me, go away */
01637         elog(WARNING, "autovacuum worker started without a worker entry");
01638         dbid = InvalidOid;
01639         LWLockRelease(AutovacuumLock);
01640     }
01641 
01642     if (OidIsValid(dbid))
01643     {
01644         char        dbname[NAMEDATALEN];
01645 
01646         /*
01647          * Report autovac startup to the stats collector.  We deliberately do
01648          * this before InitPostgres, so that the last_autovac_time will get
01649          * updated even if the connection attempt fails.  This is to prevent
01650          * autovac from getting "stuck" repeatedly selecting an unopenable
01651          * database, rather than making any progress on stuff it can connect
01652          * to.
01653          */
01654         pgstat_report_autovac(dbid);
01655 
01656         /*
01657          * Connect to the selected database
01658          *
01659          * Note: if we have selected a just-deleted database (due to using
01660          * stale stats info), we'll fail and exit here.
01661          */
01662         InitPostgres(NULL, dbid, NULL, dbname);
01663         SetProcessingMode(NormalProcessing);
01664         set_ps_display(dbname, false);
01665         ereport(DEBUG1,
01666                 (errmsg("autovacuum: processing database \"%s\"", dbname)));
01667 
01668         if (PostAuthDelay)
01669             pg_usleep(PostAuthDelay * 1000000L);
01670 
01671         /* And do an appropriate amount of work */
01672         recentXid = ReadNewTransactionId();
01673         recentMulti = ReadNextMultiXactId();
01674         do_autovacuum();
01675     }
01676 
01677     /*
01678      * The launcher will be notified of my death in ProcKill, *if* we managed
01679      * to get a worker slot at all
01680      */
01681 
01682     /* All done, go away */
01683     proc_exit(0);
01684 }
01685 
01686 /*
01687  * Return a WorkerInfo to the free list
01688  */
01689 static void
01690 FreeWorkerInfo(int code, Datum arg)
01691 {
01692     if (MyWorkerInfo != NULL)
01693     {
01694         LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
01695 
01696         /*
01697          * Wake the launcher up so that he can launch a new worker immediately
01698          * if required.  We only save the launcher's PID in local memory here;
01699          * the actual signal will be sent when the PGPROC is recycled.  Note
01700          * that we always do this, so that the launcher can rebalance the cost
01701          * limit setting of the remaining workers.
01702          *
01703          * We somewhat ignore the risk that the launcher changes its PID
01704          * between us reading it and the actual kill; we expect ProcKill to be
01705          * called shortly after us, and we assume that PIDs are not reused too
01706          * quickly after a process exits.
01707          */
01708         AutovacuumLauncherPid = AutoVacuumShmem->av_launcherpid;
01709 
01710         dlist_delete(&MyWorkerInfo->wi_links);
01711         MyWorkerInfo->wi_dboid = InvalidOid;
01712         MyWorkerInfo->wi_tableoid = InvalidOid;
01713         MyWorkerInfo->wi_proc = NULL;
01714         MyWorkerInfo->wi_launchtime = 0;
01715         MyWorkerInfo->wi_cost_delay = 0;
01716         MyWorkerInfo->wi_cost_limit = 0;
01717         MyWorkerInfo->wi_cost_limit_base = 0;
01718         dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
01719                         &MyWorkerInfo->wi_links);
01720         /* not mine anymore */
01721         MyWorkerInfo = NULL;
01722 
01723         /*
01724          * now that we're inactive, cause a rebalancing of the surviving
01725          * workers
01726          */
01727         AutoVacuumShmem->av_signal[AutoVacRebalance] = true;
01728         LWLockRelease(AutovacuumLock);
01729     }
01730 }
01731 
01732 /*
01733  * Update the cost-based delay parameters, so that multiple workers consume
01734  * each a fraction of the total available I/O.
01735  */
01736 void
01737 AutoVacuumUpdateDelay(void)
01738 {
01739     if (MyWorkerInfo)
01740     {
01741         VacuumCostDelay = MyWorkerInfo->wi_cost_delay;
01742         VacuumCostLimit = MyWorkerInfo->wi_cost_limit;
01743     }
01744 }
01745 
01746 /*
01747  * autovac_balance_cost
01748  *      Recalculate the cost limit setting for each active worker.
01749  *
01750  * Caller must hold the AutovacuumLock in exclusive mode.
01751  */
01752 static void
01753 autovac_balance_cost(void)
01754 {
01755     /*
01756      * The idea here is that we ration out I/O equally.  The amount of I/O
01757      * that a worker can consume is determined by cost_limit/cost_delay, so we
01758      * try to equalize those ratios rather than the raw limit settings.
01759      *
01760      * note: in cost_limit, zero also means use value from elsewhere, because
01761      * zero is not a valid value.
01762      */
01763     int         vac_cost_limit = (autovacuum_vac_cost_limit > 0 ?
01764                                 autovacuum_vac_cost_limit : VacuumCostLimit);
01765     int         vac_cost_delay = (autovacuum_vac_cost_delay >= 0 ?
01766                                 autovacuum_vac_cost_delay : VacuumCostDelay);
01767     double      cost_total;
01768     double      cost_avail;
01769     dlist_iter  iter;
01770 
01771     /* not set? nothing to do */
01772     if (vac_cost_limit <= 0 || vac_cost_delay <= 0)
01773         return;
01774 
01775     /* caculate the total base cost limit of active workers */
01776     cost_total = 0.0;
01777     dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
01778     {
01779         WorkerInfo worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
01780 
01781         if (worker->wi_proc != NULL &&
01782             worker->wi_cost_limit_base > 0 && worker->wi_cost_delay > 0)
01783             cost_total +=
01784                 (double) worker->wi_cost_limit_base / worker->wi_cost_delay;
01785     }
01786     /* there are no cost limits -- nothing to do */
01787     if (cost_total <= 0)
01788         return;
01789 
01790     /*
01791      * Adjust cost limit of each active worker to balance the total of cost
01792      * limit to autovacuum_vacuum_cost_limit.
01793      */
01794     cost_avail = (double) vac_cost_limit / vac_cost_delay;
01795     dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
01796     {
01797         WorkerInfo worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
01798 
01799         if (worker->wi_proc != NULL &&
01800             worker->wi_cost_limit_base > 0 && worker->wi_cost_delay > 0)
01801         {
01802             int         limit = (int)
01803             (cost_avail * worker->wi_cost_limit_base / cost_total);
01804 
01805             /*
01806              * We put a lower bound of 1 on the cost_limit, to avoid division-
01807              * by-zero in the vacuum code.  Also, in case of roundoff trouble
01808              * in these calculations, let's be sure we don't ever set
01809              * cost_limit to more than the base value.
01810              */
01811             worker->wi_cost_limit = Max(Min(limit,
01812                                             worker->wi_cost_limit_base),
01813                                         1);
01814 
01815             elog(DEBUG2, "autovac_balance_cost(pid=%u db=%u, rel=%u, cost_limit=%d, cost_limit_base=%d, cost_delay=%d)",
01816                  worker->wi_proc->pid, worker->wi_dboid, worker->wi_tableoid,
01817                  worker->wi_cost_limit, worker->wi_cost_limit_base,
01818                  worker->wi_cost_delay);
01819         }
01820     }
01821 }
01822 
01823 /*
01824  * get_database_list
01825  *      Return a list of all databases found in pg_database.
01826  *
01827  * The list and associated data is allocated in the caller's memory context,
01828  * which is in charge of ensuring that it's properly cleaned up afterwards.
01829  *
01830  * Note: this is the only function in which the autovacuum launcher uses a
01831  * transaction.  Although we aren't attached to any particular database and
01832  * therefore can't access most catalogs, we do have enough infrastructure
01833  * to do a seqscan on pg_database.
01834  */
01835 static List *
01836 get_database_list(void)
01837 {
01838     List       *dblist = NIL;
01839     Relation    rel;
01840     HeapScanDesc scan;
01841     HeapTuple   tup;
01842     MemoryContext resultcxt;
01843 
01844     /* This is the context that we will allocate our output data in */
01845     resultcxt = CurrentMemoryContext;
01846 
01847     /*
01848      * Start a transaction so we can access pg_database, and get a snapshot.
01849      * We don't have a use for the snapshot itself, but we're interested in
01850      * the secondary effect that it sets RecentGlobalXmin.  (This is critical
01851      * for anything that reads heap pages, because HOT may decide to prune
01852      * them even if the process doesn't attempt to modify any tuples.)
01853      */
01854     StartTransactionCommand();
01855     (void) GetTransactionSnapshot();
01856 
01857     rel = heap_open(DatabaseRelationId, AccessShareLock);
01858     scan = heap_beginscan(rel, SnapshotNow, 0, NULL);
01859 
01860     while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection)))
01861     {
01862         Form_pg_database pgdatabase = (Form_pg_database) GETSTRUCT(tup);
01863         avw_dbase  *avdb;
01864         MemoryContext oldcxt;
01865 
01866         /*
01867          * Allocate our results in the caller's context, not the
01868          * transaction's. We do this inside the loop, and restore the original
01869          * context at the end, so that leaky things like heap_getnext() are
01870          * not called in a potentially long-lived context.
01871          */
01872         oldcxt = MemoryContextSwitchTo(resultcxt);
01873 
01874         avdb = (avw_dbase *) palloc(sizeof(avw_dbase));
01875 
01876         avdb->adw_datid = HeapTupleGetOid(tup);
01877         avdb->adw_name = pstrdup(NameStr(pgdatabase->datname));
01878         avdb->adw_frozenxid = pgdatabase->datfrozenxid;
01879         avdb->adw_frozenmulti = pgdatabase->datminmxid;
01880         /* this gets set later: */
01881         avdb->adw_entry = NULL;
01882 
01883         dblist = lappend(dblist, avdb);
01884         MemoryContextSwitchTo(oldcxt);
01885     }
01886 
01887     heap_endscan(scan);
01888     heap_close(rel, AccessShareLock);
01889 
01890     CommitTransactionCommand();
01891 
01892     return dblist;
01893 }
01894 
01895 /*
01896  * Process a database table-by-table
01897  *
01898  * Note that CHECK_FOR_INTERRUPTS is supposed to be used in certain spots in
01899  * order not to ignore shutdown commands for too long.
01900  */
01901 static void
01902 do_autovacuum(void)
01903 {
01904     Relation    classRel;
01905     HeapTuple   tuple;
01906     HeapScanDesc relScan;
01907     Form_pg_database dbForm;
01908     List       *table_oids = NIL;
01909     HASHCTL     ctl;
01910     HTAB       *table_toast_map;
01911     ListCell   *volatile cell;
01912     PgStat_StatDBEntry *shared;
01913     PgStat_StatDBEntry *dbentry;
01914     BufferAccessStrategy bstrategy;
01915     ScanKeyData key;
01916     TupleDesc   pg_class_desc;
01917 
01918     /*
01919      * StartTransactionCommand and CommitTransactionCommand will automatically
01920      * switch to other contexts.  We need this one to keep the list of
01921      * relations to vacuum/analyze across transactions.
01922      */
01923     AutovacMemCxt = AllocSetContextCreate(TopMemoryContext,
01924                                           "AV worker",
01925                                           ALLOCSET_DEFAULT_MINSIZE,
01926                                           ALLOCSET_DEFAULT_INITSIZE,
01927                                           ALLOCSET_DEFAULT_MAXSIZE);
01928     MemoryContextSwitchTo(AutovacMemCxt);
01929 
01930     /*
01931      * may be NULL if we couldn't find an entry (only happens if we are
01932      * forcing a vacuum for anti-wrap purposes).
01933      */
01934     dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
01935 
01936     /* Start a transaction so our commands have one to play into. */
01937     StartTransactionCommand();
01938 
01939     /*
01940      * Clean up any dead statistics collector entries for this DB. We always
01941      * want to do this exactly once per DB-processing cycle, even if we find
01942      * nothing worth vacuuming in the database.
01943      */
01944     pgstat_vacuum_stat();
01945 
01946     /*
01947      * Find the pg_database entry and select the default freeze ages. We use
01948      * zero in template and nonconnectable databases, else the system-wide
01949      * default.
01950      */
01951     tuple = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
01952     if (!HeapTupleIsValid(tuple))
01953         elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
01954     dbForm = (Form_pg_database) GETSTRUCT(tuple);
01955 
01956     if (dbForm->datistemplate || !dbForm->datallowconn)
01957     {
01958         default_freeze_min_age = 0;
01959         default_freeze_table_age = 0;
01960     }
01961     else
01962     {
01963         default_freeze_min_age = vacuum_freeze_min_age;
01964         default_freeze_table_age = vacuum_freeze_table_age;
01965     }
01966 
01967     ReleaseSysCache(tuple);
01968 
01969     /* StartTransactionCommand changed elsewhere */
01970     MemoryContextSwitchTo(AutovacMemCxt);
01971 
01972     /* The database hash where pgstat keeps shared relations */
01973     shared = pgstat_fetch_stat_dbentry(InvalidOid);
01974 
01975     classRel = heap_open(RelationRelationId, AccessShareLock);
01976 
01977     /* create a copy so we can use it after closing pg_class */
01978     pg_class_desc = CreateTupleDescCopy(RelationGetDescr(classRel));
01979 
01980     /* create hash table for toast <-> main relid mapping */
01981     MemSet(&ctl, 0, sizeof(ctl));
01982     ctl.keysize = sizeof(Oid);
01983     ctl.entrysize = sizeof(av_relation);
01984     ctl.hash = oid_hash;
01985 
01986     table_toast_map = hash_create("TOAST to main relid map",
01987                                   100,
01988                                   &ctl,
01989                                   HASH_ELEM | HASH_FUNCTION);
01990 
01991     /*
01992      * Scan pg_class to determine which tables to vacuum.
01993      *
01994      * We do this in two passes: on the first one we collect the list of plain
01995      * relations and materialized views, and on the second one we collect
01996      * TOAST tables. The reason for doing the second pass is that during it we
01997      * want to use the main relation's pg_class.reloptions entry if the TOAST
01998      * table does not have any, and we cannot obtain it unless we know
01999      * beforehand what's the main  table OID.
02000      *
02001      * We need to check TOAST tables separately because in cases with short,
02002      * wide tables there might be proportionally much more activity in the
02003      * TOAST table than in its parent.
02004      */
02005     relScan = heap_beginscan(classRel, SnapshotNow, 0, NULL);
02006 
02007     /*
02008      * On the first pass, we collect main tables to vacuum, and also the main
02009      * table relid to TOAST relid mapping.
02010      */
02011     while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL)
02012     {
02013         Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple);
02014         PgStat_StatTabEntry *tabentry;
02015         AutoVacOpts *relopts;
02016         Oid         relid;
02017         bool        dovacuum;
02018         bool        doanalyze;
02019         bool        wraparound;
02020 
02021         if (classForm->relkind != RELKIND_RELATION &&
02022             classForm->relkind != RELKIND_MATVIEW)
02023             continue;
02024 
02025         relid = HeapTupleGetOid(tuple);
02026 
02027         /* Fetch reloptions and the pgstat entry for this table */
02028         relopts = extract_autovac_opts(tuple, pg_class_desc);
02029         tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
02030                                              shared, dbentry);
02031 
02032         /* Check if it needs vacuum or analyze */
02033         relation_needs_vacanalyze(relid, relopts, classForm, tabentry,
02034                                   &dovacuum, &doanalyze, &wraparound);
02035 
02036         /*
02037          * Check if it is a temp table (presumably, of some other backend's).
02038          * We cannot safely process other backends' temp tables.
02039          */
02040         if (classForm->relpersistence == RELPERSISTENCE_TEMP)
02041         {
02042             int         backendID;
02043 
02044             backendID = GetTempNamespaceBackendId(classForm->relnamespace);
02045 
02046             /* We just ignore it if the owning backend is still active */
02047             if (backendID == MyBackendId || BackendIdGetProc(backendID) == NULL)
02048             {
02049                 /*
02050                  * We found an orphan temp table (which was probably left
02051                  * behind by a crashed backend).  If it's so old as to need
02052                  * vacuum for wraparound, forcibly drop it.  Otherwise just
02053                  * log a complaint.
02054                  */
02055                 if (wraparound)
02056                 {
02057                     ObjectAddress object;
02058 
02059                     ereport(LOG,
02060                             (errmsg("autovacuum: dropping orphan temp table \"%s\".\"%s\" in database \"%s\"",
02061                                  get_namespace_name(classForm->relnamespace),
02062                                     NameStr(classForm->relname),
02063                                     get_database_name(MyDatabaseId))));
02064                     object.classId = RelationRelationId;
02065                     object.objectId = relid;
02066                     object.objectSubId = 0;
02067                     performDeletion(&object, DROP_CASCADE, PERFORM_DELETION_INTERNAL);
02068                 }
02069                 else
02070                 {
02071                     ereport(LOG,
02072                             (errmsg("autovacuum: found orphan temp table \"%s\".\"%s\" in database \"%s\"",
02073                                  get_namespace_name(classForm->relnamespace),
02074                                     NameStr(classForm->relname),
02075                                     get_database_name(MyDatabaseId))));
02076                 }
02077             }
02078         }
02079         else
02080         {
02081             /* relations that need work are added to table_oids */
02082             if (dovacuum || doanalyze)
02083                 table_oids = lappend_oid(table_oids, relid);
02084 
02085             /*
02086              * Remember the association for the second pass.  Note: we must do
02087              * this even if the table is going to be vacuumed, because we
02088              * don't automatically vacuum toast tables along the parent table.
02089              */
02090             if (OidIsValid(classForm->reltoastrelid))
02091             {
02092                 av_relation *hentry;
02093                 bool        found;
02094 
02095                 hentry = hash_search(table_toast_map,
02096                                      &classForm->reltoastrelid,
02097                                      HASH_ENTER, &found);
02098 
02099                 if (!found)
02100                 {
02101                     /* hash_search already filled in the key */
02102                     hentry->ar_relid = relid;
02103                     hentry->ar_hasrelopts = false;
02104                     if (relopts != NULL)
02105                     {
02106                         hentry->ar_hasrelopts = true;
02107                         memcpy(&hentry->ar_reloptions, relopts,
02108                                sizeof(AutoVacOpts));
02109                     }
02110                 }
02111             }
02112         }
02113     }
02114 
02115     heap_endscan(relScan);
02116 
02117     /* second pass: check TOAST tables */
02118     ScanKeyInit(&key,
02119                 Anum_pg_class_relkind,
02120                 BTEqualStrategyNumber, F_CHAREQ,
02121                 CharGetDatum(RELKIND_TOASTVALUE));
02122 
02123     relScan = heap_beginscan(classRel, SnapshotNow, 1, &key);
02124     while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL)
02125     {
02126         Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple);
02127         PgStat_StatTabEntry *tabentry;
02128         Oid         relid;
02129         AutoVacOpts *relopts = NULL;
02130         bool        dovacuum;
02131         bool        doanalyze;
02132         bool        wraparound;
02133 
02134         /*
02135          * We cannot safely process other backends' temp tables, so skip 'em.
02136          */
02137         if (classForm->relpersistence == RELPERSISTENCE_TEMP)
02138             continue;
02139 
02140         relid = HeapTupleGetOid(tuple);
02141 
02142         /*
02143          * fetch reloptions -- if this toast table does not have them, try the
02144          * main rel
02145          */
02146         relopts = extract_autovac_opts(tuple, pg_class_desc);
02147         if (relopts == NULL)
02148         {
02149             av_relation *hentry;
02150             bool        found;
02151 
02152             hentry = hash_search(table_toast_map, &relid, HASH_FIND, &found);
02153             if (found && hentry->ar_hasrelopts)
02154                 relopts = &hentry->ar_reloptions;
02155         }
02156 
02157         /* Fetch the pgstat entry for this table */
02158         tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
02159                                              shared, dbentry);
02160 
02161         relation_needs_vacanalyze(relid, relopts, classForm, tabentry,
02162                                   &dovacuum, &doanalyze, &wraparound);
02163 
02164         /* ignore analyze for toast tables */
02165         if (dovacuum)
02166             table_oids = lappend_oid(table_oids, relid);
02167     }
02168 
02169     heap_endscan(relScan);
02170     heap_close(classRel, AccessShareLock);
02171 
02172     /*
02173      * Create a buffer access strategy object for VACUUM to use.  We want to
02174      * use the same one across all the vacuum operations we perform, since the
02175      * point is for VACUUM not to blow out the shared cache.
02176      */
02177     bstrategy = GetAccessStrategy(BAS_VACUUM);
02178 
02179     /*
02180      * create a memory context to act as fake PortalContext, so that the
02181      * contexts created in the vacuum code are cleaned up for each table.
02182      */
02183     PortalContext = AllocSetContextCreate(AutovacMemCxt,
02184                                           "Autovacuum Portal",
02185                                           ALLOCSET_DEFAULT_INITSIZE,
02186                                           ALLOCSET_DEFAULT_MINSIZE,
02187                                           ALLOCSET_DEFAULT_MAXSIZE);
02188 
02189     /*
02190      * Perform operations on collected tables.
02191      */
02192     foreach(cell, table_oids)
02193     {
02194         Oid         relid = lfirst_oid(cell);
02195         autovac_table *tab;
02196         bool        skipit;
02197         int         stdVacuumCostDelay;
02198         int         stdVacuumCostLimit;
02199         dlist_iter  iter;
02200 
02201         CHECK_FOR_INTERRUPTS();
02202 
02203         /*
02204          * hold schedule lock from here until we're sure that this table still
02205          * needs vacuuming.  We also need the AutovacuumLock to walk the
02206          * worker array, but we'll let go of that one quickly.
02207          */
02208         LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE);
02209         LWLockAcquire(AutovacuumLock, LW_SHARED);
02210 
02211         /*
02212          * Check whether the table is being vacuumed concurrently by another
02213          * worker.
02214          */
02215         skipit = false;
02216         dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
02217         {
02218             WorkerInfo  worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
02219 
02220             /* ignore myself */
02221             if (worker == MyWorkerInfo)
02222                 continue;
02223 
02224             /* ignore workers in other databases */
02225             if (worker->wi_dboid != MyDatabaseId)
02226                 continue;
02227 
02228             if (worker->wi_tableoid == relid)
02229             {
02230                 skipit = true;
02231                 break;
02232             }
02233         }
02234         LWLockRelease(AutovacuumLock);
02235         if (skipit)
02236         {
02237             LWLockRelease(AutovacuumScheduleLock);
02238             continue;
02239         }
02240 
02241         /*
02242          * Check whether pgstat data still says we need to vacuum this table.
02243          * It could have changed if something else processed the table while
02244          * we weren't looking.
02245          *
02246          * Note: we have a special case in pgstat code to ensure that the
02247          * stats we read are as up-to-date as possible, to avoid the problem
02248          * that somebody just finished vacuuming this table.  The window to
02249          * the race condition is not closed but it is very small.
02250          */
02251         MemoryContextSwitchTo(AutovacMemCxt);
02252         tab = table_recheck_autovac(relid, table_toast_map, pg_class_desc);
02253         if (tab == NULL)
02254         {
02255             /* someone else vacuumed the table, or it went away */
02256             LWLockRelease(AutovacuumScheduleLock);
02257             continue;
02258         }
02259 
02260         /*
02261          * Ok, good to go.  Store the table in shared memory before releasing
02262          * the lock so that other workers don't vacuum it concurrently.
02263          */
02264         MyWorkerInfo->wi_tableoid = relid;
02265         LWLockRelease(AutovacuumScheduleLock);
02266 
02267         /*
02268          * Remember the prevailing values of the vacuum cost GUCs.  We have to
02269          * restore these at the bottom of the loop, else we'll compute wrong
02270          * values in the next iteration of autovac_balance_cost().
02271          */
02272         stdVacuumCostDelay = VacuumCostDelay;
02273         stdVacuumCostLimit = VacuumCostLimit;
02274 
02275         /* Must hold AutovacuumLock while mucking with cost balance info */
02276         LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
02277 
02278         /* advertise my cost delay parameters for the balancing algorithm */
02279         MyWorkerInfo->wi_cost_delay = tab->at_vacuum_cost_delay;
02280         MyWorkerInfo->wi_cost_limit = tab->at_vacuum_cost_limit;
02281         MyWorkerInfo->wi_cost_limit_base = tab->at_vacuum_cost_limit;
02282 
02283         /* do a balance */
02284         autovac_balance_cost();
02285 
02286         /* set the active cost parameters from the result of that */
02287         AutoVacuumUpdateDelay();
02288 
02289         /* done */
02290         LWLockRelease(AutovacuumLock);
02291 
02292         /* clean up memory before each iteration */
02293         MemoryContextResetAndDeleteChildren(PortalContext);
02294 
02295         /*
02296          * Save the relation name for a possible error message, to avoid a
02297          * catalog lookup in case of an error.  If any of these return NULL,
02298          * then the relation has been dropped since last we checked; skip it.
02299          * Note: they must live in a long-lived memory context because we call
02300          * vacuum and analyze in different transactions.
02301          */
02302 
02303         tab->at_relname = get_rel_name(tab->at_relid);
02304         tab->at_nspname = get_namespace_name(get_rel_namespace(tab->at_relid));
02305         tab->at_datname = get_database_name(MyDatabaseId);
02306         if (!tab->at_relname || !tab->at_nspname || !tab->at_datname)
02307             goto deleted;
02308 
02309         /*
02310          * We will abort vacuuming the current table if something errors out,
02311          * and continue with the next one in schedule; in particular, this
02312          * happens if we are interrupted with SIGINT.
02313          */
02314         PG_TRY();
02315         {
02316             /* have at it */
02317             MemoryContextSwitchTo(TopTransactionContext);
02318             autovacuum_do_vac_analyze(tab, bstrategy);
02319 
02320             /*
02321              * Clear a possible query-cancel signal, to avoid a late reaction
02322              * to an automatically-sent signal because of vacuuming the
02323              * current table (we're done with it, so it would make no sense to
02324              * cancel at this point.)
02325              */
02326             QueryCancelPending = false;
02327         }
02328         PG_CATCH();
02329         {
02330             /*
02331              * Abort the transaction, start a new one, and proceed with the
02332              * next table in our list.
02333              */
02334             HOLD_INTERRUPTS();
02335             if (tab->at_dovacuum)
02336                 errcontext("automatic vacuum of table \"%s.%s.%s\"",
02337                            tab->at_datname, tab->at_nspname, tab->at_relname);
02338             else
02339                 errcontext("automatic analyze of table \"%s.%s.%s\"",
02340                            tab->at_datname, tab->at_nspname, tab->at_relname);
02341             EmitErrorReport();
02342 
02343             /* this resets the PGXACT flags too */
02344             AbortOutOfAnyTransaction();
02345             FlushErrorState();
02346             MemoryContextResetAndDeleteChildren(PortalContext);
02347 
02348             /* restart our transaction for the following operations */
02349             StartTransactionCommand();
02350             RESUME_INTERRUPTS();
02351         }
02352         PG_END_TRY();
02353 
02354         /* the PGXACT flags are reset at the next end of transaction */
02355 
02356         /* be tidy */
02357 deleted:
02358         if (tab->at_datname != NULL)
02359             pfree(tab->at_datname);
02360         if (tab->at_nspname != NULL)
02361             pfree(tab->at_nspname);
02362         if (tab->at_relname != NULL)
02363             pfree(tab->at_relname);
02364         pfree(tab);
02365 
02366         /*
02367          * Remove my info from shared memory.  We could, but intentionally
02368          * don't, clear wi_cost_limit and friends --- this is on the
02369          * assumption that we probably have more to do with similar cost
02370          * settings, so we don't want to give up our share of I/O for a very
02371          * short interval and thereby thrash the global balance.
02372          */
02373         LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
02374         MyWorkerInfo->wi_tableoid = InvalidOid;
02375         LWLockRelease(AutovacuumLock);
02376 
02377         /* restore vacuum cost GUCs for the next iteration */
02378         VacuumCostDelay = stdVacuumCostDelay;
02379         VacuumCostLimit = stdVacuumCostLimit;
02380     }
02381 
02382     /*
02383      * We leak table_toast_map here (among other things), but since we're
02384      * going away soon, it's not a problem.
02385      */
02386 
02387     /*
02388      * Update pg_database.datfrozenxid, and truncate pg_clog if possible. We
02389      * only need to do this once, not after each table.
02390      */
02391     vac_update_datfrozenxid();
02392 
02393     /* Finally close out the last transaction. */
02394     CommitTransactionCommand();
02395 }
02396 
02397 /*
02398  * extract_autovac_opts
02399  *
02400  * Given a relation's pg_class tuple, return the AutoVacOpts portion of
02401  * reloptions, if set; otherwise, return NULL.
02402  */
02403 static AutoVacOpts *
02404 extract_autovac_opts(HeapTuple tup, TupleDesc pg_class_desc)
02405 {
02406     bytea      *relopts;
02407     AutoVacOpts *av;
02408 
02409     Assert(((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_RELATION ||
02410            ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_MATVIEW ||
02411            ((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_TOASTVALUE);
02412 
02413     relopts = extractRelOptions(tup, pg_class_desc, InvalidOid);
02414     if (relopts == NULL)
02415         return NULL;
02416 
02417     av = palloc(sizeof(AutoVacOpts));
02418     memcpy(av, &(((StdRdOptions *) relopts)->autovacuum), sizeof(AutoVacOpts));
02419     pfree(relopts);
02420 
02421     return av;
02422 }
02423 
02424 /*
02425  * get_pgstat_tabentry_relid
02426  *
02427  * Fetch the pgstat entry of a table, either local to a database or shared.
02428  */
02429 static PgStat_StatTabEntry *
02430 get_pgstat_tabentry_relid(Oid relid, bool isshared, PgStat_StatDBEntry *shared,
02431                           PgStat_StatDBEntry *dbentry)
02432 {
02433     PgStat_StatTabEntry *tabentry = NULL;
02434 
02435     if (isshared)
02436     {
02437         if (PointerIsValid(shared))
02438             tabentry = hash_search(shared->tables, &relid,
02439                                    HASH_FIND, NULL);
02440     }
02441     else if (PointerIsValid(dbentry))
02442         tabentry = hash_search(dbentry->tables, &relid,
02443                                HASH_FIND, NULL);
02444 
02445     return tabentry;
02446 }
02447 
02448 /*
02449  * table_recheck_autovac
02450  *
02451  * Recheck whether a table still needs vacuum or analyze.  Return value is a
02452  * valid autovac_table pointer if it does, NULL otherwise.
02453  *
02454  * Note that the returned autovac_table does not have the name fields set.
02455  */
02456 static autovac_table *
02457 table_recheck_autovac(Oid relid, HTAB *table_toast_map,
02458                       TupleDesc pg_class_desc)
02459 {
02460     Form_pg_class classForm;
02461     HeapTuple   classTup;
02462     bool        dovacuum;
02463     bool        doanalyze;
02464     autovac_table *tab = NULL;
02465     PgStat_StatTabEntry *tabentry;
02466     PgStat_StatDBEntry *shared;
02467     PgStat_StatDBEntry *dbentry;
02468     bool        wraparound;
02469     AutoVacOpts *avopts;
02470 
02471     /* use fresh stats */
02472     autovac_refresh_stats();
02473 
02474     shared = pgstat_fetch_stat_dbentry(InvalidOid);
02475     dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
02476 
02477     /* fetch the relation's relcache entry */
02478     classTup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid));
02479     if (!HeapTupleIsValid(classTup))
02480         return NULL;
02481     classForm = (Form_pg_class) GETSTRUCT(classTup);
02482 
02483     /*
02484      * Get the applicable reloptions.  If it is a TOAST table, try to get the
02485      * main table reloptions if the toast table itself doesn't have.
02486      */
02487     avopts = extract_autovac_opts(classTup, pg_class_desc);
02488     if (classForm->relkind == RELKIND_TOASTVALUE &&
02489         avopts == NULL && table_toast_map != NULL)
02490     {
02491         av_relation *hentry;
02492         bool        found;
02493 
02494         hentry = hash_search(table_toast_map, &relid, HASH_FIND, &found);
02495         if (found && hentry->ar_hasrelopts)
02496             avopts = &hentry->ar_reloptions;
02497     }
02498 
02499     /* fetch the pgstat table entry */
02500     tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
02501                                          shared, dbentry);
02502 
02503     relation_needs_vacanalyze(relid, avopts, classForm, tabentry,
02504                               &dovacuum, &doanalyze, &wraparound);
02505 
02506     /* ignore ANALYZE for toast tables */
02507     if (classForm->relkind == RELKIND_TOASTVALUE)
02508         doanalyze = false;
02509 
02510     /* OK, it needs something done */
02511     if (doanalyze || dovacuum)
02512     {
02513         int         freeze_min_age;
02514         int         freeze_table_age;
02515         int         vac_cost_limit;
02516         int         vac_cost_delay;
02517 
02518         /*
02519          * Calculate the vacuum cost parameters and the freeze ages.  If there
02520          * are options set in pg_class.reloptions, use them; in the case of a
02521          * toast table, try the main table too.  Otherwise use the GUC
02522          * defaults, autovacuum's own first and plain vacuum second.
02523          */
02524 
02525         /* -1 in autovac setting means use plain vacuum_cost_delay */
02526         vac_cost_delay = (avopts && avopts->vacuum_cost_delay >= 0)
02527             ? avopts->vacuum_cost_delay
02528             : (autovacuum_vac_cost_delay >= 0)
02529             ? autovacuum_vac_cost_delay
02530             : VacuumCostDelay;
02531 
02532         /* 0 or -1 in autovac setting means use plain vacuum_cost_limit */
02533         vac_cost_limit = (avopts && avopts->vacuum_cost_limit > 0)
02534             ? avopts->vacuum_cost_limit
02535             : (autovacuum_vac_cost_limit > 0)
02536             ? autovacuum_vac_cost_limit
02537             : VacuumCostLimit;
02538 
02539         /* these do not have autovacuum-specific settings */
02540         freeze_min_age = (avopts && avopts->freeze_min_age >= 0)
02541             ? avopts->freeze_min_age
02542             : default_freeze_min_age;
02543 
02544         freeze_table_age = (avopts && avopts->freeze_table_age >= 0)
02545             ? avopts->freeze_table_age
02546             : default_freeze_table_age;
02547 
02548         tab = palloc(sizeof(autovac_table));
02549         tab->at_relid = relid;
02550         tab->at_dovacuum = dovacuum;
02551         tab->at_doanalyze = doanalyze;
02552         tab->at_freeze_min_age = freeze_min_age;
02553         tab->at_freeze_table_age = freeze_table_age;
02554         tab->at_vacuum_cost_limit = vac_cost_limit;
02555         tab->at_vacuum_cost_delay = vac_cost_delay;
02556         tab->at_wraparound = wraparound;
02557         tab->at_relname = NULL;
02558         tab->at_nspname = NULL;
02559         tab->at_datname = NULL;
02560     }
02561 
02562     heap_freetuple(classTup);
02563 
02564     return tab;
02565 }
02566 
02567 /*
02568  * relation_needs_vacanalyze
02569  *
02570  * Check whether a relation needs to be vacuumed or analyzed; return each into
02571  * "dovacuum" and "doanalyze", respectively.  Also return whether the vacuum is
02572  * being forced because of Xid wraparound.
02573  *
02574  * relopts is a pointer to the AutoVacOpts options (either for itself in the
02575  * case of a plain table, or for either itself or its parent table in the case
02576  * of a TOAST table), NULL if none; tabentry is the pgstats entry, which can be
02577  * NULL.
02578  *
02579  * A table needs to be vacuumed if the number of dead tuples exceeds a
02580  * threshold.  This threshold is calculated as
02581  *
02582  * threshold = vac_base_thresh + vac_scale_factor * reltuples
02583  *
02584  * For analyze, the analysis done is that the number of tuples inserted,
02585  * deleted and updated since the last analyze exceeds a threshold calculated
02586  * in the same fashion as above.  Note that the collector actually stores
02587  * the number of tuples (both live and dead) that there were as of the last
02588  * analyze.  This is asymmetric to the VACUUM case.
02589  *
02590  * We also force vacuum if the table's relfrozenxid is more than freeze_max_age
02591  * transactions back.
02592  *
02593  * A table whose autovacuum_enabled option is false is
02594  * automatically skipped (unless we have to vacuum it due to freeze_max_age).
02595  * Thus autovacuum can be disabled for specific tables. Also, when the stats
02596  * collector does not have data about a table, it will be skipped.
02597  *
02598  * A table whose vac_base_thresh value is < 0 takes the base value from the
02599  * autovacuum_vacuum_threshold GUC variable.  Similarly, a vac_scale_factor
02600  * value < 0 is substituted with the value of
02601  * autovacuum_vacuum_scale_factor GUC variable.  Ditto for analyze.
02602  */
02603 static void
02604 relation_needs_vacanalyze(Oid relid,
02605                           AutoVacOpts *relopts,
02606                           Form_pg_class classForm,
02607                           PgStat_StatTabEntry *tabentry,
02608  /* output params below */
02609                           bool *dovacuum,
02610                           bool *doanalyze,
02611                           bool *wraparound)
02612 {
02613     bool        force_vacuum;
02614     bool        av_enabled;
02615     float4      reltuples;      /* pg_class.reltuples */
02616 
02617     /* constants from reloptions or GUC variables */
02618     int         vac_base_thresh,
02619                 anl_base_thresh;
02620     float4      vac_scale_factor,
02621                 anl_scale_factor;
02622 
02623     /* thresholds calculated from above constants */
02624     float4      vacthresh,
02625                 anlthresh;
02626 
02627     /* number of vacuum (resp. analyze) tuples at this time */
02628     float4      vactuples,
02629                 anltuples;
02630 
02631     /* freeze parameters */
02632     int         freeze_max_age;
02633     TransactionId xidForceLimit;
02634     MultiXactId multiForceLimit;
02635 
02636     AssertArg(classForm != NULL);
02637     AssertArg(OidIsValid(relid));
02638 
02639     /*
02640      * Determine vacuum/analyze equation parameters.  We have two possible
02641      * sources: the passed reloptions (which could be a main table or a toast
02642      * table), or the autovacuum GUC variables.
02643      */
02644 
02645     /* -1 in autovac setting means use plain vacuum_cost_delay */
02646     vac_scale_factor = (relopts && relopts->vacuum_scale_factor >= 0)
02647         ? relopts->vacuum_scale_factor
02648         : autovacuum_vac_scale;
02649 
02650     vac_base_thresh = (relopts && relopts->vacuum_threshold >= 0)
02651         ? relopts->vacuum_threshold
02652         : autovacuum_vac_thresh;
02653 
02654     anl_scale_factor = (relopts && relopts->analyze_scale_factor >= 0)
02655         ? relopts->analyze_scale_factor
02656         : autovacuum_anl_scale;
02657 
02658     anl_base_thresh = (relopts && relopts->analyze_threshold >= 0)
02659         ? relopts->analyze_threshold
02660         : autovacuum_anl_thresh;
02661 
02662     freeze_max_age = (relopts && relopts->freeze_max_age >= 0)
02663         ? Min(relopts->freeze_max_age, autovacuum_freeze_max_age)
02664         : autovacuum_freeze_max_age;
02665 
02666     av_enabled = (relopts ? relopts->enabled : true);
02667 
02668     /* Force vacuum if table is at risk of wraparound */
02669     xidForceLimit = recentXid - freeze_max_age;
02670     if (xidForceLimit < FirstNormalTransactionId)
02671         xidForceLimit -= FirstNormalTransactionId;
02672     force_vacuum = (TransactionIdIsNormal(classForm->relfrozenxid) &&
02673                     TransactionIdPrecedes(classForm->relfrozenxid,
02674                                           xidForceLimit));
02675     if (!force_vacuum)
02676     {
02677         multiForceLimit = recentMulti - autovacuum_freeze_max_age;
02678         if (multiForceLimit < FirstMultiXactId)
02679             multiForceLimit -= FirstMultiXactId;
02680         force_vacuum = MultiXactIdPrecedes(classForm->relminmxid,
02681                                            multiForceLimit);
02682     }
02683     *wraparound = force_vacuum;
02684 
02685     /* User disabled it in pg_class.reloptions?  (But ignore if at risk) */
02686     if (!force_vacuum && !av_enabled)
02687     {
02688         *doanalyze = false;
02689         *dovacuum = false;
02690         return;
02691     }
02692 
02693     if (PointerIsValid(tabentry))
02694     {
02695         reltuples = classForm->reltuples;
02696         vactuples = tabentry->n_dead_tuples;
02697         anltuples = tabentry->changes_since_analyze;
02698 
02699         vacthresh = (float4) vac_base_thresh + vac_scale_factor * reltuples;
02700         anlthresh = (float4) anl_base_thresh + anl_scale_factor * reltuples;
02701 
02702         /*
02703          * Note that we don't need to take special consideration for stat
02704          * reset, because if that happens, the last vacuum and analyze counts
02705          * will be reset too.
02706          */
02707         elog(DEBUG3, "%s: vac: %.0f (threshold %.0f), anl: %.0f (threshold %.0f)",
02708              NameStr(classForm->relname),
02709              vactuples, vacthresh, anltuples, anlthresh);
02710 
02711         /* Determine if this table needs vacuum or analyze. */
02712         *dovacuum = force_vacuum || (vactuples > vacthresh);
02713         *doanalyze = (anltuples > anlthresh);
02714     }
02715     else
02716     {
02717         /*
02718          * Skip a table not found in stat hash, unless we have to force vacuum
02719          * for anti-wrap purposes.  If it's not acted upon, there's no need to
02720          * vacuum it.
02721          */
02722         *dovacuum = force_vacuum;
02723         *doanalyze = false;
02724     }
02725 
02726     /* ANALYZE refuses to work with pg_statistics */
02727     if (relid == StatisticRelationId)
02728         *doanalyze = false;
02729 }
02730 
02731 /*
02732  * autovacuum_do_vac_analyze
02733  *      Vacuum and/or analyze the specified table
02734  */
02735 static void
02736 autovacuum_do_vac_analyze(autovac_table *tab,
02737                           BufferAccessStrategy bstrategy)
02738 {
02739     VacuumStmt  vacstmt;
02740     RangeVar    rangevar;
02741 
02742     /* Set up command parameters --- use local variables instead of palloc */
02743     MemSet(&vacstmt, 0, sizeof(vacstmt));
02744     MemSet(&rangevar, 0, sizeof(rangevar));
02745 
02746     rangevar.schemaname = tab->at_nspname;
02747     rangevar.relname = tab->at_relname;
02748     rangevar.location = -1;
02749 
02750     vacstmt.type = T_VacuumStmt;
02751     if (!tab->at_wraparound)
02752         vacstmt.options = VACOPT_NOWAIT;
02753     if (tab->at_dovacuum)
02754         vacstmt.options |= VACOPT_VACUUM;
02755     if (tab->at_doanalyze)
02756         vacstmt.options |= VACOPT_ANALYZE;
02757     vacstmt.freeze_min_age = tab->at_freeze_min_age;
02758     vacstmt.freeze_table_age = tab->at_freeze_table_age;
02759     /* we pass the OID, but might need this anyway for an error message */
02760     vacstmt.relation = &rangevar;
02761     vacstmt.va_cols = NIL;
02762 
02763     /* Let pgstat know what we're doing */
02764     autovac_report_activity(tab);
02765 
02766     vacuum(&vacstmt, tab->at_relid, false, bstrategy, tab->at_wraparound, true);
02767 }
02768 
02769 /*
02770  * autovac_report_activity
02771  *      Report to pgstat what autovacuum is doing
02772  *
02773  * We send a SQL string corresponding to what the user would see if the
02774  * equivalent command was to be issued manually.
02775  *
02776  * Note we assume that we are going to report the next command as soon as we're
02777  * done with the current one, and exit right after the last one, so we don't
02778  * bother to report "<IDLE>" or some such.
02779  */
02780 static void
02781 autovac_report_activity(autovac_table *tab)
02782 {
02783 #define MAX_AUTOVAC_ACTIV_LEN (NAMEDATALEN * 2 + 56)
02784     char        activity[MAX_AUTOVAC_ACTIV_LEN];
02785     int         len;
02786 
02787     /* Report the command and possible options */
02788     if (tab->at_dovacuum)
02789         snprintf(activity, MAX_AUTOVAC_ACTIV_LEN,
02790                  "autovacuum: VACUUM%s",
02791                  tab->at_doanalyze ? " ANALYZE" : "");
02792     else
02793         snprintf(activity, MAX_AUTOVAC_ACTIV_LEN,
02794                  "autovacuum: ANALYZE");
02795 
02796     /*
02797      * Report the qualified name of the relation.
02798      */
02799     len = strlen(activity);
02800 
02801     snprintf(activity + len, MAX_AUTOVAC_ACTIV_LEN - len,
02802              " %s.%s%s", tab->at_nspname, tab->at_relname,
02803              tab->at_wraparound ? " (to prevent wraparound)" : "");
02804 
02805     /* Set statement_timestamp() to current time for pg_stat_activity */
02806     SetCurrentStatementStartTimestamp();
02807 
02808     pgstat_report_activity(STATE_RUNNING, activity);
02809 }
02810 
02811 /*
02812  * AutoVacuumingActive
02813  *      Check GUC vars and report whether the autovacuum process should be
02814  *      running.
02815  */
02816 bool
02817 AutoVacuumingActive(void)
02818 {
02819     if (!autovacuum_start_daemon || !pgstat_track_counts)
02820         return false;
02821     return true;
02822 }
02823 
02824 /*
02825  * autovac_init
02826  *      This is called at postmaster initialization.
02827  *
02828  * All we do here is annoy the user if he got it wrong.
02829  */
02830 void
02831 autovac_init(void)
02832 {
02833     if (autovacuum_start_daemon && !pgstat_track_counts)
02834         ereport(WARNING,
02835                 (errmsg("autovacuum not started because of misconfiguration"),
02836                  errhint("Enable the \"track_counts\" option.")));
02837 }
02838 
02839 /*
02840  * IsAutoVacuum functions
02841  *      Return whether this is either a launcher autovacuum process or a worker
02842  *      process.
02843  */
02844 bool
02845 IsAutoVacuumLauncherProcess(void)
02846 {
02847     return am_autovacuum_launcher;
02848 }
02849 
02850 bool
02851 IsAutoVacuumWorkerProcess(void)
02852 {
02853     return am_autovacuum_worker;
02854 }
02855 
02856 
02857 /*
02858  * AutoVacuumShmemSize
02859  *      Compute space needed for autovacuum-related shared memory
02860  */
02861 Size
02862 AutoVacuumShmemSize(void)
02863 {
02864     Size        size;
02865 
02866     /*
02867      * Need the fixed struct and the array of WorkerInfoData.
02868      */
02869     size = sizeof(AutoVacuumShmemStruct);
02870     size = MAXALIGN(size);
02871     size = add_size(size, mul_size(autovacuum_max_workers,
02872                                    sizeof(WorkerInfoData)));
02873     return size;
02874 }
02875 
02876 /*
02877  * AutoVacuumShmemInit
02878  *      Allocate and initialize autovacuum-related shared memory
02879  */
02880 void
02881 AutoVacuumShmemInit(void)
02882 {
02883     bool        found;
02884 
02885     AutoVacuumShmem = (AutoVacuumShmemStruct *)
02886         ShmemInitStruct("AutoVacuum Data",
02887                         AutoVacuumShmemSize(),
02888                         &found);
02889 
02890     if (!IsUnderPostmaster)
02891     {
02892         WorkerInfo  worker;
02893         int         i;
02894 
02895         Assert(!found);
02896 
02897         AutoVacuumShmem->av_launcherpid = 0;
02898         dlist_init(&AutoVacuumShmem->av_freeWorkers);
02899         dlist_init(&AutoVacuumShmem->av_runningWorkers);
02900         AutoVacuumShmem->av_startingWorker = NULL;
02901 
02902         worker = (WorkerInfo) ((char *) AutoVacuumShmem +
02903                                MAXALIGN(sizeof(AutoVacuumShmemStruct)));
02904 
02905         /* initialize the WorkerInfo free list */
02906         for (i = 0; i < autovacuum_max_workers; i++)
02907             dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
02908                             &worker[i].wi_links);
02909     }
02910     else
02911         Assert(found);
02912 }
02913 
02914 /*
02915  * autovac_refresh_stats
02916  *      Refresh pgstats data for an autovacuum process
02917  *
02918  * Cause the next pgstats read operation to obtain fresh data, but throttle
02919  * such refreshing in the autovacuum launcher.  This is mostly to avoid
02920  * rereading the pgstats files too many times in quick succession when there
02921  * are many databases.
02922  *
02923  * Note: we avoid throttling in the autovac worker, as it would be
02924  * counterproductive in the recheck logic.
02925  */
02926 static void
02927 autovac_refresh_stats(void)
02928 {
02929     if (IsAutoVacuumLauncherProcess())
02930     {
02931         static TimestampTz last_read = 0;
02932         TimestampTz current_time;
02933 
02934         current_time = GetCurrentTimestamp();
02935 
02936         if (!TimestampDifferenceExceeds(last_read, current_time,
02937                                         STATS_READ_DELAY))
02938             return;
02939 
02940         last_read = current_time;
02941     }
02942 
02943     pgstat_clear_snapshot();
02944 }