Header And Logo

PostgreSQL
| The world's most advanced open source database.

syncscan.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * syncscan.c
00004  *    heap scan synchronization support
00005  *
00006  * When multiple backends run a sequential scan on the same table, we try
00007  * to keep them synchronized to reduce the overall I/O needed.  The goal is
00008  * to read each page into shared buffer cache only once, and let all backends
00009  * that take part in the shared scan process the page before it falls out of
00010  * the cache.
00011  *
00012  * Since the "leader" in a pack of backends doing a seqscan will have to wait
00013  * for I/O, while the "followers" don't, there is a strong self-synchronizing
00014  * effect once we can get the backends examining approximately the same part
00015  * of the table at the same time.  Hence all that is really needed is to get
00016  * a new backend beginning a seqscan to begin it close to where other backends
00017  * are reading.  We can scan the table circularly, from block X up to the
00018  * end and then from block 0 to X-1, to ensure we visit all rows while still
00019  * participating in the common scan.
00020  *
00021  * To accomplish that, we keep track of the scan position of each table, and
00022  * start new scans close to where the previous scan(s) are.  We don't try to
00023  * do any extra synchronization to keep the scans together afterwards; some
00024  * scans might progress much more slowly than others, for example if the
00025  * results need to be transferred to the client over a slow network, and we
00026  * don't want such queries to slow down others.
00027  *
00028  * There can realistically only be a few large sequential scans on different
00029  * tables in progress at any time.  Therefore we just keep the scan positions
00030  * in a small LRU list which we scan every time we need to look up or update a
00031  * scan position.  The whole mechanism is only applied for tables exceeding
00032  * a threshold size (but that is not the concern of this module).
00033  *
00034  * INTERFACE ROUTINES
00035  *      ss_get_location     - return current scan location of a relation
00036  *      ss_report_location  - update current scan location
00037  *
00038  *
00039  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00040  * Portions Copyright (c) 1994, Regents of the University of California
00041  *
00042  * IDENTIFICATION
00043  *    src/backend/access/heap/syncscan.c
00044  *
00045  *-------------------------------------------------------------------------
00046  */
00047 #include "postgres.h"
00048 
00049 #include "access/heapam.h"
00050 #include "miscadmin.h"
00051 #include "utils/rel.h"
00052 
00053 
00054 /* GUC variables */
00055 #ifdef TRACE_SYNCSCAN
00056 bool        trace_syncscan = false;
00057 #endif
00058 
00059 
00060 /*
00061  * Size of the LRU list.
00062  *
00063  * Note: the code assumes that SYNC_SCAN_NELEM > 1.
00064  *
00065  * XXX: What's a good value? It should be large enough to hold the
00066  * maximum number of large tables scanned simultaneously.  But a larger value
00067  * means more traversing of the LRU list when starting a new scan.
00068  */
00069 #define SYNC_SCAN_NELEM 20
00070 
00071 /*
00072  * Interval between reports of the location of the current scan, in pages.
00073  *
00074  * Note: This should be smaller than the ring size (see buffer/freelist.c)
00075  * we use for bulk reads.  Otherwise a scan joining other scans might start
00076  * from a page that's no longer in the buffer cache.  This is a bit fuzzy;
00077  * there's no guarantee that the new scan will read the page before it leaves
00078  * the buffer cache anyway, and on the other hand the page is most likely
00079  * still in the OS cache.
00080  */
00081 #define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ)
00082 
00083 
00084 /*
00085  * The scan locations structure is essentially a doubly-linked LRU with head
00086  * and tail pointer, but designed to hold a fixed maximum number of elements in
00087  * fixed-size shared memory.
00088  */
00089 typedef struct ss_scan_location_t
00090 {
00091     RelFileNode relfilenode;    /* identity of a relation */
00092     BlockNumber location;       /* last-reported location in the relation */
00093 } ss_scan_location_t;
00094 
00095 typedef struct ss_lru_item_t
00096 {
00097     struct ss_lru_item_t *prev;
00098     struct ss_lru_item_t *next;
00099     ss_scan_location_t location;
00100 } ss_lru_item_t;
00101 
00102 typedef struct ss_scan_locations_t
00103 {
00104     ss_lru_item_t *head;
00105     ss_lru_item_t *tail;
00106     ss_lru_item_t items[1];     /* SYNC_SCAN_NELEM items */
00107 } ss_scan_locations_t;
00108 
00109 #define SizeOfScanLocations(N) offsetof(ss_scan_locations_t, items[N])
00110 
00111 /* Pointer to struct in shared memory */
00112 static ss_scan_locations_t *scan_locations;
00113 
00114 /* prototypes for internal functions */
00115 static BlockNumber ss_search(RelFileNode relfilenode,
00116           BlockNumber location, bool set);
00117 
00118 
00119 /*
00120  * SyncScanShmemSize --- report amount of shared memory space needed
00121  */
00122 Size
00123 SyncScanShmemSize(void)
00124 {
00125     return SizeOfScanLocations(SYNC_SCAN_NELEM);
00126 }
00127 
00128 /*
00129  * SyncScanShmemInit --- initialize this module's shared memory
00130  */
00131 void
00132 SyncScanShmemInit(void)
00133 {
00134     int         i;
00135     bool        found;
00136 
00137     scan_locations = (ss_scan_locations_t *)
00138         ShmemInitStruct("Sync Scan Locations List",
00139                         SizeOfScanLocations(SYNC_SCAN_NELEM),
00140                         &found);
00141 
00142     if (!IsUnderPostmaster)
00143     {
00144         /* Initialize shared memory area */
00145         Assert(!found);
00146 
00147         scan_locations->head = &scan_locations->items[0];
00148         scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1];
00149 
00150         for (i = 0; i < SYNC_SCAN_NELEM; i++)
00151         {
00152             ss_lru_item_t *item = &scan_locations->items[i];
00153 
00154             /*
00155              * Initialize all slots with invalid values. As scans are started,
00156              * these invalid entries will fall off the LRU list and get
00157              * replaced with real entries.
00158              */
00159             item->location.relfilenode.spcNode = InvalidOid;
00160             item->location.relfilenode.dbNode = InvalidOid;
00161             item->location.relfilenode.relNode = InvalidOid;
00162             item->location.location = InvalidBlockNumber;
00163 
00164             item->prev = (i > 0) ?
00165                 (&scan_locations->items[i - 1]) : NULL;
00166             item->next = (i < SYNC_SCAN_NELEM - 1) ?
00167                 (&scan_locations->items[i + 1]) : NULL;
00168         }
00169     }
00170     else
00171         Assert(found);
00172 }
00173 
00174 /*
00175  * ss_search --- search the scan_locations structure for an entry with the
00176  *      given relfilenode.
00177  *
00178  * If "set" is true, the location is updated to the given location.  If no
00179  * entry for the given relfilenode is found, it will be created at the head
00180  * of the list with the given location, even if "set" is false.
00181  *
00182  * In any case, the location after possible update is returned.
00183  *
00184  * Caller is responsible for having acquired suitable lock on the shared
00185  * data structure.
00186  */
00187 static BlockNumber
00188 ss_search(RelFileNode relfilenode, BlockNumber location, bool set)
00189 {
00190     ss_lru_item_t *item;
00191 
00192     item = scan_locations->head;
00193     for (;;)
00194     {
00195         bool        match;
00196 
00197         match = RelFileNodeEquals(item->location.relfilenode, relfilenode);
00198 
00199         if (match || item->next == NULL)
00200         {
00201             /*
00202              * If we reached the end of list and no match was found, take over
00203              * the last entry
00204              */
00205             if (!match)
00206             {
00207                 item->location.relfilenode = relfilenode;
00208                 item->location.location = location;
00209             }
00210             else if (set)
00211                 item->location.location = location;
00212 
00213             /* Move the entry to the front of the LRU list */
00214             if (item != scan_locations->head)
00215             {
00216                 /* unlink */
00217                 if (item == scan_locations->tail)
00218                     scan_locations->tail = item->prev;
00219                 item->prev->next = item->next;
00220                 if (item->next)
00221                     item->next->prev = item->prev;
00222 
00223                 /* link */
00224                 item->prev = NULL;
00225                 item->next = scan_locations->head;
00226                 scan_locations->head->prev = item;
00227                 scan_locations->head = item;
00228             }
00229 
00230             return item->location.location;
00231         }
00232 
00233         item = item->next;
00234     }
00235 
00236     /* not reached */
00237 }
00238 
00239 /*
00240  * ss_get_location --- get the optimal starting location for scan
00241  *
00242  * Returns the last-reported location of a sequential scan on the
00243  * relation, or 0 if no valid location is found.
00244  *
00245  * We expect the caller has just done RelationGetNumberOfBlocks(), and
00246  * so that number is passed in rather than computing it again.  The result
00247  * is guaranteed less than relnblocks (assuming that's > 0).
00248  */
00249 BlockNumber
00250 ss_get_location(Relation rel, BlockNumber relnblocks)
00251 {
00252     BlockNumber startloc;
00253 
00254     LWLockAcquire(SyncScanLock, LW_EXCLUSIVE);
00255     startloc = ss_search(rel->rd_node, 0, false);
00256     LWLockRelease(SyncScanLock);
00257 
00258     /*
00259      * If the location is not a valid block number for this scan, start at 0.
00260      *
00261      * This can happen if for instance a VACUUM truncated the table since the
00262      * location was saved.
00263      */
00264     if (startloc >= relnblocks)
00265         startloc = 0;
00266 
00267 #ifdef TRACE_SYNCSCAN
00268     if (trace_syncscan)
00269         elog(LOG,
00270              "SYNC_SCAN: start \"%s\" (size %u) at %u",
00271              RelationGetRelationName(rel), relnblocks, startloc);
00272 #endif
00273 
00274     return startloc;
00275 }
00276 
00277 /*
00278  * ss_report_location --- update the current scan location
00279  *
00280  * Writes an entry into the shared Sync Scan state of the form
00281  * (relfilenode, blocknumber), overwriting any existing entry for the
00282  * same relfilenode.
00283  */
00284 void
00285 ss_report_location(Relation rel, BlockNumber location)
00286 {
00287 #ifdef TRACE_SYNCSCAN
00288     if (trace_syncscan)
00289     {
00290         if ((location % 1024) == 0)
00291             elog(LOG,
00292                  "SYNC_SCAN: scanning \"%s\" at %u",
00293                  RelationGetRelationName(rel), location);
00294     }
00295 #endif
00296 
00297     /*
00298      * To reduce lock contention, only report scan progress every N pages. For
00299      * the same reason, don't block if the lock isn't immediately available.
00300      * Missing a few updates isn't critical, it just means that a new scan
00301      * that wants to join the pack will start a little bit behind the head of
00302      * the scan.  Hopefully the pages are still in OS cache and the scan
00303      * catches up quickly.
00304      */
00305     if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0)
00306     {
00307         if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE))
00308         {
00309             (void) ss_search(rel->rd_node, location, true);
00310             LWLockRelease(SyncScanLock);
00311         }
00312 #ifdef TRACE_SYNCSCAN
00313         else if (trace_syncscan)
00314             elog(LOG,
00315                  "SYNC_SCAN: missed update for \"%s\" at %u",
00316                  RelationGetRelationName(rel), location);
00317 #endif
00318     }
00319 }