00001 /*------------------------------------------------------------------------- 00002 * 00003 * syncscan.c 00004 * heap scan synchronization support 00005 * 00006 * When multiple backends run a sequential scan on the same table, we try 00007 * to keep them synchronized to reduce the overall I/O needed. The goal is 00008 * to read each page into shared buffer cache only once, and let all backends 00009 * that take part in the shared scan process the page before it falls out of 00010 * the cache. 00011 * 00012 * Since the "leader" in a pack of backends doing a seqscan will have to wait 00013 * for I/O, while the "followers" don't, there is a strong self-synchronizing 00014 * effect once we can get the backends examining approximately the same part 00015 * of the table at the same time. Hence all that is really needed is to get 00016 * a new backend beginning a seqscan to begin it close to where other backends 00017 * are reading. We can scan the table circularly, from block X up to the 00018 * end and then from block 0 to X-1, to ensure we visit all rows while still 00019 * participating in the common scan. 00020 * 00021 * To accomplish that, we keep track of the scan position of each table, and 00022 * start new scans close to where the previous scan(s) are. We don't try to 00023 * do any extra synchronization to keep the scans together afterwards; some 00024 * scans might progress much more slowly than others, for example if the 00025 * results need to be transferred to the client over a slow network, and we 00026 * don't want such queries to slow down others. 00027 * 00028 * There can realistically only be a few large sequential scans on different 00029 * tables in progress at any time. Therefore we just keep the scan positions 00030 * in a small LRU list which we scan every time we need to look up or update a 00031 * scan position. The whole mechanism is only applied for tables exceeding 00032 * a threshold size (but that is not the concern of this module). 00033 * 00034 * INTERFACE ROUTINES 00035 * ss_get_location - return current scan location of a relation 00036 * ss_report_location - update current scan location 00037 * 00038 * 00039 * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group 00040 * Portions Copyright (c) 1994, Regents of the University of California 00041 * 00042 * IDENTIFICATION 00043 * src/backend/access/heap/syncscan.c 00044 * 00045 *------------------------------------------------------------------------- 00046 */ 00047 #include "postgres.h" 00048 00049 #include "access/heapam.h" 00050 #include "miscadmin.h" 00051 #include "utils/rel.h" 00052 00053 00054 /* GUC variables */ 00055 #ifdef TRACE_SYNCSCAN 00056 bool trace_syncscan = false; 00057 #endif 00058 00059 00060 /* 00061 * Size of the LRU list. 00062 * 00063 * Note: the code assumes that SYNC_SCAN_NELEM > 1. 00064 * 00065 * XXX: What's a good value? It should be large enough to hold the 00066 * maximum number of large tables scanned simultaneously. But a larger value 00067 * means more traversing of the LRU list when starting a new scan. 00068 */ 00069 #define SYNC_SCAN_NELEM 20 00070 00071 /* 00072 * Interval between reports of the location of the current scan, in pages. 00073 * 00074 * Note: This should be smaller than the ring size (see buffer/freelist.c) 00075 * we use for bulk reads. Otherwise a scan joining other scans might start 00076 * from a page that's no longer in the buffer cache. This is a bit fuzzy; 00077 * there's no guarantee that the new scan will read the page before it leaves 00078 * the buffer cache anyway, and on the other hand the page is most likely 00079 * still in the OS cache. 00080 */ 00081 #define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ) 00082 00083 00084 /* 00085 * The scan locations structure is essentially a doubly-linked LRU with head 00086 * and tail pointer, but designed to hold a fixed maximum number of elements in 00087 * fixed-size shared memory. 00088 */ 00089 typedef struct ss_scan_location_t 00090 { 00091 RelFileNode relfilenode; /* identity of a relation */ 00092 BlockNumber location; /* last-reported location in the relation */ 00093 } ss_scan_location_t; 00094 00095 typedef struct ss_lru_item_t 00096 { 00097 struct ss_lru_item_t *prev; 00098 struct ss_lru_item_t *next; 00099 ss_scan_location_t location; 00100 } ss_lru_item_t; 00101 00102 typedef struct ss_scan_locations_t 00103 { 00104 ss_lru_item_t *head; 00105 ss_lru_item_t *tail; 00106 ss_lru_item_t items[1]; /* SYNC_SCAN_NELEM items */ 00107 } ss_scan_locations_t; 00108 00109 #define SizeOfScanLocations(N) offsetof(ss_scan_locations_t, items[N]) 00110 00111 /* Pointer to struct in shared memory */ 00112 static ss_scan_locations_t *scan_locations; 00113 00114 /* prototypes for internal functions */ 00115 static BlockNumber ss_search(RelFileNode relfilenode, 00116 BlockNumber location, bool set); 00117 00118 00119 /* 00120 * SyncScanShmemSize --- report amount of shared memory space needed 00121 */ 00122 Size 00123 SyncScanShmemSize(void) 00124 { 00125 return SizeOfScanLocations(SYNC_SCAN_NELEM); 00126 } 00127 00128 /* 00129 * SyncScanShmemInit --- initialize this module's shared memory 00130 */ 00131 void 00132 SyncScanShmemInit(void) 00133 { 00134 int i; 00135 bool found; 00136 00137 scan_locations = (ss_scan_locations_t *) 00138 ShmemInitStruct("Sync Scan Locations List", 00139 SizeOfScanLocations(SYNC_SCAN_NELEM), 00140 &found); 00141 00142 if (!IsUnderPostmaster) 00143 { 00144 /* Initialize shared memory area */ 00145 Assert(!found); 00146 00147 scan_locations->head = &scan_locations->items[0]; 00148 scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1]; 00149 00150 for (i = 0; i < SYNC_SCAN_NELEM; i++) 00151 { 00152 ss_lru_item_t *item = &scan_locations->items[i]; 00153 00154 /* 00155 * Initialize all slots with invalid values. As scans are started, 00156 * these invalid entries will fall off the LRU list and get 00157 * replaced with real entries. 00158 */ 00159 item->location.relfilenode.spcNode = InvalidOid; 00160 item->location.relfilenode.dbNode = InvalidOid; 00161 item->location.relfilenode.relNode = InvalidOid; 00162 item->location.location = InvalidBlockNumber; 00163 00164 item->prev = (i > 0) ? 00165 (&scan_locations->items[i - 1]) : NULL; 00166 item->next = (i < SYNC_SCAN_NELEM - 1) ? 00167 (&scan_locations->items[i + 1]) : NULL; 00168 } 00169 } 00170 else 00171 Assert(found); 00172 } 00173 00174 /* 00175 * ss_search --- search the scan_locations structure for an entry with the 00176 * given relfilenode. 00177 * 00178 * If "set" is true, the location is updated to the given location. If no 00179 * entry for the given relfilenode is found, it will be created at the head 00180 * of the list with the given location, even if "set" is false. 00181 * 00182 * In any case, the location after possible update is returned. 00183 * 00184 * Caller is responsible for having acquired suitable lock on the shared 00185 * data structure. 00186 */ 00187 static BlockNumber 00188 ss_search(RelFileNode relfilenode, BlockNumber location, bool set) 00189 { 00190 ss_lru_item_t *item; 00191 00192 item = scan_locations->head; 00193 for (;;) 00194 { 00195 bool match; 00196 00197 match = RelFileNodeEquals(item->location.relfilenode, relfilenode); 00198 00199 if (match || item->next == NULL) 00200 { 00201 /* 00202 * If we reached the end of list and no match was found, take over 00203 * the last entry 00204 */ 00205 if (!match) 00206 { 00207 item->location.relfilenode = relfilenode; 00208 item->location.location = location; 00209 } 00210 else if (set) 00211 item->location.location = location; 00212 00213 /* Move the entry to the front of the LRU list */ 00214 if (item != scan_locations->head) 00215 { 00216 /* unlink */ 00217 if (item == scan_locations->tail) 00218 scan_locations->tail = item->prev; 00219 item->prev->next = item->next; 00220 if (item->next) 00221 item->next->prev = item->prev; 00222 00223 /* link */ 00224 item->prev = NULL; 00225 item->next = scan_locations->head; 00226 scan_locations->head->prev = item; 00227 scan_locations->head = item; 00228 } 00229 00230 return item->location.location; 00231 } 00232 00233 item = item->next; 00234 } 00235 00236 /* not reached */ 00237 } 00238 00239 /* 00240 * ss_get_location --- get the optimal starting location for scan 00241 * 00242 * Returns the last-reported location of a sequential scan on the 00243 * relation, or 0 if no valid location is found. 00244 * 00245 * We expect the caller has just done RelationGetNumberOfBlocks(), and 00246 * so that number is passed in rather than computing it again. The result 00247 * is guaranteed less than relnblocks (assuming that's > 0). 00248 */ 00249 BlockNumber 00250 ss_get_location(Relation rel, BlockNumber relnblocks) 00251 { 00252 BlockNumber startloc; 00253 00254 LWLockAcquire(SyncScanLock, LW_EXCLUSIVE); 00255 startloc = ss_search(rel->rd_node, 0, false); 00256 LWLockRelease(SyncScanLock); 00257 00258 /* 00259 * If the location is not a valid block number for this scan, start at 0. 00260 * 00261 * This can happen if for instance a VACUUM truncated the table since the 00262 * location was saved. 00263 */ 00264 if (startloc >= relnblocks) 00265 startloc = 0; 00266 00267 #ifdef TRACE_SYNCSCAN 00268 if (trace_syncscan) 00269 elog(LOG, 00270 "SYNC_SCAN: start \"%s\" (size %u) at %u", 00271 RelationGetRelationName(rel), relnblocks, startloc); 00272 #endif 00273 00274 return startloc; 00275 } 00276 00277 /* 00278 * ss_report_location --- update the current scan location 00279 * 00280 * Writes an entry into the shared Sync Scan state of the form 00281 * (relfilenode, blocknumber), overwriting any existing entry for the 00282 * same relfilenode. 00283 */ 00284 void 00285 ss_report_location(Relation rel, BlockNumber location) 00286 { 00287 #ifdef TRACE_SYNCSCAN 00288 if (trace_syncscan) 00289 { 00290 if ((location % 1024) == 0) 00291 elog(LOG, 00292 "SYNC_SCAN: scanning \"%s\" at %u", 00293 RelationGetRelationName(rel), location); 00294 } 00295 #endif 00296 00297 /* 00298 * To reduce lock contention, only report scan progress every N pages. For 00299 * the same reason, don't block if the lock isn't immediately available. 00300 * Missing a few updates isn't critical, it just means that a new scan 00301 * that wants to join the pack will start a little bit behind the head of 00302 * the scan. Hopefully the pages are still in OS cache and the scan 00303 * catches up quickly. 00304 */ 00305 if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0) 00306 { 00307 if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE)) 00308 { 00309 (void) ss_search(rel->rd_node, location, true); 00310 LWLockRelease(SyncScanLock); 00311 } 00312 #ifdef TRACE_SYNCSCAN 00313 else if (trace_syncscan) 00314 elog(LOG, 00315 "SYNC_SCAN: missed update for \"%s\" at %u", 00316 RelationGetRelationName(rel), location); 00317 #endif 00318 } 00319 }