00001 /*------------------------------------------------------------------------- 00002 * 00003 * fsmpage.c 00004 * routines to search and manipulate one FSM page. 00005 * 00006 * 00007 * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group 00008 * Portions Copyright (c) 1994, Regents of the University of California 00009 * 00010 * IDENTIFICATION 00011 * src/backend/storage/freespace/fsmpage.c 00012 * 00013 * NOTES: 00014 * 00015 * The public functions in this file form an API that hides the internal 00016 * structure of a FSM page. This allows freespace.c to treat each FSM page 00017 * as a black box with SlotsPerPage "slots". fsm_set_avail() and 00018 * fsm_get_avail() let you get/set the value of a slot, and 00019 * fsm_search_avail() lets you search for a slot with value >= X. 00020 * 00021 *------------------------------------------------------------------------- 00022 */ 00023 #include "postgres.h" 00024 00025 #include "storage/bufmgr.h" 00026 #include "storage/fsm_internals.h" 00027 00028 /* Macros to navigate the tree within a page. Root has index zero. */ 00029 #define leftchild(x) (2 * (x) + 1) 00030 #define rightchild(x) (2 * (x) + 2) 00031 #define parentof(x) (((x) - 1) / 2) 00032 00033 /* 00034 * Find right neighbor of x, wrapping around within the level 00035 */ 00036 static int 00037 rightneighbor(int x) 00038 { 00039 /* 00040 * Move right. This might wrap around, stepping to the leftmost node at 00041 * the next level. 00042 */ 00043 x++; 00044 00045 /* 00046 * Check if we stepped to the leftmost node at next level, and correct if 00047 * so. The leftmost nodes at each level are numbered x = 2^level - 1, so 00048 * check if (x + 1) is a power of two, using a standard 00049 * twos-complement-arithmetic trick. 00050 */ 00051 if (((x + 1) & x) == 0) 00052 x = parentof(x); 00053 00054 return x; 00055 } 00056 00057 /* 00058 * Sets the value of a slot on page. Returns true if the page was modified. 00059 * 00060 * The caller must hold an exclusive lock on the page. 00061 */ 00062 bool 00063 fsm_set_avail(Page page, int slot, uint8 value) 00064 { 00065 int nodeno = NonLeafNodesPerPage + slot; 00066 FSMPage fsmpage = (FSMPage) PageGetContents(page); 00067 uint8 oldvalue; 00068 00069 Assert(slot < LeafNodesPerPage); 00070 00071 oldvalue = fsmpage->fp_nodes[nodeno]; 00072 00073 /* If the value hasn't changed, we don't need to do anything */ 00074 if (oldvalue == value && value <= fsmpage->fp_nodes[0]) 00075 return false; 00076 00077 fsmpage->fp_nodes[nodeno] = value; 00078 00079 /* 00080 * Propagate up, until we hit the root or a node that doesn't need to be 00081 * updated. 00082 */ 00083 do 00084 { 00085 uint8 newvalue = 0; 00086 int lchild; 00087 int rchild; 00088 00089 nodeno = parentof(nodeno); 00090 lchild = leftchild(nodeno); 00091 rchild = lchild + 1; 00092 00093 newvalue = fsmpage->fp_nodes[lchild]; 00094 if (rchild < NodesPerPage) 00095 newvalue = Max(newvalue, 00096 fsmpage->fp_nodes[rchild]); 00097 00098 oldvalue = fsmpage->fp_nodes[nodeno]; 00099 if (oldvalue == newvalue) 00100 break; 00101 00102 fsmpage->fp_nodes[nodeno] = newvalue; 00103 } while (nodeno > 0); 00104 00105 /* 00106 * sanity check: if the new value is (still) higher than the value at the 00107 * top, the tree is corrupt. If so, rebuild. 00108 */ 00109 if (value > fsmpage->fp_nodes[0]) 00110 fsm_rebuild_page(page); 00111 00112 return true; 00113 } 00114 00115 /* 00116 * Returns the value of given slot on page. 00117 * 00118 * Since this is just a read-only access of a single byte, the page doesn't 00119 * need to be locked. 00120 */ 00121 uint8 00122 fsm_get_avail(Page page, int slot) 00123 { 00124 FSMPage fsmpage = (FSMPage) PageGetContents(page); 00125 00126 Assert(slot < LeafNodesPerPage); 00127 00128 return fsmpage->fp_nodes[NonLeafNodesPerPage + slot]; 00129 } 00130 00131 /* 00132 * Returns the value at the root of a page. 00133 * 00134 * Since this is just a read-only access of a single byte, the page doesn't 00135 * need to be locked. 00136 */ 00137 uint8 00138 fsm_get_max_avail(Page page) 00139 { 00140 FSMPage fsmpage = (FSMPage) PageGetContents(page); 00141 00142 return fsmpage->fp_nodes[0]; 00143 } 00144 00145 /* 00146 * Searches for a slot with category at least minvalue. 00147 * Returns slot number, or -1 if none found. 00148 * 00149 * The caller must hold at least a shared lock on the page, and this 00150 * function can unlock and lock the page again in exclusive mode if it 00151 * needs to be updated. exclusive_lock_held should be set to true if the 00152 * caller is already holding an exclusive lock, to avoid extra work. 00153 * 00154 * If advancenext is false, fp_next_slot is set to point to the returned 00155 * slot, and if it's true, to the slot after the returned slot. 00156 */ 00157 int 00158 fsm_search_avail(Buffer buf, uint8 minvalue, bool advancenext, 00159 bool exclusive_lock_held) 00160 { 00161 Page page = BufferGetPage(buf); 00162 FSMPage fsmpage = (FSMPage) PageGetContents(page); 00163 int nodeno; 00164 int target; 00165 uint16 slot; 00166 00167 restart: 00168 00169 /* 00170 * Check the root first, and exit quickly if there's no leaf with enough 00171 * free space 00172 */ 00173 if (fsmpage->fp_nodes[0] < minvalue) 00174 return -1; 00175 00176 /* 00177 * Start search using fp_next_slot. It's just a hint, so check that it's 00178 * sane. (This also handles wrapping around when the prior call returned 00179 * the last slot on the page.) 00180 */ 00181 target = fsmpage->fp_next_slot; 00182 if (target < 0 || target >= LeafNodesPerPage) 00183 target = 0; 00184 target += NonLeafNodesPerPage; 00185 00186 /*---------- 00187 * Start the search from the target slot. At every step, move one 00188 * node to the right, then climb up to the parent. Stop when we reach 00189 * a node with enough free space (as we must, since the root has enough 00190 * space). 00191 * 00192 * The idea is to gradually expand our "search triangle", that is, all 00193 * nodes covered by the current node, and to be sure we search to the 00194 * right from the start point. At the first step, only the target slot 00195 * is examined. When we move up from a left child to its parent, we are 00196 * adding the right-hand subtree of that parent to the search triangle. 00197 * When we move right then up from a right child, we are dropping the 00198 * current search triangle (which we know doesn't contain any suitable 00199 * page) and instead looking at the next-larger-size triangle to its 00200 * right. So we never look left from our original start point, and at 00201 * each step the size of the search triangle doubles, ensuring it takes 00202 * only log2(N) work to search N pages. 00203 * 00204 * The "move right" operation will wrap around if it hits the right edge 00205 * of the tree, so the behavior is still good if we start near the right. 00206 * Note also that the move-and-climb behavior ensures that we can't end 00207 * up on one of the missing nodes at the right of the leaf level. 00208 * 00209 * For example, consider this tree: 00210 * 00211 * 7 00212 * 7 6 00213 * 5 7 6 5 00214 * 4 5 5 7 2 6 5 2 00215 * T 00216 * 00217 * Assume that the target node is the node indicated by the letter T, 00218 * and we're searching for a node with value of 6 or higher. The search 00219 * begins at T. At the first iteration, we move to the right, then to the 00220 * parent, arriving at the rightmost 5. At the second iteration, we move 00221 * to the right, wrapping around, then climb up, arriving at the 7 on the 00222 * third level. 7 satisfies our search, so we descend down to the bottom, 00223 * following the path of sevens. This is in fact the first suitable page 00224 * to the right of (allowing for wraparound) our start point. 00225 *---------- 00226 */ 00227 nodeno = target; 00228 while (nodeno > 0) 00229 { 00230 if (fsmpage->fp_nodes[nodeno] >= minvalue) 00231 break; 00232 00233 /* 00234 * Move to the right, wrapping around on same level if necessary, then 00235 * climb up. 00236 */ 00237 nodeno = parentof(rightneighbor(nodeno)); 00238 } 00239 00240 /* 00241 * We're now at a node with enough free space, somewhere in the middle of 00242 * the tree. Descend to the bottom, following a path with enough free 00243 * space, preferring to move left if there's a choice. 00244 */ 00245 while (nodeno < NonLeafNodesPerPage) 00246 { 00247 int childnodeno = leftchild(nodeno); 00248 00249 if (childnodeno < NodesPerPage && 00250 fsmpage->fp_nodes[childnodeno] >= minvalue) 00251 { 00252 nodeno = childnodeno; 00253 continue; 00254 } 00255 childnodeno++; /* point to right child */ 00256 if (childnodeno < NodesPerPage && 00257 fsmpage->fp_nodes[childnodeno] >= minvalue) 00258 { 00259 nodeno = childnodeno; 00260 } 00261 else 00262 { 00263 /* 00264 * Oops. The parent node promised that either left or right child 00265 * has enough space, but neither actually did. This can happen in 00266 * case of a "torn page", IOW if we crashed earlier while writing 00267 * the page to disk, and only part of the page made it to disk. 00268 * 00269 * Fix the corruption and restart. 00270 */ 00271 RelFileNode rnode; 00272 ForkNumber forknum; 00273 BlockNumber blknum; 00274 00275 BufferGetTag(buf, &rnode, &forknum, &blknum); 00276 elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/%u", 00277 blknum, rnode.spcNode, rnode.dbNode, rnode.relNode); 00278 00279 /* make sure we hold an exclusive lock */ 00280 if (!exclusive_lock_held) 00281 { 00282 LockBuffer(buf, BUFFER_LOCK_UNLOCK); 00283 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); 00284 exclusive_lock_held = true; 00285 } 00286 fsm_rebuild_page(page); 00287 MarkBufferDirtyHint(buf); 00288 goto restart; 00289 } 00290 } 00291 00292 /* We're now at the bottom level, at a node with enough space. */ 00293 slot = nodeno - NonLeafNodesPerPage; 00294 00295 /* 00296 * Update the next-target pointer. Note that we do this even if we're only 00297 * holding a shared lock, on the grounds that it's better to use a shared 00298 * lock and get a garbled next pointer every now and then, than take the 00299 * concurrency hit of an exclusive lock. 00300 * 00301 * Wrap-around is handled at the beginning of this function. 00302 */ 00303 fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0); 00304 00305 return slot; 00306 } 00307 00308 /* 00309 * Sets the available space to zero for all slots numbered >= nslots. 00310 * Returns true if the page was modified. 00311 */ 00312 bool 00313 fsm_truncate_avail(Page page, int nslots) 00314 { 00315 FSMPage fsmpage = (FSMPage) PageGetContents(page); 00316 uint8 *ptr; 00317 bool changed = false; 00318 00319 Assert(nslots >= 0 && nslots < LeafNodesPerPage); 00320 00321 /* Clear all truncated leaf nodes */ 00322 ptr = &fsmpage->fp_nodes[NonLeafNodesPerPage + nslots]; 00323 for (; ptr < &fsmpage->fp_nodes[NodesPerPage]; ptr++) 00324 { 00325 if (*ptr != 0) 00326 changed = true; 00327 *ptr = 0; 00328 } 00329 00330 /* Fix upper nodes. */ 00331 if (changed) 00332 fsm_rebuild_page(page); 00333 00334 return changed; 00335 } 00336 00337 /* 00338 * Reconstructs the upper levels of a page. Returns true if the page 00339 * was modified. 00340 */ 00341 bool 00342 fsm_rebuild_page(Page page) 00343 { 00344 FSMPage fsmpage = (FSMPage) PageGetContents(page); 00345 bool changed = false; 00346 int nodeno; 00347 00348 /* 00349 * Start from the lowest non-leaf level, at last node, working our way 00350 * backwards, through all non-leaf nodes at all levels, up to the root. 00351 */ 00352 for (nodeno = NonLeafNodesPerPage - 1; nodeno >= 0; nodeno--) 00353 { 00354 int lchild = leftchild(nodeno); 00355 int rchild = lchild + 1; 00356 uint8 newvalue = 0; 00357 00358 /* The first few nodes we examine might have zero or one child. */ 00359 if (lchild < NodesPerPage) 00360 newvalue = fsmpage->fp_nodes[lchild]; 00361 00362 if (rchild < NodesPerPage) 00363 newvalue = Max(newvalue, 00364 fsmpage->fp_nodes[rchild]); 00365 00366 if (fsmpage->fp_nodes[nodeno] != newvalue) 00367 { 00368 fsmpage->fp_nodes[nodeno] = newvalue; 00369 changed = true; 00370 } 00371 } 00372 00373 return changed; 00374 }