Header And Logo

PostgreSQL
| The world's most advanced open source database.

fsmpage.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * fsmpage.c
00004  *    routines to search and manipulate one FSM page.
00005  *
00006  *
00007  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00008  * Portions Copyright (c) 1994, Regents of the University of California
00009  *
00010  * IDENTIFICATION
00011  *    src/backend/storage/freespace/fsmpage.c
00012  *
00013  * NOTES:
00014  *
00015  *  The public functions in this file form an API that hides the internal
00016  *  structure of a FSM page. This allows freespace.c to treat each FSM page
00017  *  as a black box with SlotsPerPage "slots". fsm_set_avail() and
00018  *  fsm_get_avail() let you get/set the value of a slot, and
00019  *  fsm_search_avail() lets you search for a slot with value >= X.
00020  *
00021  *-------------------------------------------------------------------------
00022  */
00023 #include "postgres.h"
00024 
00025 #include "storage/bufmgr.h"
00026 #include "storage/fsm_internals.h"
00027 
00028 /* Macros to navigate the tree within a page. Root has index zero. */
00029 #define leftchild(x)    (2 * (x) + 1)
00030 #define rightchild(x)   (2 * (x) + 2)
00031 #define parentof(x)     (((x) - 1) / 2)
00032 
00033 /*
00034  * Find right neighbor of x, wrapping around within the level
00035  */
00036 static int
00037 rightneighbor(int x)
00038 {
00039     /*
00040      * Move right. This might wrap around, stepping to the leftmost node at
00041      * the next level.
00042      */
00043     x++;
00044 
00045     /*
00046      * Check if we stepped to the leftmost node at next level, and correct if
00047      * so. The leftmost nodes at each level are numbered x = 2^level - 1, so
00048      * check if (x + 1) is a power of two, using a standard
00049      * twos-complement-arithmetic trick.
00050      */
00051     if (((x + 1) & x) == 0)
00052         x = parentof(x);
00053 
00054     return x;
00055 }
00056 
00057 /*
00058  * Sets the value of a slot on page. Returns true if the page was modified.
00059  *
00060  * The caller must hold an exclusive lock on the page.
00061  */
00062 bool
00063 fsm_set_avail(Page page, int slot, uint8 value)
00064 {
00065     int         nodeno = NonLeafNodesPerPage + slot;
00066     FSMPage     fsmpage = (FSMPage) PageGetContents(page);
00067     uint8       oldvalue;
00068 
00069     Assert(slot < LeafNodesPerPage);
00070 
00071     oldvalue = fsmpage->fp_nodes[nodeno];
00072 
00073     /* If the value hasn't changed, we don't need to do anything */
00074     if (oldvalue == value && value <= fsmpage->fp_nodes[0])
00075         return false;
00076 
00077     fsmpage->fp_nodes[nodeno] = value;
00078 
00079     /*
00080      * Propagate up, until we hit the root or a node that doesn't need to be
00081      * updated.
00082      */
00083     do
00084     {
00085         uint8       newvalue = 0;
00086         int         lchild;
00087         int         rchild;
00088 
00089         nodeno = parentof(nodeno);
00090         lchild = leftchild(nodeno);
00091         rchild = lchild + 1;
00092 
00093         newvalue = fsmpage->fp_nodes[lchild];
00094         if (rchild < NodesPerPage)
00095             newvalue = Max(newvalue,
00096                            fsmpage->fp_nodes[rchild]);
00097 
00098         oldvalue = fsmpage->fp_nodes[nodeno];
00099         if (oldvalue == newvalue)
00100             break;
00101 
00102         fsmpage->fp_nodes[nodeno] = newvalue;
00103     } while (nodeno > 0);
00104 
00105     /*
00106      * sanity check: if the new value is (still) higher than the value at the
00107      * top, the tree is corrupt.  If so, rebuild.
00108      */
00109     if (value > fsmpage->fp_nodes[0])
00110         fsm_rebuild_page(page);
00111 
00112     return true;
00113 }
00114 
00115 /*
00116  * Returns the value of given slot on page.
00117  *
00118  * Since this is just a read-only access of a single byte, the page doesn't
00119  * need to be locked.
00120  */
00121 uint8
00122 fsm_get_avail(Page page, int slot)
00123 {
00124     FSMPage     fsmpage = (FSMPage) PageGetContents(page);
00125 
00126     Assert(slot < LeafNodesPerPage);
00127 
00128     return fsmpage->fp_nodes[NonLeafNodesPerPage + slot];
00129 }
00130 
00131 /*
00132  * Returns the value at the root of a page.
00133  *
00134  * Since this is just a read-only access of a single byte, the page doesn't
00135  * need to be locked.
00136  */
00137 uint8
00138 fsm_get_max_avail(Page page)
00139 {
00140     FSMPage     fsmpage = (FSMPage) PageGetContents(page);
00141 
00142     return fsmpage->fp_nodes[0];
00143 }
00144 
00145 /*
00146  * Searches for a slot with category at least minvalue.
00147  * Returns slot number, or -1 if none found.
00148  *
00149  * The caller must hold at least a shared lock on the page, and this
00150  * function can unlock and lock the page again in exclusive mode if it
00151  * needs to be updated. exclusive_lock_held should be set to true if the
00152  * caller is already holding an exclusive lock, to avoid extra work.
00153  *
00154  * If advancenext is false, fp_next_slot is set to point to the returned
00155  * slot, and if it's true, to the slot after the returned slot.
00156  */
00157 int
00158 fsm_search_avail(Buffer buf, uint8 minvalue, bool advancenext,
00159                  bool exclusive_lock_held)
00160 {
00161     Page        page = BufferGetPage(buf);
00162     FSMPage     fsmpage = (FSMPage) PageGetContents(page);
00163     int         nodeno;
00164     int         target;
00165     uint16      slot;
00166 
00167 restart:
00168 
00169     /*
00170      * Check the root first, and exit quickly if there's no leaf with enough
00171      * free space
00172      */
00173     if (fsmpage->fp_nodes[0] < minvalue)
00174         return -1;
00175 
00176     /*
00177      * Start search using fp_next_slot.  It's just a hint, so check that it's
00178      * sane.  (This also handles wrapping around when the prior call returned
00179      * the last slot on the page.)
00180      */
00181     target = fsmpage->fp_next_slot;
00182     if (target < 0 || target >= LeafNodesPerPage)
00183         target = 0;
00184     target += NonLeafNodesPerPage;
00185 
00186     /*----------
00187      * Start the search from the target slot.  At every step, move one
00188      * node to the right, then climb up to the parent.  Stop when we reach
00189      * a node with enough free space (as we must, since the root has enough
00190      * space).
00191      *
00192      * The idea is to gradually expand our "search triangle", that is, all
00193      * nodes covered by the current node, and to be sure we search to the
00194      * right from the start point.  At the first step, only the target slot
00195      * is examined.  When we move up from a left child to its parent, we are
00196      * adding the right-hand subtree of that parent to the search triangle.
00197      * When we move right then up from a right child, we are dropping the
00198      * current search triangle (which we know doesn't contain any suitable
00199      * page) and instead looking at the next-larger-size triangle to its
00200      * right.  So we never look left from our original start point, and at
00201      * each step the size of the search triangle doubles, ensuring it takes
00202      * only log2(N) work to search N pages.
00203      *
00204      * The "move right" operation will wrap around if it hits the right edge
00205      * of the tree, so the behavior is still good if we start near the right.
00206      * Note also that the move-and-climb behavior ensures that we can't end
00207      * up on one of the missing nodes at the right of the leaf level.
00208      *
00209      * For example, consider this tree:
00210      *
00211      *         7
00212      *     7       6
00213      *   5   7   6   5
00214      *  4 5 5 7 2 6 5 2
00215      *              T
00216      *
00217      * Assume that the target node is the node indicated by the letter T,
00218      * and we're searching for a node with value of 6 or higher. The search
00219      * begins at T. At the first iteration, we move to the right, then to the
00220      * parent, arriving at the rightmost 5. At the second iteration, we move
00221      * to the right, wrapping around, then climb up, arriving at the 7 on the
00222      * third level.  7 satisfies our search, so we descend down to the bottom,
00223      * following the path of sevens.  This is in fact the first suitable page
00224      * to the right of (allowing for wraparound) our start point.
00225      *----------
00226      */
00227     nodeno = target;
00228     while (nodeno > 0)
00229     {
00230         if (fsmpage->fp_nodes[nodeno] >= minvalue)
00231             break;
00232 
00233         /*
00234          * Move to the right, wrapping around on same level if necessary, then
00235          * climb up.
00236          */
00237         nodeno = parentof(rightneighbor(nodeno));
00238     }
00239 
00240     /*
00241      * We're now at a node with enough free space, somewhere in the middle of
00242      * the tree. Descend to the bottom, following a path with enough free
00243      * space, preferring to move left if there's a choice.
00244      */
00245     while (nodeno < NonLeafNodesPerPage)
00246     {
00247         int         childnodeno = leftchild(nodeno);
00248 
00249         if (childnodeno < NodesPerPage &&
00250             fsmpage->fp_nodes[childnodeno] >= minvalue)
00251         {
00252             nodeno = childnodeno;
00253             continue;
00254         }
00255         childnodeno++;          /* point to right child */
00256         if (childnodeno < NodesPerPage &&
00257             fsmpage->fp_nodes[childnodeno] >= minvalue)
00258         {
00259             nodeno = childnodeno;
00260         }
00261         else
00262         {
00263             /*
00264              * Oops. The parent node promised that either left or right child
00265              * has enough space, but neither actually did. This can happen in
00266              * case of a "torn page", IOW if we crashed earlier while writing
00267              * the page to disk, and only part of the page made it to disk.
00268              *
00269              * Fix the corruption and restart.
00270              */
00271             RelFileNode rnode;
00272             ForkNumber  forknum;
00273             BlockNumber blknum;
00274 
00275             BufferGetTag(buf, &rnode, &forknum, &blknum);
00276             elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/%u",
00277                  blknum, rnode.spcNode, rnode.dbNode, rnode.relNode);
00278 
00279             /* make sure we hold an exclusive lock */
00280             if (!exclusive_lock_held)
00281             {
00282                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
00283                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
00284                 exclusive_lock_held = true;
00285             }
00286             fsm_rebuild_page(page);
00287             MarkBufferDirtyHint(buf);
00288             goto restart;
00289         }
00290     }
00291 
00292     /* We're now at the bottom level, at a node with enough space. */
00293     slot = nodeno - NonLeafNodesPerPage;
00294 
00295     /*
00296      * Update the next-target pointer. Note that we do this even if we're only
00297      * holding a shared lock, on the grounds that it's better to use a shared
00298      * lock and get a garbled next pointer every now and then, than take the
00299      * concurrency hit of an exclusive lock.
00300      *
00301      * Wrap-around is handled at the beginning of this function.
00302      */
00303     fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0);
00304 
00305     return slot;
00306 }
00307 
00308 /*
00309  * Sets the available space to zero for all slots numbered >= nslots.
00310  * Returns true if the page was modified.
00311  */
00312 bool
00313 fsm_truncate_avail(Page page, int nslots)
00314 {
00315     FSMPage     fsmpage = (FSMPage) PageGetContents(page);
00316     uint8      *ptr;
00317     bool        changed = false;
00318 
00319     Assert(nslots >= 0 && nslots < LeafNodesPerPage);
00320 
00321     /* Clear all truncated leaf nodes */
00322     ptr = &fsmpage->fp_nodes[NonLeafNodesPerPage + nslots];
00323     for (; ptr < &fsmpage->fp_nodes[NodesPerPage]; ptr++)
00324     {
00325         if (*ptr != 0)
00326             changed = true;
00327         *ptr = 0;
00328     }
00329 
00330     /* Fix upper nodes. */
00331     if (changed)
00332         fsm_rebuild_page(page);
00333 
00334     return changed;
00335 }
00336 
00337 /*
00338  * Reconstructs the upper levels of a page. Returns true if the page
00339  * was modified.
00340  */
00341 bool
00342 fsm_rebuild_page(Page page)
00343 {
00344     FSMPage     fsmpage = (FSMPage) PageGetContents(page);
00345     bool        changed = false;
00346     int         nodeno;
00347 
00348     /*
00349      * Start from the lowest non-leaf level, at last node, working our way
00350      * backwards, through all non-leaf nodes at all levels, up to the root.
00351      */
00352     for (nodeno = NonLeafNodesPerPage - 1; nodeno >= 0; nodeno--)
00353     {
00354         int         lchild = leftchild(nodeno);
00355         int         rchild = lchild + 1;
00356         uint8       newvalue = 0;
00357 
00358         /* The first few nodes we examine might have zero or one child. */
00359         if (lchild < NodesPerPage)
00360             newvalue = fsmpage->fp_nodes[lchild];
00361 
00362         if (rchild < NodesPerPage)
00363             newvalue = Max(newvalue,
00364                            fsmpage->fp_nodes[rchild]);
00365 
00366         if (fsmpage->fp_nodes[nodeno] != newvalue)
00367         {
00368             fsmpage->fp_nodes[nodeno] = newvalue;
00369             changed = true;
00370         }
00371     }
00372 
00373     return changed;
00374 }