Header And Logo

PostgreSQL
| The world's most advanced open source database.

freespace.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * freespace.c
00004  *    POSTGRES free space map for quickly finding free space in relations
00005  *
00006  *
00007  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00008  * Portions Copyright (c) 1994, Regents of the University of California
00009  *
00010  * IDENTIFICATION
00011  *    src/backend/storage/freespace/freespace.c
00012  *
00013  *
00014  * NOTES:
00015  *
00016  *  Free Space Map keeps track of the amount of free space on pages, and
00017  *  allows quickly searching for a page with enough free space. The FSM is
00018  *  stored in a dedicated relation fork of all heap relations, and those
00019  *  index access methods that need it (see also indexfsm.c). See README for
00020  *  more information.
00021  *
00022  *-------------------------------------------------------------------------
00023  */
00024 #include "postgres.h"
00025 
00026 #include "access/htup_details.h"
00027 #include "access/xlogutils.h"
00028 #include "miscadmin.h"
00029 #include "storage/freespace.h"
00030 #include "storage/fsm_internals.h"
00031 #include "storage/lmgr.h"
00032 #include "storage/smgr.h"
00033 
00034 
00035 /*
00036  * We use just one byte to store the amount of free space on a page, so we
00037  * divide the amount of free space a page can have into 256 different
00038  * categories. The highest category, 255, represents a page with at least
00039  * MaxFSMRequestSize bytes of free space, and the second highest category
00040  * represents the range from 254 * FSM_CAT_STEP, inclusive, to
00041  * MaxFSMRequestSize, exclusive.
00042  *
00043  * MaxFSMRequestSize depends on the architecture and BLCKSZ, but assuming
00044  * default 8k BLCKSZ, and that MaxFSMRequestSize is 24 bytes, the categories
00045  * look like this
00046  *
00047  *
00048  * Range     Category
00049  * 0    - 31   0
00050  * 32   - 63   1
00051  * ...    ...  ...
00052  * 8096 - 8127 253
00053  * 8128 - 8163 254
00054  * 8164 - 8192 255
00055  *
00056  * The reason that MaxFSMRequestSize is special is that if MaxFSMRequestSize
00057  * isn't equal to a range boundary, a page with exactly MaxFSMRequestSize
00058  * bytes of free space wouldn't satisfy a request for MaxFSMRequestSize
00059  * bytes. If there isn't more than MaxFSMRequestSize bytes of free space on a
00060  * completely empty page, that would mean that we could never satisfy a
00061  * request of exactly MaxFSMRequestSize bytes.
00062  */
00063 #define FSM_CATEGORIES  256
00064 #define FSM_CAT_STEP    (BLCKSZ / FSM_CATEGORIES)
00065 #define MaxFSMRequestSize   MaxHeapTupleSize
00066 
00067 /*
00068  * Depth of the on-disk tree. We need to be able to address 2^32-1 blocks,
00069  * and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise,
00070  * 216 is the smallest number that satisfies X^4 >= 2^32-1. In practice,
00071  * this means that 4096 bytes is the smallest BLCKSZ that we can get away
00072  * with a 3-level tree, and 512 is the smallest we support.
00073  */
00074 #define FSM_TREE_DEPTH  ((SlotsPerFSMPage >= 1626) ? 3 : 4)
00075 
00076 #define FSM_ROOT_LEVEL  (FSM_TREE_DEPTH - 1)
00077 #define FSM_BOTTOM_LEVEL 0
00078 
00079 /*
00080  * The internal FSM routines work on a logical addressing scheme. Each
00081  * level of the tree can be thought of as a separately addressable file.
00082  */
00083 typedef struct
00084 {
00085     int         level;          /* level */
00086     int         logpageno;      /* page number within the level */
00087 } FSMAddress;
00088 
00089 /* Address of the root page. */
00090 static const FSMAddress FSM_ROOT_ADDRESS = {FSM_ROOT_LEVEL, 0};
00091 
00092 /* functions to navigate the tree */
00093 static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot);
00094 static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot);
00095 static FSMAddress fsm_get_location(BlockNumber heapblk, uint16 *slot);
00096 static BlockNumber fsm_get_heap_blk(FSMAddress addr, uint16 slot);
00097 static BlockNumber fsm_logical_to_physical(FSMAddress addr);
00098 
00099 static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend);
00100 static void fsm_extend(Relation rel, BlockNumber fsm_nblocks);
00101 
00102 /* functions to convert amount of free space to a FSM category */
00103 static uint8 fsm_space_avail_to_cat(Size avail);
00104 static uint8 fsm_space_needed_to_cat(Size needed);
00105 static Size fsm_space_cat_to_avail(uint8 cat);
00106 
00107 /* workhorse functions for various operations */
00108 static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
00109                    uint8 newValue, uint8 minValue);
00110 static BlockNumber fsm_search(Relation rel, uint8 min_cat);
00111 static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof);
00112 
00113 
00114 /******** Public API ********/
00115 
00116 /*
00117  * GetPageWithFreeSpace - try to find a page in the given relation with
00118  *      at least the specified amount of free space.
00119  *
00120  * If successful, return the block number; if not, return InvalidBlockNumber.
00121  *
00122  * The caller must be prepared for the possibility that the returned page
00123  * will turn out to have too little space available by the time the caller
00124  * gets a lock on it.  In that case, the caller should report the actual
00125  * amount of free space available on that page and then try again (see
00126  * RecordAndGetPageWithFreeSpace).  If InvalidBlockNumber is returned,
00127  * extend the relation.
00128  */
00129 BlockNumber
00130 GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
00131 {
00132     uint8       min_cat = fsm_space_needed_to_cat(spaceNeeded);
00133 
00134     return fsm_search(rel, min_cat);
00135 }
00136 
00137 /*
00138  * RecordAndGetPageWithFreeSpace - update info about a page and try again.
00139  *
00140  * We provide this combo form to save some locking overhead, compared to
00141  * separate RecordPageWithFreeSpace + GetPageWithFreeSpace calls. There's
00142  * also some effort to return a page close to the old page; if there's a
00143  * page with enough free space on the same FSM page where the old one page
00144  * is located, it is preferred.
00145  */
00146 BlockNumber
00147 RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
00148                               Size oldSpaceAvail, Size spaceNeeded)
00149 {
00150     int         old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
00151     int         search_cat = fsm_space_needed_to_cat(spaceNeeded);
00152     FSMAddress  addr;
00153     uint16      slot;
00154     int         search_slot;
00155 
00156     /* Get the location of the FSM byte representing the heap block */
00157     addr = fsm_get_location(oldPage, &slot);
00158 
00159     search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat);
00160 
00161     /*
00162      * If fsm_set_and_search found a suitable new block, return that.
00163      * Otherwise, search as usual.
00164      */
00165     if (search_slot != -1)
00166         return fsm_get_heap_blk(addr, search_slot);
00167     else
00168         return fsm_search(rel, search_cat);
00169 }
00170 
00171 /*
00172  * RecordPageWithFreeSpace - update info about a page.
00173  *
00174  * Note that if the new spaceAvail value is higher than the old value stored
00175  * in the FSM, the space might not become visible to searchers until the next
00176  * FreeSpaceMapVacuum call, which updates the upper level pages.
00177  */
00178 void
00179 RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
00180 {
00181     int         new_cat = fsm_space_avail_to_cat(spaceAvail);
00182     FSMAddress  addr;
00183     uint16      slot;
00184 
00185     /* Get the location of the FSM byte representing the heap block */
00186     addr = fsm_get_location(heapBlk, &slot);
00187 
00188     fsm_set_and_search(rel, addr, slot, new_cat, 0);
00189 }
00190 
00191 /*
00192  * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in
00193  *      WAL replay
00194  */
00195 void
00196 XLogRecordPageWithFreeSpace(RelFileNode rnode, BlockNumber heapBlk,
00197                             Size spaceAvail)
00198 {
00199     int         new_cat = fsm_space_avail_to_cat(spaceAvail);
00200     FSMAddress  addr;
00201     uint16      slot;
00202     BlockNumber blkno;
00203     Buffer      buf;
00204     Page        page;
00205 
00206     /* Get the location of the FSM byte representing the heap block */
00207     addr = fsm_get_location(heapBlk, &slot);
00208     blkno = fsm_logical_to_physical(addr);
00209 
00210     /* If the page doesn't exist already, extend */
00211     buf = XLogReadBufferExtended(rnode, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR);
00212     LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
00213 
00214     page = BufferGetPage(buf);
00215     if (PageIsNew(page))
00216         PageInit(page, BLCKSZ, 0);
00217 
00218     if (fsm_set_avail(page, slot, new_cat))
00219         MarkBufferDirtyHint(buf);
00220     UnlockReleaseBuffer(buf);
00221 }
00222 
00223 /*
00224  * GetRecordedFreePage - return the amount of free space on a particular page,
00225  *      according to the FSM.
00226  */
00227 Size
00228 GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk)
00229 {
00230     FSMAddress  addr;
00231     uint16      slot;
00232     Buffer      buf;
00233     uint8       cat;
00234 
00235     /* Get the location of the FSM byte representing the heap block */
00236     addr = fsm_get_location(heapBlk, &slot);
00237 
00238     buf = fsm_readbuf(rel, addr, false);
00239     if (!BufferIsValid(buf))
00240         return 0;
00241     cat = fsm_get_avail(BufferGetPage(buf), slot);
00242     ReleaseBuffer(buf);
00243 
00244     return fsm_space_cat_to_avail(cat);
00245 }
00246 
00247 /*
00248  * FreeSpaceMapTruncateRel - adjust for truncation of a relation.
00249  *
00250  * The caller must hold AccessExclusiveLock on the relation, to ensure that
00251  * other backends receive the smgr invalidation event that this function sends
00252  * before they access the FSM again.
00253  *
00254  * nblocks is the new size of the heap.
00255  */
00256 void
00257 FreeSpaceMapTruncateRel(Relation rel, BlockNumber nblocks)
00258 {
00259     BlockNumber new_nfsmblocks;
00260     FSMAddress  first_removed_address;
00261     uint16      first_removed_slot;
00262     Buffer      buf;
00263 
00264     RelationOpenSmgr(rel);
00265 
00266     /*
00267      * If no FSM has been created yet for this relation, there's nothing to
00268      * truncate.
00269      */
00270     if (!smgrexists(rel->rd_smgr, FSM_FORKNUM))
00271         return;
00272 
00273     /* Get the location in the FSM of the first removed heap block */
00274     first_removed_address = fsm_get_location(nblocks, &first_removed_slot);
00275 
00276     /*
00277      * Zero out the tail of the last remaining FSM page. If the slot
00278      * representing the first removed heap block is at a page boundary, as the
00279      * first slot on the FSM page that first_removed_address points to, we can
00280      * just truncate that page altogether.
00281      */
00282     if (first_removed_slot > 0)
00283     {
00284         buf = fsm_readbuf(rel, first_removed_address, false);
00285         if (!BufferIsValid(buf))
00286             return;             /* nothing to do; the FSM was already smaller */
00287         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
00288         fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
00289         MarkBufferDirtyHint(buf);
00290         UnlockReleaseBuffer(buf);
00291 
00292         new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
00293     }
00294     else
00295     {
00296         new_nfsmblocks = fsm_logical_to_physical(first_removed_address);
00297         if (smgrnblocks(rel->rd_smgr, FSM_FORKNUM) <= new_nfsmblocks)
00298             return;             /* nothing to do; the FSM was already smaller */
00299     }
00300 
00301     /* Truncate the unused FSM pages, and send smgr inval message */
00302     smgrtruncate(rel->rd_smgr, FSM_FORKNUM, new_nfsmblocks);
00303 
00304     /*
00305      * We might as well update the local smgr_fsm_nblocks setting.
00306      * smgrtruncate sent an smgr cache inval message, which will cause other
00307      * backends to invalidate their copy of smgr_fsm_nblocks, and this one too
00308      * at the next command boundary.  But this ensures it isn't outright wrong
00309      * until then.
00310      */
00311     if (rel->rd_smgr)
00312         rel->rd_smgr->smgr_fsm_nblocks = new_nfsmblocks;
00313 }
00314 
00315 /*
00316  * FreeSpaceMapVacuum - scan and fix any inconsistencies in the FSM
00317  */
00318 void
00319 FreeSpaceMapVacuum(Relation rel)
00320 {
00321     bool        dummy;
00322 
00323     /*
00324      * Traverse the tree in depth-first order. The tree is stored physically
00325      * in depth-first order, so this should be pretty I/O efficient.
00326      */
00327     fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, &dummy);
00328 }
00329 
00330 /******** Internal routines ********/
00331 
00332 /*
00333  * Return category corresponding x bytes of free space
00334  */
00335 static uint8
00336 fsm_space_avail_to_cat(Size avail)
00337 {
00338     int         cat;
00339 
00340     Assert(avail < BLCKSZ);
00341 
00342     if (avail >= MaxFSMRequestSize)
00343         return 255;
00344 
00345     cat = avail / FSM_CAT_STEP;
00346 
00347     /*
00348      * The highest category, 255, is reserved for MaxFSMRequestSize bytes or
00349      * more.
00350      */
00351     if (cat > 254)
00352         cat = 254;
00353 
00354     return (uint8) cat;
00355 }
00356 
00357 /*
00358  * Return the lower bound of the range of free space represented by given
00359  * category.
00360  */
00361 static Size
00362 fsm_space_cat_to_avail(uint8 cat)
00363 {
00364     /* The highest category represents exactly MaxFSMRequestSize bytes. */
00365     if (cat == 255)
00366         return MaxFSMRequestSize;
00367     else
00368         return cat * FSM_CAT_STEP;
00369 }
00370 
00371 /*
00372  * Which category does a page need to have, to accommodate x bytes of data?
00373  * While fsm_size_to_avail_cat() rounds down, this needs to round up.
00374  */
00375 static uint8
00376 fsm_space_needed_to_cat(Size needed)
00377 {
00378     int         cat;
00379 
00380     /* Can't ask for more space than the highest category represents */
00381     if (needed > MaxFSMRequestSize)
00382         elog(ERROR, "invalid FSM request size %lu",
00383              (unsigned long) needed);
00384 
00385     if (needed == 0)
00386         return 1;
00387 
00388     cat = (needed + FSM_CAT_STEP - 1) / FSM_CAT_STEP;
00389 
00390     if (cat > 255)
00391         cat = 255;
00392 
00393     return (uint8) cat;
00394 }
00395 
00396 /*
00397  * Returns the physical block number an FSM page
00398  */
00399 static BlockNumber
00400 fsm_logical_to_physical(FSMAddress addr)
00401 {
00402     BlockNumber pages;
00403     int         leafno;
00404     int         l;
00405 
00406     /*
00407      * Calculate the logical page number of the first leaf page below the
00408      * given page.
00409      */
00410     leafno = addr.logpageno;
00411     for (l = 0; l < addr.level; l++)
00412         leafno *= SlotsPerFSMPage;
00413 
00414     /* Count upper level nodes required to address the leaf page */
00415     pages = 0;
00416     for (l = 0; l < FSM_TREE_DEPTH; l++)
00417     {
00418         pages += leafno + 1;
00419         leafno /= SlotsPerFSMPage;
00420     }
00421 
00422     /*
00423      * If the page we were asked for wasn't at the bottom level, subtract the
00424      * additional lower level pages we counted above.
00425      */
00426     pages -= addr.level;
00427 
00428     /* Turn the page count into 0-based block number */
00429     return pages - 1;
00430 }
00431 
00432 /*
00433  * Return the FSM location corresponding to given heap block.
00434  */
00435 static FSMAddress
00436 fsm_get_location(BlockNumber heapblk, uint16 *slot)
00437 {
00438     FSMAddress  addr;
00439 
00440     addr.level = FSM_BOTTOM_LEVEL;
00441     addr.logpageno = heapblk / SlotsPerFSMPage;
00442     *slot = heapblk % SlotsPerFSMPage;
00443 
00444     return addr;
00445 }
00446 
00447 /*
00448  * Return the heap block number corresponding to given location in the FSM.
00449  */
00450 static BlockNumber
00451 fsm_get_heap_blk(FSMAddress addr, uint16 slot)
00452 {
00453     Assert(addr.level == FSM_BOTTOM_LEVEL);
00454     return ((unsigned int) addr.logpageno) * SlotsPerFSMPage + slot;
00455 }
00456 
00457 /*
00458  * Given a logical address of a child page, get the logical page number of
00459  * the parent, and the slot within the parent corresponding to the child.
00460  */
00461 static FSMAddress
00462 fsm_get_parent(FSMAddress child, uint16 *slot)
00463 {
00464     FSMAddress  parent;
00465 
00466     Assert(child.level < FSM_ROOT_LEVEL);
00467 
00468     parent.level = child.level + 1;
00469     parent.logpageno = child.logpageno / SlotsPerFSMPage;
00470     *slot = child.logpageno % SlotsPerFSMPage;
00471 
00472     return parent;
00473 }
00474 
00475 /*
00476  * Given a logical address of a parent page, and a slot number get the
00477  * logical address of the corresponding child page.
00478  */
00479 static FSMAddress
00480 fsm_get_child(FSMAddress parent, uint16 slot)
00481 {
00482     FSMAddress  child;
00483 
00484     Assert(parent.level > FSM_BOTTOM_LEVEL);
00485 
00486     child.level = parent.level - 1;
00487     child.logpageno = parent.logpageno * SlotsPerFSMPage + slot;
00488 
00489     return child;
00490 }
00491 
00492 /*
00493  * Read a FSM page.
00494  *
00495  * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
00496  * true, the FSM file is extended.
00497  */
00498 static Buffer
00499 fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
00500 {
00501     BlockNumber blkno = fsm_logical_to_physical(addr);
00502     Buffer      buf;
00503 
00504     RelationOpenSmgr(rel);
00505 
00506     /*
00507      * If we haven't cached the size of the FSM yet, check it first.  Also
00508      * recheck if the requested block seems to be past end, since our cached
00509      * value might be stale.  (We send smgr inval messages on truncation, but
00510      * not on extension.)
00511      */
00512     if (rel->rd_smgr->smgr_fsm_nblocks == InvalidBlockNumber ||
00513         blkno >= rel->rd_smgr->smgr_fsm_nblocks)
00514     {
00515         if (smgrexists(rel->rd_smgr, FSM_FORKNUM))
00516             rel->rd_smgr->smgr_fsm_nblocks = smgrnblocks(rel->rd_smgr,
00517                                                          FSM_FORKNUM);
00518         else
00519             rel->rd_smgr->smgr_fsm_nblocks = 0;
00520     }
00521 
00522     /* Handle requests beyond EOF */
00523     if (blkno >= rel->rd_smgr->smgr_fsm_nblocks)
00524     {
00525         if (extend)
00526             fsm_extend(rel, blkno + 1);
00527         else
00528             return InvalidBuffer;
00529     }
00530 
00531     /*
00532      * Use ZERO_ON_ERROR mode, and initialize the page if necessary. The FSM
00533      * information is not accurate anyway, so it's better to clear corrupt
00534      * pages than error out. Since the FSM changes are not WAL-logged, the
00535      * so-called torn page problem on crash can lead to pages with corrupt
00536      * headers, for example.
00537      */
00538     buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL);
00539     if (PageIsNew(BufferGetPage(buf)))
00540         PageInit(BufferGetPage(buf), BLCKSZ, 0);
00541     return buf;
00542 }
00543 
00544 /*
00545  * Ensure that the FSM fork is at least fsm_nblocks long, extending
00546  * it if necessary with empty pages. And by empty, I mean pages filled
00547  * with zeros, meaning there's no free space.
00548  */
00549 static void
00550 fsm_extend(Relation rel, BlockNumber fsm_nblocks)
00551 {
00552     BlockNumber fsm_nblocks_now;
00553     Page        pg;
00554 
00555     pg = (Page) palloc(BLCKSZ);
00556     PageInit(pg, BLCKSZ, 0);
00557 
00558     /*
00559      * We use the relation extension lock to lock out other backends trying to
00560      * extend the FSM at the same time. It also locks out extension of the
00561      * main fork, unnecessarily, but extending the FSM happens seldom enough
00562      * that it doesn't seem worthwhile to have a separate lock tag type for
00563      * it.
00564      *
00565      * Note that another backend might have extended or created the relation
00566      * by the time we get the lock.
00567      */
00568     LockRelationForExtension(rel, ExclusiveLock);
00569 
00570     /* Might have to re-open if a cache flush happened */
00571     RelationOpenSmgr(rel);
00572 
00573     /*
00574      * Create the FSM file first if it doesn't exist.  If smgr_fsm_nblocks is
00575      * positive then it must exist, no need for an smgrexists call.
00576      */
00577     if ((rel->rd_smgr->smgr_fsm_nblocks == 0 ||
00578          rel->rd_smgr->smgr_fsm_nblocks == InvalidBlockNumber) &&
00579         !smgrexists(rel->rd_smgr, FSM_FORKNUM))
00580         smgrcreate(rel->rd_smgr, FSM_FORKNUM, false);
00581 
00582     fsm_nblocks_now = smgrnblocks(rel->rd_smgr, FSM_FORKNUM);
00583 
00584     while (fsm_nblocks_now < fsm_nblocks)
00585     {
00586         PageSetChecksumInplace(pg, fsm_nblocks_now);
00587 
00588         smgrextend(rel->rd_smgr, FSM_FORKNUM, fsm_nblocks_now,
00589                    (char *) pg, false);
00590         fsm_nblocks_now++;
00591     }
00592 
00593     /* Update local cache with the up-to-date size */
00594     rel->rd_smgr->smgr_fsm_nblocks = fsm_nblocks_now;
00595 
00596     UnlockRelationForExtension(rel, ExclusiveLock);
00597 
00598     pfree(pg);
00599 }
00600 
00601 /*
00602  * Set value in given FSM page and slot.
00603  *
00604  * If minValue > 0, the updated page is also searched for a page with at
00605  * least minValue of free space. If one is found, its slot number is
00606  * returned, -1 otherwise.
00607  */
00608 static int
00609 fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
00610                    uint8 newValue, uint8 minValue)
00611 {
00612     Buffer      buf;
00613     Page        page;
00614     int         newslot = -1;
00615 
00616     buf = fsm_readbuf(rel, addr, true);
00617     LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
00618 
00619     page = BufferGetPage(buf);
00620 
00621     if (fsm_set_avail(page, slot, newValue))
00622         MarkBufferDirtyHint(buf);
00623 
00624     if (minValue != 0)
00625     {
00626         /* Search while we still hold the lock */
00627         newslot = fsm_search_avail(buf, minValue,
00628                                    addr.level == FSM_BOTTOM_LEVEL,
00629                                    true);
00630     }
00631 
00632     UnlockReleaseBuffer(buf);
00633 
00634     return newslot;
00635 }
00636 
00637 /*
00638  * Search the tree for a heap page with at least min_cat of free space
00639  */
00640 static BlockNumber
00641 fsm_search(Relation rel, uint8 min_cat)
00642 {
00643     int         restarts = 0;
00644     FSMAddress  addr = FSM_ROOT_ADDRESS;
00645 
00646     for (;;)
00647     {
00648         int         slot;
00649         Buffer      buf;
00650         uint8       max_avail = 0;
00651 
00652         /* Read the FSM page. */
00653         buf = fsm_readbuf(rel, addr, false);
00654 
00655         /* Search within the page */
00656         if (BufferIsValid(buf))
00657         {
00658             LockBuffer(buf, BUFFER_LOCK_SHARE);
00659             slot = fsm_search_avail(buf, min_cat,
00660                                     (addr.level == FSM_BOTTOM_LEVEL),
00661                                     false);
00662             if (slot == -1)
00663                 max_avail = fsm_get_max_avail(BufferGetPage(buf));
00664             UnlockReleaseBuffer(buf);
00665         }
00666         else
00667             slot = -1;
00668 
00669         if (slot != -1)
00670         {
00671             /*
00672              * Descend the tree, or return the found block if we're at the
00673              * bottom.
00674              */
00675             if (addr.level == FSM_BOTTOM_LEVEL)
00676                 return fsm_get_heap_blk(addr, slot);
00677 
00678             addr = fsm_get_child(addr, slot);
00679         }
00680         else if (addr.level == FSM_ROOT_LEVEL)
00681         {
00682             /*
00683              * At the root, failure means there's no page with enough free
00684              * space in the FSM. Give up.
00685              */
00686             return InvalidBlockNumber;
00687         }
00688         else
00689         {
00690             uint16      parentslot;
00691             FSMAddress  parent;
00692 
00693             /*
00694              * At lower level, failure can happen if the value in the upper-
00695              * level node didn't reflect the value on the lower page. Update
00696              * the upper node, to avoid falling into the same trap again, and
00697              * start over.
00698              *
00699              * There's a race condition here, if another backend updates this
00700              * page right after we release it, and gets the lock on the parent
00701              * page before us. We'll then update the parent page with the now
00702              * stale information we had. It's OK, because it should happen
00703              * rarely, and will be fixed by the next vacuum.
00704              */
00705             parent = fsm_get_parent(addr, &parentslot);
00706             fsm_set_and_search(rel, parent, parentslot, max_avail, 0);
00707 
00708             /*
00709              * If the upper pages are badly out of date, we might need to loop
00710              * quite a few times, updating them as we go. Any inconsistencies
00711              * should eventually be corrected and the loop should end. Looping
00712              * indefinitely is nevertheless scary, so provide an emergency
00713              * valve.
00714              */
00715             if (restarts++ > 10000)
00716                 return InvalidBlockNumber;
00717 
00718             /* Start search all over from the root */
00719             addr = FSM_ROOT_ADDRESS;
00720         }
00721     }
00722 }
00723 
00724 
00725 /*
00726  * Recursive guts of FreeSpaceMapVacuum
00727  */
00728 static uint8
00729 fsm_vacuum_page(Relation rel, FSMAddress addr, bool *eof_p)
00730 {
00731     Buffer      buf;
00732     Page        page;
00733     uint8       max_avail;
00734 
00735     /* Read the page if it exists, or return EOF */
00736     buf = fsm_readbuf(rel, addr, false);
00737     if (!BufferIsValid(buf))
00738     {
00739         *eof_p = true;
00740         return 0;
00741     }
00742     else
00743         *eof_p = false;
00744 
00745     page = BufferGetPage(buf);
00746 
00747     /*
00748      * Recurse into children, and fix the information stored about them at
00749      * this level.
00750      */
00751     if (addr.level > FSM_BOTTOM_LEVEL)
00752     {
00753         int         slot;
00754         bool        eof = false;
00755 
00756         for (slot = 0; slot < SlotsPerFSMPage; slot++)
00757         {
00758             int         child_avail;
00759 
00760             CHECK_FOR_INTERRUPTS();
00761 
00762             /* After we hit end-of-file, just clear the rest of the slots */
00763             if (!eof)
00764                 child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot), &eof);
00765             else
00766                 child_avail = 0;
00767 
00768             /* Update information about the child */
00769             if (fsm_get_avail(page, slot) != child_avail)
00770             {
00771                 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
00772                 fsm_set_avail(BufferGetPage(buf), slot, child_avail);
00773                 MarkBufferDirtyHint(buf);
00774                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
00775             }
00776         }
00777     }
00778 
00779     max_avail = fsm_get_max_avail(BufferGetPage(buf));
00780 
00781     /*
00782      * Reset the next slot pointer. This encourages the use of low-numbered
00783      * pages, increasing the chances that a later vacuum can truncate the
00784      * relation.
00785      */
00786     ((FSMPage) PageGetContents(page))->fp_next_slot = 0;
00787 
00788     ReleaseBuffer(buf);
00789 
00790     return max_avail;
00791 }