Header And Logo

PostgreSQL
| The world's most advanced open source database.

nbtpage.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * nbtpage.c
00004  *    BTree-specific page management code for the Postgres btree access
00005  *    method.
00006  *
00007  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00008  * Portions Copyright (c) 1994, Regents of the University of California
00009  *
00010  *
00011  * IDENTIFICATION
00012  *    src/backend/access/nbtree/nbtpage.c
00013  *
00014  *  NOTES
00015  *     Postgres btree pages look like ordinary relation pages.  The opaque
00016  *     data at high addresses includes pointers to left and right siblings
00017  *     and flag data describing page state.  The first page in a btree, page
00018  *     zero, is special -- it stores meta-information describing the tree.
00019  *     Pages one and higher store the actual tree data.
00020  *
00021  *-------------------------------------------------------------------------
00022  */
00023 #include "postgres.h"
00024 
00025 #include "access/nbtree.h"
00026 #include "access/transam.h"
00027 #include "miscadmin.h"
00028 #include "storage/indexfsm.h"
00029 #include "storage/lmgr.h"
00030 #include "storage/predicate.h"
00031 #include "utils/inval.h"
00032 #include "utils/snapmgr.h"
00033 
00034 
00035 /*
00036  *  _bt_initmetapage() -- Fill a page buffer with a correct metapage image
00037  */
00038 void
00039 _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
00040 {
00041     BTMetaPageData *metad;
00042     BTPageOpaque metaopaque;
00043 
00044     _bt_pageinit(page, BLCKSZ);
00045 
00046     metad = BTPageGetMeta(page);
00047     metad->btm_magic = BTREE_MAGIC;
00048     metad->btm_version = BTREE_VERSION;
00049     metad->btm_root = rootbknum;
00050     metad->btm_level = level;
00051     metad->btm_fastroot = rootbknum;
00052     metad->btm_fastlevel = level;
00053 
00054     metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
00055     metaopaque->btpo_flags = BTP_META;
00056 
00057     /*
00058      * Set pd_lower just past the end of the metadata.  This is not essential
00059      * but it makes the page look compressible to xlog.c.
00060      */
00061     ((PageHeader) page)->pd_lower =
00062         ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
00063 }
00064 
00065 /*
00066  *  _bt_getroot() -- Get the root page of the btree.
00067  *
00068  *      Since the root page can move around the btree file, we have to read
00069  *      its location from the metadata page, and then read the root page
00070  *      itself.  If no root page exists yet, we have to create one.  The
00071  *      standard class of race conditions exists here; I think I covered
00072  *      them all in the Hopi Indian rain dance of lock requests below.
00073  *
00074  *      The access type parameter (BT_READ or BT_WRITE) controls whether
00075  *      a new root page will be created or not.  If access = BT_READ,
00076  *      and no root page exists, we just return InvalidBuffer.  For
00077  *      BT_WRITE, we try to create the root page if it doesn't exist.
00078  *      NOTE that the returned root page will have only a read lock set
00079  *      on it even if access = BT_WRITE!
00080  *
00081  *      The returned page is not necessarily the true root --- it could be
00082  *      a "fast root" (a page that is alone in its level due to deletions).
00083  *      Also, if the root page is split while we are "in flight" to it,
00084  *      what we will return is the old root, which is now just the leftmost
00085  *      page on a probably-not-very-wide level.  For most purposes this is
00086  *      as good as or better than the true root, so we do not bother to
00087  *      insist on finding the true root.  We do, however, guarantee to
00088  *      return a live (not deleted or half-dead) page.
00089  *
00090  *      On successful return, the root page is pinned and read-locked.
00091  *      The metadata page is not locked or pinned on exit.
00092  */
00093 Buffer
00094 _bt_getroot(Relation rel, int access)
00095 {
00096     Buffer      metabuf;
00097     Page        metapg;
00098     BTPageOpaque metaopaque;
00099     Buffer      rootbuf;
00100     Page        rootpage;
00101     BTPageOpaque rootopaque;
00102     BlockNumber rootblkno;
00103     uint32      rootlevel;
00104     BTMetaPageData *metad;
00105 
00106     /*
00107      * Try to use previously-cached metapage data to find the root.  This
00108      * normally saves one buffer access per index search, which is a very
00109      * helpful savings in bufmgr traffic and hence contention.
00110      */
00111     if (rel->rd_amcache != NULL)
00112     {
00113         metad = (BTMetaPageData *) rel->rd_amcache;
00114         /* We shouldn't have cached it if any of these fail */
00115         Assert(metad->btm_magic == BTREE_MAGIC);
00116         Assert(metad->btm_version == BTREE_VERSION);
00117         Assert(metad->btm_root != P_NONE);
00118 
00119         rootblkno = metad->btm_fastroot;
00120         Assert(rootblkno != P_NONE);
00121         rootlevel = metad->btm_fastlevel;
00122 
00123         rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
00124         rootpage = BufferGetPage(rootbuf);
00125         rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
00126 
00127         /*
00128          * Since the cache might be stale, we check the page more carefully
00129          * here than normal.  We *must* check that it's not deleted. If it's
00130          * not alone on its level, then we reject too --- this may be overly
00131          * paranoid but better safe than sorry.  Note we don't check P_ISROOT,
00132          * because that's not set in a "fast root".
00133          */
00134         if (!P_IGNORE(rootopaque) &&
00135             rootopaque->btpo.level == rootlevel &&
00136             P_LEFTMOST(rootopaque) &&
00137             P_RIGHTMOST(rootopaque))
00138         {
00139             /* OK, accept cached page as the root */
00140             return rootbuf;
00141         }
00142         _bt_relbuf(rel, rootbuf);
00143         /* Cache is stale, throw it away */
00144         if (rel->rd_amcache)
00145             pfree(rel->rd_amcache);
00146         rel->rd_amcache = NULL;
00147     }
00148 
00149     metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
00150     metapg = BufferGetPage(metabuf);
00151     metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
00152     metad = BTPageGetMeta(metapg);
00153 
00154     /* sanity-check the metapage */
00155     if (!(metaopaque->btpo_flags & BTP_META) ||
00156         metad->btm_magic != BTREE_MAGIC)
00157         ereport(ERROR,
00158                 (errcode(ERRCODE_INDEX_CORRUPTED),
00159                  errmsg("index \"%s\" is not a btree",
00160                         RelationGetRelationName(rel))));
00161 
00162     if (metad->btm_version != BTREE_VERSION)
00163         ereport(ERROR,
00164                 (errcode(ERRCODE_INDEX_CORRUPTED),
00165                  errmsg("version mismatch in index \"%s\": file version %d, code version %d",
00166                         RelationGetRelationName(rel),
00167                         metad->btm_version, BTREE_VERSION)));
00168 
00169     /* if no root page initialized yet, do it */
00170     if (metad->btm_root == P_NONE)
00171     {
00172         /* If access = BT_READ, caller doesn't want us to create root yet */
00173         if (access == BT_READ)
00174         {
00175             _bt_relbuf(rel, metabuf);
00176             return InvalidBuffer;
00177         }
00178 
00179         /* trade in our read lock for a write lock */
00180         LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
00181         LockBuffer(metabuf, BT_WRITE);
00182 
00183         /*
00184          * Race condition:  if someone else initialized the metadata between
00185          * the time we released the read lock and acquired the write lock, we
00186          * must avoid doing it again.
00187          */
00188         if (metad->btm_root != P_NONE)
00189         {
00190             /*
00191              * Metadata initialized by someone else.  In order to guarantee no
00192              * deadlocks, we have to release the metadata page and start all
00193              * over again.  (Is that really true? But it's hardly worth trying
00194              * to optimize this case.)
00195              */
00196             _bt_relbuf(rel, metabuf);
00197             return _bt_getroot(rel, access);
00198         }
00199 
00200         /*
00201          * Get, initialize, write, and leave a lock of the appropriate type on
00202          * the new root page.  Since this is the first page in the tree, it's
00203          * a leaf as well as the root.
00204          */
00205         rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
00206         rootblkno = BufferGetBlockNumber(rootbuf);
00207         rootpage = BufferGetPage(rootbuf);
00208         rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
00209         rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
00210         rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
00211         rootopaque->btpo.level = 0;
00212         rootopaque->btpo_cycleid = 0;
00213 
00214         /* NO ELOG(ERROR) till meta is updated */
00215         START_CRIT_SECTION();
00216 
00217         metad->btm_root = rootblkno;
00218         metad->btm_level = 0;
00219         metad->btm_fastroot = rootblkno;
00220         metad->btm_fastlevel = 0;
00221 
00222         MarkBufferDirty(rootbuf);
00223         MarkBufferDirty(metabuf);
00224 
00225         /* XLOG stuff */
00226         if (RelationNeedsWAL(rel))
00227         {
00228             xl_btree_newroot xlrec;
00229             XLogRecPtr  recptr;
00230             XLogRecData rdata;
00231 
00232             xlrec.node = rel->rd_node;
00233             xlrec.rootblk = rootblkno;
00234             xlrec.level = 0;
00235 
00236             rdata.data = (char *) &xlrec;
00237             rdata.len = SizeOfBtreeNewroot;
00238             rdata.buffer = InvalidBuffer;
00239             rdata.next = NULL;
00240 
00241             recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, &rdata);
00242 
00243             PageSetLSN(rootpage, recptr);
00244             PageSetLSN(metapg, recptr);
00245         }
00246 
00247         END_CRIT_SECTION();
00248 
00249         /*
00250          * Send out relcache inval for metapage change (probably unnecessary
00251          * here, but let's be safe).
00252          */
00253         CacheInvalidateRelcache(rel);
00254 
00255         /*
00256          * swap root write lock for read lock.  There is no danger of anyone
00257          * else accessing the new root page while it's unlocked, since no one
00258          * else knows where it is yet.
00259          */
00260         LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
00261         LockBuffer(rootbuf, BT_READ);
00262 
00263         /* okay, metadata is correct, release lock on it */
00264         _bt_relbuf(rel, metabuf);
00265     }
00266     else
00267     {
00268         rootblkno = metad->btm_fastroot;
00269         Assert(rootblkno != P_NONE);
00270         rootlevel = metad->btm_fastlevel;
00271 
00272         /*
00273          * Cache the metapage data for next time
00274          */
00275         rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
00276                                              sizeof(BTMetaPageData));
00277         memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
00278 
00279         /*
00280          * We are done with the metapage; arrange to release it via first
00281          * _bt_relandgetbuf call
00282          */
00283         rootbuf = metabuf;
00284 
00285         for (;;)
00286         {
00287             rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
00288             rootpage = BufferGetPage(rootbuf);
00289             rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
00290 
00291             if (!P_IGNORE(rootopaque))
00292                 break;
00293 
00294             /* it's dead, Jim.  step right one page */
00295             if (P_RIGHTMOST(rootopaque))
00296                 elog(ERROR, "no live root page found in index \"%s\"",
00297                      RelationGetRelationName(rel));
00298             rootblkno = rootopaque->btpo_next;
00299         }
00300 
00301         /* Note: can't check btpo.level on deleted pages */
00302         if (rootopaque->btpo.level != rootlevel)
00303             elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
00304                  rootblkno, RelationGetRelationName(rel),
00305                  rootopaque->btpo.level, rootlevel);
00306     }
00307 
00308     /*
00309      * By here, we have a pin and read lock on the root page, and no lock set
00310      * on the metadata page.  Return the root page's buffer.
00311      */
00312     return rootbuf;
00313 }
00314 
00315 /*
00316  *  _bt_gettrueroot() -- Get the true root page of the btree.
00317  *
00318  *      This is the same as the BT_READ case of _bt_getroot(), except
00319  *      we follow the true-root link not the fast-root link.
00320  *
00321  * By the time we acquire lock on the root page, it might have been split and
00322  * not be the true root anymore.  This is okay for the present uses of this
00323  * routine; we only really need to be able to move up at least one tree level
00324  * from whatever non-root page we were at.  If we ever do need to lock the
00325  * one true root page, we could loop here, re-reading the metapage on each
00326  * failure.  (Note that it wouldn't do to hold the lock on the metapage while
00327  * moving to the root --- that'd deadlock against any concurrent root split.)
00328  */
00329 Buffer
00330 _bt_gettrueroot(Relation rel)
00331 {
00332     Buffer      metabuf;
00333     Page        metapg;
00334     BTPageOpaque metaopaque;
00335     Buffer      rootbuf;
00336     Page        rootpage;
00337     BTPageOpaque rootopaque;
00338     BlockNumber rootblkno;
00339     uint32      rootlevel;
00340     BTMetaPageData *metad;
00341 
00342     /*
00343      * We don't try to use cached metapage data here, since (a) this path is
00344      * not performance-critical, and (b) if we are here it suggests our cache
00345      * is out-of-date anyway.  In light of point (b), it's probably safest to
00346      * actively flush any cached metapage info.
00347      */
00348     if (rel->rd_amcache)
00349         pfree(rel->rd_amcache);
00350     rel->rd_amcache = NULL;
00351 
00352     metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
00353     metapg = BufferGetPage(metabuf);
00354     metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
00355     metad = BTPageGetMeta(metapg);
00356 
00357     if (!(metaopaque->btpo_flags & BTP_META) ||
00358         metad->btm_magic != BTREE_MAGIC)
00359         ereport(ERROR,
00360                 (errcode(ERRCODE_INDEX_CORRUPTED),
00361                  errmsg("index \"%s\" is not a btree",
00362                         RelationGetRelationName(rel))));
00363 
00364     if (metad->btm_version != BTREE_VERSION)
00365         ereport(ERROR,
00366                 (errcode(ERRCODE_INDEX_CORRUPTED),
00367                  errmsg("version mismatch in index \"%s\": file version %d, code version %d",
00368                         RelationGetRelationName(rel),
00369                         metad->btm_version, BTREE_VERSION)));
00370 
00371     /* if no root page initialized yet, fail */
00372     if (metad->btm_root == P_NONE)
00373     {
00374         _bt_relbuf(rel, metabuf);
00375         return InvalidBuffer;
00376     }
00377 
00378     rootblkno = metad->btm_root;
00379     rootlevel = metad->btm_level;
00380 
00381     /*
00382      * We are done with the metapage; arrange to release it via first
00383      * _bt_relandgetbuf call
00384      */
00385     rootbuf = metabuf;
00386 
00387     for (;;)
00388     {
00389         rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
00390         rootpage = BufferGetPage(rootbuf);
00391         rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
00392 
00393         if (!P_IGNORE(rootopaque))
00394             break;
00395 
00396         /* it's dead, Jim.  step right one page */
00397         if (P_RIGHTMOST(rootopaque))
00398             elog(ERROR, "no live root page found in index \"%s\"",
00399                  RelationGetRelationName(rel));
00400         rootblkno = rootopaque->btpo_next;
00401     }
00402 
00403     /* Note: can't check btpo.level on deleted pages */
00404     if (rootopaque->btpo.level != rootlevel)
00405         elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
00406              rootblkno, RelationGetRelationName(rel),
00407              rootopaque->btpo.level, rootlevel);
00408 
00409     return rootbuf;
00410 }
00411 
00412 /*
00413  *  _bt_getrootheight() -- Get the height of the btree search tree.
00414  *
00415  *      We return the level (counting from zero) of the current fast root.
00416  *      This represents the number of tree levels we'd have to descend through
00417  *      to start any btree index search.
00418  *
00419  *      This is used by the planner for cost-estimation purposes.  Since it's
00420  *      only an estimate, slightly-stale data is fine, hence we don't worry
00421  *      about updating previously cached data.
00422  */
00423 int
00424 _bt_getrootheight(Relation rel)
00425 {
00426     BTMetaPageData *metad;
00427 
00428     /*
00429      * We can get what we need from the cached metapage data.  If it's not
00430      * cached yet, load it.  Sanity checks here must match _bt_getroot().
00431      */
00432     if (rel->rd_amcache == NULL)
00433     {
00434         Buffer      metabuf;
00435         Page        metapg;
00436         BTPageOpaque metaopaque;
00437 
00438         metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
00439         metapg = BufferGetPage(metabuf);
00440         metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
00441         metad = BTPageGetMeta(metapg);
00442 
00443         /* sanity-check the metapage */
00444         if (!(metaopaque->btpo_flags & BTP_META) ||
00445             metad->btm_magic != BTREE_MAGIC)
00446             ereport(ERROR,
00447                     (errcode(ERRCODE_INDEX_CORRUPTED),
00448                      errmsg("index \"%s\" is not a btree",
00449                             RelationGetRelationName(rel))));
00450 
00451         if (metad->btm_version != BTREE_VERSION)
00452             ereport(ERROR,
00453                     (errcode(ERRCODE_INDEX_CORRUPTED),
00454                      errmsg("version mismatch in index \"%s\": file version %d, code version %d",
00455                             RelationGetRelationName(rel),
00456                             metad->btm_version, BTREE_VERSION)));
00457 
00458         /*
00459          * If there's no root page yet, _bt_getroot() doesn't expect a cache
00460          * to be made, so just stop here and report the index height is zero.
00461          * (XXX perhaps _bt_getroot() should be changed to allow this case.)
00462          */
00463         if (metad->btm_root == P_NONE)
00464         {
00465             _bt_relbuf(rel, metabuf);
00466             return 0;
00467         }
00468 
00469         /*
00470          * Cache the metapage data for next time
00471          */
00472         rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
00473                                              sizeof(BTMetaPageData));
00474         memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
00475 
00476         _bt_relbuf(rel, metabuf);
00477     }
00478 
00479     metad = (BTMetaPageData *) rel->rd_amcache;
00480     /* We shouldn't have cached it if any of these fail */
00481     Assert(metad->btm_magic == BTREE_MAGIC);
00482     Assert(metad->btm_version == BTREE_VERSION);
00483     Assert(metad->btm_fastroot != P_NONE);
00484 
00485     return metad->btm_fastlevel;
00486 }
00487 
00488 /*
00489  *  _bt_checkpage() -- Verify that a freshly-read page looks sane.
00490  */
00491 void
00492 _bt_checkpage(Relation rel, Buffer buf)
00493 {
00494     Page        page = BufferGetPage(buf);
00495 
00496     /*
00497      * ReadBuffer verifies that every newly-read page passes
00498      * PageHeaderIsValid, which means it either contains a reasonably sane
00499      * page header or is all-zero.  We have to defend against the all-zero
00500      * case, however.
00501      */
00502     if (PageIsNew(page))
00503         ereport(ERROR,
00504                 (errcode(ERRCODE_INDEX_CORRUPTED),
00505              errmsg("index \"%s\" contains unexpected zero page at block %u",
00506                     RelationGetRelationName(rel),
00507                     BufferGetBlockNumber(buf)),
00508                  errhint("Please REINDEX it.")));
00509 
00510     /*
00511      * Additionally check that the special area looks sane.
00512      */
00513     if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
00514         ereport(ERROR,
00515                 (errcode(ERRCODE_INDEX_CORRUPTED),
00516                  errmsg("index \"%s\" contains corrupted page at block %u",
00517                         RelationGetRelationName(rel),
00518                         BufferGetBlockNumber(buf)),
00519                  errhint("Please REINDEX it.")));
00520 }
00521 
00522 /*
00523  * Log the reuse of a page from the FSM.
00524  */
00525 static void
00526 _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid)
00527 {
00528     if (!RelationNeedsWAL(rel))
00529         return;
00530 
00531     /* No ereport(ERROR) until changes are logged */
00532     START_CRIT_SECTION();
00533 
00534     /*
00535      * We don't do MarkBufferDirty here because we're about to initialise
00536      * the page, and nobody else can see it yet.
00537      */
00538 
00539     /* XLOG stuff */
00540     {
00541         XLogRecData rdata[1];
00542         xl_btree_reuse_page xlrec_reuse;
00543 
00544         xlrec_reuse.node = rel->rd_node;
00545         xlrec_reuse.block = blkno;
00546         xlrec_reuse.latestRemovedXid = latestRemovedXid;
00547         rdata[0].data = (char *) &xlrec_reuse;
00548         rdata[0].len = SizeOfBtreeReusePage;
00549         rdata[0].buffer = InvalidBuffer;
00550         rdata[0].next = NULL;
00551 
00552         XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE, rdata);
00553 
00554         /*
00555          * We don't do PageSetLSN here because we're about to initialise
00556          * the page, so no need.
00557          */
00558     }
00559 
00560     END_CRIT_SECTION();
00561 }
00562 
00563 /*
00564  *  _bt_getbuf() -- Get a buffer by block number for read or write.
00565  *
00566  *      blkno == P_NEW means to get an unallocated index page.  The page
00567  *      will be initialized before returning it.
00568  *
00569  *      When this routine returns, the appropriate lock is set on the
00570  *      requested buffer and its reference count has been incremented
00571  *      (ie, the buffer is "locked and pinned").  Also, we apply
00572  *      _bt_checkpage to sanity-check the page (except in P_NEW case).
00573  */
00574 Buffer
00575 _bt_getbuf(Relation rel, BlockNumber blkno, int access)
00576 {
00577     Buffer      buf;
00578 
00579     if (blkno != P_NEW)
00580     {
00581         /* Read an existing block of the relation */
00582         buf = ReadBuffer(rel, blkno);
00583         LockBuffer(buf, access);
00584         _bt_checkpage(rel, buf);
00585     }
00586     else
00587     {
00588         bool        needLock;
00589         Page        page;
00590 
00591         Assert(access == BT_WRITE);
00592 
00593         /*
00594          * First see if the FSM knows of any free pages.
00595          *
00596          * We can't trust the FSM's report unreservedly; we have to check that
00597          * the page is still free.  (For example, an already-free page could
00598          * have been re-used between the time the last VACUUM scanned it and
00599          * the time the VACUUM made its FSM updates.)
00600          *
00601          * In fact, it's worse than that: we can't even assume that it's safe
00602          * to take a lock on the reported page.  If somebody else has a lock
00603          * on it, or even worse our own caller does, we could deadlock.  (The
00604          * own-caller scenario is actually not improbable. Consider an index
00605          * on a serial or timestamp column.  Nearly all splits will be at the
00606          * rightmost page, so it's entirely likely that _bt_split will call us
00607          * while holding a lock on the page most recently acquired from FSM. A
00608          * VACUUM running concurrently with the previous split could well have
00609          * placed that page back in FSM.)
00610          *
00611          * To get around that, we ask for only a conditional lock on the
00612          * reported page.  If we fail, then someone else is using the page,
00613          * and we may reasonably assume it's not free.  (If we happen to be
00614          * wrong, the worst consequence is the page will be lost to use till
00615          * the next VACUUM, which is no big problem.)
00616          */
00617         for (;;)
00618         {
00619             blkno = GetFreeIndexPage(rel);
00620             if (blkno == InvalidBlockNumber)
00621                 break;
00622             buf = ReadBuffer(rel, blkno);
00623             if (ConditionalLockBuffer(buf))
00624             {
00625                 page = BufferGetPage(buf);
00626                 if (_bt_page_recyclable(page))
00627                 {
00628                     /*
00629                      * If we are generating WAL for Hot Standby then create a
00630                      * WAL record that will allow us to conflict with queries
00631                      * running on standby.
00632                      */
00633                     if (XLogStandbyInfoActive())
00634                     {
00635                         BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
00636 
00637                         _bt_log_reuse_page(rel, blkno, opaque->btpo.xact);
00638                     }
00639 
00640                     /* Okay to use page.  Re-initialize and return it */
00641                     _bt_pageinit(page, BufferGetPageSize(buf));
00642                     return buf;
00643                 }
00644                 elog(DEBUG2, "FSM returned nonrecyclable page");
00645                 _bt_relbuf(rel, buf);
00646             }
00647             else
00648             {
00649                 elog(DEBUG2, "FSM returned nonlockable page");
00650                 /* couldn't get lock, so just drop pin */
00651                 ReleaseBuffer(buf);
00652             }
00653         }
00654 
00655         /*
00656          * Extend the relation by one page.
00657          *
00658          * We have to use a lock to ensure no one else is extending the rel at
00659          * the same time, else we will both try to initialize the same new
00660          * page.  We can skip locking for new or temp relations, however,
00661          * since no one else could be accessing them.
00662          */
00663         needLock = !RELATION_IS_LOCAL(rel);
00664 
00665         if (needLock)
00666             LockRelationForExtension(rel, ExclusiveLock);
00667 
00668         buf = ReadBuffer(rel, P_NEW);
00669 
00670         /* Acquire buffer lock on new page */
00671         LockBuffer(buf, BT_WRITE);
00672 
00673         /*
00674          * Release the file-extension lock; it's now OK for someone else to
00675          * extend the relation some more.  Note that we cannot release this
00676          * lock before we have buffer lock on the new page, or we risk a race
00677          * condition against btvacuumscan --- see comments therein.
00678          */
00679         if (needLock)
00680             UnlockRelationForExtension(rel, ExclusiveLock);
00681 
00682         /* Initialize the new page before returning it */
00683         page = BufferGetPage(buf);
00684         Assert(PageIsNew(page));
00685         _bt_pageinit(page, BufferGetPageSize(buf));
00686     }
00687 
00688     /* ref count and lock type are correct */
00689     return buf;
00690 }
00691 
00692 /*
00693  *  _bt_relandgetbuf() -- release a locked buffer and get another one.
00694  *
00695  * This is equivalent to _bt_relbuf followed by _bt_getbuf, with the
00696  * exception that blkno may not be P_NEW.  Also, if obuf is InvalidBuffer
00697  * then it reduces to just _bt_getbuf; allowing this case simplifies some
00698  * callers.
00699  *
00700  * The original motivation for using this was to avoid two entries to the
00701  * bufmgr when one would do.  However, now it's mainly just a notational
00702  * convenience.  The only case where it saves work over _bt_relbuf/_bt_getbuf
00703  * is when the target page is the same one already in the buffer.
00704  */
00705 Buffer
00706 _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
00707 {
00708     Buffer      buf;
00709 
00710     Assert(blkno != P_NEW);
00711     if (BufferIsValid(obuf))
00712         LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
00713     buf = ReleaseAndReadBuffer(obuf, rel, blkno);
00714     LockBuffer(buf, access);
00715     _bt_checkpage(rel, buf);
00716     return buf;
00717 }
00718 
00719 /*
00720  *  _bt_relbuf() -- release a locked buffer.
00721  *
00722  * Lock and pin (refcount) are both dropped.
00723  */
00724 void
00725 _bt_relbuf(Relation rel, Buffer buf)
00726 {
00727     UnlockReleaseBuffer(buf);
00728 }
00729 
00730 /*
00731  *  _bt_pageinit() -- Initialize a new page.
00732  *
00733  * On return, the page header is initialized; data space is empty;
00734  * special space is zeroed out.
00735  */
00736 void
00737 _bt_pageinit(Page page, Size size)
00738 {
00739     PageInit(page, size, sizeof(BTPageOpaqueData));
00740 }
00741 
00742 /*
00743  *  _bt_page_recyclable() -- Is an existing page recyclable?
00744  *
00745  * This exists to make sure _bt_getbuf and btvacuumscan have the same
00746  * policy about whether a page is safe to re-use.
00747  */
00748 bool
00749 _bt_page_recyclable(Page page)
00750 {
00751     BTPageOpaque opaque;
00752 
00753     /*
00754      * It's possible to find an all-zeroes page in an index --- for example, a
00755      * backend might successfully extend the relation one page and then crash
00756      * before it is able to make a WAL entry for adding the page. If we find a
00757      * zeroed page then reclaim it.
00758      */
00759     if (PageIsNew(page))
00760         return true;
00761 
00762     /*
00763      * Otherwise, recycle if deleted and too old to have any processes
00764      * interested in it.
00765      */
00766     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
00767     if (P_ISDELETED(opaque) &&
00768         TransactionIdPrecedes(opaque->btpo.xact, RecentGlobalXmin))
00769         return true;
00770     return false;
00771 }
00772 
00773 /*
00774  * Delete item(s) from a btree page during VACUUM.
00775  *
00776  * This must only be used for deleting leaf items.  Deleting an item on a
00777  * non-leaf page has to be done as part of an atomic action that includes
00778  * deleting the page it points to.
00779  *
00780  * This routine assumes that the caller has pinned and locked the buffer.
00781  * Also, the given itemnos *must* appear in increasing order in the array.
00782  *
00783  * We record VACUUMs and b-tree deletes differently in WAL. InHotStandby
00784  * we need to be able to pin all of the blocks in the btree in physical
00785  * order when replaying the effects of a VACUUM, just as we do for the
00786  * original VACUUM itself. lastBlockVacuumed allows us to tell whether an
00787  * intermediate range of blocks has had no changes at all by VACUUM,
00788  * and so must be scanned anyway during replay. We always write a WAL record
00789  * for the last block in the index, whether or not it contained any items
00790  * to be removed. This allows us to scan right up to end of index to
00791  * ensure correct locking.
00792  */
00793 void
00794 _bt_delitems_vacuum(Relation rel, Buffer buf,
00795                     OffsetNumber *itemnos, int nitems,
00796                     BlockNumber lastBlockVacuumed)
00797 {
00798     Page        page = BufferGetPage(buf);
00799     BTPageOpaque opaque;
00800 
00801     /* No ereport(ERROR) until changes are logged */
00802     START_CRIT_SECTION();
00803 
00804     /* Fix the page */
00805     if (nitems > 0)
00806         PageIndexMultiDelete(page, itemnos, nitems);
00807 
00808     /*
00809      * We can clear the vacuum cycle ID since this page has certainly been
00810      * processed by the current vacuum scan.
00811      */
00812     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
00813     opaque->btpo_cycleid = 0;
00814 
00815     /*
00816      * Mark the page as not containing any LP_DEAD items.  This is not
00817      * certainly true (there might be some that have recently been marked, but
00818      * weren't included in our target-item list), but it will almost always be
00819      * true and it doesn't seem worth an additional page scan to check it.
00820      * Remember that BTP_HAS_GARBAGE is only a hint anyway.
00821      */
00822     opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
00823 
00824     MarkBufferDirty(buf);
00825 
00826     /* XLOG stuff */
00827     if (RelationNeedsWAL(rel))
00828     {
00829         XLogRecPtr  recptr;
00830         XLogRecData rdata[2];
00831         xl_btree_vacuum xlrec_vacuum;
00832 
00833         xlrec_vacuum.node = rel->rd_node;
00834         xlrec_vacuum.block = BufferGetBlockNumber(buf);
00835 
00836         xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed;
00837         rdata[0].data = (char *) &xlrec_vacuum;
00838         rdata[0].len = SizeOfBtreeVacuum;
00839         rdata[0].buffer = InvalidBuffer;
00840         rdata[0].next = &(rdata[1]);
00841 
00842         /*
00843          * The target-offsets array is not in the buffer, but pretend that it
00844          * is.  When XLogInsert stores the whole buffer, the offsets array
00845          * need not be stored too.
00846          */
00847         if (nitems > 0)
00848         {
00849             rdata[1].data = (char *) itemnos;
00850             rdata[1].len = nitems * sizeof(OffsetNumber);
00851         }
00852         else
00853         {
00854             rdata[1].data = NULL;
00855             rdata[1].len = 0;
00856         }
00857         rdata[1].buffer = buf;
00858         rdata[1].buffer_std = true;
00859         rdata[1].next = NULL;
00860 
00861         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM, rdata);
00862 
00863         PageSetLSN(page, recptr);
00864     }
00865 
00866     END_CRIT_SECTION();
00867 }
00868 
00869 /*
00870  * Delete item(s) from a btree page during single-page cleanup.
00871  *
00872  * As above, must only be used on leaf pages.
00873  *
00874  * This routine assumes that the caller has pinned and locked the buffer.
00875  * Also, the given itemnos *must* appear in increasing order in the array.
00876  *
00877  * This is nearly the same as _bt_delitems_vacuum as far as what it does to
00878  * the page, but the WAL logging considerations are quite different.  See
00879  * comments for _bt_delitems_vacuum.
00880  */
00881 void
00882 _bt_delitems_delete(Relation rel, Buffer buf,
00883                     OffsetNumber *itemnos, int nitems,
00884                     Relation heapRel)
00885 {
00886     Page        page = BufferGetPage(buf);
00887     BTPageOpaque opaque;
00888 
00889     /* Shouldn't be called unless there's something to do */
00890     Assert(nitems > 0);
00891 
00892     /* No ereport(ERROR) until changes are logged */
00893     START_CRIT_SECTION();
00894 
00895     /* Fix the page */
00896     PageIndexMultiDelete(page, itemnos, nitems);
00897 
00898     /*
00899      * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID,
00900      * because this is not called by VACUUM.
00901      */
00902 
00903     /*
00904      * Mark the page as not containing any LP_DEAD items.  This is not
00905      * certainly true (there might be some that have recently been marked, but
00906      * weren't included in our target-item list), but it will almost always be
00907      * true and it doesn't seem worth an additional page scan to check it.
00908      * Remember that BTP_HAS_GARBAGE is only a hint anyway.
00909      */
00910     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
00911     opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
00912 
00913     MarkBufferDirty(buf);
00914 
00915     /* XLOG stuff */
00916     if (RelationNeedsWAL(rel))
00917     {
00918         XLogRecPtr  recptr;
00919         XLogRecData rdata[3];
00920         xl_btree_delete xlrec_delete;
00921 
00922         xlrec_delete.node = rel->rd_node;
00923         xlrec_delete.hnode = heapRel->rd_node;
00924         xlrec_delete.block = BufferGetBlockNumber(buf);
00925         xlrec_delete.nitems = nitems;
00926 
00927         rdata[0].data = (char *) &xlrec_delete;
00928         rdata[0].len = SizeOfBtreeDelete;
00929         rdata[0].buffer = InvalidBuffer;
00930         rdata[0].next = &(rdata[1]);
00931 
00932         /*
00933          * We need the target-offsets array whether or not we store the whole
00934          * buffer, to allow us to find the latestRemovedXid on a standby
00935          * server.
00936          */
00937         rdata[1].data = (char *) itemnos;
00938         rdata[1].len = nitems * sizeof(OffsetNumber);
00939         rdata[1].buffer = InvalidBuffer;
00940         rdata[1].next = &(rdata[2]);
00941 
00942         rdata[2].data = NULL;
00943         rdata[2].len = 0;
00944         rdata[2].buffer = buf;
00945         rdata[2].buffer_std = true;
00946         rdata[2].next = NULL;
00947 
00948         recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
00949 
00950         PageSetLSN(page, recptr);
00951     }
00952 
00953     END_CRIT_SECTION();
00954 }
00955 
00956 /*
00957  * Subroutine to pre-check whether a page deletion is safe, that is, its
00958  * parent page would be left in a valid or deletable state.
00959  *
00960  * "target" is the page we wish to delete, and "stack" is a search stack
00961  * leading to it (approximately).  Note that we will update the stack
00962  * entry(s) to reflect current downlink positions --- this is harmless and
00963  * indeed saves later search effort in _bt_pagedel.
00964  *
00965  * Note: it's OK to release page locks after checking, because a safe
00966  * deletion can't become unsafe due to concurrent activity.  A non-rightmost
00967  * page cannot become rightmost unless there's a concurrent page deletion,
00968  * but only VACUUM does page deletion and we only allow one VACUUM on an index
00969  * at a time.  An only child could acquire a sibling (of the same parent) only
00970  * by being split ... but that would make it a non-rightmost child so the
00971  * deletion is still safe.
00972  */
00973 static bool
00974 _bt_parent_deletion_safe(Relation rel, BlockNumber target, BTStack stack)
00975 {
00976     BlockNumber parent;
00977     OffsetNumber poffset,
00978                 maxoff;
00979     Buffer      pbuf;
00980     Page        page;
00981     BTPageOpaque opaque;
00982 
00983     /*
00984      * In recovery mode, assume the deletion being replayed is valid.  We
00985      * can't always check it because we won't have a full search stack, and we
00986      * should complain if there's a problem, anyway.
00987      */
00988     if (InRecovery)
00989         return true;
00990 
00991     /* Locate the parent's downlink (updating the stack entry if needed) */
00992     ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY);
00993     pbuf = _bt_getstackbuf(rel, stack, BT_READ);
00994     if (pbuf == InvalidBuffer)
00995         elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u",
00996              RelationGetRelationName(rel), target);
00997     parent = stack->bts_blkno;
00998     poffset = stack->bts_offset;
00999 
01000     page = BufferGetPage(pbuf);
01001     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01002     maxoff = PageGetMaxOffsetNumber(page);
01003 
01004     /*
01005      * If the target is the rightmost child of its parent, then we can't
01006      * delete, unless it's also the only child.
01007      */
01008     if (poffset >= maxoff)
01009     {
01010         /* It's rightmost child... */
01011         if (poffset == P_FIRSTDATAKEY(opaque))
01012         {
01013             /*
01014              * It's only child, so safe if parent would itself be removable.
01015              * We have to check the parent itself, and then recurse to test
01016              * the conditions at the parent's parent.
01017              */
01018             if (P_RIGHTMOST(opaque) || P_ISROOT(opaque))
01019             {
01020                 _bt_relbuf(rel, pbuf);
01021                 return false;
01022             }
01023 
01024             _bt_relbuf(rel, pbuf);
01025             return _bt_parent_deletion_safe(rel, parent, stack->bts_parent);
01026         }
01027         else
01028         {
01029             /* Unsafe to delete */
01030             _bt_relbuf(rel, pbuf);
01031             return false;
01032         }
01033     }
01034     else
01035     {
01036         /* Not rightmost child, so safe to delete */
01037         _bt_relbuf(rel, pbuf);
01038         return true;
01039     }
01040 }
01041 
01042 /*
01043  * _bt_pagedel() -- Delete a page from the b-tree, if legal to do so.
01044  *
01045  * This action unlinks the page from the b-tree structure, removing all
01046  * pointers leading to it --- but not touching its own left and right links.
01047  * The page cannot be physically reclaimed right away, since other processes
01048  * may currently be trying to follow links leading to the page; they have to
01049  * be allowed to use its right-link to recover.  See nbtree/README.
01050  *
01051  * On entry, the target buffer must be pinned and locked (either read or write
01052  * lock is OK).  This lock and pin will be dropped before exiting.
01053  *
01054  * The "stack" argument can be a search stack leading (approximately) to the
01055  * target page, or NULL --- outside callers typically pass NULL since they
01056  * have not done such a search, but internal recursion cases pass the stack
01057  * to avoid duplicated search effort.
01058  *
01059  * Returns the number of pages successfully deleted (zero if page cannot
01060  * be deleted now; could be more than one if parent pages were deleted too).
01061  *
01062  * NOTE: this leaks memory.  Rather than trying to clean up everything
01063  * carefully, it's better to run it in a temp context that can be reset
01064  * frequently.
01065  */
01066 int
01067 _bt_pagedel(Relation rel, Buffer buf, BTStack stack)
01068 {
01069     int         result;
01070     BlockNumber target,
01071                 leftsib,
01072                 rightsib,
01073                 parent;
01074     OffsetNumber poffset,
01075                 maxoff;
01076     uint32      targetlevel,
01077                 ilevel;
01078     ItemId      itemid;
01079     IndexTuple  targetkey,
01080                 itup;
01081     ScanKey     itup_scankey;
01082     Buffer      lbuf,
01083                 rbuf,
01084                 pbuf;
01085     bool        parent_half_dead;
01086     bool        parent_one_child;
01087     bool        rightsib_empty;
01088     Buffer      metabuf = InvalidBuffer;
01089     Page        metapg = NULL;
01090     BTMetaPageData *metad = NULL;
01091     Page        page;
01092     BTPageOpaque opaque;
01093 
01094     /*
01095      * We can never delete rightmost pages nor root pages.  While at it, check
01096      * that page is not already deleted and is empty.
01097      */
01098     page = BufferGetPage(buf);
01099     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01100     if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
01101         P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
01102     {
01103         /* Should never fail to delete a half-dead page */
01104         Assert(!P_ISHALFDEAD(opaque));
01105 
01106         _bt_relbuf(rel, buf);
01107         return 0;
01108     }
01109 
01110     /*
01111      * Save info about page, including a copy of its high key (it must have
01112      * one, being non-rightmost).
01113      */
01114     target = BufferGetBlockNumber(buf);
01115     targetlevel = opaque->btpo.level;
01116     leftsib = opaque->btpo_prev;
01117     itemid = PageGetItemId(page, P_HIKEY);
01118     targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
01119 
01120     /*
01121      * To avoid deadlocks, we'd better drop the target page lock before going
01122      * further.
01123      */
01124     _bt_relbuf(rel, buf);
01125 
01126     /*
01127      * We need an approximate pointer to the page's parent page.  We use the
01128      * standard search mechanism to search for the page's high key; this will
01129      * give us a link to either the current parent or someplace to its left
01130      * (if there are multiple equal high keys).  In recursion cases, the
01131      * caller already generated a search stack and we can just re-use that
01132      * work.
01133      */
01134     if (stack == NULL)
01135     {
01136         if (!InRecovery)
01137         {
01138             /* we need an insertion scan key to do our search, so build one */
01139             itup_scankey = _bt_mkscankey(rel, targetkey);
01140             /* find the leftmost leaf page containing this key */
01141             stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false,
01142                                &lbuf, BT_READ);
01143             /* don't need a pin on that either */
01144             _bt_relbuf(rel, lbuf);
01145 
01146             /*
01147              * If we are trying to delete an interior page, _bt_search did
01148              * more than we needed.  Locate the stack item pointing to our
01149              * parent level.
01150              */
01151             ilevel = 0;
01152             for (;;)
01153             {
01154                 if (stack == NULL)
01155                     elog(ERROR, "not enough stack items");
01156                 if (ilevel == targetlevel)
01157                     break;
01158                 stack = stack->bts_parent;
01159                 ilevel++;
01160             }
01161         }
01162         else
01163         {
01164             /*
01165              * During WAL recovery, we can't use _bt_search (for one reason,
01166              * it might invoke user-defined comparison functions that expect
01167              * facilities not available in recovery mode).  Instead, just set
01168              * up a dummy stack pointing to the left end of the parent tree
01169              * level, from which _bt_getstackbuf will walk right to the parent
01170              * page.  Painful, but we don't care too much about performance in
01171              * this scenario.
01172              */
01173             pbuf = _bt_get_endpoint(rel, targetlevel + 1, false);
01174             stack = (BTStack) palloc(sizeof(BTStackData));
01175             stack->bts_blkno = BufferGetBlockNumber(pbuf);
01176             stack->bts_offset = InvalidOffsetNumber;
01177             /* bts_btentry will be initialized below */
01178             stack->bts_parent = NULL;
01179             _bt_relbuf(rel, pbuf);
01180         }
01181     }
01182 
01183     /*
01184      * We cannot delete a page that is the rightmost child of its immediate
01185      * parent, unless it is the only child --- in which case the parent has to
01186      * be deleted too, and the same condition applies recursively to it. We
01187      * have to check this condition all the way up before trying to delete. We
01188      * don't need to re-test when deleting a non-leaf page, though.
01189      */
01190     if (targetlevel == 0 &&
01191         !_bt_parent_deletion_safe(rel, target, stack))
01192         return 0;
01193 
01194     /*
01195      * We have to lock the pages we need to modify in the standard order:
01196      * moving right, then up.  Else we will deadlock against other writers.
01197      *
01198      * So, we need to find and write-lock the current left sibling of the
01199      * target page.  The sibling that was current a moment ago could have
01200      * split, so we may have to move right.  This search could fail if either
01201      * the sibling or the target page was deleted by someone else meanwhile;
01202      * if so, give up.  (Right now, that should never happen, since page
01203      * deletion is only done in VACUUM and there shouldn't be multiple VACUUMs
01204      * concurrently on the same table.)
01205      */
01206     if (leftsib != P_NONE)
01207     {
01208         lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
01209         page = BufferGetPage(lbuf);
01210         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01211         while (P_ISDELETED(opaque) || opaque->btpo_next != target)
01212         {
01213             /* step right one page */
01214             leftsib = opaque->btpo_next;
01215             _bt_relbuf(rel, lbuf);
01216             if (leftsib == P_NONE)
01217             {
01218                 elog(LOG, "no left sibling (concurrent deletion?) in \"%s\"",
01219                      RelationGetRelationName(rel));
01220                 return 0;
01221             }
01222             lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
01223             page = BufferGetPage(lbuf);
01224             opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01225         }
01226     }
01227     else
01228         lbuf = InvalidBuffer;
01229 
01230     /*
01231      * Next write-lock the target page itself.  It should be okay to take just
01232      * a write lock not a superexclusive lock, since no scans would stop on an
01233      * empty page.
01234      */
01235     buf = _bt_getbuf(rel, target, BT_WRITE);
01236     page = BufferGetPage(buf);
01237     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01238 
01239     /*
01240      * Check page is still empty etc, else abandon deletion.  The empty check
01241      * is necessary since someone else might have inserted into it while we
01242      * didn't have it locked; the others are just for paranoia's sake.
01243      */
01244     if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
01245         P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
01246     {
01247         _bt_relbuf(rel, buf);
01248         if (BufferIsValid(lbuf))
01249             _bt_relbuf(rel, lbuf);
01250         return 0;
01251     }
01252     if (opaque->btpo_prev != leftsib)
01253         elog(ERROR, "left link changed unexpectedly in block %u of index \"%s\"",
01254              target, RelationGetRelationName(rel));
01255 
01256     /*
01257      * And next write-lock the (current) right sibling.
01258      */
01259     rightsib = opaque->btpo_next;
01260     rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
01261     page = BufferGetPage(rbuf);
01262     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01263     if (opaque->btpo_prev != target)
01264         elog(ERROR, "right sibling's left-link doesn't match: "
01265              "block %u links to %u instead of expected %u in index \"%s\"",
01266              rightsib, opaque->btpo_prev, target,
01267              RelationGetRelationName(rel));
01268 
01269     /*
01270      * Any insert which would have gone on the target block will now go to the
01271      * right sibling block.
01272      */
01273     PredicateLockPageCombine(rel, target, rightsib);
01274 
01275     /*
01276      * Next find and write-lock the current parent of the target page. This is
01277      * essentially the same as the corresponding step of splitting.
01278      */
01279     ItemPointerSet(&(stack->bts_btentry.t_tid), target, P_HIKEY);
01280     pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
01281     if (pbuf == InvalidBuffer)
01282         elog(ERROR, "failed to re-find parent key in index \"%s\" for deletion target page %u",
01283              RelationGetRelationName(rel), target);
01284     parent = stack->bts_blkno;
01285     poffset = stack->bts_offset;
01286 
01287     /*
01288      * If the target is the rightmost child of its parent, then we can't
01289      * delete, unless it's also the only child --- in which case the parent
01290      * changes to half-dead status.  The "can't delete" case should have been
01291      * detected by _bt_parent_deletion_safe, so complain if we see it now.
01292      */
01293     page = BufferGetPage(pbuf);
01294     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01295     maxoff = PageGetMaxOffsetNumber(page);
01296     parent_half_dead = false;
01297     parent_one_child = false;
01298     if (poffset >= maxoff)
01299     {
01300         if (poffset == P_FIRSTDATAKEY(opaque))
01301             parent_half_dead = true;
01302         else
01303             elog(ERROR, "failed to delete rightmost child %u of block %u in index \"%s\"",
01304                  target, parent, RelationGetRelationName(rel));
01305     }
01306     else
01307     {
01308         /* Will there be exactly one child left in this parent? */
01309         if (OffsetNumberNext(P_FIRSTDATAKEY(opaque)) == maxoff)
01310             parent_one_child = true;
01311     }
01312 
01313     /*
01314      * If we are deleting the next-to-last page on the target's level, then
01315      * the rightsib is a candidate to become the new fast root. (In theory, it
01316      * might be possible to push the fast root even further down, but the odds
01317      * of doing so are slim, and the locking considerations daunting.)
01318      *
01319      * We don't support handling this in the case where the parent is becoming
01320      * half-dead, even though it theoretically could occur.
01321      *
01322      * We can safely acquire a lock on the metapage here --- see comments for
01323      * _bt_newroot().
01324      */
01325     if (leftsib == P_NONE && !parent_half_dead)
01326     {
01327         page = BufferGetPage(rbuf);
01328         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01329         Assert(opaque->btpo.level == targetlevel);
01330         if (P_RIGHTMOST(opaque))
01331         {
01332             /* rightsib will be the only one left on the level */
01333             metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
01334             metapg = BufferGetPage(metabuf);
01335             metad = BTPageGetMeta(metapg);
01336 
01337             /*
01338              * The expected case here is btm_fastlevel == targetlevel+1; if
01339              * the fastlevel is <= targetlevel, something is wrong, and we
01340              * choose to overwrite it to fix it.
01341              */
01342             if (metad->btm_fastlevel > targetlevel + 1)
01343             {
01344                 /* no update wanted */
01345                 _bt_relbuf(rel, metabuf);
01346                 metabuf = InvalidBuffer;
01347             }
01348         }
01349     }
01350 
01351     /*
01352      * Check that the parent-page index items we're about to delete/overwrite
01353      * contain what we expect.  This can fail if the index has become corrupt
01354      * for some reason.  We want to throw any error before entering the
01355      * critical section --- otherwise it'd be a PANIC.
01356      *
01357      * The test on the target item is just an Assert because _bt_getstackbuf
01358      * should have guaranteed it has the expected contents.  The test on the
01359      * next-child downlink is known to sometimes fail in the field, though.
01360      */
01361     page = BufferGetPage(pbuf);
01362     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01363 
01364 #ifdef USE_ASSERT_CHECKING
01365     itemid = PageGetItemId(page, poffset);
01366     itup = (IndexTuple) PageGetItem(page, itemid);
01367     Assert(ItemPointerGetBlockNumber(&(itup->t_tid)) == target);
01368 #endif
01369 
01370     if (!parent_half_dead)
01371     {
01372         OffsetNumber nextoffset;
01373 
01374         nextoffset = OffsetNumberNext(poffset);
01375         itemid = PageGetItemId(page, nextoffset);
01376         itup = (IndexTuple) PageGetItem(page, itemid);
01377         if (ItemPointerGetBlockNumber(&(itup->t_tid)) != rightsib)
01378             elog(ERROR, "right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
01379                  rightsib, target, ItemPointerGetBlockNumber(&(itup->t_tid)),
01380                  parent, RelationGetRelationName(rel));
01381     }
01382 
01383     /*
01384      * Here we begin doing the deletion.
01385      */
01386 
01387     /* No ereport(ERROR) until changes are logged */
01388     START_CRIT_SECTION();
01389 
01390     /*
01391      * Update parent.  The normal case is a tad tricky because we want to
01392      * delete the target's downlink and the *following* key.  Easiest way is
01393      * to copy the right sibling's downlink over the target downlink, and then
01394      * delete the following item.
01395      */
01396     if (parent_half_dead)
01397     {
01398         PageIndexTupleDelete(page, poffset);
01399         opaque->btpo_flags |= BTP_HALF_DEAD;
01400     }
01401     else
01402     {
01403         OffsetNumber nextoffset;
01404 
01405         itemid = PageGetItemId(page, poffset);
01406         itup = (IndexTuple) PageGetItem(page, itemid);
01407         ItemPointerSet(&(itup->t_tid), rightsib, P_HIKEY);
01408 
01409         nextoffset = OffsetNumberNext(poffset);
01410         PageIndexTupleDelete(page, nextoffset);
01411     }
01412 
01413     /*
01414      * Update siblings' side-links.  Note the target page's side-links will
01415      * continue to point to the siblings.  Asserts here are just rechecking
01416      * things we already verified above.
01417      */
01418     if (BufferIsValid(lbuf))
01419     {
01420         page = BufferGetPage(lbuf);
01421         opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01422         Assert(opaque->btpo_next == target);
01423         opaque->btpo_next = rightsib;
01424     }
01425     page = BufferGetPage(rbuf);
01426     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01427     Assert(opaque->btpo_prev == target);
01428     opaque->btpo_prev = leftsib;
01429     rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
01430 
01431     /*
01432      * Mark the page itself deleted.  It can be recycled when all current
01433      * transactions are gone.  Storing GetTopTransactionId() would work, but
01434      * we're in VACUUM and would not otherwise have an XID.  Having already
01435      * updated links to the target, ReadNewTransactionId() suffices as an
01436      * upper bound.  Any scan having retained a now-stale link is advertising
01437      * in its PGXACT an xmin less than or equal to the value we read here.  It
01438      * will continue to do so, holding back RecentGlobalXmin, for the duration
01439      * of that scan.
01440      */
01441     page = BufferGetPage(buf);
01442     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
01443     opaque->btpo_flags &= ~BTP_HALF_DEAD;
01444     opaque->btpo_flags |= BTP_DELETED;
01445     opaque->btpo.xact = ReadNewTransactionId();
01446 
01447     /* And update the metapage, if needed */
01448     if (BufferIsValid(metabuf))
01449     {
01450         metad->btm_fastroot = rightsib;
01451         metad->btm_fastlevel = targetlevel;
01452         MarkBufferDirty(metabuf);
01453     }
01454 
01455     /* Must mark buffers dirty before XLogInsert */
01456     MarkBufferDirty(pbuf);
01457     MarkBufferDirty(rbuf);
01458     MarkBufferDirty(buf);
01459     if (BufferIsValid(lbuf))
01460         MarkBufferDirty(lbuf);
01461 
01462     /* XLOG stuff */
01463     if (RelationNeedsWAL(rel))
01464     {
01465         xl_btree_delete_page xlrec;
01466         xl_btree_metadata xlmeta;
01467         uint8       xlinfo;
01468         XLogRecPtr  recptr;
01469         XLogRecData rdata[5];
01470         XLogRecData *nextrdata;
01471 
01472         xlrec.target.node = rel->rd_node;
01473         ItemPointerSet(&(xlrec.target.tid), parent, poffset);
01474         xlrec.deadblk = target;
01475         xlrec.leftblk = leftsib;
01476         xlrec.rightblk = rightsib;
01477         xlrec.btpo_xact = opaque->btpo.xact;
01478 
01479         rdata[0].data = (char *) &xlrec;
01480         rdata[0].len = SizeOfBtreeDeletePage;
01481         rdata[0].buffer = InvalidBuffer;
01482         rdata[0].next = nextrdata = &(rdata[1]);
01483 
01484         if (BufferIsValid(metabuf))
01485         {
01486             xlmeta.root = metad->btm_root;
01487             xlmeta.level = metad->btm_level;
01488             xlmeta.fastroot = metad->btm_fastroot;
01489             xlmeta.fastlevel = metad->btm_fastlevel;
01490 
01491             nextrdata->data = (char *) &xlmeta;
01492             nextrdata->len = sizeof(xl_btree_metadata);
01493             nextrdata->buffer = InvalidBuffer;
01494             nextrdata->next = nextrdata + 1;
01495             nextrdata++;
01496             xlinfo = XLOG_BTREE_DELETE_PAGE_META;
01497         }
01498         else if (parent_half_dead)
01499             xlinfo = XLOG_BTREE_DELETE_PAGE_HALF;
01500         else
01501             xlinfo = XLOG_BTREE_DELETE_PAGE;
01502 
01503         nextrdata->data = NULL;
01504         nextrdata->len = 0;
01505         nextrdata->next = nextrdata + 1;
01506         nextrdata->buffer = pbuf;
01507         nextrdata->buffer_std = true;
01508         nextrdata++;
01509 
01510         nextrdata->data = NULL;
01511         nextrdata->len = 0;
01512         nextrdata->buffer = rbuf;
01513         nextrdata->buffer_std = true;
01514         nextrdata->next = NULL;
01515 
01516         if (BufferIsValid(lbuf))
01517         {
01518             nextrdata->next = nextrdata + 1;
01519             nextrdata++;
01520             nextrdata->data = NULL;
01521             nextrdata->len = 0;
01522             nextrdata->buffer = lbuf;
01523             nextrdata->buffer_std = true;
01524             nextrdata->next = NULL;
01525         }
01526 
01527         recptr = XLogInsert(RM_BTREE_ID, xlinfo, rdata);
01528 
01529         if (BufferIsValid(metabuf))
01530         {
01531             PageSetLSN(metapg, recptr);
01532         }
01533         page = BufferGetPage(pbuf);
01534         PageSetLSN(page, recptr);
01535         page = BufferGetPage(rbuf);
01536         PageSetLSN(page, recptr);
01537         page = BufferGetPage(buf);
01538         PageSetLSN(page, recptr);
01539         if (BufferIsValid(lbuf))
01540         {
01541             page = BufferGetPage(lbuf);
01542             PageSetLSN(page, recptr);
01543         }
01544     }
01545 
01546     END_CRIT_SECTION();
01547 
01548     /* release metapage; send out relcache inval if metapage changed */
01549     if (BufferIsValid(metabuf))
01550     {
01551         CacheInvalidateRelcache(rel);
01552         _bt_relbuf(rel, metabuf);
01553     }
01554     /* can always release leftsib immediately */
01555     if (BufferIsValid(lbuf))
01556         _bt_relbuf(rel, lbuf);
01557 
01558     /*
01559      * If parent became half dead, recurse to delete it. Otherwise, if right
01560      * sibling is empty and is now the last child of the parent, recurse to
01561      * try to delete it.  (These cases cannot apply at the same time, though
01562      * the second case might itself recurse to the first.)
01563      *
01564      * When recursing to parent, we hold the lock on the target page until
01565      * done.  This delays any insertions into the keyspace that was just
01566      * effectively reassigned to the parent's right sibling.  If we allowed
01567      * that, and there were enough such insertions before we finish deleting
01568      * the parent, page splits within that keyspace could lead to inserting
01569      * out-of-order keys into the grandparent level.  It is thought that that
01570      * wouldn't have any serious consequences, but it still seems like a
01571      * pretty bad idea.
01572      */
01573     if (parent_half_dead)
01574     {
01575         /* recursive call will release pbuf */
01576         _bt_relbuf(rel, rbuf);
01577         result = _bt_pagedel(rel, pbuf, stack->bts_parent) + 1;
01578         _bt_relbuf(rel, buf);
01579     }
01580     else if (parent_one_child && rightsib_empty)
01581     {
01582         _bt_relbuf(rel, pbuf);
01583         _bt_relbuf(rel, buf);
01584         /* recursive call will release rbuf */
01585         result = _bt_pagedel(rel, rbuf, stack) + 1;
01586     }
01587     else
01588     {
01589         _bt_relbuf(rel, pbuf);
01590         _bt_relbuf(rel, buf);
01591         _bt_relbuf(rel, rbuf);
01592         result = 1;
01593     }
01594 
01595     return result;
01596 }