Header And Logo

PostgreSQL
| The world's most advanced open source database.

nbtsort.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * nbtsort.c
00004  *      Build a btree from sorted input by loading leaf pages sequentially.
00005  *
00006  * NOTES
00007  *
00008  * We use tuplesort.c to sort the given index tuples into order.
00009  * Then we scan the index tuples in order and build the btree pages
00010  * for each level.  We load source tuples into leaf-level pages.
00011  * Whenever we fill a page at one level, we add a link to it to its
00012  * parent level (starting a new parent level if necessary).  When
00013  * done, we write out each final page on each level, adding it to
00014  * its parent level.  When we have only one page on a level, it must be
00015  * the root -- it can be attached to the btree metapage and we are done.
00016  *
00017  * This code is moderately slow (~10% slower) compared to the regular
00018  * btree (insertion) build code on sorted or well-clustered data.  On
00019  * random data, however, the insertion build code is unusable -- the
00020  * difference on a 60MB heap is a factor of 15 because the random
00021  * probes into the btree thrash the buffer pool.  (NOTE: the above
00022  * "10%" estimate is probably obsolete, since it refers to an old and
00023  * not very good external sort implementation that used to exist in
00024  * this module.  tuplesort.c is almost certainly faster.)
00025  *
00026  * It is not wise to pack the pages entirely full, since then *any*
00027  * insertion would cause a split (and not only of the leaf page; the need
00028  * for a split would cascade right up the tree).  The steady-state load
00029  * factor for btrees is usually estimated at 70%.  We choose to pack leaf
00030  * pages to the user-controllable fill factor (default 90%) while upper pages
00031  * are always packed to 70%.  This gives us reasonable density (there aren't
00032  * many upper pages if the keys are reasonable-size) without risking a lot of
00033  * cascading splits during early insertions.
00034  *
00035  * Formerly the index pages being built were kept in shared buffers, but
00036  * that is of no value (since other backends have no interest in them yet)
00037  * and it created locking problems for CHECKPOINT, because the upper-level
00038  * pages were held exclusive-locked for long periods.  Now we just build
00039  * the pages in local memory and smgrwrite or smgrextend them as we finish
00040  * them.  They will need to be re-read into shared buffers on first use after
00041  * the build finishes.
00042  *
00043  * Since the index will never be used unless it is completely built,
00044  * from a crash-recovery point of view there is no need to WAL-log the
00045  * steps of the build.  After completing the index build, we can just sync
00046  * the whole file to disk using smgrimmedsync() before exiting this module.
00047  * This can be seen to be sufficient for crash recovery by considering that
00048  * it's effectively equivalent to what would happen if a CHECKPOINT occurred
00049  * just after the index build.  However, it is clearly not sufficient if the
00050  * DBA is using the WAL log for PITR or replication purposes, since another
00051  * machine would not be able to reconstruct the index from WAL.  Therefore,
00052  * we log the completed index pages to WAL if and only if WAL archiving is
00053  * active.
00054  *
00055  * This code isn't concerned about the FSM at all. The caller is responsible
00056  * for initializing that.
00057  *
00058  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00059  * Portions Copyright (c) 1994, Regents of the University of California
00060  *
00061  * IDENTIFICATION
00062  *    src/backend/access/nbtree/nbtsort.c
00063  *
00064  *-------------------------------------------------------------------------
00065  */
00066 
00067 #include "postgres.h"
00068 
00069 #include "access/heapam_xlog.h"
00070 #include "access/nbtree.h"
00071 #include "miscadmin.h"
00072 #include "storage/smgr.h"
00073 #include "tcop/tcopprot.h"
00074 #include "utils/rel.h"
00075 #include "utils/tuplesort.h"
00076 
00077 
00078 /*
00079  * Status record for spooling/sorting phase.  (Note we may have two of
00080  * these due to the special requirements for uniqueness-checking with
00081  * dead tuples.)
00082  */
00083 struct BTSpool
00084 {
00085     Tuplesortstate *sortstate;  /* state data for tuplesort.c */
00086     Relation    heap;
00087     Relation    index;
00088     bool        isunique;
00089 };
00090 
00091 /*
00092  * Status record for a btree page being built.  We have one of these
00093  * for each active tree level.
00094  *
00095  * The reason we need to store a copy of the minimum key is that we'll
00096  * need to propagate it to the parent node when this page is linked
00097  * into its parent.  However, if the page is not a leaf page, the first
00098  * entry on the page doesn't need to contain a key, so we will not have
00099  * stored the key itself on the page.  (You might think we could skip
00100  * copying the minimum key on leaf pages, but actually we must have a
00101  * writable copy anyway because we'll poke the page's address into it
00102  * before passing it up to the parent...)
00103  */
00104 typedef struct BTPageState
00105 {
00106     Page        btps_page;      /* workspace for page building */
00107     BlockNumber btps_blkno;     /* block # to write this page at */
00108     IndexTuple  btps_minkey;    /* copy of minimum key (first item) on page */
00109     OffsetNumber btps_lastoff;  /* last item offset loaded */
00110     uint32      btps_level;     /* tree level (0 = leaf) */
00111     Size        btps_full;      /* "full" if less than this much free space */
00112     struct BTPageState *btps_next;      /* link to parent level, if any */
00113 } BTPageState;
00114 
00115 /*
00116  * Overall status record for index writing phase.
00117  */
00118 typedef struct BTWriteState
00119 {
00120     Relation    heap;
00121     Relation    index;
00122     bool        btws_use_wal;   /* dump pages to WAL? */
00123     BlockNumber btws_pages_alloced;     /* # pages allocated */
00124     BlockNumber btws_pages_written;     /* # pages written out */
00125     Page        btws_zeropage;  /* workspace for filling zeroes */
00126 } BTWriteState;
00127 
00128 
00129 static Page _bt_blnewpage(uint32 level);
00130 static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
00131 static void _bt_slideleft(Page page);
00132 static void _bt_sortaddtup(Page page, Size itemsize,
00133                IndexTuple itup, OffsetNumber itup_off);
00134 static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
00135              IndexTuple itup);
00136 static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
00137 static void _bt_load(BTWriteState *wstate,
00138          BTSpool *btspool, BTSpool *btspool2);
00139 
00140 
00141 /*
00142  * Interface routines
00143  */
00144 
00145 
00146 /*
00147  * create and initialize a spool structure
00148  */
00149 BTSpool *
00150 _bt_spoolinit(Relation heap, Relation index, bool isunique, bool isdead)
00151 {
00152     BTSpool    *btspool = (BTSpool *) palloc0(sizeof(BTSpool));
00153     int         btKbytes;
00154 
00155     btspool->heap = heap;
00156     btspool->index = index;
00157     btspool->isunique = isunique;
00158 
00159     /*
00160      * We size the sort area as maintenance_work_mem rather than work_mem to
00161      * speed index creation.  This should be OK since a single backend can't
00162      * run multiple index creations in parallel.  Note that creation of a
00163      * unique index actually requires two BTSpool objects.  We expect that the
00164      * second one (for dead tuples) won't get very full, so we give it only
00165      * work_mem.
00166      */
00167     btKbytes = isdead ? work_mem : maintenance_work_mem;
00168     btspool->sortstate = tuplesort_begin_index_btree(heap, index, isunique,
00169                                                      btKbytes, false);
00170 
00171     return btspool;
00172 }
00173 
00174 /*
00175  * clean up a spool structure and its substructures.
00176  */
00177 void
00178 _bt_spooldestroy(BTSpool *btspool)
00179 {
00180     tuplesort_end(btspool->sortstate);
00181     pfree(btspool);
00182 }
00183 
00184 /*
00185  * spool an index entry into the sort file.
00186  */
00187 void
00188 _bt_spool(IndexTuple itup, BTSpool *btspool)
00189 {
00190     tuplesort_putindextuple(btspool->sortstate, itup);
00191 }
00192 
00193 /*
00194  * given a spool loaded by successive calls to _bt_spool,
00195  * create an entire btree.
00196  */
00197 void
00198 _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
00199 {
00200     BTWriteState wstate;
00201 
00202 #ifdef BTREE_BUILD_STATS
00203     if (log_btree_build_stats)
00204     {
00205         ShowUsage("BTREE BUILD (Spool) STATISTICS");
00206         ResetUsage();
00207     }
00208 #endif   /* BTREE_BUILD_STATS */
00209 
00210     tuplesort_performsort(btspool->sortstate);
00211     if (btspool2)
00212         tuplesort_performsort(btspool2->sortstate);
00213 
00214     wstate.heap = btspool->heap;
00215     wstate.index = btspool->index;
00216 
00217     /*
00218      * We need to log index creation in WAL iff WAL archiving/streaming is
00219      * enabled UNLESS the index isn't WAL-logged anyway.
00220      */
00221     wstate.btws_use_wal = XLogIsNeeded() && RelationNeedsWAL(wstate.index);
00222 
00223     /* reserve the metapage */
00224     wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
00225     wstate.btws_pages_written = 0;
00226     wstate.btws_zeropage = NULL;    /* until needed */
00227 
00228     _bt_load(&wstate, btspool, btspool2);
00229 }
00230 
00231 
00232 /*
00233  * Internal routines.
00234  */
00235 
00236 
00237 /*
00238  * allocate workspace for a new, clean btree page, not linked to any siblings.
00239  */
00240 static Page
00241 _bt_blnewpage(uint32 level)
00242 {
00243     Page        page;
00244     BTPageOpaque opaque;
00245 
00246     page = (Page) palloc(BLCKSZ);
00247 
00248     /* Zero the page and set up standard page header info */
00249     _bt_pageinit(page, BLCKSZ);
00250 
00251     /* Initialize BT opaque state */
00252     opaque = (BTPageOpaque) PageGetSpecialPointer(page);
00253     opaque->btpo_prev = opaque->btpo_next = P_NONE;
00254     opaque->btpo.level = level;
00255     opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
00256     opaque->btpo_cycleid = 0;
00257 
00258     /* Make the P_HIKEY line pointer appear allocated */
00259     ((PageHeader) page)->pd_lower += sizeof(ItemIdData);
00260 
00261     return page;
00262 }
00263 
00264 /*
00265  * emit a completed btree page, and release the working storage.
00266  */
00267 static void
00268 _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
00269 {
00270     /* Ensure rd_smgr is open (could have been closed by relcache flush!) */
00271     RelationOpenSmgr(wstate->index);
00272 
00273     /* XLOG stuff */
00274     if (wstate->btws_use_wal)
00275     {
00276         /* We use the heap NEWPAGE record type for this */
00277         log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page);
00278     }
00279 
00280     /*
00281      * If we have to write pages nonsequentially, fill in the space with
00282      * zeroes until we come back and overwrite.  This is not logically
00283      * necessary on standard Unix filesystems (unwritten space will read as
00284      * zeroes anyway), but it should help to avoid fragmentation. The dummy
00285      * pages aren't WAL-logged though.
00286      */
00287     while (blkno > wstate->btws_pages_written)
00288     {
00289         if (!wstate->btws_zeropage)
00290             wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
00291         /* don't set checksum for all-zero page */
00292         smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM,
00293                    wstate->btws_pages_written++,
00294                    (char *) wstate->btws_zeropage,
00295                    true);
00296     }
00297 
00298     PageSetChecksumInplace(page, blkno);
00299 
00300     /*
00301      * Now write the page.  There's no need for smgr to schedule an fsync for
00302      * this write; we'll do it ourselves before ending the build.
00303      */
00304     if (blkno == wstate->btws_pages_written)
00305     {
00306         /* extending the file... */
00307         smgrextend(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
00308                    (char *) page, true);
00309         wstate->btws_pages_written++;
00310     }
00311     else
00312     {
00313         /* overwriting a block we zero-filled before */
00314         smgrwrite(wstate->index->rd_smgr, MAIN_FORKNUM, blkno,
00315                   (char *) page, true);
00316     }
00317 
00318     pfree(page);
00319 }
00320 
00321 /*
00322  * allocate and initialize a new BTPageState.  the returned structure
00323  * is suitable for immediate use by _bt_buildadd.
00324  */
00325 static BTPageState *
00326 _bt_pagestate(BTWriteState *wstate, uint32 level)
00327 {
00328     BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
00329 
00330     /* create initial page for level */
00331     state->btps_page = _bt_blnewpage(level);
00332 
00333     /* and assign it a page position */
00334     state->btps_blkno = wstate->btws_pages_alloced++;
00335 
00336     state->btps_minkey = NULL;
00337     /* initialize lastoff so first item goes into P_FIRSTKEY */
00338     state->btps_lastoff = P_HIKEY;
00339     state->btps_level = level;
00340     /* set "full" threshold based on level.  See notes at head of file. */
00341     if (level > 0)
00342         state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);
00343     else
00344         state->btps_full = RelationGetTargetPageFreeSpace(wstate->index,
00345                                                    BTREE_DEFAULT_FILLFACTOR);
00346     /* no parent level, yet */
00347     state->btps_next = NULL;
00348 
00349     return state;
00350 }
00351 
00352 /*
00353  * slide an array of ItemIds back one slot (from P_FIRSTKEY to
00354  * P_HIKEY, overwriting P_HIKEY).  we need to do this when we discover
00355  * that we have built an ItemId array in what has turned out to be a
00356  * P_RIGHTMOST page.
00357  */
00358 static void
00359 _bt_slideleft(Page page)
00360 {
00361     OffsetNumber off;
00362     OffsetNumber maxoff;
00363     ItemId      previi;
00364     ItemId      thisii;
00365 
00366     if (!PageIsEmpty(page))
00367     {
00368         maxoff = PageGetMaxOffsetNumber(page);
00369         previi = PageGetItemId(page, P_HIKEY);
00370         for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
00371         {
00372             thisii = PageGetItemId(page, off);
00373             *previi = *thisii;
00374             previi = thisii;
00375         }
00376         ((PageHeader) page)->pd_lower -= sizeof(ItemIdData);
00377     }
00378 }
00379 
00380 /*
00381  * Add an item to a page being built.
00382  *
00383  * The main difference between this routine and a bare PageAddItem call
00384  * is that this code knows that the leftmost data item on a non-leaf
00385  * btree page doesn't need to have a key.  Therefore, it strips such
00386  * items down to just the item header.
00387  *
00388  * This is almost like nbtinsert.c's _bt_pgaddtup(), but we can't use
00389  * that because it assumes that P_RIGHTMOST() will return the correct
00390  * answer for the page.  Here, we don't know yet if the page will be
00391  * rightmost.  Offset P_FIRSTKEY is always the first data key.
00392  */
00393 static void
00394 _bt_sortaddtup(Page page,
00395                Size itemsize,
00396                IndexTuple itup,
00397                OffsetNumber itup_off)
00398 {
00399     BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
00400     IndexTupleData trunctuple;
00401 
00402     if (!P_ISLEAF(opaque) && itup_off == P_FIRSTKEY)
00403     {
00404         trunctuple = *itup;
00405         trunctuple.t_info = sizeof(IndexTupleData);
00406         itup = &trunctuple;
00407         itemsize = sizeof(IndexTupleData);
00408     }
00409 
00410     if (PageAddItem(page, (Item) itup, itemsize, itup_off,
00411                     false, false) == InvalidOffsetNumber)
00412         elog(ERROR, "failed to add item to the index page");
00413 }
00414 
00415 /*----------
00416  * Add an item to a disk page from the sort output.
00417  *
00418  * We must be careful to observe the page layout conventions of nbtsearch.c:
00419  * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
00420  * - on non-leaf pages, the key portion of the first item need not be
00421  *   stored, we should store only the link.
00422  *
00423  * A leaf page being built looks like:
00424  *
00425  * +----------------+---------------------------------+
00426  * | PageHeaderData | linp0 linp1 linp2 ...           |
00427  * +-----------+----+---------------------------------+
00428  * | ... linpN |                                      |
00429  * +-----------+--------------------------------------+
00430  * |     ^ last                                       |
00431  * |                                                  |
00432  * +-------------+------------------------------------+
00433  * |             | itemN ...                          |
00434  * +-------------+------------------+-----------------+
00435  * |          ... item3 item2 item1 | "special space" |
00436  * +--------------------------------+-----------------+
00437  *
00438  * Contrast this with the diagram in bufpage.h; note the mismatch
00439  * between linps and items.  This is because we reserve linp0 as a
00440  * placeholder for the pointer to the "high key" item; when we have
00441  * filled up the page, we will set linp0 to point to itemN and clear
00442  * linpN.  On the other hand, if we find this is the last (rightmost)
00443  * page, we leave the items alone and slide the linp array over.
00444  *
00445  * 'last' pointer indicates the last offset added to the page.
00446  *----------
00447  */
00448 static void
00449 _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
00450 {
00451     Page        npage;
00452     BlockNumber nblkno;
00453     OffsetNumber last_off;
00454     Size        pgspc;
00455     Size        itupsz;
00456 
00457     /*
00458      * This is a handy place to check for cancel interrupts during the btree
00459      * load phase of index creation.
00460      */
00461     CHECK_FOR_INTERRUPTS();
00462 
00463     npage = state->btps_page;
00464     nblkno = state->btps_blkno;
00465     last_off = state->btps_lastoff;
00466 
00467     pgspc = PageGetFreeSpace(npage);
00468     itupsz = IndexTupleDSize(*itup);
00469     itupsz = MAXALIGN(itupsz);
00470 
00471     /*
00472      * Check whether the item can fit on a btree page at all. (Eventually, we
00473      * ought to try to apply TOAST methods if not.) We actually need to be
00474      * able to fit three items on every page, so restrict any one item to 1/3
00475      * the per-page available space. Note that at this point, itupsz doesn't
00476      * include the ItemId.
00477      *
00478      * NOTE: similar code appears in _bt_insertonpg() to defend against
00479      * oversize items being inserted into an already-existing index. But
00480      * during creation of an index, we don't go through there.
00481      */
00482     if (itupsz > BTMaxItemSize(npage))
00483         ereport(ERROR,
00484                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
00485             errmsg("index row size %lu exceeds maximum %lu for index \"%s\"",
00486                    (unsigned long) itupsz,
00487                    (unsigned long) BTMaxItemSize(npage),
00488                    RelationGetRelationName(wstate->index)),
00489         errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
00490                 "Consider a function index of an MD5 hash of the value, "
00491                 "or use full text indexing."),
00492                  errtableconstraint(wstate->heap,
00493                                     RelationGetRelationName(wstate->index))));
00494 
00495     /*
00496      * Check to see if page is "full".  It's definitely full if the item won't
00497      * fit.  Otherwise, compare to the target freespace derived from the
00498      * fillfactor.  However, we must put at least two items on each page, so
00499      * disregard fillfactor if we don't have that many.
00500      */
00501     if (pgspc < itupsz || (pgspc < state->btps_full && last_off > P_FIRSTKEY))
00502     {
00503         /*
00504          * Finish off the page and write it out.
00505          */
00506         Page        opage = npage;
00507         BlockNumber oblkno = nblkno;
00508         ItemId      ii;
00509         ItemId      hii;
00510         IndexTuple  oitup;
00511 
00512         /* Create new page of same level */
00513         npage = _bt_blnewpage(state->btps_level);
00514 
00515         /* and assign it a page position */
00516         nblkno = wstate->btws_pages_alloced++;
00517 
00518         /*
00519          * We copy the last item on the page into the new page, and then
00520          * rearrange the old page so that the 'last item' becomes its high key
00521          * rather than a true data item.  There had better be at least two
00522          * items on the page already, else the page would be empty of useful
00523          * data.
00524          */
00525         Assert(last_off > P_FIRSTKEY);
00526         ii = PageGetItemId(opage, last_off);
00527         oitup = (IndexTuple) PageGetItem(opage, ii);
00528         _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY);
00529 
00530         /*
00531          * Move 'last' into the high key position on opage
00532          */
00533         hii = PageGetItemId(opage, P_HIKEY);
00534         *hii = *ii;
00535         ItemIdSetUnused(ii);    /* redundant */
00536         ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);
00537 
00538         /*
00539          * Link the old page into its parent, using its minimum key. If we
00540          * don't have a parent, we have to create one; this adds a new btree
00541          * level.
00542          */
00543         if (state->btps_next == NULL)
00544             state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
00545 
00546         Assert(state->btps_minkey != NULL);
00547         ItemPointerSet(&(state->btps_minkey->t_tid), oblkno, P_HIKEY);
00548         _bt_buildadd(wstate, state->btps_next, state->btps_minkey);
00549         pfree(state->btps_minkey);
00550 
00551         /*
00552          * Save a copy of the minimum key for the new page.  We have to copy
00553          * it off the old page, not the new one, in case we are not at leaf
00554          * level.
00555          */
00556         state->btps_minkey = CopyIndexTuple(oitup);
00557 
00558         /*
00559          * Set the sibling links for both pages.
00560          */
00561         {
00562             BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(opage);
00563             BTPageOpaque nopaque = (BTPageOpaque) PageGetSpecialPointer(npage);
00564 
00565             oopaque->btpo_next = nblkno;
00566             nopaque->btpo_prev = oblkno;
00567             nopaque->btpo_next = P_NONE;        /* redundant */
00568         }
00569 
00570         /*
00571          * Write out the old page.  We never need to touch it again, so we can
00572          * free the opage workspace too.
00573          */
00574         _bt_blwritepage(wstate, opage, oblkno);
00575 
00576         /*
00577          * Reset last_off to point to new page
00578          */
00579         last_off = P_FIRSTKEY;
00580     }
00581 
00582     /*
00583      * If the new item is the first for its page, stash a copy for later. Note
00584      * this will only happen for the first item on a level; on later pages,
00585      * the first item for a page is copied from the prior page in the code
00586      * above.
00587      */
00588     if (last_off == P_HIKEY)
00589     {
00590         Assert(state->btps_minkey == NULL);
00591         state->btps_minkey = CopyIndexTuple(itup);
00592     }
00593 
00594     /*
00595      * Add the new item into the current page.
00596      */
00597     last_off = OffsetNumberNext(last_off);
00598     _bt_sortaddtup(npage, itupsz, itup, last_off);
00599 
00600     state->btps_page = npage;
00601     state->btps_blkno = nblkno;
00602     state->btps_lastoff = last_off;
00603 }
00604 
00605 /*
00606  * Finish writing out the completed btree.
00607  */
00608 static void
00609 _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
00610 {
00611     BTPageState *s;
00612     BlockNumber rootblkno = P_NONE;
00613     uint32      rootlevel = 0;
00614     Page        metapage;
00615 
00616     /*
00617      * Each iteration of this loop completes one more level of the tree.
00618      */
00619     for (s = state; s != NULL; s = s->btps_next)
00620     {
00621         BlockNumber blkno;
00622         BTPageOpaque opaque;
00623 
00624         blkno = s->btps_blkno;
00625         opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page);
00626 
00627         /*
00628          * We have to link the last page on this level to somewhere.
00629          *
00630          * If we're at the top, it's the root, so attach it to the metapage.
00631          * Otherwise, add an entry for it to its parent using its minimum key.
00632          * This may cause the last page of the parent level to split, but
00633          * that's not a problem -- we haven't gotten to it yet.
00634          */
00635         if (s->btps_next == NULL)
00636         {
00637             opaque->btpo_flags |= BTP_ROOT;
00638             rootblkno = blkno;
00639             rootlevel = s->btps_level;
00640         }
00641         else
00642         {
00643             Assert(s->btps_minkey != NULL);
00644             ItemPointerSet(&(s->btps_minkey->t_tid), blkno, P_HIKEY);
00645             _bt_buildadd(wstate, s->btps_next, s->btps_minkey);
00646             pfree(s->btps_minkey);
00647             s->btps_minkey = NULL;
00648         }
00649 
00650         /*
00651          * This is the rightmost page, so the ItemId array needs to be slid
00652          * back one slot.  Then we can dump out the page.
00653          */
00654         _bt_slideleft(s->btps_page);
00655         _bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
00656         s->btps_page = NULL;    /* writepage freed the workspace */
00657     }
00658 
00659     /*
00660      * As the last step in the process, construct the metapage and make it
00661      * point to the new root (unless we had no data at all, in which case it's
00662      * set to point to "P_NONE").  This changes the index to the "valid" state
00663      * by filling in a valid magic number in the metapage.
00664      */
00665     metapage = (Page) palloc(BLCKSZ);
00666     _bt_initmetapage(metapage, rootblkno, rootlevel);
00667     _bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
00668 }
00669 
00670 /*
00671  * Read tuples in correct sort order from tuplesort, and load them into
00672  * btree leaves.
00673  */
00674 static void
00675 _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
00676 {
00677     BTPageState *state = NULL;
00678     bool        merge = (btspool2 != NULL);
00679     IndexTuple  itup,
00680                 itup2 = NULL;
00681     bool        should_free,
00682                 should_free2,
00683                 load1;
00684     TupleDesc   tupdes = RelationGetDescr(wstate->index);
00685     int         i,
00686                 keysz = RelationGetNumberOfAttributes(wstate->index);
00687     ScanKey     indexScanKey = NULL;
00688 
00689     if (merge)
00690     {
00691         /*
00692          * Another BTSpool for dead tuples exists. Now we have to merge
00693          * btspool and btspool2.
00694          */
00695 
00696         /* the preparation of merge */
00697         itup = tuplesort_getindextuple(btspool->sortstate,
00698                                        true, &should_free);
00699         itup2 = tuplesort_getindextuple(btspool2->sortstate,
00700                                         true, &should_free2);
00701         indexScanKey = _bt_mkscankey_nodata(wstate->index);
00702 
00703         for (;;)
00704         {
00705             load1 = true;       /* load BTSpool next ? */
00706             if (itup2 == NULL)
00707             {
00708                 if (itup == NULL)
00709                     break;
00710             }
00711             else if (itup != NULL)
00712             {
00713                 for (i = 1; i <= keysz; i++)
00714                 {
00715                     ScanKey     entry;
00716                     Datum       attrDatum1,
00717                                 attrDatum2;
00718                     bool        isNull1,
00719                                 isNull2;
00720                     int32       compare;
00721 
00722                     entry = indexScanKey + i - 1;
00723                     attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);
00724                     attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);
00725                     if (isNull1)
00726                     {
00727                         if (isNull2)
00728                             compare = 0;        /* NULL "=" NULL */
00729                         else if (entry->sk_flags & SK_BT_NULLS_FIRST)
00730                             compare = -1;       /* NULL "<" NOT_NULL */
00731                         else
00732                             compare = 1;        /* NULL ">" NOT_NULL */
00733                     }
00734                     else if (isNull2)
00735                     {
00736                         if (entry->sk_flags & SK_BT_NULLS_FIRST)
00737                             compare = 1;        /* NOT_NULL ">" NULL */
00738                         else
00739                             compare = -1;       /* NOT_NULL "<" NULL */
00740                     }
00741                     else
00742                     {
00743                         compare =
00744                             DatumGetInt32(FunctionCall2Coll(&entry->sk_func,
00745                                                          entry->sk_collation,
00746                                                             attrDatum1,
00747                                                             attrDatum2));
00748 
00749                         if (entry->sk_flags & SK_BT_DESC)
00750                             compare = -compare;
00751                     }
00752                     if (compare > 0)
00753                     {
00754                         load1 = false;
00755                         break;
00756                     }
00757                     else if (compare < 0)
00758                         break;
00759                 }
00760             }
00761             else
00762                 load1 = false;
00763 
00764             /* When we see first tuple, create first index page */
00765             if (state == NULL)
00766                 state = _bt_pagestate(wstate, 0);
00767 
00768             if (load1)
00769             {
00770                 _bt_buildadd(wstate, state, itup);
00771                 if (should_free)
00772                     pfree(itup);
00773                 itup = tuplesort_getindextuple(btspool->sortstate,
00774                                                true, &should_free);
00775             }
00776             else
00777             {
00778                 _bt_buildadd(wstate, state, itup2);
00779                 if (should_free2)
00780                     pfree(itup2);
00781                 itup2 = tuplesort_getindextuple(btspool2->sortstate,
00782                                                 true, &should_free2);
00783             }
00784         }
00785         _bt_freeskey(indexScanKey);
00786     }
00787     else
00788     {
00789         /* merge is unnecessary */
00790         while ((itup = tuplesort_getindextuple(btspool->sortstate,
00791                                                true, &should_free)) != NULL)
00792         {
00793             /* When we see first tuple, create first index page */
00794             if (state == NULL)
00795                 state = _bt_pagestate(wstate, 0);
00796 
00797             _bt_buildadd(wstate, state, itup);
00798             if (should_free)
00799                 pfree(itup);
00800         }
00801     }
00802 
00803     /* Close down final pages and write the metapage */
00804     _bt_uppershutdown(wstate, state);
00805 
00806     /*
00807      * If the index is WAL-logged, we must fsync it down to disk before it's
00808      * safe to commit the transaction.  (For a non-WAL-logged index we don't
00809      * care since the index will be uninteresting after a crash anyway.)
00810      *
00811      * It's obvious that we must do this when not WAL-logging the build. It's
00812      * less obvious that we have to do it even if we did WAL-log the index
00813      * pages.  The reason is that since we're building outside shared buffers,
00814      * a CHECKPOINT occurring during the build has no way to flush the
00815      * previously written data to disk (indeed it won't know the index even
00816      * exists).  A crash later on would replay WAL from the checkpoint,
00817      * therefore it wouldn't replay our earlier WAL entries. If we do not
00818      * fsync those pages here, they might still not be on disk when the crash
00819      * occurs.
00820      */
00821     if (RelationNeedsWAL(wstate->index))
00822     {
00823         RelationOpenSmgr(wstate->index);
00824         smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
00825     }
00826 }