Header And Logo

PostgreSQL
| The world's most advanced open source database.

gistbuild.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * gistbuild.c
00004  *    build algorithm for GiST indexes implementation.
00005  *
00006  *
00007  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00008  * Portions Copyright (c) 1994, Regents of the University of California
00009  *
00010  * IDENTIFICATION
00011  *    src/backend/access/gist/gistbuild.c
00012  *
00013  *-------------------------------------------------------------------------
00014  */
00015 #include "postgres.h"
00016 
00017 #include <math.h>
00018 
00019 #include "access/genam.h"
00020 #include "access/gist_private.h"
00021 #include "catalog/index.h"
00022 #include "miscadmin.h"
00023 #include "optimizer/cost.h"
00024 #include "storage/bufmgr.h"
00025 #include "storage/smgr.h"
00026 #include "utils/memutils.h"
00027 #include "utils/rel.h"
00028 
00029 /* Step of index tuples for check whether to switch to buffering build mode */
00030 #define BUFFERING_MODE_SWITCH_CHECK_STEP 256
00031 
00032 /*
00033  * Number of tuples to process in the slow way before switching to buffering
00034  * mode, when buffering is explicitly turned on. Also, the number of tuples
00035  * to process between readjusting the buffer size parameter, while in
00036  * buffering mode.
00037  */
00038 #define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096
00039 
00040 typedef enum
00041 {
00042     GIST_BUFFERING_DISABLED,    /* in regular build mode and aren't going to
00043                                  * switch */
00044     GIST_BUFFERING_AUTO,        /* in regular build mode, but will switch to
00045                                  * buffering build mode if the index grows too
00046                                  * big */
00047     GIST_BUFFERING_STATS,       /* gathering statistics of index tuple size
00048                                  * before switching to the buffering build
00049                                  * mode */
00050     GIST_BUFFERING_ACTIVE       /* in buffering build mode */
00051 } GistBufferingMode;
00052 
00053 /* Working state for gistbuild and its callback */
00054 typedef struct
00055 {
00056     Relation    indexrel;
00057     GISTSTATE  *giststate;
00058 
00059     int64       indtuples;      /* number of tuples indexed */
00060     int64       indtuplesSize;  /* total size of all indexed tuples */
00061 
00062     Size        freespace;      /* amount of free space to leave on pages */
00063 
00064     /*
00065      * Extra data structures used during a buffering build. 'gfbb' contains
00066      * information related to managing the build buffers. 'parentMap' is a
00067      * lookup table of the parent of each internal page.
00068      */
00069     GISTBuildBuffers *gfbb;
00070     HTAB       *parentMap;
00071 
00072     GistBufferingMode bufferingMode;
00073 } GISTBuildState;
00074 
00075 /* prototypes for private functions */
00076 static void gistInitBuffering(GISTBuildState *buildstate);
00077 static int  calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
00078 static void gistBuildCallback(Relation index,
00079                   HeapTuple htup,
00080                   Datum *values,
00081                   bool *isnull,
00082                   bool tupleIsAlive,
00083                   void *state);
00084 static void gistBufferingBuildInsert(GISTBuildState *buildstate,
00085                          IndexTuple itup);
00086 static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
00087                 BlockNumber startblkno, int startlevel);
00088 static BlockNumber gistbufferinginserttuples(GISTBuildState *buildstate,
00089                           Buffer buffer, int level,
00090                           IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
00091                           BlockNumber parentblk, OffsetNumber downlinkoffnum);
00092 static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate,
00093                                BlockNumber childblkno, int level,
00094                                BlockNumber *parentblk,
00095                                OffsetNumber *downlinkoffnum);
00096 static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
00097 static void gistEmptyAllBuffers(GISTBuildState *buildstate);
00098 static int  gistGetMaxLevel(Relation index);
00099 
00100 static void gistInitParentMap(GISTBuildState *buildstate);
00101 static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child,
00102                    BlockNumber parent);
00103 static void gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parent);
00104 static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child);
00105 
00106 /*
00107  * Main entry point to GiST index build. Initially calls insert over and over,
00108  * but switches to more efficient buffering build algorithm after a certain
00109  * number of tuples (unless buffering mode is disabled).
00110  */
00111 Datum
00112 gistbuild(PG_FUNCTION_ARGS)
00113 {
00114     Relation    heap = (Relation) PG_GETARG_POINTER(0);
00115     Relation    index = (Relation) PG_GETARG_POINTER(1);
00116     IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
00117     IndexBuildResult *result;
00118     double      reltuples;
00119     GISTBuildState buildstate;
00120     Buffer      buffer;
00121     Page        page;
00122     MemoryContext oldcxt = CurrentMemoryContext;
00123     int         fillfactor;
00124 
00125     buildstate.indexrel = index;
00126     if (index->rd_options)
00127     {
00128         /* Get buffering mode from the options string */
00129         GiSTOptions *options = (GiSTOptions *) index->rd_options;
00130         char       *bufferingMode = (char *) options + options->bufferingModeOffset;
00131 
00132         if (strcmp(bufferingMode, "on") == 0)
00133             buildstate.bufferingMode = GIST_BUFFERING_STATS;
00134         else if (strcmp(bufferingMode, "off") == 0)
00135             buildstate.bufferingMode = GIST_BUFFERING_DISABLED;
00136         else
00137             buildstate.bufferingMode = GIST_BUFFERING_AUTO;
00138 
00139         fillfactor = options->fillfactor;
00140     }
00141     else
00142     {
00143         /*
00144          * By default, switch to buffering mode when the index grows too large
00145          * to fit in cache.
00146          */
00147         buildstate.bufferingMode = GIST_BUFFERING_AUTO;
00148         fillfactor = GIST_DEFAULT_FILLFACTOR;
00149     }
00150     /* Calculate target amount of free space to leave on pages */
00151     buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100;
00152 
00153     /*
00154      * We expect to be called exactly once for any index relation. If that's
00155      * not the case, big trouble's what we have.
00156      */
00157     if (RelationGetNumberOfBlocks(index) != 0)
00158         elog(ERROR, "index \"%s\" already contains data",
00159              RelationGetRelationName(index));
00160 
00161     /* no locking is needed */
00162     buildstate.giststate = initGISTstate(index);
00163 
00164     /*
00165      * Create a temporary memory context that is reset once for each tuple
00166      * processed.  (Note: we don't bother to make this a child of the
00167      * giststate's scanCxt, so we have to delete it separately at the end.)
00168      */
00169     buildstate.giststate->tempCxt = createTempGistContext();
00170 
00171     /* initialize the root page */
00172     buffer = gistNewBuffer(index);
00173     Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
00174     page = BufferGetPage(buffer);
00175 
00176     START_CRIT_SECTION();
00177 
00178     GISTInitBuffer(buffer, F_LEAF);
00179 
00180     MarkBufferDirty(buffer);
00181 
00182     if (RelationNeedsWAL(index))
00183     {
00184         XLogRecPtr  recptr;
00185         XLogRecData rdata;
00186 
00187         rdata.data = (char *) &(index->rd_node);
00188         rdata.len = sizeof(RelFileNode);
00189         rdata.buffer = InvalidBuffer;
00190         rdata.next = NULL;
00191 
00192         recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata);
00193         PageSetLSN(page, recptr);
00194     }
00195     else
00196         PageSetLSN(page, gistGetFakeLSN(heap));
00197 
00198     UnlockReleaseBuffer(buffer);
00199 
00200     END_CRIT_SECTION();
00201 
00202     /* build the index */
00203     buildstate.indtuples = 0;
00204     buildstate.indtuplesSize = 0;
00205 
00206     /*
00207      * Do the heap scan.
00208      */
00209     reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
00210                                    gistBuildCallback, (void *) &buildstate);
00211 
00212     /*
00213      * If buffering was used, flush out all the tuples that are still in the
00214      * buffers.
00215      */
00216     if (buildstate.bufferingMode == GIST_BUFFERING_ACTIVE)
00217     {
00218         elog(DEBUG1, "all tuples processed, emptying buffers");
00219         gistEmptyAllBuffers(&buildstate);
00220         gistFreeBuildBuffers(buildstate.gfbb);
00221     }
00222 
00223     /* okay, all heap tuples are indexed */
00224     MemoryContextSwitchTo(oldcxt);
00225     MemoryContextDelete(buildstate.giststate->tempCxt);
00226 
00227     freeGISTstate(buildstate.giststate);
00228 
00229     /*
00230      * Return statistics
00231      */
00232     result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
00233 
00234     result->heap_tuples = reltuples;
00235     result->index_tuples = (double) buildstate.indtuples;
00236 
00237     PG_RETURN_POINTER(result);
00238 }
00239 
00240 /*
00241  * Validator for "buffering" reloption on GiST indexes. Allows "on", "off"
00242  * and "auto" values.
00243  */
00244 void
00245 gistValidateBufferingOption(char *value)
00246 {
00247     if (value == NULL ||
00248         (strcmp(value, "on") != 0 &&
00249          strcmp(value, "off") != 0 &&
00250          strcmp(value, "auto") != 0))
00251     {
00252         ereport(ERROR,
00253                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
00254                  errmsg("invalid value for \"buffering\" option"),
00255               errdetail("Valid values are \"on\", \"off\", and \"auto\".")));
00256     }
00257 }
00258 
00259 /*
00260  * Attempt to switch to buffering mode.
00261  *
00262  * If there is not enough memory for buffering build, sets bufferingMode
00263  * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch
00264  * anymore. Otherwise initializes the build buffers, and sets bufferingMode to
00265  * GIST_BUFFERING_ACTIVE.
00266  */
00267 static void
00268 gistInitBuffering(GISTBuildState *buildstate)
00269 {
00270     Relation    index = buildstate->indexrel;
00271     int         pagesPerBuffer;
00272     Size        pageFreeSpace;
00273     Size        itupAvgSize,
00274                 itupMinSize;
00275     double      avgIndexTuplesPerPage,
00276                 maxIndexTuplesPerPage;
00277     int         i;
00278     int         levelStep;
00279 
00280     /* Calc space of index page which is available for index tuples */
00281     pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
00282         - sizeof(ItemIdData)
00283         - buildstate->freespace;
00284 
00285     /*
00286      * Calculate average size of already inserted index tuples using gathered
00287      * statistics.
00288      */
00289     itupAvgSize = (double) buildstate->indtuplesSize /
00290         (double) buildstate->indtuples;
00291 
00292     /*
00293      * Calculate minimal possible size of index tuple by index metadata.
00294      * Minimal possible size of varlena is VARHDRSZ.
00295      *
00296      * XXX: that's not actually true, as a short varlen can be just 2 bytes.
00297      * And we should take padding into account here.
00298      */
00299     itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
00300     for (i = 0; i < index->rd_att->natts; i++)
00301     {
00302         if (index->rd_att->attrs[i]->attlen < 0)
00303             itupMinSize += VARHDRSZ;
00304         else
00305             itupMinSize += index->rd_att->attrs[i]->attlen;
00306     }
00307 
00308     /* Calculate average and maximal number of index tuples which fit to page */
00309     avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
00310     maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;
00311 
00312     /*
00313      * We need to calculate two parameters for the buffering algorithm:
00314      * levelStep and pagesPerBuffer.
00315      *
00316      * levelStep determines the size of subtree that we operate on, while
00317      * emptying a buffer. A higher value is better, as you need fewer buffer
00318      * emptying steps to build the index. However, if you set it too high, the
00319      * subtree doesn't fit in cache anymore, and you quickly lose the benefit
00320      * of the buffers.
00321      *
00322      * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is
00323      * the number of tuples on page (ie. fanout), and M is the amount of
00324      * internal memory available. Curiously, they doesn't explain *why* that
00325      * setting is optimal. We calculate it by taking the highest levelStep so
00326      * that a subtree still fits in cache. For a small B, our way of
00327      * calculating levelStep is very close to Arge et al's formula. For a
00328      * large B, our formula gives a value that is 2x higher.
00329      *
00330      * The average size (in pages) of a subtree of depth n can be calculated
00331      * as a geometric series:
00332      *
00333      * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
00334      *
00335      * where B is the average number of index tuples on page. The subtree is
00336      * cached in the shared buffer cache and the OS cache, so we choose
00337      * levelStep so that the subtree size is comfortably smaller than
00338      * effective_cache_size, with a safety factor of 4.
00339      *
00340      * The estimate on the average number of index tuples on page is based on
00341      * average tuple sizes observed before switching to buffered build, so the
00342      * real subtree size can be somewhat larger. Also, it would selfish to
00343      * gobble the whole cache for our index build. The safety factor of 4
00344      * should account for those effects.
00345      *
00346      * The other limiting factor for setting levelStep is that while
00347      * processing a subtree, we need to hold one page for each buffer at the
00348      * next lower buffered level. The max. number of buffers needed for that
00349      * is maxIndexTuplesPerPage^levelStep. This is very conservative, but
00350      * hopefully maintenance_work_mem is set high enough that you're
00351      * constrained by effective_cache_size rather than maintenance_work_mem.
00352      *
00353      * XXX: the buffer hash table consumes a fair amount of memory too per
00354      * buffer, but that is not currently taken into account. That scales on
00355      * the total number of buffers used, ie. the index size and on levelStep.
00356      * Note that a higher levelStep *reduces* the amount of memory needed for
00357      * the hash table.
00358      */
00359     levelStep = 1;
00360     for (;;)
00361     {
00362         double      subtreesize;
00363         double      maxlowestlevelpages;
00364 
00365         /* size of an average subtree at this levelStep (in pages). */
00366         subtreesize =
00367             (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) /
00368             (1 - avgIndexTuplesPerPage);
00369 
00370         /* max number of pages at the lowest level of a subtree */
00371         maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep);
00372 
00373         /* subtree must fit in cache (with safety factor of 4) */
00374         if (subtreesize > effective_cache_size / 4)
00375             break;
00376 
00377         /* each node in the lowest level of a subtree has one page in memory */
00378         if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ)
00379             break;
00380 
00381         /* Good, we can handle this levelStep. See if we can go one higher. */
00382         levelStep++;
00383     }
00384 
00385     /*
00386      * We just reached an unacceptable value of levelStep in previous loop.
00387      * So, decrease levelStep to get last acceptable value.
00388      */
00389     levelStep--;
00390 
00391     /*
00392      * If there's not enough cache or maintenance_work_mem, fall back to plain
00393      * inserts.
00394      */
00395     if (levelStep <= 0)
00396     {
00397         elog(DEBUG1, "failed to switch to buffered GiST build");
00398         buildstate->bufferingMode = GIST_BUFFERING_DISABLED;
00399         return;
00400     }
00401 
00402     /*
00403      * The second parameter to set is pagesPerBuffer, which determines the
00404      * size of each buffer. We adjust pagesPerBuffer also during the build,
00405      * which is why this calculation is in a separate function.
00406      */
00407     pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep);
00408 
00409     /* Initialize GISTBuildBuffers with these parameters */
00410     buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
00411                                             gistGetMaxLevel(index));
00412 
00413     gistInitParentMap(buildstate);
00414 
00415     buildstate->bufferingMode = GIST_BUFFERING_ACTIVE;
00416 
00417     elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
00418          levelStep, pagesPerBuffer);
00419 }
00420 
00421 /*
00422  * Calculate pagesPerBuffer parameter for the buffering algorithm.
00423  *
00424  * Buffer size is chosen so that assuming that tuples are distributed
00425  * randomly, emptying half a buffer fills on average one page in every buffer
00426  * at the next lower level.
00427  */
00428 static int
00429 calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
00430 {
00431     double      pagesPerBuffer;
00432     double      avgIndexTuplesPerPage;
00433     double      itupAvgSize;
00434     Size        pageFreeSpace;
00435 
00436     /* Calc space of index page which is available for index tuples */
00437     pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
00438         - sizeof(ItemIdData)
00439         - buildstate->freespace;
00440 
00441     /*
00442      * Calculate average size of already inserted index tuples using gathered
00443      * statistics.
00444      */
00445     itupAvgSize = (double) buildstate->indtuplesSize /
00446         (double) buildstate->indtuples;
00447 
00448     avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
00449 
00450     /*
00451      * Recalculate required size of buffers.
00452      */
00453     pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep);
00454 
00455     return (int) rint(pagesPerBuffer);
00456 }
00457 
00458 /*
00459  * Per-tuple callback from IndexBuildHeapScan.
00460  */
00461 static void
00462 gistBuildCallback(Relation index,
00463                   HeapTuple htup,
00464                   Datum *values,
00465                   bool *isnull,
00466                   bool tupleIsAlive,
00467                   void *state)
00468 {
00469     GISTBuildState *buildstate = (GISTBuildState *) state;
00470     IndexTuple  itup;
00471     MemoryContext oldCtx;
00472 
00473     oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
00474 
00475     /* form an index tuple and point it at the heap tuple */
00476     itup = gistFormTuple(buildstate->giststate, index, values, isnull, true);
00477     itup->t_tid = htup->t_self;
00478 
00479     if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE)
00480     {
00481         /* We have buffers, so use them. */
00482         gistBufferingBuildInsert(buildstate, itup);
00483     }
00484     else
00485     {
00486         /*
00487          * There's no buffers (yet). Since we already have the index relation
00488          * locked, we call gistdoinsert directly.
00489          */
00490         gistdoinsert(index, itup, buildstate->freespace,
00491                      buildstate->giststate);
00492     }
00493 
00494     /* Update tuple count and total size. */
00495     buildstate->indtuples += 1;
00496     buildstate->indtuplesSize += IndexTupleSize(itup);
00497 
00498     MemoryContextSwitchTo(oldCtx);
00499     MemoryContextReset(buildstate->giststate->tempCxt);
00500 
00501     if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE &&
00502         buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0)
00503     {
00504         /* Adjust the target buffer size now */
00505         buildstate->gfbb->pagesPerBuffer =
00506             calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep);
00507     }
00508 
00509     /*
00510      * In 'auto' mode, check if the index has grown too large to fit in cache,
00511      * and switch to buffering mode if it has.
00512      *
00513      * To avoid excessive calls to smgrnblocks(), only check this every
00514      * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples
00515      */
00516     if ((buildstate->bufferingMode == GIST_BUFFERING_AUTO &&
00517          buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
00518          effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM)) ||
00519         (buildstate->bufferingMode == GIST_BUFFERING_STATS &&
00520          buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET))
00521     {
00522         /*
00523          * Index doesn't fit in effective cache anymore. Try to switch to
00524          * buffering build mode.
00525          */
00526         gistInitBuffering(buildstate);
00527     }
00528 }
00529 
00530 /*
00531  * Insert function for buffering index build.
00532  */
00533 static void
00534 gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
00535 {
00536     /* Insert the tuple to buffers. */
00537     gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel);
00538 
00539     /* If we filled up (half of a) buffer, process buffer emptying. */
00540     gistProcessEmptyingQueue(buildstate);
00541 }
00542 
00543 /*
00544  * Process an index tuple. Runs the tuple down the tree until we reach a leaf
00545  * page or node buffer, and inserts the tuple there. Returns true if we have
00546  * to stop buffer emptying process (because one of child buffers can't take
00547  * index tuples anymore).
00548  */
00549 static bool
00550 gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
00551                 BlockNumber startblkno, int startlevel)
00552 {
00553     GISTSTATE  *giststate = buildstate->giststate;
00554     GISTBuildBuffers *gfbb = buildstate->gfbb;
00555     Relation    indexrel = buildstate->indexrel;
00556     BlockNumber childblkno;
00557     Buffer      buffer;
00558     bool        result = false;
00559     BlockNumber blkno;
00560     int         level;
00561     OffsetNumber downlinkoffnum = InvalidOffsetNumber;
00562     BlockNumber parentblkno = InvalidBlockNumber;
00563 
00564     CHECK_FOR_INTERRUPTS();
00565 
00566     /*
00567      * Loop until we reach a leaf page (level == 0) or a level with buffers
00568      * (not including the level we start at, because we would otherwise make
00569      * no progress).
00570      */
00571     blkno = startblkno;
00572     level = startlevel;
00573     for (;;)
00574     {
00575         ItemId      iid;
00576         IndexTuple  idxtuple,
00577                     newtup;
00578         Page        page;
00579         OffsetNumber childoffnum;
00580 
00581         /* Have we reached a level with buffers? */
00582         if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel)
00583             break;
00584 
00585         /* Have we reached a leaf page? */
00586         if (level == 0)
00587             break;
00588 
00589         /*
00590          * Nope. Descend down to the next level then. Choose a child to
00591          * descend down to.
00592          */
00593 
00594         buffer = ReadBuffer(indexrel, blkno);
00595         LockBuffer(buffer, GIST_EXCLUSIVE);
00596 
00597         page = (Page) BufferGetPage(buffer);
00598         childoffnum = gistchoose(indexrel, page, itup, giststate);
00599         iid = PageGetItemId(page, childoffnum);
00600         idxtuple = (IndexTuple) PageGetItem(page, iid);
00601         childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
00602 
00603         if (level > 1)
00604             gistMemorizeParent(buildstate, childblkno, blkno);
00605 
00606         /*
00607          * Check that the key representing the target child node is consistent
00608          * with the key we're inserting. Update it if it's not.
00609          */
00610         newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
00611         if (newtup)
00612         {
00613             blkno  = gistbufferinginserttuples(buildstate, buffer, level,
00614                                                &newtup, 1, childoffnum,
00615                                       InvalidBlockNumber, InvalidOffsetNumber);
00616             /* gistbufferinginserttuples() released the buffer */
00617         }
00618         else
00619             UnlockReleaseBuffer(buffer);
00620 
00621         /* Descend to the child */
00622         parentblkno = blkno;
00623         blkno = childblkno;
00624         downlinkoffnum = childoffnum;
00625         Assert(level > 0);
00626         level--;
00627     }
00628 
00629     if (LEVEL_HAS_BUFFERS(level, gfbb))
00630     {
00631         /*
00632          * We've reached level with buffers. Place the index tuple to the
00633          * buffer, and add the buffer to the emptying queue if it overflows.
00634          */
00635         GISTNodeBuffer *childNodeBuffer;
00636 
00637         /* Find the buffer or create a new one */
00638         childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level);
00639 
00640         /* Add index tuple to it */
00641         gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
00642 
00643         if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
00644             result = true;
00645     }
00646     else
00647     {
00648         /*
00649          * We've reached a leaf page. Place the tuple here.
00650          */
00651         Assert(level == 0);
00652         buffer = ReadBuffer(indexrel, blkno);
00653         LockBuffer(buffer, GIST_EXCLUSIVE);
00654         gistbufferinginserttuples(buildstate, buffer, level,
00655                                   &itup, 1, InvalidOffsetNumber,
00656                                   parentblkno, downlinkoffnum);
00657         /* gistbufferinginserttuples() released the buffer */
00658     }
00659 
00660     return result;
00661 }
00662 
00663 /*
00664  * Insert tuples to a given page.
00665  *
00666  * This is analogous with gistinserttuples() in the regular insertion code.
00667  *
00668  * Returns the block number of the page where the (first) new or updated tuple
00669  * was inserted. Usually that's the original page, but might be a sibling page
00670  * if the original page was split.
00671  *
00672  * Caller should hold a lock on 'buffer' on entry. This function will unlock
00673  * and unpin it.
00674  */
00675 static BlockNumber
00676 gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
00677                           IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
00678                           BlockNumber parentblk, OffsetNumber downlinkoffnum)
00679 {
00680     GISTBuildBuffers *gfbb = buildstate->gfbb;
00681     List       *splitinfo;
00682     bool        is_split;
00683     BlockNumber placed_to_blk = InvalidBlockNumber;
00684 
00685     is_split = gistplacetopage(buildstate->indexrel,
00686                                buildstate->freespace,
00687                                buildstate->giststate,
00688                                buffer,
00689                                itup, ntup, oldoffnum, &placed_to_blk,
00690                                InvalidBuffer,
00691                                &splitinfo,
00692                                false);
00693 
00694     /*
00695      * If this is a root split, update the root path item kept in memory. This
00696      * ensures that all path stacks are always complete, including all parent
00697      * nodes up to the root. That simplifies the algorithm to re-find correct
00698      * parent.
00699      */
00700     if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
00701     {
00702         Page        page = BufferGetPage(buffer);
00703         OffsetNumber off;
00704         OffsetNumber maxoff;
00705 
00706         Assert(level == gfbb->rootlevel);
00707         gfbb->rootlevel++;
00708 
00709         elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel);
00710 
00711         /*
00712          * All the downlinks on the old root page are now on one of the child
00713          * pages. Visit all the new child pages to memorize the parents of the
00714          * grandchildren.
00715          */
00716         if (gfbb->rootlevel > 1)
00717         {
00718             maxoff = PageGetMaxOffsetNumber(page);
00719             for (off = FirstOffsetNumber; off <= maxoff; off++)
00720             {
00721                 ItemId      iid = PageGetItemId(page, off);
00722                 IndexTuple  idxtuple = (IndexTuple) PageGetItem(page, iid);
00723                 BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
00724                 Buffer      childbuf = ReadBuffer(buildstate->indexrel, childblkno);
00725 
00726                 LockBuffer(childbuf, GIST_SHARE);
00727                 gistMemorizeAllDownlinks(buildstate, childbuf);
00728                 UnlockReleaseBuffer(childbuf);
00729 
00730                 /*
00731                  * Also remember that the parent of the new child page is the
00732                  * root block.
00733                  */
00734                 gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO);
00735             }
00736         }
00737     }
00738 
00739     if (splitinfo)
00740     {
00741         /*
00742          * Insert the downlinks to the parent. This is analogous with
00743          * gistfinishsplit() in the regular insertion code, but the locking is
00744          * simpler, and we have to maintain the buffers on internal nodes and
00745          * the parent map.
00746          */
00747         IndexTuple *downlinks;
00748         int         ndownlinks,
00749                     i;
00750         Buffer      parentBuffer;
00751         ListCell   *lc;
00752 
00753         /* Parent may have changed since we memorized this path. */
00754         parentBuffer =
00755             gistBufferingFindCorrectParent(buildstate,
00756                                            BufferGetBlockNumber(buffer),
00757                                            level,
00758                                            &parentblk,
00759                                            &downlinkoffnum);
00760 
00761         /*
00762          * If there's a buffer associated with this page, that needs to be
00763          * split too. gistRelocateBuildBuffersOnSplit() will also adjust the
00764          * downlinks in 'splitinfo', to make sure they're consistent not only
00765          * with the tuples already on the pages, but also the tuples in the
00766          * buffers that will eventually be inserted to them.
00767          */
00768         gistRelocateBuildBuffersOnSplit(gfbb,
00769                                         buildstate->giststate,
00770                                         buildstate->indexrel,
00771                                         level,
00772                                         buffer, splitinfo);
00773 
00774         /* Create an array of all the downlink tuples */
00775         ndownlinks = list_length(splitinfo);
00776         downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks);
00777         i = 0;
00778         foreach(lc, splitinfo)
00779         {
00780             GISTPageSplitInfo *splitinfo = lfirst(lc);
00781 
00782             /*
00783              * Remember the parent of each new child page in our parent map.
00784              * This assumes that the downlinks fit on the parent page. If the
00785              * parent page is split, too, when we recurse up to insert the
00786              * downlinks, the recursive gistbufferinginserttuples() call will
00787              * update the map again.
00788              */
00789             if (level > 0)
00790                 gistMemorizeParent(buildstate,
00791                                    BufferGetBlockNumber(splitinfo->buf),
00792                                    BufferGetBlockNumber(parentBuffer));
00793 
00794             /*
00795              * Also update the parent map for all the downlinks that got moved
00796              * to a different page. (actually this also loops through the
00797              * downlinks that stayed on the original page, but it does no
00798              * harm).
00799              */
00800             if (level > 1)
00801                 gistMemorizeAllDownlinks(buildstate, splitinfo->buf);
00802 
00803             /*
00804              * Since there's no concurrent access, we can release the lower
00805              * level buffers immediately. This includes the original page.
00806              */
00807             UnlockReleaseBuffer(splitinfo->buf);
00808             downlinks[i++] = splitinfo->downlink;
00809         }
00810 
00811         /* Insert them into parent. */
00812         gistbufferinginserttuples(buildstate, parentBuffer, level + 1,
00813                                   downlinks, ndownlinks, downlinkoffnum,
00814                                   InvalidBlockNumber, InvalidOffsetNumber);
00815 
00816         list_free_deep(splitinfo);      /* we don't need this anymore */
00817     }
00818     else
00819         UnlockReleaseBuffer(buffer);
00820 
00821     return placed_to_blk;
00822 }
00823 
00824 /*
00825  * Find the downlink pointing to a child page.
00826  *
00827  * 'childblkno' indicates the child page to find the parent for. 'level' is
00828  * the level of the child. On entry, *parentblkno and *downlinkoffnum can
00829  * point to a location where the downlink used to be - we will check that
00830  * location first, and save some cycles if it hasn't moved. The function
00831  * returns a buffer containing the downlink, exclusively-locked, and
00832  * *parentblkno and *downlinkoffnum are set to the real location of the
00833  * downlink.
00834  *
00835  * If the child page is a leaf (level == 0), the caller must supply a correct
00836  * parentblkno. Otherwise we use the parent map hash table to find the parent
00837  * block.
00838  *
00839  * This function serves the same purpose as gistFindCorrectParent() during
00840  * normal index inserts, but this is simpler because we don't need to deal
00841  * with concurrent inserts.
00842  */
00843 static Buffer
00844 gistBufferingFindCorrectParent(GISTBuildState *buildstate,
00845                                BlockNumber childblkno, int level,
00846                                BlockNumber *parentblkno,
00847                                OffsetNumber *downlinkoffnum)
00848 {
00849     BlockNumber parent;
00850     Buffer      buffer;
00851     Page        page;
00852     OffsetNumber maxoff;
00853     OffsetNumber off;
00854 
00855     if (level > 0)
00856         parent = gistGetParent(buildstate, childblkno);
00857     else
00858     {
00859         /*
00860          * For a leaf page, the caller must supply a correct parent block
00861          * number.
00862          */
00863         if (*parentblkno == InvalidBlockNumber)
00864             elog(ERROR, "no parent buffer provided of child %d", childblkno);
00865         parent = *parentblkno;
00866     }
00867 
00868     buffer = ReadBuffer(buildstate->indexrel, parent);
00869     page = BufferGetPage(buffer);
00870     LockBuffer(buffer, GIST_EXCLUSIVE);
00871     gistcheckpage(buildstate->indexrel, buffer);
00872     maxoff = PageGetMaxOffsetNumber(page);
00873 
00874     /* Check if it was not moved */
00875     if (parent == *parentblkno && *parentblkno != InvalidBlockNumber &&
00876         *downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff)
00877     {
00878         ItemId      iid = PageGetItemId(page, *downlinkoffnum);
00879         IndexTuple  idxtuple = (IndexTuple) PageGetItem(page, iid);
00880 
00881         if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
00882         {
00883             /* Still there */
00884             return buffer;
00885         }
00886     }
00887 
00888     /*
00889      * Downlink was not at the offset where it used to be. Scan the page to
00890      * find it. During normal gist insertions, it might've moved to another
00891      * page, to the right, but during a buffering build, we keep track of the
00892      * parent of each page in the lookup table so we should always know what
00893      * page it's on.
00894      */
00895     for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
00896     {
00897         ItemId      iid = PageGetItemId(page, off);
00898         IndexTuple  idxtuple = (IndexTuple) PageGetItem(page, iid);
00899 
00900         if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
00901         {
00902             /* yes!!, found it */
00903             *downlinkoffnum = off;
00904             return buffer;
00905         }
00906     }
00907 
00908     elog(ERROR, "failed to re-find parent for block %u", childblkno);
00909     return InvalidBuffer;       /* keep compiler quiet */
00910 }
00911 
00912 /*
00913  * Process buffers emptying stack. Emptying of one buffer can cause emptying
00914  * of other buffers. This function iterates until this cascading emptying
00915  * process finished, e.g. until buffers emptying stack is empty.
00916  */
00917 static void
00918 gistProcessEmptyingQueue(GISTBuildState *buildstate)
00919 {
00920     GISTBuildBuffers *gfbb = buildstate->gfbb;
00921 
00922     /* Iterate while we have elements in buffers emptying stack. */
00923     while (gfbb->bufferEmptyingQueue != NIL)
00924     {
00925         GISTNodeBuffer *emptyingNodeBuffer;
00926 
00927         /* Get node buffer from emptying stack. */
00928         emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue);
00929         gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue);
00930         emptyingNodeBuffer->queuedForEmptying = false;
00931 
00932         /*
00933          * We are going to load last pages of buffers where emptying will be
00934          * to. So let's unload any previously loaded buffers.
00935          */
00936         gistUnloadNodeBuffers(gfbb);
00937 
00938         /*
00939          * Pop tuples from the buffer and run them down to the buffers at
00940          * lower level, or leaf pages. We continue until one of the lower
00941          * level buffers fills up, or this buffer runs empty.
00942          *
00943          * In Arge et al's paper, the buffer emptying is stopped after
00944          * processing 1/2 node buffer worth of tuples, to avoid overfilling
00945          * any of the lower level buffers. However, it's more efficient to
00946          * keep going until one of the lower level buffers actually fills up,
00947          * so that's what we do. This doesn't need to be exact, if a buffer
00948          * overfills by a few tuples, there's no harm done.
00949          */
00950         while (true)
00951         {
00952             IndexTuple  itup;
00953 
00954             /* Get next index tuple from the buffer */
00955             if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
00956                 break;
00957 
00958             /*
00959              * Run it down to the underlying node buffer or leaf page.
00960              *
00961              * Note: it's possible that the buffer we're emptying splits as a
00962              * result of this call. If that happens, our emptyingNodeBuffer
00963              * points to the left half of the split. After split, it's very
00964              * likely that the new left buffer is no longer over the half-full
00965              * threshold, but we might as well keep flushing tuples from it
00966              * until we fill a lower-level buffer.
00967              */
00968             if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level))
00969             {
00970                 /*
00971                  * A lower level buffer filled up. Stop emptying this buffer,
00972                  * to avoid overflowing the lower level buffer.
00973                  */
00974                 break;
00975             }
00976 
00977             /* Free all the memory allocated during index tuple processing */
00978             MemoryContextReset(buildstate->giststate->tempCxt);
00979         }
00980     }
00981 }
00982 
00983 /*
00984  * Empty all node buffers, from top to bottom. This is done at the end of
00985  * index build to flush all remaining tuples to the index.
00986  *
00987  * Note: This destroys the buffersOnLevels lists, so the buffers should not
00988  * be inserted to after this call.
00989  */
00990 static void
00991 gistEmptyAllBuffers(GISTBuildState *buildstate)
00992 {
00993     GISTBuildBuffers *gfbb = buildstate->gfbb;
00994     MemoryContext oldCtx;
00995     int         i;
00996 
00997     oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
00998 
00999     /*
01000      * Iterate through the levels from top to bottom.
01001      */
01002     for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--)
01003     {
01004         /*
01005          * Empty all buffers on this level. Note that new buffers can pop up
01006          * in the list during the processing, as a result of page splits, so a
01007          * simple walk through the list won't work. We remove buffers from the
01008          * list when we see them empty; a buffer can't become non-empty once
01009          * it's been fully emptied.
01010          */
01011         while (gfbb->buffersOnLevels[i] != NIL)
01012         {
01013             GISTNodeBuffer *nodeBuffer;
01014 
01015             nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]);
01016 
01017             if (nodeBuffer->blocksCount != 0)
01018             {
01019                 /*
01020                  * Add this buffer to the emptying queue, and proceed to empty
01021                  * the queue.
01022                  */
01023                 if (!nodeBuffer->queuedForEmptying)
01024                 {
01025                     MemoryContextSwitchTo(gfbb->context);
01026                     nodeBuffer->queuedForEmptying = true;
01027                     gfbb->bufferEmptyingQueue =
01028                         lcons(nodeBuffer, gfbb->bufferEmptyingQueue);
01029                     MemoryContextSwitchTo(buildstate->giststate->tempCxt);
01030                 }
01031                 gistProcessEmptyingQueue(buildstate);
01032             }
01033             else
01034                 gfbb->buffersOnLevels[i] =
01035                     list_delete_first(gfbb->buffersOnLevels[i]);
01036         }
01037         elog(DEBUG2, "emptied all buffers at level %d", i);
01038     }
01039     MemoryContextSwitchTo(oldCtx);
01040 }
01041 
01042 /*
01043  * Get the depth of the GiST index.
01044  */
01045 static int
01046 gistGetMaxLevel(Relation index)
01047 {
01048     int         maxLevel;
01049     BlockNumber blkno;
01050 
01051     /*
01052      * Traverse down the tree, starting from the root, until we hit the leaf
01053      * level.
01054      */
01055     maxLevel = 0;
01056     blkno = GIST_ROOT_BLKNO;
01057     while (true)
01058     {
01059         Buffer      buffer;
01060         Page        page;
01061         IndexTuple  itup;
01062 
01063         buffer = ReadBuffer(index, blkno);
01064 
01065         /*
01066          * There's no concurrent access during index build, so locking is just
01067          * pro forma.
01068          */
01069         LockBuffer(buffer, GIST_SHARE);
01070         page = (Page) BufferGetPage(buffer);
01071 
01072         if (GistPageIsLeaf(page))
01073         {
01074             /* We hit the bottom, so we're done. */
01075             UnlockReleaseBuffer(buffer);
01076             break;
01077         }
01078 
01079         /*
01080          * Pick the first downlink on the page, and follow it. It doesn't
01081          * matter which downlink we choose, the tree has the same depth
01082          * everywhere, so we just pick the first one.
01083          */
01084         itup = (IndexTuple) PageGetItem(page,
01085                                      PageGetItemId(page, FirstOffsetNumber));
01086         blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
01087         UnlockReleaseBuffer(buffer);
01088 
01089         /*
01090          * We're going down on the tree. It means that there is yet one more
01091          * level in the tree.
01092          */
01093         maxLevel++;
01094     }
01095     return maxLevel;
01096 }
01097 
01098 
01099 /*
01100  * Routines for managing the parent map.
01101  *
01102  * Whenever a page is split, we need to insert the downlinks into the parent.
01103  * We need to somehow find the parent page to do that. In normal insertions,
01104  * we keep a stack of nodes visited when we descend the tree. However, in
01105  * buffering build, we can start descending the tree from any internal node,
01106  * when we empty a buffer by cascading tuples to its children. So we don't
01107  * have a full stack up to the root available at that time.
01108  *
01109  * So instead, we maintain a hash table to track the parent of every internal
01110  * page. We don't need to track the parents of leaf nodes, however. Whenever
01111  * we insert to a leaf, we've just descended down from its parent, so we know
01112  * its immediate parent already. This helps a lot to limit the memory used
01113  * by this hash table.
01114  *
01115  * Whenever an internal node is split, the parent map needs to be updated.
01116  * the parent of the new child page needs to be recorded, and also the
01117  * entries for all page whose downlinks are moved to a new page at the split
01118  * needs to be updated.
01119  *
01120  * We also update the parent map whenever we descend the tree. That might seem
01121  * unnecessary, because we maintain the map whenever a downlink is moved or
01122  * created, but it is needed because we switch to buffering mode after
01123  * creating a tree with regular index inserts. Any pages created before
01124  * switching to buffering mode will not be present in the parent map initially,
01125  * but will be added there the first time we visit them.
01126  */
01127 
01128 typedef struct
01129 {
01130     BlockNumber childblkno;     /* hash key */
01131     BlockNumber parentblkno;
01132 } ParentMapEntry;
01133 
01134 static void
01135 gistInitParentMap(GISTBuildState *buildstate)
01136 {
01137     HASHCTL     hashCtl;
01138 
01139     hashCtl.keysize = sizeof(BlockNumber);
01140     hashCtl.entrysize = sizeof(ParentMapEntry);
01141     hashCtl.hcxt = CurrentMemoryContext;
01142     hashCtl.hash = oid_hash;
01143     buildstate->parentMap = hash_create("gistbuild parent map",
01144                                         1024,
01145                                         &hashCtl,
01146                                         HASH_ELEM | HASH_CONTEXT
01147                                         | HASH_FUNCTION);
01148 }
01149 
01150 static void
01151 gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent)
01152 {
01153     ParentMapEntry *entry;
01154     bool        found;
01155 
01156     entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
01157                                            (const void *) &child,
01158                                            HASH_ENTER,
01159                                            &found);
01160     entry->parentblkno = parent;
01161 }
01162 
01163 /*
01164  * Scan all downlinks on a page, and memorize their parent.
01165  */
01166 static void
01167 gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf)
01168 {
01169     OffsetNumber maxoff;
01170     OffsetNumber off;
01171     BlockNumber parentblkno = BufferGetBlockNumber(parentbuf);
01172     Page        page = BufferGetPage(parentbuf);
01173 
01174     Assert(!GistPageIsLeaf(page));
01175 
01176     maxoff = PageGetMaxOffsetNumber(page);
01177     for (off = FirstOffsetNumber; off <= maxoff; off++)
01178     {
01179         ItemId      iid = PageGetItemId(page, off);
01180         IndexTuple  idxtuple = (IndexTuple) PageGetItem(page, iid);
01181         BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
01182 
01183         gistMemorizeParent(buildstate, childblkno, parentblkno);
01184     }
01185 }
01186 
01187 static BlockNumber
01188 gistGetParent(GISTBuildState *buildstate, BlockNumber child)
01189 {
01190     ParentMapEntry *entry;
01191     bool        found;
01192 
01193     /* Find node buffer in hash table */
01194     entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
01195                                            (const void *) &child,
01196                                            HASH_FIND,
01197                                            &found);
01198     if (!found)
01199         elog(ERROR, "could not find parent of block %d in lookup table", child);
01200 
01201     return entry->parentblkno;
01202 }