Header And Logo

PostgreSQL
| The world's most advanced open source database.

hashpage.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * hashpage.c
00004  *    Hash table page management code for the Postgres hash access method
00005  *
00006  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00007  * Portions Copyright (c) 1994, Regents of the University of California
00008  *
00009  *
00010  * IDENTIFICATION
00011  *    src/backend/access/hash/hashpage.c
00012  *
00013  * NOTES
00014  *    Postgres hash pages look like ordinary relation pages.  The opaque
00015  *    data at high addresses includes information about the page including
00016  *    whether a page is an overflow page or a true bucket, the bucket
00017  *    number, and the block numbers of the preceding and following pages
00018  *    in the same bucket.
00019  *
00020  *    The first page in a hash relation, page zero, is special -- it stores
00021  *    information describing the hash table; it is referred to as the
00022  *    "meta page." Pages one and higher store the actual data.
00023  *
00024  *    There are also bitmap pages, which are not manipulated here;
00025  *    see hashovfl.c.
00026  *
00027  *-------------------------------------------------------------------------
00028  */
00029 #include "postgres.h"
00030 
00031 #include "access/hash.h"
00032 #include "miscadmin.h"
00033 #include "storage/lmgr.h"
00034 #include "storage/smgr.h"
00035 
00036 
00037 static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock,
00038                     uint32 nblocks);
00039 static void _hash_splitbucket(Relation rel, Buffer metabuf,
00040                   Bucket obucket, Bucket nbucket,
00041                   BlockNumber start_oblkno,
00042                   BlockNumber start_nblkno,
00043                   uint32 maxbucket,
00044                   uint32 highmask, uint32 lowmask);
00045 
00046 
00047 /*
00048  * We use high-concurrency locking on hash indexes (see README for an overview
00049  * of the locking rules).  However, we can skip taking lmgr locks when the
00050  * index is local to the current backend (ie, either temp or new in the
00051  * current transaction).  No one else can see it, so there's no reason to
00052  * take locks.  We still take buffer-level locks, but not lmgr locks.
00053  */
00054 #define USELOCKING(rel)     (!RELATION_IS_LOCAL(rel))
00055 
00056 
00057 /*
00058  * _hash_getlock() -- Acquire an lmgr lock.
00059  *
00060  * 'whichlock' should the block number of a bucket's primary bucket page to
00061  * acquire the per-bucket lock.  (See README for details of the use of these
00062  * locks.)
00063  *
00064  * 'access' must be HASH_SHARE or HASH_EXCLUSIVE.
00065  */
00066 void
00067 _hash_getlock(Relation rel, BlockNumber whichlock, int access)
00068 {
00069     if (USELOCKING(rel))
00070         LockPage(rel, whichlock, access);
00071 }
00072 
00073 /*
00074  * _hash_try_getlock() -- Acquire an lmgr lock, but only if it's free.
00075  *
00076  * Same as above except we return FALSE without blocking if lock isn't free.
00077  */
00078 bool
00079 _hash_try_getlock(Relation rel, BlockNumber whichlock, int access)
00080 {
00081     if (USELOCKING(rel))
00082         return ConditionalLockPage(rel, whichlock, access);
00083     else
00084         return true;
00085 }
00086 
00087 /*
00088  * _hash_droplock() -- Release an lmgr lock.
00089  */
00090 void
00091 _hash_droplock(Relation rel, BlockNumber whichlock, int access)
00092 {
00093     if (USELOCKING(rel))
00094         UnlockPage(rel, whichlock, access);
00095 }
00096 
00097 /*
00098  *  _hash_getbuf() -- Get a buffer by block number for read or write.
00099  *
00100  *      'access' must be HASH_READ, HASH_WRITE, or HASH_NOLOCK.
00101  *      'flags' is a bitwise OR of the allowed page types.
00102  *
00103  *      This must be used only to fetch pages that are expected to be valid
00104  *      already.  _hash_checkpage() is applied using the given flags.
00105  *
00106  *      When this routine returns, the appropriate lock is set on the
00107  *      requested buffer and its reference count has been incremented
00108  *      (ie, the buffer is "locked and pinned").
00109  *
00110  *      P_NEW is disallowed because this routine can only be used
00111  *      to access pages that are known to be before the filesystem EOF.
00112  *      Extending the index should be done with _hash_getnewbuf.
00113  */
00114 Buffer
00115 _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
00116 {
00117     Buffer      buf;
00118 
00119     if (blkno == P_NEW)
00120         elog(ERROR, "hash AM does not use P_NEW");
00121 
00122     buf = ReadBuffer(rel, blkno);
00123 
00124     if (access != HASH_NOLOCK)
00125         LockBuffer(buf, access);
00126 
00127     /* ref count and lock type are correct */
00128 
00129     _hash_checkpage(rel, buf, flags);
00130 
00131     return buf;
00132 }
00133 
00134 /*
00135  *  _hash_getinitbuf() -- Get and initialize a buffer by block number.
00136  *
00137  *      This must be used only to fetch pages that are known to be before
00138  *      the index's filesystem EOF, but are to be filled from scratch.
00139  *      _hash_pageinit() is applied automatically.  Otherwise it has
00140  *      effects similar to _hash_getbuf() with access = HASH_WRITE.
00141  *
00142  *      When this routine returns, a write lock is set on the
00143  *      requested buffer and its reference count has been incremented
00144  *      (ie, the buffer is "locked and pinned").
00145  *
00146  *      P_NEW is disallowed because this routine can only be used
00147  *      to access pages that are known to be before the filesystem EOF.
00148  *      Extending the index should be done with _hash_getnewbuf.
00149  */
00150 Buffer
00151 _hash_getinitbuf(Relation rel, BlockNumber blkno)
00152 {
00153     Buffer      buf;
00154 
00155     if (blkno == P_NEW)
00156         elog(ERROR, "hash AM does not use P_NEW");
00157 
00158     buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO, NULL);
00159 
00160     LockBuffer(buf, HASH_WRITE);
00161 
00162     /* ref count and lock type are correct */
00163 
00164     /* initialize the page */
00165     _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf));
00166 
00167     return buf;
00168 }
00169 
00170 /*
00171  *  _hash_getnewbuf() -- Get a new page at the end of the index.
00172  *
00173  *      This has the same API as _hash_getinitbuf, except that we are adding
00174  *      a page to the index, and hence expect the page to be past the
00175  *      logical EOF.  (However, we have to support the case where it isn't,
00176  *      since a prior try might have crashed after extending the filesystem
00177  *      EOF but before updating the metapage to reflect the added page.)
00178  *
00179  *      It is caller's responsibility to ensure that only one process can
00180  *      extend the index at a time.
00181  */
00182 Buffer
00183 _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum)
00184 {
00185     BlockNumber nblocks = RelationGetNumberOfBlocksInFork(rel, forkNum);
00186     Buffer      buf;
00187 
00188     if (blkno == P_NEW)
00189         elog(ERROR, "hash AM does not use P_NEW");
00190     if (blkno > nblocks)
00191         elog(ERROR, "access to noncontiguous page in hash index \"%s\"",
00192              RelationGetRelationName(rel));
00193 
00194     /* smgr insists we use P_NEW to extend the relation */
00195     if (blkno == nblocks)
00196     {
00197         buf = ReadBufferExtended(rel, forkNum, P_NEW, RBM_NORMAL, NULL);
00198         if (BufferGetBlockNumber(buf) != blkno)
00199             elog(ERROR, "unexpected hash relation size: %u, should be %u",
00200                  BufferGetBlockNumber(buf), blkno);
00201     }
00202     else
00203         buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO, NULL);
00204 
00205     LockBuffer(buf, HASH_WRITE);
00206 
00207     /* ref count and lock type are correct */
00208 
00209     /* initialize the page */
00210     _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf));
00211 
00212     return buf;
00213 }
00214 
00215 /*
00216  *  _hash_getbuf_with_strategy() -- Get a buffer with nondefault strategy.
00217  *
00218  *      This is identical to _hash_getbuf() but also allows a buffer access
00219  *      strategy to be specified.  We use this for VACUUM operations.
00220  */
00221 Buffer
00222 _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno,
00223                            int access, int flags,
00224                            BufferAccessStrategy bstrategy)
00225 {
00226     Buffer      buf;
00227 
00228     if (blkno == P_NEW)
00229         elog(ERROR, "hash AM does not use P_NEW");
00230 
00231     buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy);
00232 
00233     if (access != HASH_NOLOCK)
00234         LockBuffer(buf, access);
00235 
00236     /* ref count and lock type are correct */
00237 
00238     _hash_checkpage(rel, buf, flags);
00239 
00240     return buf;
00241 }
00242 
00243 /*
00244  *  _hash_relbuf() -- release a locked buffer.
00245  *
00246  * Lock and pin (refcount) are both dropped.
00247  */
00248 void
00249 _hash_relbuf(Relation rel, Buffer buf)
00250 {
00251     UnlockReleaseBuffer(buf);
00252 }
00253 
00254 /*
00255  *  _hash_dropbuf() -- release an unlocked buffer.
00256  *
00257  * This is used to unpin a buffer on which we hold no lock.
00258  */
00259 void
00260 _hash_dropbuf(Relation rel, Buffer buf)
00261 {
00262     ReleaseBuffer(buf);
00263 }
00264 
00265 /*
00266  *  _hash_wrtbuf() -- write a hash page to disk.
00267  *
00268  *      This routine releases the lock held on the buffer and our refcount
00269  *      for it.  It is an error to call _hash_wrtbuf() without a write lock
00270  *      and a pin on the buffer.
00271  *
00272  * NOTE: this routine should go away when/if hash indexes are WAL-ified.
00273  * The correct sequence of operations is to mark the buffer dirty, then
00274  * write the WAL record, then release the lock and pin; so marking dirty
00275  * can't be combined with releasing.
00276  */
00277 void
00278 _hash_wrtbuf(Relation rel, Buffer buf)
00279 {
00280     MarkBufferDirty(buf);
00281     UnlockReleaseBuffer(buf);
00282 }
00283 
00284 /*
00285  * _hash_chgbufaccess() -- Change the lock type on a buffer, without
00286  *          dropping our pin on it.
00287  *
00288  * from_access and to_access may be HASH_READ, HASH_WRITE, or HASH_NOLOCK,
00289  * the last indicating that no buffer-level lock is held or wanted.
00290  *
00291  * When from_access == HASH_WRITE, we assume the buffer is dirty and tell
00292  * bufmgr it must be written out.  If the caller wants to release a write
00293  * lock on a page that's not been modified, it's okay to pass from_access
00294  * as HASH_READ (a bit ugly, but handy in some places).
00295  */
00296 void
00297 _hash_chgbufaccess(Relation rel,
00298                    Buffer buf,
00299                    int from_access,
00300                    int to_access)
00301 {
00302     if (from_access == HASH_WRITE)
00303         MarkBufferDirty(buf);
00304     if (from_access != HASH_NOLOCK)
00305         LockBuffer(buf, BUFFER_LOCK_UNLOCK);
00306     if (to_access != HASH_NOLOCK)
00307         LockBuffer(buf, to_access);
00308 }
00309 
00310 
00311 /*
00312  *  _hash_metapinit() -- Initialize the metadata page of a hash index,
00313  *              the initial buckets, and the initial bitmap page.
00314  *
00315  * The initial number of buckets is dependent on num_tuples, an estimate
00316  * of the number of tuples to be loaded into the index initially.  The
00317  * chosen number of buckets is returned.
00318  *
00319  * We are fairly cavalier about locking here, since we know that no one else
00320  * could be accessing this index.  In particular the rule about not holding
00321  * multiple buffer locks is ignored.
00322  */
00323 uint32
00324 _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum)
00325 {
00326     HashMetaPage metap;
00327     HashPageOpaque pageopaque;
00328     Buffer      metabuf;
00329     Buffer      buf;
00330     Page        pg;
00331     int32       data_width;
00332     int32       item_width;
00333     int32       ffactor;
00334     double      dnumbuckets;
00335     uint32      num_buckets;
00336     uint32      log2_num_buckets;
00337     uint32      i;
00338 
00339     /* safety check */
00340     if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0)
00341         elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
00342              RelationGetRelationName(rel));
00343 
00344     /*
00345      * Determine the target fill factor (in tuples per bucket) for this index.
00346      * The idea is to make the fill factor correspond to pages about as full
00347      * as the user-settable fillfactor parameter says.  We can compute it
00348      * exactly since the index datatype (i.e. uint32 hash key) is fixed-width.
00349      */
00350     data_width = sizeof(uint32);
00351     item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) +
00352         sizeof(ItemIdData);     /* include the line pointer */
00353     ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width;
00354     /* keep to a sane range */
00355     if (ffactor < 10)
00356         ffactor = 10;
00357 
00358     /*
00359      * Choose the number of initial bucket pages to match the fill factor
00360      * given the estimated number of tuples.  We round up the result to the
00361      * next power of 2, however, and always force at least 2 bucket pages. The
00362      * upper limit is determined by considerations explained in
00363      * _hash_expandtable().
00364      */
00365     dnumbuckets = num_tuples / ffactor;
00366     if (dnumbuckets <= 2.0)
00367         num_buckets = 2;
00368     else if (dnumbuckets >= (double) 0x40000000)
00369         num_buckets = 0x40000000;
00370     else
00371         num_buckets = ((uint32) 1) << _hash_log2((uint32) dnumbuckets);
00372 
00373     log2_num_buckets = _hash_log2(num_buckets);
00374     Assert(num_buckets == (((uint32) 1) << log2_num_buckets));
00375     Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS);
00376 
00377     /*
00378      * We initialize the metapage, the first N bucket pages, and the first
00379      * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend()
00380      * calls to occur.  This ensures that the smgr level has the right idea of
00381      * the physical index length.
00382      */
00383     metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum);
00384     pg = BufferGetPage(metabuf);
00385 
00386     pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
00387     pageopaque->hasho_prevblkno = InvalidBlockNumber;
00388     pageopaque->hasho_nextblkno = InvalidBlockNumber;
00389     pageopaque->hasho_bucket = -1;
00390     pageopaque->hasho_flag = LH_META_PAGE;
00391     pageopaque->hasho_page_id = HASHO_PAGE_ID;
00392 
00393     metap = HashPageGetMeta(pg);
00394 
00395     metap->hashm_magic = HASH_MAGIC;
00396     metap->hashm_version = HASH_VERSION;
00397     metap->hashm_ntuples = 0;
00398     metap->hashm_nmaps = 0;
00399     metap->hashm_ffactor = ffactor;
00400     metap->hashm_bsize = HashGetMaxBitmapSize(pg);
00401     /* find largest bitmap array size that will fit in page size */
00402     for (i = _hash_log2(metap->hashm_bsize); i > 0; --i)
00403     {
00404         if ((1 << i) <= metap->hashm_bsize)
00405             break;
00406     }
00407     Assert(i > 0);
00408     metap->hashm_bmsize = 1 << i;
00409     metap->hashm_bmshift = i + BYTE_TO_BIT;
00410     Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1));
00411 
00412     /*
00413      * Label the index with its primary hash support function's OID.  This is
00414      * pretty useless for normal operation (in fact, hashm_procid is not used
00415      * anywhere), but it might be handy for forensic purposes so we keep it.
00416      */
00417     metap->hashm_procid = index_getprocid(rel, 1, HASHPROC);
00418 
00419     /*
00420      * We initialize the index with N buckets, 0 .. N-1, occupying physical
00421      * blocks 1 to N.  The first freespace bitmap page is in block N+1. Since
00422      * N is a power of 2, we can set the masks this way:
00423      */
00424     metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1;
00425     metap->hashm_highmask = (num_buckets << 1) - 1;
00426 
00427     MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares));
00428     MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
00429 
00430     /* Set up mapping for one spare page after the initial splitpoints */
00431     metap->hashm_spares[log2_num_buckets] = 1;
00432     metap->hashm_ovflpoint = log2_num_buckets;
00433     metap->hashm_firstfree = 0;
00434 
00435     /*
00436      * Release buffer lock on the metapage while we initialize buckets.
00437      * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS
00438      * won't accomplish anything.  It's a bad idea to hold buffer locks for
00439      * long intervals in any case, since that can block the bgwriter.
00440      */
00441     _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
00442 
00443     /*
00444      * Initialize the first N buckets
00445      */
00446     for (i = 0; i < num_buckets; i++)
00447     {
00448         /* Allow interrupts, in case N is huge */
00449         CHECK_FOR_INTERRUPTS();
00450 
00451         buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum);
00452         pg = BufferGetPage(buf);
00453         pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
00454         pageopaque->hasho_prevblkno = InvalidBlockNumber;
00455         pageopaque->hasho_nextblkno = InvalidBlockNumber;
00456         pageopaque->hasho_bucket = i;
00457         pageopaque->hasho_flag = LH_BUCKET_PAGE;
00458         pageopaque->hasho_page_id = HASHO_PAGE_ID;
00459         _hash_wrtbuf(rel, buf);
00460     }
00461 
00462     /* Now reacquire buffer lock on metapage */
00463     _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
00464 
00465     /*
00466      * Initialize first bitmap page
00467      */
00468     _hash_initbitmap(rel, metap, num_buckets + 1, forkNum);
00469 
00470     /* all done */
00471     _hash_wrtbuf(rel, metabuf);
00472 
00473     return num_buckets;
00474 }
00475 
00476 /*
00477  *  _hash_pageinit() -- Initialize a new hash index page.
00478  */
00479 void
00480 _hash_pageinit(Page page, Size size)
00481 {
00482     Assert(PageIsNew(page));
00483     PageInit(page, size, sizeof(HashPageOpaqueData));
00484 }
00485 
00486 /*
00487  * Attempt to expand the hash table by creating one new bucket.
00488  *
00489  * This will silently do nothing if it cannot get the needed locks.
00490  *
00491  * The caller should hold no locks on the hash index.
00492  *
00493  * The caller must hold a pin, but no lock, on the metapage buffer.
00494  * The buffer is returned in the same state.
00495  */
00496 void
00497 _hash_expandtable(Relation rel, Buffer metabuf)
00498 {
00499     HashMetaPage metap;
00500     Bucket      old_bucket;
00501     Bucket      new_bucket;
00502     uint32      spare_ndx;
00503     BlockNumber start_oblkno;
00504     BlockNumber start_nblkno;
00505     uint32      maxbucket;
00506     uint32      highmask;
00507     uint32      lowmask;
00508 
00509     /*
00510      * Write-lock the meta page.  It used to be necessary to acquire a
00511      * heavyweight lock to begin a split, but that is no longer required.
00512      */
00513     _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
00514 
00515     _hash_checkpage(rel, metabuf, LH_META_PAGE);
00516     metap = HashPageGetMeta(BufferGetPage(metabuf));
00517 
00518     /*
00519      * Check to see if split is still needed; someone else might have already
00520      * done one while we waited for the lock.
00521      *
00522      * Make sure this stays in sync with _hash_doinsert()
00523      */
00524     if (metap->hashm_ntuples <=
00525         (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))
00526         goto fail;
00527 
00528     /*
00529      * Can't split anymore if maxbucket has reached its maximum possible
00530      * value.
00531      *
00532      * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because
00533      * the calculation maxbucket+1 mustn't overflow).  Currently we restrict
00534      * to half that because of overflow looping in _hash_log2() and
00535      * insufficient space in hashm_spares[].  It's moot anyway because an
00536      * index with 2^32 buckets would certainly overflow BlockNumber and hence
00537      * _hash_alloc_buckets() would fail, but if we supported buckets smaller
00538      * than a disk block then this would be an independent constraint.
00539      *
00540      * If you change this, see also the maximum initial number of buckets in
00541      * _hash_metapinit().
00542      */
00543     if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)
00544         goto fail;
00545 
00546     /*
00547      * Determine which bucket is to be split, and attempt to lock the old
00548      * bucket.  If we can't get the lock, give up.
00549      *
00550      * The lock protects us against other backends, but not against our own
00551      * backend.  Must check for active scans separately.
00552      */
00553     new_bucket = metap->hashm_maxbucket + 1;
00554 
00555     old_bucket = (new_bucket & metap->hashm_lowmask);
00556 
00557     start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
00558 
00559     if (_hash_has_active_scan(rel, old_bucket))
00560         goto fail;
00561 
00562     if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE))
00563         goto fail;
00564 
00565     /*
00566      * Likewise lock the new bucket (should never fail).
00567      *
00568      * Note: it is safe to compute the new bucket's blkno here, even though we
00569      * may still need to update the BUCKET_TO_BLKNO mapping.  This is because
00570      * the current value of hashm_spares[hashm_ovflpoint] correctly shows
00571      * where we are going to put a new splitpoint's worth of buckets.
00572      */
00573     start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
00574 
00575     if (_hash_has_active_scan(rel, new_bucket))
00576         elog(ERROR, "scan in progress on supposedly new bucket");
00577 
00578     if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE))
00579         elog(ERROR, "could not get lock on supposedly new bucket");
00580 
00581     /*
00582      * If the split point is increasing (hashm_maxbucket's log base 2
00583      * increases), we need to allocate a new batch of bucket pages.
00584      */
00585     spare_ndx = _hash_log2(new_bucket + 1);
00586     if (spare_ndx > metap->hashm_ovflpoint)
00587     {
00588         Assert(spare_ndx == metap->hashm_ovflpoint + 1);
00589 
00590         /*
00591          * The number of buckets in the new splitpoint is equal to the total
00592          * number already in existence, i.e. new_bucket.  Currently this maps
00593          * one-to-one to blocks required, but someday we may need a more
00594          * complicated calculation here.
00595          */
00596         if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket))
00597         {
00598             /* can't split due to BlockNumber overflow */
00599             _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE);
00600             _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE);
00601             goto fail;
00602         }
00603     }
00604 
00605     /*
00606      * Okay to proceed with split.  Update the metapage bucket mapping info.
00607      *
00608      * Since we are scribbling on the metapage data right in the shared
00609      * buffer, any failure in this next little bit leaves us with a big
00610      * problem: the metapage is effectively corrupt but could get written back
00611      * to disk.  We don't really expect any failure, but just to be sure,
00612      * establish a critical section.
00613      */
00614     START_CRIT_SECTION();
00615 
00616     metap->hashm_maxbucket = new_bucket;
00617 
00618     if (new_bucket > metap->hashm_highmask)
00619     {
00620         /* Starting a new doubling */
00621         metap->hashm_lowmask = metap->hashm_highmask;
00622         metap->hashm_highmask = new_bucket | metap->hashm_lowmask;
00623     }
00624 
00625     /*
00626      * If the split point is increasing (hashm_maxbucket's log base 2
00627      * increases), we need to adjust the hashm_spares[] array and
00628      * hashm_ovflpoint so that future overflow pages will be created beyond
00629      * this new batch of bucket pages.
00630      */
00631     if (spare_ndx > metap->hashm_ovflpoint)
00632     {
00633         metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
00634         metap->hashm_ovflpoint = spare_ndx;
00635     }
00636 
00637     /* Done mucking with metapage */
00638     END_CRIT_SECTION();
00639 
00640     /*
00641      * Copy bucket mapping info now; this saves re-accessing the meta page
00642      * inside _hash_splitbucket's inner loop.  Note that once we drop the
00643      * split lock, other splits could begin, so these values might be out of
00644      * date before _hash_splitbucket finishes.  That's okay, since all it
00645      * needs is to tell which of these two buckets to map hashkeys into.
00646      */
00647     maxbucket = metap->hashm_maxbucket;
00648     highmask = metap->hashm_highmask;
00649     lowmask = metap->hashm_lowmask;
00650 
00651     /* Write out the metapage and drop lock, but keep pin */
00652     _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
00653 
00654     /* Relocate records to the new bucket */
00655     _hash_splitbucket(rel, metabuf, old_bucket, new_bucket,
00656                       start_oblkno, start_nblkno,
00657                       maxbucket, highmask, lowmask);
00658 
00659     /* Release bucket locks, allowing others to access them */
00660     _hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE);
00661     _hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE);
00662 
00663     return;
00664 
00665     /* Here if decide not to split or fail to acquire old bucket lock */
00666 fail:
00667 
00668     /* We didn't write the metapage, so just drop lock */
00669     _hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
00670 }
00671 
00672 
00673 /*
00674  * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages
00675  *
00676  * This does not need to initialize the new bucket pages; we'll do that as
00677  * each one is used by _hash_expandtable().  But we have to extend the logical
00678  * EOF to the end of the splitpoint; this keeps smgr's idea of the EOF in
00679  * sync with ours, so that we don't get complaints from smgr.
00680  *
00681  * We do this by writing a page of zeroes at the end of the splitpoint range.
00682  * We expect that the filesystem will ensure that the intervening pages read
00683  * as zeroes too.  On many filesystems this "hole" will not be allocated
00684  * immediately, which means that the index file may end up more fragmented
00685  * than if we forced it all to be allocated now; but since we don't scan
00686  * hash indexes sequentially anyway, that probably doesn't matter.
00687  *
00688  * XXX It's annoying that this code is executed with the metapage lock held.
00689  * We need to interlock against _hash_getovflpage() adding a new overflow page
00690  * concurrently, but it'd likely be better to use LockRelationForExtension
00691  * for the purpose.  OTOH, adding a splitpoint is a very infrequent operation,
00692  * so it may not be worth worrying about.
00693  *
00694  * Returns TRUE if successful, or FALSE if allocation failed due to
00695  * BlockNumber overflow.
00696  */
00697 static bool
00698 _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
00699 {
00700     BlockNumber lastblock;
00701     char        zerobuf[BLCKSZ];
00702 
00703     lastblock = firstblock + nblocks - 1;
00704 
00705     /*
00706      * Check for overflow in block number calculation; if so, we cannot extend
00707      * the index anymore.
00708      */
00709     if (lastblock < firstblock || lastblock == InvalidBlockNumber)
00710         return false;
00711 
00712     MemSet(zerobuf, 0, sizeof(zerobuf));
00713 
00714     RelationOpenSmgr(rel);
00715     smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf, false);
00716 
00717     return true;
00718 }
00719 
00720 
00721 /*
00722  * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
00723  *
00724  * We are splitting a bucket that consists of a base bucket page and zero
00725  * or more overflow (bucket chain) pages.  We must relocate tuples that
00726  * belong in the new bucket, and compress out any free space in the old
00727  * bucket.
00728  *
00729  * The caller must hold exclusive locks on both buckets to ensure that
00730  * no one else is trying to access them (see README).
00731  *
00732  * The caller must hold a pin, but no lock, on the metapage buffer.
00733  * The buffer is returned in the same state.  (The metapage is only
00734  * touched if it becomes necessary to add or remove overflow pages.)
00735  */
00736 static void
00737 _hash_splitbucket(Relation rel,
00738                   Buffer metabuf,
00739                   Bucket obucket,
00740                   Bucket nbucket,
00741                   BlockNumber start_oblkno,
00742                   BlockNumber start_nblkno,
00743                   uint32 maxbucket,
00744                   uint32 highmask,
00745                   uint32 lowmask)
00746 {
00747     BlockNumber oblkno;
00748     BlockNumber nblkno;
00749     Buffer      obuf;
00750     Buffer      nbuf;
00751     Page        opage;
00752     Page        npage;
00753     HashPageOpaque oopaque;
00754     HashPageOpaque nopaque;
00755 
00756     /*
00757      * It should be okay to simultaneously write-lock pages from each bucket,
00758      * since no one else can be trying to acquire buffer lock on pages of
00759      * either bucket.
00760      */
00761     oblkno = start_oblkno;
00762     obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_BUCKET_PAGE);
00763     opage = BufferGetPage(obuf);
00764     oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
00765 
00766     nblkno = start_nblkno;
00767     nbuf = _hash_getnewbuf(rel, nblkno, MAIN_FORKNUM);
00768     npage = BufferGetPage(nbuf);
00769 
00770     /* initialize the new bucket's primary page */
00771     nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
00772     nopaque->hasho_prevblkno = InvalidBlockNumber;
00773     nopaque->hasho_nextblkno = InvalidBlockNumber;
00774     nopaque->hasho_bucket = nbucket;
00775     nopaque->hasho_flag = LH_BUCKET_PAGE;
00776     nopaque->hasho_page_id = HASHO_PAGE_ID;
00777 
00778     /*
00779      * Partition the tuples in the old bucket between the old bucket and the
00780      * new bucket, advancing along the old bucket's overflow bucket chain and
00781      * adding overflow pages to the new bucket as needed.  Outer loop iterates
00782      * once per page in old bucket.
00783      */
00784     for (;;)
00785     {
00786         OffsetNumber ooffnum;
00787         OffsetNumber omaxoffnum;
00788         OffsetNumber deletable[MaxOffsetNumber];
00789         int         ndeletable = 0;
00790 
00791         /* Scan each tuple in old page */
00792         omaxoffnum = PageGetMaxOffsetNumber(opage);
00793         for (ooffnum = FirstOffsetNumber;
00794              ooffnum <= omaxoffnum;
00795              ooffnum = OffsetNumberNext(ooffnum))
00796         {
00797             IndexTuple  itup;
00798             Size        itemsz;
00799             Bucket      bucket;
00800 
00801             /*
00802              * Fetch the item's hash key (conveniently stored in the item) and
00803              * determine which bucket it now belongs in.
00804              */
00805             itup = (IndexTuple) PageGetItem(opage,
00806                                             PageGetItemId(opage, ooffnum));
00807             bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
00808                                           maxbucket, highmask, lowmask);
00809 
00810             if (bucket == nbucket)
00811             {
00812                 /*
00813                  * insert the tuple into the new bucket.  if it doesn't fit on
00814                  * the current page in the new bucket, we must allocate a new
00815                  * overflow page and place the tuple on that page instead.
00816                  */
00817                 itemsz = IndexTupleDSize(*itup);
00818                 itemsz = MAXALIGN(itemsz);
00819 
00820                 if (PageGetFreeSpace(npage) < itemsz)
00821                 {
00822                     /* write out nbuf and drop lock, but keep pin */
00823                     _hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK);
00824                     /* chain to a new overflow page */
00825                     nbuf = _hash_addovflpage(rel, metabuf, nbuf);
00826                     npage = BufferGetPage(nbuf);
00827                     /* we don't need nblkno or nopaque within the loop */
00828                 }
00829 
00830                 /*
00831                  * Insert tuple on new page, using _hash_pgaddtup to ensure
00832                  * correct ordering by hashkey.  This is a tad inefficient
00833                  * since we may have to shuffle itempointers repeatedly.
00834                  * Possible future improvement: accumulate all the items for
00835                  * the new page and qsort them before insertion.
00836                  */
00837                 (void) _hash_pgaddtup(rel, nbuf, itemsz, itup);
00838 
00839                 /*
00840                  * Mark tuple for deletion from old page.
00841                  */
00842                 deletable[ndeletable++] = ooffnum;
00843             }
00844             else
00845             {
00846                 /*
00847                  * the tuple stays on this page, so nothing to do.
00848                  */
00849                 Assert(bucket == obucket);
00850             }
00851         }
00852 
00853         oblkno = oopaque->hasho_nextblkno;
00854 
00855         /*
00856          * Done scanning this old page.  If we moved any tuples, delete them
00857          * from the old page.
00858          */
00859         if (ndeletable > 0)
00860         {
00861             PageIndexMultiDelete(opage, deletable, ndeletable);
00862             _hash_wrtbuf(rel, obuf);
00863         }
00864         else
00865             _hash_relbuf(rel, obuf);
00866 
00867         /* Exit loop if no more overflow pages in old bucket */
00868         if (!BlockNumberIsValid(oblkno))
00869             break;
00870 
00871         /* Else, advance to next old page */
00872         obuf = _hash_getbuf(rel, oblkno, HASH_WRITE, LH_OVERFLOW_PAGE);
00873         opage = BufferGetPage(obuf);
00874         oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
00875     }
00876 
00877     /*
00878      * We're at the end of the old bucket chain, so we're done partitioning
00879      * the tuples.  Before quitting, call _hash_squeezebucket to ensure the
00880      * tuples remaining in the old bucket (including the overflow pages) are
00881      * packed as tightly as possible.  The new bucket is already tight.
00882      */
00883     _hash_wrtbuf(rel, nbuf);
00884 
00885     _hash_squeezebucket(rel, obucket, start_oblkno, NULL);
00886 }