Header And Logo

PostgreSQL
| The world's most advanced open source database.

tidbitmap.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * tidbitmap.c
00004  *    PostgreSQL tuple-id (TID) bitmap package
00005  *
00006  * This module provides bitmap data structures that are spiritually
00007  * similar to Bitmapsets, but are specially adapted to store sets of
00008  * tuple identifiers (TIDs), or ItemPointers.  In particular, the division
00009  * of an ItemPointer into BlockNumber and OffsetNumber is catered for.
00010  * Also, since we wish to be able to store very large tuple sets in
00011  * memory with this data structure, we support "lossy" storage, in which
00012  * we no longer remember individual tuple offsets on a page but only the
00013  * fact that a particular page needs to be visited.
00014  *
00015  * The "lossy" storage uses one bit per disk page, so at the standard 8K
00016  * BLCKSZ, we can represent all pages in 64Gb of disk space in about 1Mb
00017  * of memory.  People pushing around tables of that size should have a
00018  * couple of Mb to spare, so we don't worry about providing a second level
00019  * of lossiness.  In theory we could fall back to page ranges at some
00020  * point, but for now that seems useless complexity.
00021  *
00022  * We also support the notion of candidate matches, or rechecking.  This
00023  * means we know that a search need visit only some tuples on a page,
00024  * but we are not certain that all of those tuples are real matches.
00025  * So the eventual heap scan must recheck the quals for these tuples only,
00026  * rather than rechecking the quals for all tuples on the page as in the
00027  * lossy-bitmap case.  Rechecking can be specified when TIDs are inserted
00028  * into a bitmap, and it can also happen internally when we AND a lossy
00029  * and a non-lossy page.
00030  *
00031  *
00032  * Copyright (c) 2003-2013, PostgreSQL Global Development Group
00033  *
00034  * IDENTIFICATION
00035  *    src/backend/nodes/tidbitmap.c
00036  *
00037  *-------------------------------------------------------------------------
00038  */
00039 #include "postgres.h"
00040 
00041 #include <limits.h>
00042 
00043 #include "access/htup_details.h"
00044 #include "nodes/bitmapset.h"
00045 #include "nodes/tidbitmap.h"
00046 #include "utils/hsearch.h"
00047 
00048 /*
00049  * The maximum number of tuples per page is not large (typically 256 with
00050  * 8K pages, or 1024 with 32K pages).  So there's not much point in making
00051  * the per-page bitmaps variable size.  We just legislate that the size
00052  * is this:
00053  */
00054 #define MAX_TUPLES_PER_PAGE  MaxHeapTuplesPerPage
00055 
00056 /*
00057  * When we have to switch over to lossy storage, we use a data structure
00058  * with one bit per page, where all pages having the same number DIV
00059  * PAGES_PER_CHUNK are aggregated into one chunk.  When a chunk is present
00060  * and has the bit set for a given page, there must not be a per-page entry
00061  * for that page in the page table.
00062  *
00063  * We actually store both exact pages and lossy chunks in the same hash
00064  * table, using identical data structures.  (This is because dynahash.c's
00065  * memory management doesn't allow space to be transferred easily from one
00066  * hashtable to another.)  Therefore it's best if PAGES_PER_CHUNK is the
00067  * same as MAX_TUPLES_PER_PAGE, or at least not too different.  But we
00068  * also want PAGES_PER_CHUNK to be a power of 2 to avoid expensive integer
00069  * remainder operations.  So, define it like this:
00070  */
00071 #define PAGES_PER_CHUNK  (BLCKSZ / 32)
00072 
00073 /* We use BITS_PER_BITMAPWORD and typedef bitmapword from nodes/bitmapset.h */
00074 
00075 #define WORDNUM(x)  ((x) / BITS_PER_BITMAPWORD)
00076 #define BITNUM(x)   ((x) % BITS_PER_BITMAPWORD)
00077 
00078 /* number of active words for an exact page: */
00079 #define WORDS_PER_PAGE  ((MAX_TUPLES_PER_PAGE - 1) / BITS_PER_BITMAPWORD + 1)
00080 /* number of active words for a lossy chunk: */
00081 #define WORDS_PER_CHUNK  ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1)
00082 
00083 /*
00084  * The hashtable entries are represented by this data structure.  For
00085  * an exact page, blockno is the page number and bit k of the bitmap
00086  * represents tuple offset k+1.  For a lossy chunk, blockno is the first
00087  * page in the chunk (this must be a multiple of PAGES_PER_CHUNK) and
00088  * bit k represents page blockno+k.  Note that it is not possible to
00089  * have exact storage for the first page of a chunk if we are using
00090  * lossy storage for any page in the chunk's range, since the same
00091  * hashtable entry has to serve both purposes.
00092  *
00093  * recheck is used only on exact pages --- it indicates that although
00094  * only the stated tuples need be checked, the full index qual condition
00095  * must be checked for each (ie, these are candidate matches).
00096  */
00097 typedef struct PagetableEntry
00098 {
00099     BlockNumber blockno;        /* page number (hashtable key) */
00100     bool        ischunk;        /* T = lossy storage, F = exact */
00101     bool        recheck;        /* should the tuples be rechecked? */
00102     bitmapword  words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
00103 } PagetableEntry;
00104 
00105 /*
00106  * dynahash.c is optimized for relatively large, long-lived hash tables.
00107  * This is not ideal for TIDBitMap, particularly when we are using a bitmap
00108  * scan on the inside of a nestloop join: a bitmap may well live only long
00109  * enough to accumulate one entry in such cases.  We therefore avoid creating
00110  * an actual hashtable until we need two pagetable entries.  When just one
00111  * pagetable entry is needed, we store it in a fixed field of TIDBitMap.
00112  * (NOTE: we don't get rid of the hashtable if the bitmap later shrinks down
00113  * to zero or one page again.  So, status can be TBM_HASH even when nentries
00114  * is zero or one.)
00115  */
00116 typedef enum
00117 {
00118     TBM_EMPTY,                  /* no hashtable, nentries == 0 */
00119     TBM_ONE_PAGE,               /* entry1 contains the single entry */
00120     TBM_HASH                    /* pagetable is valid, entry1 is not */
00121 } TBMStatus;
00122 
00123 /*
00124  * Here is the representation for a whole TIDBitMap:
00125  */
00126 struct TIDBitmap
00127 {
00128     NodeTag     type;           /* to make it a valid Node */
00129     MemoryContext mcxt;         /* memory context containing me */
00130     TBMStatus   status;         /* see codes above */
00131     HTAB       *pagetable;      /* hash table of PagetableEntry's */
00132     int         nentries;       /* number of entries in pagetable */
00133     int         maxentries;     /* limit on same to meet maxbytes */
00134     int         npages;         /* number of exact entries in pagetable */
00135     int         nchunks;        /* number of lossy entries in pagetable */
00136     bool        iterating;      /* tbm_begin_iterate called? */
00137     PagetableEntry entry1;      /* used when status == TBM_ONE_PAGE */
00138     /* these are valid when iterating is true: */
00139     PagetableEntry **spages;    /* sorted exact-page list, or NULL */
00140     PagetableEntry **schunks;   /* sorted lossy-chunk list, or NULL */
00141 };
00142 
00143 /*
00144  * When iterating over a bitmap in sorted order, a TBMIterator is used to
00145  * track our progress.  There can be several iterators scanning the same
00146  * bitmap concurrently.  Note that the bitmap becomes read-only as soon as
00147  * any iterator is created.
00148  */
00149 struct TBMIterator
00150 {
00151     TIDBitmap  *tbm;            /* TIDBitmap we're iterating over */
00152     int         spageptr;       /* next spages index */
00153     int         schunkptr;      /* next schunks index */
00154     int         schunkbit;      /* next bit to check in current schunk */
00155     TBMIterateResult output;    /* MUST BE LAST (because variable-size) */
00156 };
00157 
00158 
00159 /* Local function prototypes */
00160 static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
00161 static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage,
00162                    const TIDBitmap *b);
00163 static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
00164                    BlockNumber pageno);
00165 static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
00166 static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
00167 static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
00168 static void tbm_lossify(TIDBitmap *tbm);
00169 static int  tbm_comparator(const void *left, const void *right);
00170 
00171 
00172 /*
00173  * tbm_create - create an initially-empty bitmap
00174  *
00175  * The bitmap will live in the memory context that is CurrentMemoryContext
00176  * at the time of this call.  It will be limited to (approximately) maxbytes
00177  * total memory consumption.
00178  */
00179 TIDBitmap *
00180 tbm_create(long maxbytes)
00181 {
00182     TIDBitmap  *tbm;
00183     long        nbuckets;
00184 
00185     /* Create the TIDBitmap struct and zero all its fields */
00186     tbm = makeNode(TIDBitmap);
00187 
00188     tbm->mcxt = CurrentMemoryContext;
00189     tbm->status = TBM_EMPTY;
00190 
00191     /*
00192      * Estimate number of hashtable entries we can have within maxbytes. This
00193      * estimates the hash overhead at MAXALIGN(sizeof(HASHELEMENT)) plus a
00194      * pointer per hash entry, which is crude but good enough for our purpose.
00195      * Also count an extra Pointer per entry for the arrays created during
00196      * iteration readout.
00197      */
00198     nbuckets = maxbytes /
00199         (MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(sizeof(PagetableEntry))
00200          + sizeof(Pointer) + sizeof(Pointer));
00201     nbuckets = Min(nbuckets, INT_MAX - 1);      /* safety limit */
00202     nbuckets = Max(nbuckets, 16);       /* sanity limit */
00203     tbm->maxentries = (int) nbuckets;
00204 
00205     return tbm;
00206 }
00207 
00208 /*
00209  * Actually create the hashtable.  Since this is a moderately expensive
00210  * proposition, we don't do it until we have to.
00211  */
00212 static void
00213 tbm_create_pagetable(TIDBitmap *tbm)
00214 {
00215     HASHCTL     hash_ctl;
00216 
00217     Assert(tbm->status != TBM_HASH);
00218     Assert(tbm->pagetable == NULL);
00219 
00220     /* Create the hashtable proper */
00221     MemSet(&hash_ctl, 0, sizeof(hash_ctl));
00222     hash_ctl.keysize = sizeof(BlockNumber);
00223     hash_ctl.entrysize = sizeof(PagetableEntry);
00224     hash_ctl.hash = tag_hash;
00225     hash_ctl.hcxt = tbm->mcxt;
00226     tbm->pagetable = hash_create("TIDBitmap",
00227                                  128,   /* start small and extend */
00228                                  &hash_ctl,
00229                                  HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
00230 
00231     /* If entry1 is valid, push it into the hashtable */
00232     if (tbm->status == TBM_ONE_PAGE)
00233     {
00234         PagetableEntry *page;
00235         bool        found;
00236 
00237         page = (PagetableEntry *) hash_search(tbm->pagetable,
00238                                               (void *) &tbm->entry1.blockno,
00239                                               HASH_ENTER, &found);
00240         Assert(!found);
00241         memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
00242     }
00243 
00244     tbm->status = TBM_HASH;
00245 }
00246 
00247 /*
00248  * tbm_free - free a TIDBitmap
00249  */
00250 void
00251 tbm_free(TIDBitmap *tbm)
00252 {
00253     if (tbm->pagetable)
00254         hash_destroy(tbm->pagetable);
00255     if (tbm->spages)
00256         pfree(tbm->spages);
00257     if (tbm->schunks)
00258         pfree(tbm->schunks);
00259     pfree(tbm);
00260 }
00261 
00262 /*
00263  * tbm_add_tuples - add some tuple IDs to a TIDBitmap
00264  *
00265  * If recheck is true, then the recheck flag will be set in the
00266  * TBMIterateResult when any of these tuples are reported out.
00267  */
00268 void
00269 tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids,
00270                bool recheck)
00271 {
00272     int         i;
00273 
00274     Assert(!tbm->iterating);
00275     for (i = 0; i < ntids; i++)
00276     {
00277         BlockNumber blk = ItemPointerGetBlockNumber(tids + i);
00278         OffsetNumber off = ItemPointerGetOffsetNumber(tids + i);
00279         PagetableEntry *page;
00280         int         wordnum,
00281                     bitnum;
00282 
00283         /* safety check to ensure we don't overrun bit array bounds */
00284         if (off < 1 || off > MAX_TUPLES_PER_PAGE)
00285             elog(ERROR, "tuple offset out of range: %u", off);
00286 
00287         if (tbm_page_is_lossy(tbm, blk))
00288             continue;           /* whole page is already marked */
00289 
00290         page = tbm_get_pageentry(tbm, blk);
00291 
00292         if (page->ischunk)
00293         {
00294             /* The page is a lossy chunk header, set bit for itself */
00295             wordnum = bitnum = 0;
00296         }
00297         else
00298         {
00299             /* Page is exact, so set bit for individual tuple */
00300             wordnum = WORDNUM(off - 1);
00301             bitnum = BITNUM(off - 1);
00302         }
00303         page->words[wordnum] |= ((bitmapword) 1 << bitnum);
00304         page->recheck |= recheck;
00305 
00306         if (tbm->nentries > tbm->maxentries)
00307             tbm_lossify(tbm);
00308     }
00309 }
00310 
00311 /*
00312  * tbm_add_page - add a whole page to a TIDBitmap
00313  *
00314  * This causes the whole page to be reported (with the recheck flag)
00315  * when the TIDBitmap is scanned.
00316  */
00317 void
00318 tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)
00319 {
00320     /* Enter the page in the bitmap, or mark it lossy if already present */
00321     tbm_mark_page_lossy(tbm, pageno);
00322     /* If we went over the memory limit, lossify some more pages */
00323     if (tbm->nentries > tbm->maxentries)
00324         tbm_lossify(tbm);
00325 }
00326 
00327 /*
00328  * tbm_union - set union
00329  *
00330  * a is modified in-place, b is not changed
00331  */
00332 void
00333 tbm_union(TIDBitmap *a, const TIDBitmap *b)
00334 {
00335     Assert(!a->iterating);
00336     /* Nothing to do if b is empty */
00337     if (b->nentries == 0)
00338         return;
00339     /* Scan through chunks and pages in b, merge into a */
00340     if (b->status == TBM_ONE_PAGE)
00341         tbm_union_page(a, &b->entry1);
00342     else
00343     {
00344         HASH_SEQ_STATUS status;
00345         PagetableEntry *bpage;
00346 
00347         Assert(b->status == TBM_HASH);
00348         hash_seq_init(&status, b->pagetable);
00349         while ((bpage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
00350             tbm_union_page(a, bpage);
00351     }
00352 }
00353 
00354 /* Process one page of b during a union op */
00355 static void
00356 tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
00357 {
00358     PagetableEntry *apage;
00359     int         wordnum;
00360 
00361     if (bpage->ischunk)
00362     {
00363         /* Scan b's chunk, mark each indicated page lossy in a */
00364         for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
00365         {
00366             bitmapword  w = bpage->words[wordnum];
00367 
00368             if (w != 0)
00369             {
00370                 BlockNumber pg;
00371 
00372                 pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
00373                 while (w != 0)
00374                 {
00375                     if (w & 1)
00376                         tbm_mark_page_lossy(a, pg);
00377                     pg++;
00378                     w >>= 1;
00379                 }
00380             }
00381         }
00382     }
00383     else if (tbm_page_is_lossy(a, bpage->blockno))
00384     {
00385         /* page is already lossy in a, nothing to do */
00386         return;
00387     }
00388     else
00389     {
00390         apage = tbm_get_pageentry(a, bpage->blockno);
00391         if (apage->ischunk)
00392         {
00393             /* The page is a lossy chunk header, set bit for itself */
00394             apage->words[0] |= ((bitmapword) 1 << 0);
00395         }
00396         else
00397         {
00398             /* Both pages are exact, merge at the bit level */
00399             for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
00400                 apage->words[wordnum] |= bpage->words[wordnum];
00401             apage->recheck |= bpage->recheck;
00402         }
00403     }
00404 
00405     if (a->nentries > a->maxentries)
00406         tbm_lossify(a);
00407 }
00408 
00409 /*
00410  * tbm_intersect - set intersection
00411  *
00412  * a is modified in-place, b is not changed
00413  */
00414 void
00415 tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
00416 {
00417     Assert(!a->iterating);
00418     /* Nothing to do if a is empty */
00419     if (a->nentries == 0)
00420         return;
00421     /* Scan through chunks and pages in a, try to match to b */
00422     if (a->status == TBM_ONE_PAGE)
00423     {
00424         if (tbm_intersect_page(a, &a->entry1, b))
00425         {
00426             /* Page is now empty, remove it from a */
00427             Assert(!a->entry1.ischunk);
00428             a->npages--;
00429             a->nentries--;
00430             Assert(a->nentries == 0);
00431             a->status = TBM_EMPTY;
00432         }
00433     }
00434     else
00435     {
00436         HASH_SEQ_STATUS status;
00437         PagetableEntry *apage;
00438 
00439         Assert(a->status == TBM_HASH);
00440         hash_seq_init(&status, a->pagetable);
00441         while ((apage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
00442         {
00443             if (tbm_intersect_page(a, apage, b))
00444             {
00445                 /* Page or chunk is now empty, remove it from a */
00446                 if (apage->ischunk)
00447                     a->nchunks--;
00448                 else
00449                     a->npages--;
00450                 a->nentries--;
00451                 if (hash_search(a->pagetable,
00452                                 (void *) &apage->blockno,
00453                                 HASH_REMOVE, NULL) == NULL)
00454                     elog(ERROR, "hash table corrupted");
00455             }
00456         }
00457     }
00458 }
00459 
00460 /*
00461  * Process one page of a during an intersection op
00462  *
00463  * Returns TRUE if apage is now empty and should be deleted from a
00464  */
00465 static bool
00466 tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, const TIDBitmap *b)
00467 {
00468     const PagetableEntry *bpage;
00469     int         wordnum;
00470 
00471     if (apage->ischunk)
00472     {
00473         /* Scan each bit in chunk, try to clear */
00474         bool        candelete = true;
00475 
00476         for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
00477         {
00478             bitmapword  w = apage->words[wordnum];
00479 
00480             if (w != 0)
00481             {
00482                 bitmapword  neww = w;
00483                 BlockNumber pg;
00484                 int         bitnum;
00485 
00486                 pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
00487                 bitnum = 0;
00488                 while (w != 0)
00489                 {
00490                     if (w & 1)
00491                     {
00492                         if (!tbm_page_is_lossy(b, pg) &&
00493                             tbm_find_pageentry(b, pg) == NULL)
00494                         {
00495                             /* Page is not in b at all, lose lossy bit */
00496                             neww &= ~((bitmapword) 1 << bitnum);
00497                         }
00498                     }
00499                     pg++;
00500                     bitnum++;
00501                     w >>= 1;
00502                 }
00503                 apage->words[wordnum] = neww;
00504                 if (neww != 0)
00505                     candelete = false;
00506             }
00507         }
00508         return candelete;
00509     }
00510     else if (tbm_page_is_lossy(b, apage->blockno))
00511     {
00512         /*
00513          * Some of the tuples in 'a' might not satisfy the quals for 'b', but
00514          * because the page 'b' is lossy, we don't know which ones. Therefore
00515          * we mark 'a' as requiring rechecks, to indicate that at most those
00516          * tuples set in 'a' are matches.
00517          */
00518         apage->recheck = true;
00519         return false;
00520     }
00521     else
00522     {
00523         bool        candelete = true;
00524 
00525         bpage = tbm_find_pageentry(b, apage->blockno);
00526         if (bpage != NULL)
00527         {
00528             /* Both pages are exact, merge at the bit level */
00529             Assert(!bpage->ischunk);
00530             for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
00531             {
00532                 apage->words[wordnum] &= bpage->words[wordnum];
00533                 if (apage->words[wordnum] != 0)
00534                     candelete = false;
00535             }
00536             apage->recheck |= bpage->recheck;
00537         }
00538         /* If there is no matching b page, we can just delete the a page */
00539         return candelete;
00540     }
00541 }
00542 
00543 /*
00544  * tbm_is_empty - is a TIDBitmap completely empty?
00545  */
00546 bool
00547 tbm_is_empty(const TIDBitmap *tbm)
00548 {
00549     return (tbm->nentries == 0);
00550 }
00551 
00552 /*
00553  * tbm_begin_iterate - prepare to iterate through a TIDBitmap
00554  *
00555  * The TBMIterator struct is created in the caller's memory context.
00556  * For a clean shutdown of the iteration, call tbm_end_iterate; but it's
00557  * okay to just allow the memory context to be released, too.  It is caller's
00558  * responsibility not to touch the TBMIterator anymore once the TIDBitmap
00559  * is freed.
00560  *
00561  * NB: after this is called, it is no longer allowed to modify the contents
00562  * of the bitmap.  However, you can call this multiple times to scan the
00563  * contents repeatedly, including parallel scans.
00564  */
00565 TBMIterator *
00566 tbm_begin_iterate(TIDBitmap *tbm)
00567 {
00568     TBMIterator *iterator;
00569 
00570     /*
00571      * Create the TBMIterator struct, with enough trailing space to serve the
00572      * needs of the TBMIterateResult sub-struct.
00573      */
00574     iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
00575                                  MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
00576     iterator->tbm = tbm;
00577 
00578     /*
00579      * Initialize iteration pointers.
00580      */
00581     iterator->spageptr = 0;
00582     iterator->schunkptr = 0;
00583     iterator->schunkbit = 0;
00584 
00585     /*
00586      * If we have a hashtable, create and fill the sorted page lists, unless
00587      * we already did that for a previous iterator.  Note that the lists are
00588      * attached to the bitmap not the iterator, so they can be used by more
00589      * than one iterator.
00590      */
00591     if (tbm->status == TBM_HASH && !tbm->iterating)
00592     {
00593         HASH_SEQ_STATUS status;
00594         PagetableEntry *page;
00595         int         npages;
00596         int         nchunks;
00597 
00598         if (!tbm->spages && tbm->npages > 0)
00599             tbm->spages = (PagetableEntry **)
00600                 MemoryContextAlloc(tbm->mcxt,
00601                                    tbm->npages * sizeof(PagetableEntry *));
00602         if (!tbm->schunks && tbm->nchunks > 0)
00603             tbm->schunks = (PagetableEntry **)
00604                 MemoryContextAlloc(tbm->mcxt,
00605                                    tbm->nchunks * sizeof(PagetableEntry *));
00606 
00607         hash_seq_init(&status, tbm->pagetable);
00608         npages = nchunks = 0;
00609         while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
00610         {
00611             if (page->ischunk)
00612                 tbm->schunks[nchunks++] = page;
00613             else
00614                 tbm->spages[npages++] = page;
00615         }
00616         Assert(npages == tbm->npages);
00617         Assert(nchunks == tbm->nchunks);
00618         if (npages > 1)
00619             qsort(tbm->spages, npages, sizeof(PagetableEntry *),
00620                   tbm_comparator);
00621         if (nchunks > 1)
00622             qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
00623                   tbm_comparator);
00624     }
00625 
00626     tbm->iterating = true;
00627 
00628     return iterator;
00629 }
00630 
00631 /*
00632  * tbm_iterate - scan through next page of a TIDBitmap
00633  *
00634  * Returns a TBMIterateResult representing one page, or NULL if there are
00635  * no more pages to scan.  Pages are guaranteed to be delivered in numerical
00636  * order.  If result->ntuples < 0, then the bitmap is "lossy" and failed to
00637  * remember the exact tuples to look at on this page --- the caller must
00638  * examine all tuples on the page and check if they meet the intended
00639  * condition.  If result->recheck is true, only the indicated tuples need
00640  * be examined, but the condition must be rechecked anyway.  (For ease of
00641  * testing, recheck is always set true when ntuples < 0.)
00642  */
00643 TBMIterateResult *
00644 tbm_iterate(TBMIterator *iterator)
00645 {
00646     TIDBitmap  *tbm = iterator->tbm;
00647     TBMIterateResult *output = &(iterator->output);
00648 
00649     Assert(tbm->iterating);
00650 
00651     /*
00652      * If lossy chunk pages remain, make sure we've advanced schunkptr/
00653      * schunkbit to the next set bit.
00654      */
00655     while (iterator->schunkptr < tbm->nchunks)
00656     {
00657         PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
00658         int         schunkbit = iterator->schunkbit;
00659 
00660         while (schunkbit < PAGES_PER_CHUNK)
00661         {
00662             int         wordnum = WORDNUM(schunkbit);
00663             int         bitnum = BITNUM(schunkbit);
00664 
00665             if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
00666                 break;
00667             schunkbit++;
00668         }
00669         if (schunkbit < PAGES_PER_CHUNK)
00670         {
00671             iterator->schunkbit = schunkbit;
00672             break;
00673         }
00674         /* advance to next chunk */
00675         iterator->schunkptr++;
00676         iterator->schunkbit = 0;
00677     }
00678 
00679     /*
00680      * If both chunk and per-page data remain, must output the numerically
00681      * earlier page.
00682      */
00683     if (iterator->schunkptr < tbm->nchunks)
00684     {
00685         PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
00686         BlockNumber chunk_blockno;
00687 
00688         chunk_blockno = chunk->blockno + iterator->schunkbit;
00689         if (iterator->spageptr >= tbm->npages ||
00690             chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
00691         {
00692             /* Return a lossy page indicator from the chunk */
00693             output->blockno = chunk_blockno;
00694             output->ntuples = -1;
00695             output->recheck = true;
00696             iterator->schunkbit++;
00697             return output;
00698         }
00699     }
00700 
00701     if (iterator->spageptr < tbm->npages)
00702     {
00703         PagetableEntry *page;
00704         int         ntuples;
00705         int         wordnum;
00706 
00707         /* In ONE_PAGE state, we don't allocate an spages[] array */
00708         if (tbm->status == TBM_ONE_PAGE)
00709             page = &tbm->entry1;
00710         else
00711             page = tbm->spages[iterator->spageptr];
00712 
00713         /* scan bitmap to extract individual offset numbers */
00714         ntuples = 0;
00715         for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
00716         {
00717             bitmapword  w = page->words[wordnum];
00718 
00719             if (w != 0)
00720             {
00721                 int         off = wordnum * BITS_PER_BITMAPWORD + 1;
00722 
00723                 while (w != 0)
00724                 {
00725                     if (w & 1)
00726                         output->offsets[ntuples++] = (OffsetNumber) off;
00727                     off++;
00728                     w >>= 1;
00729                 }
00730             }
00731         }
00732         output->blockno = page->blockno;
00733         output->ntuples = ntuples;
00734         output->recheck = page->recheck;
00735         iterator->spageptr++;
00736         return output;
00737     }
00738 
00739     /* Nothing more in the bitmap */
00740     return NULL;
00741 }
00742 
00743 /*
00744  * tbm_end_iterate - finish an iteration over a TIDBitmap
00745  *
00746  * Currently this is just a pfree, but it might do more someday.  (For
00747  * instance, it could be useful to count open iterators and allow the
00748  * bitmap to return to read/write status when there are no more iterators.)
00749  */
00750 void
00751 tbm_end_iterate(TBMIterator *iterator)
00752 {
00753     pfree(iterator);
00754 }
00755 
00756 /*
00757  * tbm_find_pageentry - find a PagetableEntry for the pageno
00758  *
00759  * Returns NULL if there is no non-lossy entry for the pageno.
00760  */
00761 static const PagetableEntry *
00762 tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
00763 {
00764     const PagetableEntry *page;
00765 
00766     if (tbm->nentries == 0)     /* in case pagetable doesn't exist */
00767         return NULL;
00768 
00769     if (tbm->status == TBM_ONE_PAGE)
00770     {
00771         page = &tbm->entry1;
00772         if (page->blockno != pageno)
00773             return NULL;
00774         Assert(!page->ischunk);
00775         return page;
00776     }
00777 
00778     page = (PagetableEntry *) hash_search(tbm->pagetable,
00779                                           (void *) &pageno,
00780                                           HASH_FIND, NULL);
00781     if (page == NULL)
00782         return NULL;
00783     if (page->ischunk)
00784         return NULL;            /* don't want a lossy chunk header */
00785     return page;
00786 }
00787 
00788 /*
00789  * tbm_get_pageentry - find or create a PagetableEntry for the pageno
00790  *
00791  * If new, the entry is marked as an exact (non-chunk) entry.
00792  *
00793  * This may cause the table to exceed the desired memory size.  It is
00794  * up to the caller to call tbm_lossify() at the next safe point if so.
00795  */
00796 static PagetableEntry *
00797 tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
00798 {
00799     PagetableEntry *page;
00800     bool        found;
00801 
00802     if (tbm->status == TBM_EMPTY)
00803     {
00804         /* Use the fixed slot */
00805         page = &tbm->entry1;
00806         found = false;
00807         tbm->status = TBM_ONE_PAGE;
00808     }
00809     else
00810     {
00811         if (tbm->status == TBM_ONE_PAGE)
00812         {
00813             page = &tbm->entry1;
00814             if (page->blockno == pageno)
00815                 return page;
00816             /* Time to switch from one page to a hashtable */
00817             tbm_create_pagetable(tbm);
00818         }
00819 
00820         /* Look up or create an entry */
00821         page = (PagetableEntry *) hash_search(tbm->pagetable,
00822                                               (void *) &pageno,
00823                                               HASH_ENTER, &found);
00824     }
00825 
00826     /* Initialize it if not present before */
00827     if (!found)
00828     {
00829         MemSet(page, 0, sizeof(PagetableEntry));
00830         page->blockno = pageno;
00831         /* must count it too */
00832         tbm->nentries++;
00833         tbm->npages++;
00834     }
00835 
00836     return page;
00837 }
00838 
00839 /*
00840  * tbm_page_is_lossy - is the page marked as lossily stored?
00841  */
00842 static bool
00843 tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
00844 {
00845     PagetableEntry *page;
00846     BlockNumber chunk_pageno;
00847     int         bitno;
00848 
00849     /* we can skip the lookup if there are no lossy chunks */
00850     if (tbm->nchunks == 0)
00851         return false;
00852     Assert(tbm->status == TBM_HASH);
00853 
00854     bitno = pageno % PAGES_PER_CHUNK;
00855     chunk_pageno = pageno - bitno;
00856     page = (PagetableEntry *) hash_search(tbm->pagetable,
00857                                           (void *) &chunk_pageno,
00858                                           HASH_FIND, NULL);
00859     if (page != NULL && page->ischunk)
00860     {
00861         int         wordnum = WORDNUM(bitno);
00862         int         bitnum = BITNUM(bitno);
00863 
00864         if ((page->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
00865             return true;
00866     }
00867     return false;
00868 }
00869 
00870 /*
00871  * tbm_mark_page_lossy - mark the page number as lossily stored
00872  *
00873  * This may cause the table to exceed the desired memory size.  It is
00874  * up to the caller to call tbm_lossify() at the next safe point if so.
00875  */
00876 static void
00877 tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
00878 {
00879     PagetableEntry *page;
00880     bool        found;
00881     BlockNumber chunk_pageno;
00882     int         bitno;
00883     int         wordnum;
00884     int         bitnum;
00885 
00886     /* We force the bitmap into hashtable mode whenever it's lossy */
00887     if (tbm->status != TBM_HASH)
00888         tbm_create_pagetable(tbm);
00889 
00890     bitno = pageno % PAGES_PER_CHUNK;
00891     chunk_pageno = pageno - bitno;
00892 
00893     /*
00894      * Remove any extant non-lossy entry for the page.  If the page is its own
00895      * chunk header, however, we skip this and handle the case below.
00896      */
00897     if (bitno != 0)
00898     {
00899         if (hash_search(tbm->pagetable,
00900                         (void *) &pageno,
00901                         HASH_REMOVE, NULL) != NULL)
00902         {
00903             /* It was present, so adjust counts */
00904             tbm->nentries--;
00905             tbm->npages--;      /* assume it must have been non-lossy */
00906         }
00907     }
00908 
00909     /* Look up or create entry for chunk-header page */
00910     page = (PagetableEntry *) hash_search(tbm->pagetable,
00911                                           (void *) &chunk_pageno,
00912                                           HASH_ENTER, &found);
00913 
00914     /* Initialize it if not present before */
00915     if (!found)
00916     {
00917         MemSet(page, 0, sizeof(PagetableEntry));
00918         page->blockno = chunk_pageno;
00919         page->ischunk = true;
00920         /* must count it too */
00921         tbm->nentries++;
00922         tbm->nchunks++;
00923     }
00924     else if (!page->ischunk)
00925     {
00926         /* chunk header page was formerly non-lossy, make it lossy */
00927         MemSet(page, 0, sizeof(PagetableEntry));
00928         page->blockno = chunk_pageno;
00929         page->ischunk = true;
00930         /* we assume it had some tuple bit(s) set, so mark it lossy */
00931         page->words[0] = ((bitmapword) 1 << 0);
00932         /* adjust counts */
00933         tbm->nchunks++;
00934         tbm->npages--;
00935     }
00936 
00937     /* Now set the original target page's bit */
00938     wordnum = WORDNUM(bitno);
00939     bitnum = BITNUM(bitno);
00940     page->words[wordnum] |= ((bitmapword) 1 << bitnum);
00941 }
00942 
00943 /*
00944  * tbm_lossify - lose some information to get back under the memory limit
00945  */
00946 static void
00947 tbm_lossify(TIDBitmap *tbm)
00948 {
00949     HASH_SEQ_STATUS status;
00950     PagetableEntry *page;
00951 
00952     /*
00953      * XXX Really stupid implementation: this just lossifies pages in
00954      * essentially random order.  We should be paying some attention to the
00955      * number of bits set in each page, instead.
00956      *
00957      * Since we are called as soon as nentries exceeds maxentries, we should
00958      * push nentries down to significantly less than maxentries, or else we'll
00959      * just end up doing this again very soon.  We shoot for maxentries/2.
00960      */
00961     Assert(!tbm->iterating);
00962     Assert(tbm->status == TBM_HASH);
00963 
00964     hash_seq_init(&status, tbm->pagetable);
00965     while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
00966     {
00967         if (page->ischunk)
00968             continue;           /* already a chunk header */
00969 
00970         /*
00971          * If the page would become a chunk header, we won't save anything by
00972          * converting it to lossy, so skip it.
00973          */
00974         if ((page->blockno % PAGES_PER_CHUNK) == 0)
00975             continue;
00976 
00977         /* This does the dirty work ... */
00978         tbm_mark_page_lossy(tbm, page->blockno);
00979 
00980         if (tbm->nentries <= tbm->maxentries / 2)
00981         {
00982             /* we have done enough */
00983             hash_seq_term(&status);
00984             break;
00985         }
00986 
00987         /*
00988          * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the
00989          * hashtable.  We can continue the same seq_search scan since we do
00990          * not care whether we visit lossy chunks or not.
00991          */
00992     }
00993 
00994     /*
00995      * With a big bitmap and small work_mem, it's possible that we cannot get
00996      * under maxentries.  Again, if that happens, we'd end up uselessly
00997      * calling tbm_lossify over and over.  To prevent this from becoming a
00998      * performance sink, force maxentries up to at least double the current
00999      * number of entries.  (In essence, we're admitting inability to fit
01000      * within work_mem when we do this.)  Note that this test will not fire if
01001      * we broke out of the loop early; and if we didn't, the current number of
01002      * entries is simply not reducible any further.
01003      */
01004     if (tbm->nentries > tbm->maxentries / 2)
01005         tbm->maxentries = Min(tbm->nentries, (INT_MAX - 1) / 2) * 2;
01006 }
01007 
01008 /*
01009  * qsort comparator to handle PagetableEntry pointers.
01010  */
01011 static int
01012 tbm_comparator(const void *left, const void *right)
01013 {
01014     BlockNumber l = (*((PagetableEntry *const *) left))->blockno;
01015     BlockNumber r = (*((PagetableEntry *const *) right))->blockno;
01016 
01017     if (l < r)
01018         return -1;
01019     else if (l > r)
01020         return 1;
01021     return 0;
01022 }