Header And Logo

PostgreSQL
| The world's most advanced open source database.

dynahash.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * dynahash.c
00004  *    dynamic hash tables
00005  *
00006  * dynahash.c supports both local-to-a-backend hash tables and hash tables in
00007  * shared memory.  For shared hash tables, it is the caller's responsibility
00008  * to provide appropriate access interlocking.  The simplest convention is
00009  * that a single LWLock protects the whole hash table.  Searches (HASH_FIND or
00010  * hash_seq_search) need only shared lock, but any update requires exclusive
00011  * lock.  For heavily-used shared tables, the single-lock approach creates a
00012  * concurrency bottleneck, so we also support "partitioned" locking wherein
00013  * there are multiple LWLocks guarding distinct subsets of the table.  To use
00014  * a hash table in partitioned mode, the HASH_PARTITION flag must be given
00015  * to hash_create.  This prevents any attempt to split buckets on-the-fly.
00016  * Therefore, each hash bucket chain operates independently, and no fields
00017  * of the hash header change after init except nentries and freeList.
00018  * A partitioned table uses a spinlock to guard changes of those two fields.
00019  * This lets any subset of the hash buckets be treated as a separately
00020  * lockable partition.  We expect callers to use the low-order bits of a
00021  * lookup key's hash value as a partition number --- this will work because
00022  * of the way calc_bucket() maps hash values to bucket numbers.
00023  *
00024  * For hash tables in shared memory, the memory allocator function should
00025  * match malloc's semantics of returning NULL on failure.  For hash tables
00026  * in local memory, we typically use palloc() which will throw error on
00027  * failure.  The code in this file has to cope with both cases.
00028  *
00029  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00030  * Portions Copyright (c) 1994, Regents of the University of California
00031  *
00032  *
00033  * IDENTIFICATION
00034  *    src/backend/utils/hash/dynahash.c
00035  *
00036  *-------------------------------------------------------------------------
00037  */
00038 
00039 /*
00040  * Original comments:
00041  *
00042  * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
00043  * Coded into C, with minor code improvements, and with hsearch(3) interface,
00044  * by [email protected], Jul 26, 1988: 13:16;
00045  * also, hcreate/hdestroy routines added to simulate hsearch(3).
00046  *
00047  * These routines simulate hsearch(3) and family, with the important
00048  * difference that the hash table is dynamic - can grow indefinitely
00049  * beyond its original size (as supplied to hcreate()).
00050  *
00051  * Performance appears to be comparable to that of hsearch(3).
00052  * The 'source-code' options referred to in hsearch(3)'s 'man' page
00053  * are not implemented; otherwise functionality is identical.
00054  *
00055  * Compilation controls:
00056  * DEBUG controls some informative traces, mainly for debugging.
00057  * HASH_STATISTICS causes HashAccesses and HashCollisions to be maintained;
00058  * when combined with HASH_DEBUG, these are displayed by hdestroy().
00059  *
00060  * Problems & fixes to [email protected] WARNING: relies on pre-processor
00061  * concatenation property, in probably unnecessary code 'optimisation'.
00062  *
00063  * Modified [email protected] February 1990
00064  *      added multiple table interface
00065  * Modified by [email protected] April 1990
00066  *      changed ctl structure for shared memory
00067  */
00068 
00069 #include "postgres.h"
00070 
00071 #include <limits.h>
00072 
00073 #include "access/xact.h"
00074 #include "storage/shmem.h"
00075 #include "storage/spin.h"
00076 #include "utils/dynahash.h"
00077 #include "utils/memutils.h"
00078 
00079 
00080 /*
00081  * Constants
00082  *
00083  * A hash table has a top-level "directory", each of whose entries points
00084  * to a "segment" of ssize bucket headers.  The maximum number of hash
00085  * buckets is thus dsize * ssize (but dsize may be expansible).  Of course,
00086  * the number of records in the table can be larger, but we don't want a
00087  * whole lot of records per bucket or performance goes down.
00088  *
00089  * In a hash table allocated in shared memory, the directory cannot be
00090  * expanded because it must stay at a fixed address.  The directory size
00091  * should be selected using hash_select_dirsize (and you'd better have
00092  * a good idea of the maximum number of entries!).  For non-shared hash
00093  * tables, the initial directory size can be left at the default.
00094  */
00095 #define DEF_SEGSIZE            256
00096 #define DEF_SEGSIZE_SHIFT      8    /* must be log2(DEF_SEGSIZE) */
00097 #define DEF_DIRSIZE            256
00098 #define DEF_FFACTOR            1    /* default fill factor */
00099 
00100 
00101 /* A hash bucket is a linked list of HASHELEMENTs */
00102 typedef HASHELEMENT *HASHBUCKET;
00103 
00104 /* A hash segment is an array of bucket headers */
00105 typedef HASHBUCKET *HASHSEGMENT;
00106 
00107 /*
00108  * Header structure for a hash table --- contains all changeable info
00109  *
00110  * In a shared-memory hash table, the HASHHDR is in shared memory, while
00111  * each backend has a local HTAB struct.  For a non-shared table, there isn't
00112  * any functional difference between HASHHDR and HTAB, but we separate them
00113  * anyway to share code between shared and non-shared tables.
00114  */
00115 struct HASHHDR
00116 {
00117     /* In a partitioned table, take this lock to touch nentries or freeList */
00118     slock_t     mutex;          /* unused if not partitioned table */
00119 
00120     /* These fields change during entry addition/deletion */
00121     long        nentries;       /* number of entries in hash table */
00122     HASHELEMENT *freeList;      /* linked list of free elements */
00123 
00124     /* These fields can change, but not in a partitioned table */
00125     /* Also, dsize can't change in a shared table, even if unpartitioned */
00126     long        dsize;          /* directory size */
00127     long        nsegs;          /* number of allocated segments (<= dsize) */
00128     uint32      max_bucket;     /* ID of maximum bucket in use */
00129     uint32      high_mask;      /* mask to modulo into entire table */
00130     uint32      low_mask;       /* mask to modulo into lower half of table */
00131 
00132     /* These fields are fixed at hashtable creation */
00133     Size        keysize;        /* hash key length in bytes */
00134     Size        entrysize;      /* total user element size in bytes */
00135     long        num_partitions; /* # partitions (must be power of 2), or 0 */
00136     long        ffactor;        /* target fill factor */
00137     long        max_dsize;      /* 'dsize' limit if directory is fixed size */
00138     long        ssize;          /* segment size --- must be power of 2 */
00139     int         sshift;         /* segment shift = log2(ssize) */
00140     int         nelem_alloc;    /* number of entries to allocate at once */
00141 
00142 #ifdef HASH_STATISTICS
00143 
00144     /*
00145      * Count statistics here.  NB: stats code doesn't bother with mutex, so
00146      * counts could be corrupted a bit in a partitioned table.
00147      */
00148     long        accesses;
00149     long        collisions;
00150 #endif
00151 };
00152 
00153 #define IS_PARTITIONED(hctl)  ((hctl)->num_partitions != 0)
00154 
00155 /*
00156  * Top control structure for a hashtable --- in a shared table, each backend
00157  * has its own copy (OK since no fields change at runtime)
00158  */
00159 struct HTAB
00160 {
00161     HASHHDR    *hctl;           /* => shared control information */
00162     HASHSEGMENT *dir;           /* directory of segment starts */
00163     HashValueFunc hash;         /* hash function */
00164     HashCompareFunc match;      /* key comparison function */
00165     HashCopyFunc keycopy;       /* key copying function */
00166     HashAllocFunc alloc;        /* memory allocator */
00167     MemoryContext hcxt;         /* memory context if default allocator used */
00168     char       *tabname;        /* table name (for error messages) */
00169     bool        isshared;       /* true if table is in shared memory */
00170     bool        isfixed;        /* if true, don't enlarge */
00171 
00172     /* freezing a shared table isn't allowed, so we can keep state here */
00173     bool        frozen;         /* true = no more inserts allowed */
00174 
00175     /* We keep local copies of these fixed values to reduce contention */
00176     Size        keysize;        /* hash key length in bytes */
00177     long        ssize;          /* segment size --- must be power of 2 */
00178     int         sshift;         /* segment shift = log2(ssize) */
00179 };
00180 
00181 /*
00182  * Key (also entry) part of a HASHELEMENT
00183  */
00184 #define ELEMENTKEY(helem)  (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
00185 
00186 /*
00187  * Obtain element pointer given pointer to key
00188  */
00189 #define ELEMENT_FROM_KEY(key)  \
00190     ((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT))))
00191 
00192 /*
00193  * Fast MOD arithmetic, assuming that y is a power of 2 !
00194  */
00195 #define MOD(x,y)               ((x) & ((y)-1))
00196 
00197 #if HASH_STATISTICS
00198 static long hash_accesses,
00199             hash_collisions,
00200             hash_expansions;
00201 #endif
00202 
00203 /*
00204  * Private function prototypes
00205  */
00206 static void *DynaHashAlloc(Size size);
00207 static HASHSEGMENT seg_alloc(HTAB *hashp);
00208 static bool element_alloc(HTAB *hashp, int nelem);
00209 static bool dir_realloc(HTAB *hashp);
00210 static bool expand_table(HTAB *hashp);
00211 static HASHBUCKET get_hash_entry(HTAB *hashp);
00212 static void hdefault(HTAB *hashp);
00213 static int  choose_nelem_alloc(Size entrysize);
00214 static bool init_htab(HTAB *hashp, long nelem);
00215 static void hash_corrupted(HTAB *hashp);
00216 static long next_pow2_long(long num);
00217 static int  next_pow2_int(long num);
00218 static void register_seq_scan(HTAB *hashp);
00219 static void deregister_seq_scan(HTAB *hashp);
00220 static bool has_seq_scans(HTAB *hashp);
00221 
00222 
00223 /*
00224  * memory allocation support
00225  */
00226 static MemoryContext CurrentDynaHashCxt = NULL;
00227 
00228 static void *
00229 DynaHashAlloc(Size size)
00230 {
00231     Assert(MemoryContextIsValid(CurrentDynaHashCxt));
00232     return MemoryContextAlloc(CurrentDynaHashCxt, size);
00233 }
00234 
00235 
00236 /*
00237  * HashCompareFunc for string keys
00238  *
00239  * Because we copy keys with strlcpy(), they will be truncated at keysize-1
00240  * bytes, so we can only compare that many ... hence strncmp is almost but
00241  * not quite the right thing.
00242  */
00243 static int
00244 string_compare(const char *key1, const char *key2, Size keysize)
00245 {
00246     return strncmp(key1, key2, keysize - 1);
00247 }
00248 
00249 
00250 /************************** CREATE ROUTINES **********************/
00251 
00252 /*
00253  * hash_create -- create a new dynamic hash table
00254  *
00255  *  tabname: a name for the table (for debugging purposes)
00256  *  nelem: maximum number of elements expected
00257  *  *info: additional table parameters, as indicated by flags
00258  *  flags: bitmask indicating which parameters to take from *info
00259  *
00260  * Note: for a shared-memory hashtable, nelem needs to be a pretty good
00261  * estimate, since we can't expand the table on the fly.  But an unshared
00262  * hashtable can be expanded on-the-fly, so it's better for nelem to be
00263  * on the small side and let the table grow if it's exceeded.  An overly
00264  * large nelem will penalize hash_seq_search speed without buying much.
00265  */
00266 HTAB *
00267 hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
00268 {
00269     HTAB       *hashp;
00270     HASHHDR    *hctl;
00271 
00272     /*
00273      * For shared hash tables, we have a local hash header (HTAB struct) that
00274      * we allocate in TopMemoryContext; all else is in shared memory.
00275      *
00276      * For non-shared hash tables, everything including the hash header is in
00277      * a memory context created specially for the hash table --- this makes
00278      * hash_destroy very simple.  The memory context is made a child of either
00279      * a context specified by the caller, or TopMemoryContext if nothing is
00280      * specified.
00281      */
00282     if (flags & HASH_SHARED_MEM)
00283     {
00284         /* Set up to allocate the hash header */
00285         CurrentDynaHashCxt = TopMemoryContext;
00286     }
00287     else
00288     {
00289         /* Create the hash table's private memory context */
00290         if (flags & HASH_CONTEXT)
00291             CurrentDynaHashCxt = info->hcxt;
00292         else
00293             CurrentDynaHashCxt = TopMemoryContext;
00294         CurrentDynaHashCxt = AllocSetContextCreate(CurrentDynaHashCxt,
00295                                                    tabname,
00296                                                    ALLOCSET_DEFAULT_MINSIZE,
00297                                                    ALLOCSET_DEFAULT_INITSIZE,
00298                                                    ALLOCSET_DEFAULT_MAXSIZE);
00299     }
00300 
00301     /* Initialize the hash header, plus a copy of the table name */
00302     hashp = (HTAB *) DynaHashAlloc(sizeof(HTAB) + strlen(tabname) +1);
00303     MemSet(hashp, 0, sizeof(HTAB));
00304 
00305     hashp->tabname = (char *) (hashp + 1);
00306     strcpy(hashp->tabname, tabname);
00307 
00308     if (flags & HASH_FUNCTION)
00309         hashp->hash = info->hash;
00310     else
00311         hashp->hash = string_hash;      /* default hash function */
00312 
00313     /*
00314      * If you don't specify a match function, it defaults to string_compare if
00315      * you used string_hash (either explicitly or by default) and to memcmp
00316      * otherwise.  (Prior to PostgreSQL 7.4, memcmp was always used.)
00317      */
00318     if (flags & HASH_COMPARE)
00319         hashp->match = info->match;
00320     else if (hashp->hash == string_hash)
00321         hashp->match = (HashCompareFunc) string_compare;
00322     else
00323         hashp->match = memcmp;
00324 
00325     /*
00326      * Similarly, the key-copying function defaults to strlcpy or memcpy.
00327      */
00328     if (flags & HASH_KEYCOPY)
00329         hashp->keycopy = info->keycopy;
00330     else if (hashp->hash == string_hash)
00331         hashp->keycopy = (HashCopyFunc) strlcpy;
00332     else
00333         hashp->keycopy = memcpy;
00334 
00335     if (flags & HASH_ALLOC)
00336         hashp->alloc = info->alloc;
00337     else
00338         hashp->alloc = DynaHashAlloc;
00339 
00340     if (flags & HASH_SHARED_MEM)
00341     {
00342         /*
00343          * ctl structure and directory are preallocated for shared memory
00344          * tables.  Note that HASH_DIRSIZE and HASH_ALLOC had better be set as
00345          * well.
00346          */
00347         hashp->hctl = info->hctl;
00348         hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR));
00349         hashp->hcxt = NULL;
00350         hashp->isshared = true;
00351 
00352         /* hash table already exists, we're just attaching to it */
00353         if (flags & HASH_ATTACH)
00354         {
00355             /* make local copies of some heavily-used values */
00356             hctl = hashp->hctl;
00357             hashp->keysize = hctl->keysize;
00358             hashp->ssize = hctl->ssize;
00359             hashp->sshift = hctl->sshift;
00360 
00361             return hashp;
00362         }
00363     }
00364     else
00365     {
00366         /* setup hash table defaults */
00367         hashp->hctl = NULL;
00368         hashp->dir = NULL;
00369         hashp->hcxt = CurrentDynaHashCxt;
00370         hashp->isshared = false;
00371     }
00372 
00373     if (!hashp->hctl)
00374     {
00375         hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR));
00376         if (!hashp->hctl)
00377             ereport(ERROR,
00378                     (errcode(ERRCODE_OUT_OF_MEMORY),
00379                      errmsg("out of memory")));
00380     }
00381 
00382     hashp->frozen = false;
00383 
00384     hdefault(hashp);
00385 
00386     hctl = hashp->hctl;
00387 
00388     if (flags & HASH_PARTITION)
00389     {
00390         /* Doesn't make sense to partition a local hash table */
00391         Assert(flags & HASH_SHARED_MEM);
00392 
00393         /*
00394          * The number of partitions had better be a power of 2. Also, it must
00395          * be less than INT_MAX (see init_htab()), so call the int version of
00396          * next_pow2.
00397          */
00398         Assert(info->num_partitions == next_pow2_int(info->num_partitions));
00399 
00400         hctl->num_partitions = info->num_partitions;
00401     }
00402 
00403     if (flags & HASH_SEGMENT)
00404     {
00405         hctl->ssize = info->ssize;
00406         hctl->sshift = my_log2(info->ssize);
00407         /* ssize had better be a power of 2 */
00408         Assert(hctl->ssize == (1L << hctl->sshift));
00409     }
00410     if (flags & HASH_FFACTOR)
00411         hctl->ffactor = info->ffactor;
00412 
00413     /*
00414      * SHM hash tables have fixed directory size passed by the caller.
00415      */
00416     if (flags & HASH_DIRSIZE)
00417     {
00418         hctl->max_dsize = info->max_dsize;
00419         hctl->dsize = info->dsize;
00420     }
00421 
00422     /*
00423      * hash table now allocates space for key and data but you have to say how
00424      * much space to allocate
00425      */
00426     if (flags & HASH_ELEM)
00427     {
00428         Assert(info->entrysize >= info->keysize);
00429         hctl->keysize = info->keysize;
00430         hctl->entrysize = info->entrysize;
00431     }
00432 
00433     /* make local copies of heavily-used constant fields */
00434     hashp->keysize = hctl->keysize;
00435     hashp->ssize = hctl->ssize;
00436     hashp->sshift = hctl->sshift;
00437 
00438     /* Build the hash directory structure */
00439     if (!init_htab(hashp, nelem))
00440         elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
00441 
00442     /*
00443      * For a shared hash table, preallocate the requested number of elements.
00444      * This reduces problems with run-time out-of-shared-memory conditions.
00445      *
00446      * For a non-shared hash table, preallocate the requested number of
00447      * elements if it's less than our chosen nelem_alloc.  This avoids wasting
00448      * space if the caller correctly estimates a small table size.
00449      */
00450     if ((flags & HASH_SHARED_MEM) ||
00451         nelem < hctl->nelem_alloc)
00452     {
00453         if (!element_alloc(hashp, (int) nelem))
00454             ereport(ERROR,
00455                     (errcode(ERRCODE_OUT_OF_MEMORY),
00456                      errmsg("out of memory")));
00457     }
00458 
00459     if (flags & HASH_FIXED_SIZE)
00460         hashp->isfixed = true;
00461     return hashp;
00462 }
00463 
00464 /*
00465  * Set default HASHHDR parameters.
00466  */
00467 static void
00468 hdefault(HTAB *hashp)
00469 {
00470     HASHHDR    *hctl = hashp->hctl;
00471 
00472     MemSet(hctl, 0, sizeof(HASHHDR));
00473 
00474     hctl->nentries = 0;
00475     hctl->freeList = NULL;
00476 
00477     hctl->dsize = DEF_DIRSIZE;
00478     hctl->nsegs = 0;
00479 
00480     /* rather pointless defaults for key & entry size */
00481     hctl->keysize = sizeof(char *);
00482     hctl->entrysize = 2 * sizeof(char *);
00483 
00484     hctl->num_partitions = 0;   /* not partitioned */
00485 
00486     hctl->ffactor = DEF_FFACTOR;
00487 
00488     /* table has no fixed maximum size */
00489     hctl->max_dsize = NO_MAX_DSIZE;
00490 
00491     hctl->ssize = DEF_SEGSIZE;
00492     hctl->sshift = DEF_SEGSIZE_SHIFT;
00493 
00494 #ifdef HASH_STATISTICS
00495     hctl->accesses = hctl->collisions = 0;
00496 #endif
00497 }
00498 
00499 /*
00500  * Given the user-specified entry size, choose nelem_alloc, ie, how many
00501  * elements to add to the hash table when we need more.
00502  */
00503 static int
00504 choose_nelem_alloc(Size entrysize)
00505 {
00506     int         nelem_alloc;
00507     Size        elementSize;
00508     Size        allocSize;
00509 
00510     /* Each element has a HASHELEMENT header plus user data. */
00511     /* NB: this had better match element_alloc() */
00512     elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
00513 
00514     /*
00515      * The idea here is to choose nelem_alloc at least 32, but round up so
00516      * that the allocation request will be a power of 2 or just less. This
00517      * makes little difference for hash tables in shared memory, but for hash
00518      * tables managed by palloc, the allocation request will be rounded up to
00519      * a power of 2 anyway.  If we fail to take this into account, we'll waste
00520      * as much as half the allocated space.
00521      */
00522     allocSize = 32 * 4;         /* assume elementSize at least 8 */
00523     do
00524     {
00525         allocSize <<= 1;
00526         nelem_alloc = allocSize / elementSize;
00527     } while (nelem_alloc < 32);
00528 
00529     return nelem_alloc;
00530 }
00531 
00532 /*
00533  * Compute derived fields of hctl and build the initial directory/segment
00534  * arrays
00535  */
00536 static bool
00537 init_htab(HTAB *hashp, long nelem)
00538 {
00539     HASHHDR    *hctl = hashp->hctl;
00540     HASHSEGMENT *segp;
00541     int         nbuckets;
00542     int         nsegs;
00543 
00544     /*
00545      * initialize mutex if it's a partitioned table
00546      */
00547     if (IS_PARTITIONED(hctl))
00548         SpinLockInit(&hctl->mutex);
00549 
00550     /*
00551      * Divide number of elements by the fill factor to determine a desired
00552      * number of buckets.  Allocate space for the next greater power of two
00553      * number of buckets
00554      */
00555     nbuckets = next_pow2_int((nelem - 1) / hctl->ffactor + 1);
00556 
00557     /*
00558      * In a partitioned table, nbuckets must be at least equal to
00559      * num_partitions; were it less, keys with apparently different partition
00560      * numbers would map to the same bucket, breaking partition independence.
00561      * (Normally nbuckets will be much bigger; this is just a safety check.)
00562      */
00563     while (nbuckets < hctl->num_partitions)
00564         nbuckets <<= 1;
00565 
00566     hctl->max_bucket = hctl->low_mask = nbuckets - 1;
00567     hctl->high_mask = (nbuckets << 1) - 1;
00568 
00569     /*
00570      * Figure number of directory segments needed, round up to a power of 2
00571      */
00572     nsegs = (nbuckets - 1) / hctl->ssize + 1;
00573     nsegs = next_pow2_int(nsegs);
00574 
00575     /*
00576      * Make sure directory is big enough. If pre-allocated directory is too
00577      * small, choke (caller screwed up).
00578      */
00579     if (nsegs > hctl->dsize)
00580     {
00581         if (!(hashp->dir))
00582             hctl->dsize = nsegs;
00583         else
00584             return false;
00585     }
00586 
00587     /* Allocate a directory */
00588     if (!(hashp->dir))
00589     {
00590         CurrentDynaHashCxt = hashp->hcxt;
00591         hashp->dir = (HASHSEGMENT *)
00592             hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT));
00593         if (!hashp->dir)
00594             return false;
00595     }
00596 
00597     /* Allocate initial segments */
00598     for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
00599     {
00600         *segp = seg_alloc(hashp);
00601         if (*segp == NULL)
00602             return false;
00603     }
00604 
00605     /* Choose number of entries to allocate at a time */
00606     hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
00607 
00608 #if HASH_DEBUG
00609     fprintf(stderr, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n%s%ld\n",
00610             "TABLE POINTER   ", hashp,
00611             "DIRECTORY SIZE  ", hctl->dsize,
00612             "SEGMENT SIZE    ", hctl->ssize,
00613             "SEGMENT SHIFT   ", hctl->sshift,
00614             "FILL FACTOR     ", hctl->ffactor,
00615             "MAX BUCKET      ", hctl->max_bucket,
00616             "HIGH MASK       ", hctl->high_mask,
00617             "LOW  MASK       ", hctl->low_mask,
00618             "NSEGS           ", hctl->nsegs,
00619             "NENTRIES        ", hctl->nentries);
00620 #endif
00621     return true;
00622 }
00623 
00624 /*
00625  * Estimate the space needed for a hashtable containing the given number
00626  * of entries of given size.
00627  * NOTE: this is used to estimate the footprint of hashtables in shared
00628  * memory; therefore it does not count HTAB which is in local memory.
00629  * NB: assumes that all hash structure parameters have default values!
00630  */
00631 Size
00632 hash_estimate_size(long num_entries, Size entrysize)
00633 {
00634     Size        size;
00635     long        nBuckets,
00636                 nSegments,
00637                 nDirEntries,
00638                 nElementAllocs,
00639                 elementSize,
00640                 elementAllocCnt;
00641 
00642     /* estimate number of buckets wanted */
00643     nBuckets = next_pow2_long((num_entries - 1) / DEF_FFACTOR + 1);
00644     /* # of segments needed for nBuckets */
00645     nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
00646     /* directory entries */
00647     nDirEntries = DEF_DIRSIZE;
00648     while (nDirEntries < nSegments)
00649         nDirEntries <<= 1;      /* dir_alloc doubles dsize at each call */
00650 
00651     /* fixed control info */
00652     size = MAXALIGN(sizeof(HASHHDR));   /* but not HTAB, per above */
00653     /* directory */
00654     size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT)));
00655     /* segments */
00656     size = add_size(size, mul_size(nSegments,
00657                                 MAXALIGN(DEF_SEGSIZE * sizeof(HASHBUCKET))));
00658     /* elements --- allocated in groups of choose_nelem_alloc() entries */
00659     elementAllocCnt = choose_nelem_alloc(entrysize);
00660     nElementAllocs = (num_entries - 1) / elementAllocCnt + 1;
00661     elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
00662     size = add_size(size,
00663                     mul_size(nElementAllocs,
00664                              mul_size(elementAllocCnt, elementSize)));
00665 
00666     return size;
00667 }
00668 
00669 /*
00670  * Select an appropriate directory size for a hashtable with the given
00671  * maximum number of entries.
00672  * This is only needed for hashtables in shared memory, whose directories
00673  * cannot be expanded dynamically.
00674  * NB: assumes that all hash structure parameters have default values!
00675  *
00676  * XXX this had better agree with the behavior of init_htab()...
00677  */
00678 long
00679 hash_select_dirsize(long num_entries)
00680 {
00681     long        nBuckets,
00682                 nSegments,
00683                 nDirEntries;
00684 
00685     /* estimate number of buckets wanted */
00686     nBuckets = next_pow2_long((num_entries - 1) / DEF_FFACTOR + 1);
00687     /* # of segments needed for nBuckets */
00688     nSegments = next_pow2_long((nBuckets - 1) / DEF_SEGSIZE + 1);
00689     /* directory entries */
00690     nDirEntries = DEF_DIRSIZE;
00691     while (nDirEntries < nSegments)
00692         nDirEntries <<= 1;      /* dir_alloc doubles dsize at each call */
00693 
00694     return nDirEntries;
00695 }
00696 
00697 /*
00698  * Compute the required initial memory allocation for a shared-memory
00699  * hashtable with the given parameters.  We need space for the HASHHDR
00700  * and for the (non expansible) directory.
00701  */
00702 Size
00703 hash_get_shared_size(HASHCTL *info, int flags)
00704 {
00705     Assert(flags & HASH_DIRSIZE);
00706     Assert(info->dsize == info->max_dsize);
00707     return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
00708 }
00709 
00710 
00711 /********************** DESTROY ROUTINES ************************/
00712 
00713 void
00714 hash_destroy(HTAB *hashp)
00715 {
00716     if (hashp != NULL)
00717     {
00718         /* allocation method must be one we know how to free, too */
00719         Assert(hashp->alloc == DynaHashAlloc);
00720         /* so this hashtable must have it's own context */
00721         Assert(hashp->hcxt != NULL);
00722 
00723         hash_stats("destroy", hashp);
00724 
00725         /*
00726          * Free everything by destroying the hash table's memory context.
00727          */
00728         MemoryContextDelete(hashp->hcxt);
00729     }
00730 }
00731 
00732 void
00733 hash_stats(const char *where, HTAB *hashp)
00734 {
00735 #if HASH_STATISTICS
00736     fprintf(stderr, "%s: this HTAB -- accesses %ld collisions %ld\n",
00737             where, hashp->hctl->accesses, hashp->hctl->collisions);
00738 
00739     fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
00740             hashp->hctl->nentries, (long) hashp->hctl->keysize,
00741             hashp->hctl->max_bucket, hashp->hctl->nsegs);
00742     fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
00743             where, hash_accesses, hash_collisions);
00744     fprintf(stderr, "hash_stats: total expansions %ld\n",
00745             hash_expansions);
00746 #endif
00747 }
00748 
00749 /*******************************SEARCH ROUTINES *****************************/
00750 
00751 
00752 /*
00753  * get_hash_value -- exported routine to calculate a key's hash value
00754  *
00755  * We export this because for partitioned tables, callers need to compute
00756  * the partition number (from the low-order bits of the hash value) before
00757  * searching.
00758  */
00759 uint32
00760 get_hash_value(HTAB *hashp, const void *keyPtr)
00761 {
00762     return hashp->hash(keyPtr, hashp->keysize);
00763 }
00764 
00765 /* Convert a hash value to a bucket number */
00766 static inline uint32
00767 calc_bucket(HASHHDR *hctl, uint32 hash_val)
00768 {
00769     uint32      bucket;
00770 
00771     bucket = hash_val & hctl->high_mask;
00772     if (bucket > hctl->max_bucket)
00773         bucket = bucket & hctl->low_mask;
00774 
00775     return bucket;
00776 }
00777 
00778 /*
00779  * hash_search -- look up key in table and perform action
00780  * hash_search_with_hash_value -- same, with key's hash value already computed
00781  *
00782  * action is one of:
00783  *      HASH_FIND: look up key in table
00784  *      HASH_ENTER: look up key in table, creating entry if not present
00785  *      HASH_ENTER_NULL: same, but return NULL if out of memory
00786  *      HASH_REMOVE: look up key in table, remove entry if present
00787  *
00788  * Return value is a pointer to the element found/entered/removed if any,
00789  * or NULL if no match was found.  (NB: in the case of the REMOVE action,
00790  * the result is a dangling pointer that shouldn't be dereferenced!)
00791  *
00792  * HASH_ENTER will normally ereport a generic "out of memory" error if
00793  * it is unable to create a new entry.  The HASH_ENTER_NULL operation is
00794  * the same except it will return NULL if out of memory.  Note that
00795  * HASH_ENTER_NULL cannot be used with the default palloc-based allocator,
00796  * since palloc internally ereports on out-of-memory.
00797  *
00798  * If foundPtr isn't NULL, then *foundPtr is set TRUE if we found an
00799  * existing entry in the table, FALSE otherwise.  This is needed in the
00800  * HASH_ENTER case, but is redundant with the return value otherwise.
00801  *
00802  * For hash_search_with_hash_value, the hashvalue parameter must have been
00803  * calculated with get_hash_value().
00804  */
00805 void *
00806 hash_search(HTAB *hashp,
00807             const void *keyPtr,
00808             HASHACTION action,
00809             bool *foundPtr)
00810 {
00811     return hash_search_with_hash_value(hashp,
00812                                        keyPtr,
00813                                        hashp->hash(keyPtr, hashp->keysize),
00814                                        action,
00815                                        foundPtr);
00816 }
00817 
00818 void *
00819 hash_search_with_hash_value(HTAB *hashp,
00820                             const void *keyPtr,
00821                             uint32 hashvalue,
00822                             HASHACTION action,
00823                             bool *foundPtr)
00824 {
00825     HASHHDR    *hctl = hashp->hctl;
00826     Size        keysize;
00827     uint32      bucket;
00828     long        segment_num;
00829     long        segment_ndx;
00830     HASHSEGMENT segp;
00831     HASHBUCKET  currBucket;
00832     HASHBUCKET *prevBucketPtr;
00833     HashCompareFunc match;
00834 
00835 #if HASH_STATISTICS
00836     hash_accesses++;
00837     hctl->accesses++;
00838 #endif
00839 
00840     /*
00841      * If inserting, check if it is time to split a bucket.
00842      *
00843      * NOTE: failure to expand table is not a fatal error, it just means we
00844      * have to run at higher fill factor than we wanted.  However, if we're
00845      * using the palloc allocator then it will throw error anyway on
00846      * out-of-memory, so we must do this before modifying the table.
00847      */
00848     if (action == HASH_ENTER || action == HASH_ENTER_NULL)
00849     {
00850         /*
00851          * Can't split if running in partitioned mode, nor if frozen, nor if
00852          * table is the subject of any active hash_seq_search scans.  Strange
00853          * order of these tests is to try to check cheaper conditions first.
00854          */
00855         if (!IS_PARTITIONED(hctl) && !hashp->frozen &&
00856             hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
00857             !has_seq_scans(hashp))
00858             (void) expand_table(hashp);
00859     }
00860 
00861     /*
00862      * Do the initial lookup
00863      */
00864     bucket = calc_bucket(hctl, hashvalue);
00865 
00866     segment_num = bucket >> hashp->sshift;
00867     segment_ndx = MOD(bucket, hashp->ssize);
00868 
00869     segp = hashp->dir[segment_num];
00870 
00871     if (segp == NULL)
00872         hash_corrupted(hashp);
00873 
00874     prevBucketPtr = &segp[segment_ndx];
00875     currBucket = *prevBucketPtr;
00876 
00877     /*
00878      * Follow collision chain looking for matching key
00879      */
00880     match = hashp->match;       /* save one fetch in inner loop */
00881     keysize = hashp->keysize;   /* ditto */
00882 
00883     while (currBucket != NULL)
00884     {
00885         if (currBucket->hashvalue == hashvalue &&
00886             match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
00887             break;
00888         prevBucketPtr = &(currBucket->link);
00889         currBucket = *prevBucketPtr;
00890 #if HASH_STATISTICS
00891         hash_collisions++;
00892         hctl->collisions++;
00893 #endif
00894     }
00895 
00896     if (foundPtr)
00897         *foundPtr = (bool) (currBucket != NULL);
00898 
00899     /*
00900      * OK, now what?
00901      */
00902     switch (action)
00903     {
00904         case HASH_FIND:
00905             if (currBucket != NULL)
00906                 return (void *) ELEMENTKEY(currBucket);
00907             return NULL;
00908 
00909         case HASH_REMOVE:
00910             if (currBucket != NULL)
00911             {
00912                 /* use volatile pointer to prevent code rearrangement */
00913                 volatile HASHHDR *hctlv = hctl;
00914 
00915                 /* if partitioned, must lock to touch nentries and freeList */
00916                 if (IS_PARTITIONED(hctlv))
00917                     SpinLockAcquire(&hctlv->mutex);
00918 
00919                 Assert(hctlv->nentries > 0);
00920                 hctlv->nentries--;
00921 
00922                 /* remove record from hash bucket's chain. */
00923                 *prevBucketPtr = currBucket->link;
00924 
00925                 /* add the record to the freelist for this table.  */
00926                 currBucket->link = hctlv->freeList;
00927                 hctlv->freeList = currBucket;
00928 
00929                 if (IS_PARTITIONED(hctlv))
00930                     SpinLockRelease(&hctlv->mutex);
00931 
00932                 /*
00933                  * better hope the caller is synchronizing access to this
00934                  * element, because someone else is going to reuse it the next
00935                  * time something is added to the table
00936                  */
00937                 return (void *) ELEMENTKEY(currBucket);
00938             }
00939             return NULL;
00940 
00941         case HASH_ENTER_NULL:
00942             /* ENTER_NULL does not work with palloc-based allocator */
00943             Assert(hashp->alloc != DynaHashAlloc);
00944             /* FALL THRU */
00945 
00946         case HASH_ENTER:
00947             /* Return existing element if found, else create one */
00948             if (currBucket != NULL)
00949                 return (void *) ELEMENTKEY(currBucket);
00950 
00951             /* disallow inserts if frozen */
00952             if (hashp->frozen)
00953                 elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
00954                      hashp->tabname);
00955 
00956             currBucket = get_hash_entry(hashp);
00957             if (currBucket == NULL)
00958             {
00959                 /* out of memory */
00960                 if (action == HASH_ENTER_NULL)
00961                     return NULL;
00962                 /* report a generic message */
00963                 if (hashp->isshared)
00964                     ereport(ERROR,
00965                             (errcode(ERRCODE_OUT_OF_MEMORY),
00966                              errmsg("out of shared memory")));
00967                 else
00968                     ereport(ERROR,
00969                             (errcode(ERRCODE_OUT_OF_MEMORY),
00970                              errmsg("out of memory")));
00971             }
00972 
00973             /* link into hashbucket chain */
00974             *prevBucketPtr = currBucket;
00975             currBucket->link = NULL;
00976 
00977             /* copy key into record */
00978             currBucket->hashvalue = hashvalue;
00979             hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
00980 
00981             /*
00982              * Caller is expected to fill the data field on return.  DO NOT
00983              * insert any code that could possibly throw error here, as doing
00984              * so would leave the table entry incomplete and hence corrupt the
00985              * caller's data structure.
00986              */
00987 
00988             return (void *) ELEMENTKEY(currBucket);
00989     }
00990 
00991     elog(ERROR, "unrecognized hash action code: %d", (int) action);
00992 
00993     return NULL;                /* keep compiler quiet */
00994 }
00995 
00996 /*
00997  * hash_update_hash_key -- change the hash key of an existing table entry
00998  *
00999  * This is equivalent to removing the entry, making a new entry, and copying
01000  * over its data, except that the entry never goes to the table's freelist.
01001  * Therefore this cannot suffer an out-of-memory failure, even if there are
01002  * other processes operating in other partitions of the hashtable.
01003  *
01004  * Returns TRUE if successful, FALSE if the requested new hash key is already
01005  * present.  Throws error if the specified entry pointer isn't actually a
01006  * table member.
01007  *
01008  * NB: currently, there is no special case for old and new hash keys being
01009  * identical, which means we'll report FALSE for that situation.  This is
01010  * preferable for existing uses.
01011  *
01012  * NB: for a partitioned hashtable, caller must hold lock on both relevant
01013  * partitions, if the new hash key would belong to a different partition.
01014  */
01015 bool
01016 hash_update_hash_key(HTAB *hashp,
01017                      void *existingEntry,
01018                      const void *newKeyPtr)
01019 {
01020     HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry);
01021     HASHHDR    *hctl = hashp->hctl;
01022     uint32      newhashvalue;
01023     Size        keysize;
01024     uint32      bucket;
01025     uint32      newbucket;
01026     long        segment_num;
01027     long        segment_ndx;
01028     HASHSEGMENT segp;
01029     HASHBUCKET  currBucket;
01030     HASHBUCKET *prevBucketPtr;
01031     HASHBUCKET *oldPrevPtr;
01032     HashCompareFunc match;
01033 
01034 #if HASH_STATISTICS
01035     hash_accesses++;
01036     hctl->accesses++;
01037 #endif
01038 
01039     /* disallow updates if frozen */
01040     if (hashp->frozen)
01041         elog(ERROR, "cannot update in frozen hashtable \"%s\"",
01042              hashp->tabname);
01043 
01044     /*
01045      * Lookup the existing element using its saved hash value.  We need to
01046      * do this to be able to unlink it from its hash chain, but as a side
01047      * benefit we can verify the validity of the passed existingEntry pointer.
01048      */
01049     bucket = calc_bucket(hctl, existingElement->hashvalue);
01050 
01051     segment_num = bucket >> hashp->sshift;
01052     segment_ndx = MOD(bucket, hashp->ssize);
01053 
01054     segp = hashp->dir[segment_num];
01055 
01056     if (segp == NULL)
01057         hash_corrupted(hashp);
01058 
01059     prevBucketPtr = &segp[segment_ndx];
01060     currBucket = *prevBucketPtr;
01061 
01062     while (currBucket != NULL)
01063     {
01064         if (currBucket == existingElement)
01065             break;
01066         prevBucketPtr = &(currBucket->link);
01067         currBucket = *prevBucketPtr;
01068     }
01069 
01070     if (currBucket == NULL)
01071         elog(ERROR, "hash_update_hash_key argument is not in hashtable \"%s\"",
01072              hashp->tabname);
01073 
01074     oldPrevPtr = prevBucketPtr;
01075 
01076     /*
01077      * Now perform the equivalent of a HASH_ENTER operation to locate the
01078      * hash chain we want to put the entry into.
01079      */
01080     newhashvalue = hashp->hash(newKeyPtr, hashp->keysize);
01081 
01082     newbucket = calc_bucket(hctl, newhashvalue);
01083 
01084     segment_num = newbucket >> hashp->sshift;
01085     segment_ndx = MOD(newbucket, hashp->ssize);
01086 
01087     segp = hashp->dir[segment_num];
01088 
01089     if (segp == NULL)
01090         hash_corrupted(hashp);
01091 
01092     prevBucketPtr = &segp[segment_ndx];
01093     currBucket = *prevBucketPtr;
01094 
01095     /*
01096      * Follow collision chain looking for matching key
01097      */
01098     match = hashp->match;       /* save one fetch in inner loop */
01099     keysize = hashp->keysize;   /* ditto */
01100 
01101     while (currBucket != NULL)
01102     {
01103         if (currBucket->hashvalue == newhashvalue &&
01104             match(ELEMENTKEY(currBucket), newKeyPtr, keysize) == 0)
01105             break;
01106         prevBucketPtr = &(currBucket->link);
01107         currBucket = *prevBucketPtr;
01108 #if HASH_STATISTICS
01109         hash_collisions++;
01110         hctl->collisions++;
01111 #endif
01112     }
01113 
01114     if (currBucket != NULL)
01115         return false;           /* collision with an existing entry */
01116 
01117     currBucket = existingElement;
01118 
01119     /*
01120      * If old and new hash values belong to the same bucket, we need not
01121      * change any chain links, and indeed should not since this simplistic
01122      * update will corrupt the list if currBucket is the last element.  (We
01123      * cannot fall out earlier, however, since we need to scan the bucket to
01124      * check for duplicate keys.)
01125      */
01126     if (bucket != newbucket)
01127     {
01128         /* OK to remove record from old hash bucket's chain. */
01129         *oldPrevPtr = currBucket->link;
01130 
01131         /* link into new hashbucket chain */
01132         *prevBucketPtr = currBucket;
01133         currBucket->link = NULL;
01134     }
01135 
01136     /* copy new key into record */
01137     currBucket->hashvalue = newhashvalue;
01138     hashp->keycopy(ELEMENTKEY(currBucket), newKeyPtr, keysize);
01139 
01140     /* rest of record is untouched */
01141 
01142     return true;
01143 }
01144 
01145 /*
01146  * create a new entry if possible
01147  */
01148 static HASHBUCKET
01149 get_hash_entry(HTAB *hashp)
01150 {
01151     /* use volatile pointer to prevent code rearrangement */
01152     volatile HASHHDR *hctlv = hashp->hctl;
01153     HASHBUCKET  newElement;
01154 
01155     for (;;)
01156     {
01157         /* if partitioned, must lock to touch nentries and freeList */
01158         if (IS_PARTITIONED(hctlv))
01159             SpinLockAcquire(&hctlv->mutex);
01160 
01161         /* try to get an entry from the freelist */
01162         newElement = hctlv->freeList;
01163         if (newElement != NULL)
01164             break;
01165 
01166         /* no free elements.  allocate another chunk of buckets */
01167         if (IS_PARTITIONED(hctlv))
01168             SpinLockRelease(&hctlv->mutex);
01169 
01170         if (!element_alloc(hashp, hctlv->nelem_alloc))
01171         {
01172             /* out of memory */
01173             return NULL;
01174         }
01175     }
01176 
01177     /* remove entry from freelist, bump nentries */
01178     hctlv->freeList = newElement->link;
01179     hctlv->nentries++;
01180 
01181     if (IS_PARTITIONED(hctlv))
01182         SpinLockRelease(&hctlv->mutex);
01183 
01184     return newElement;
01185 }
01186 
01187 /*
01188  * hash_get_num_entries -- get the number of entries in a hashtable
01189  */
01190 long
01191 hash_get_num_entries(HTAB *hashp)
01192 {
01193     /*
01194      * We currently don't bother with the mutex; it's only sensible to call
01195      * this function if you've got lock on all partitions of the table.
01196      */
01197     return hashp->hctl->nentries;
01198 }
01199 
01200 /*
01201  * hash_seq_init/_search/_term
01202  *          Sequentially search through hash table and return
01203  *          all the elements one by one, return NULL when no more.
01204  *
01205  * hash_seq_term should be called if and only if the scan is abandoned before
01206  * completion; if hash_seq_search returns NULL then it has already done the
01207  * end-of-scan cleanup.
01208  *
01209  * NOTE: caller may delete the returned element before continuing the scan.
01210  * However, deleting any other element while the scan is in progress is
01211  * UNDEFINED (it might be the one that curIndex is pointing at!).  Also,
01212  * if elements are added to the table while the scan is in progress, it is
01213  * unspecified whether they will be visited by the scan or not.
01214  *
01215  * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
01216  * worry about hash_seq_term cleanup, if the hashtable is first locked against
01217  * further insertions by calling hash_freeze.  This is used by nodeAgg.c,
01218  * wherein it is inconvenient to track whether a scan is still open, and
01219  * there's no possibility of further insertions after readout has begun.
01220  *
01221  * NOTE: to use this with a partitioned hashtable, caller had better hold
01222  * at least shared lock on all partitions of the table throughout the scan!
01223  * We can cope with insertions or deletions by our own backend, but *not*
01224  * with concurrent insertions or deletions by another.
01225  */
01226 void
01227 hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
01228 {
01229     status->hashp = hashp;
01230     status->curBucket = 0;
01231     status->curEntry = NULL;
01232     if (!hashp->frozen)
01233         register_seq_scan(hashp);
01234 }
01235 
01236 void *
01237 hash_seq_search(HASH_SEQ_STATUS *status)
01238 {
01239     HTAB       *hashp;
01240     HASHHDR    *hctl;
01241     uint32      max_bucket;
01242     long        ssize;
01243     long        segment_num;
01244     long        segment_ndx;
01245     HASHSEGMENT segp;
01246     uint32      curBucket;
01247     HASHELEMENT *curElem;
01248 
01249     if ((curElem = status->curEntry) != NULL)
01250     {
01251         /* Continuing scan of curBucket... */
01252         status->curEntry = curElem->link;
01253         if (status->curEntry == NULL)   /* end of this bucket */
01254             ++status->curBucket;
01255         return (void *) ELEMENTKEY(curElem);
01256     }
01257 
01258     /*
01259      * Search for next nonempty bucket starting at curBucket.
01260      */
01261     curBucket = status->curBucket;
01262     hashp = status->hashp;
01263     hctl = hashp->hctl;
01264     ssize = hashp->ssize;
01265     max_bucket = hctl->max_bucket;
01266 
01267     if (curBucket > max_bucket)
01268     {
01269         hash_seq_term(status);
01270         return NULL;            /* search is done */
01271     }
01272 
01273     /*
01274      * first find the right segment in the table directory.
01275      */
01276     segment_num = curBucket >> hashp->sshift;
01277     segment_ndx = MOD(curBucket, ssize);
01278 
01279     segp = hashp->dir[segment_num];
01280 
01281     /*
01282      * Pick up the first item in this bucket's chain.  If chain is not empty
01283      * we can begin searching it.  Otherwise we have to advance to find the
01284      * next nonempty bucket.  We try to optimize that case since searching a
01285      * near-empty hashtable has to iterate this loop a lot.
01286      */
01287     while ((curElem = segp[segment_ndx]) == NULL)
01288     {
01289         /* empty bucket, advance to next */
01290         if (++curBucket > max_bucket)
01291         {
01292             status->curBucket = curBucket;
01293             hash_seq_term(status);
01294             return NULL;        /* search is done */
01295         }
01296         if (++segment_ndx >= ssize)
01297         {
01298             segment_num++;
01299             segment_ndx = 0;
01300             segp = hashp->dir[segment_num];
01301         }
01302     }
01303 
01304     /* Begin scan of curBucket... */
01305     status->curEntry = curElem->link;
01306     if (status->curEntry == NULL)       /* end of this bucket */
01307         ++curBucket;
01308     status->curBucket = curBucket;
01309     return (void *) ELEMENTKEY(curElem);
01310 }
01311 
01312 void
01313 hash_seq_term(HASH_SEQ_STATUS *status)
01314 {
01315     if (!status->hashp->frozen)
01316         deregister_seq_scan(status->hashp);
01317 }
01318 
01319 /*
01320  * hash_freeze
01321  *          Freeze a hashtable against future insertions (deletions are
01322  *          still allowed)
01323  *
01324  * The reason for doing this is that by preventing any more bucket splits,
01325  * we no longer need to worry about registering hash_seq_search scans,
01326  * and thus caller need not be careful about ensuring hash_seq_term gets
01327  * called at the right times.
01328  *
01329  * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
01330  * with active scans (since hash_seq_term would then do the wrong thing).
01331  */
01332 void
01333 hash_freeze(HTAB *hashp)
01334 {
01335     if (hashp->isshared)
01336         elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname);
01337     if (!hashp->frozen && has_seq_scans(hashp))
01338         elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans",
01339              hashp->tabname);
01340     hashp->frozen = true;
01341 }
01342 
01343 
01344 /********************************* UTILITIES ************************/
01345 
01346 /*
01347  * Expand the table by adding one more hash bucket.
01348  */
01349 static bool
01350 expand_table(HTAB *hashp)
01351 {
01352     HASHHDR    *hctl = hashp->hctl;
01353     HASHSEGMENT old_seg,
01354                 new_seg;
01355     long        old_bucket,
01356                 new_bucket;
01357     long        new_segnum,
01358                 new_segndx;
01359     long        old_segnum,
01360                 old_segndx;
01361     HASHBUCKET *oldlink,
01362                *newlink;
01363     HASHBUCKET  currElement,
01364                 nextElement;
01365 
01366     Assert(!IS_PARTITIONED(hctl));
01367 
01368 #ifdef HASH_STATISTICS
01369     hash_expansions++;
01370 #endif
01371 
01372     new_bucket = hctl->max_bucket + 1;
01373     new_segnum = new_bucket >> hashp->sshift;
01374     new_segndx = MOD(new_bucket, hashp->ssize);
01375 
01376     if (new_segnum >= hctl->nsegs)
01377     {
01378         /* Allocate new segment if necessary -- could fail if dir full */
01379         if (new_segnum >= hctl->dsize)
01380             if (!dir_realloc(hashp))
01381                 return false;
01382         if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))
01383             return false;
01384         hctl->nsegs++;
01385     }
01386 
01387     /* OK, we created a new bucket */
01388     hctl->max_bucket++;
01389 
01390     /*
01391      * *Before* changing masks, find old bucket corresponding to same hash
01392      * values; values in that bucket may need to be relocated to new bucket.
01393      * Note that new_bucket is certainly larger than low_mask at this point,
01394      * so we can skip the first step of the regular hash mask calc.
01395      */
01396     old_bucket = (new_bucket & hctl->low_mask);
01397 
01398     /*
01399      * If we crossed a power of 2, readjust masks.
01400      */
01401     if ((uint32) new_bucket > hctl->high_mask)
01402     {
01403         hctl->low_mask = hctl->high_mask;
01404         hctl->high_mask = (uint32) new_bucket | hctl->low_mask;
01405     }
01406 
01407     /*
01408      * Relocate records to the new bucket.  NOTE: because of the way the hash
01409      * masking is done in calc_bucket, only one old bucket can need to be
01410      * split at this point.  With a different way of reducing the hash value,
01411      * that might not be true!
01412      */
01413     old_segnum = old_bucket >> hashp->sshift;
01414     old_segndx = MOD(old_bucket, hashp->ssize);
01415 
01416     old_seg = hashp->dir[old_segnum];
01417     new_seg = hashp->dir[new_segnum];
01418 
01419     oldlink = &old_seg[old_segndx];
01420     newlink = &new_seg[new_segndx];
01421 
01422     for (currElement = *oldlink;
01423          currElement != NULL;
01424          currElement = nextElement)
01425     {
01426         nextElement = currElement->link;
01427         if ((long) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
01428         {
01429             *oldlink = currElement;
01430             oldlink = &currElement->link;
01431         }
01432         else
01433         {
01434             *newlink = currElement;
01435             newlink = &currElement->link;
01436         }
01437     }
01438     /* don't forget to terminate the rebuilt hash chains... */
01439     *oldlink = NULL;
01440     *newlink = NULL;
01441 
01442     return true;
01443 }
01444 
01445 
01446 static bool
01447 dir_realloc(HTAB *hashp)
01448 {
01449     HASHSEGMENT *p;
01450     HASHSEGMENT *old_p;
01451     long        new_dsize;
01452     long        old_dirsize;
01453     long        new_dirsize;
01454 
01455     if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
01456         return false;
01457 
01458     /* Reallocate directory */
01459     new_dsize = hashp->hctl->dsize << 1;
01460     old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
01461     new_dirsize = new_dsize * sizeof(HASHSEGMENT);
01462 
01463     old_p = hashp->dir;
01464     CurrentDynaHashCxt = hashp->hcxt;
01465     p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize);
01466 
01467     if (p != NULL)
01468     {
01469         memcpy(p, old_p, old_dirsize);
01470         MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
01471         hashp->dir = p;
01472         hashp->hctl->dsize = new_dsize;
01473 
01474         /* XXX assume the allocator is palloc, so we know how to free */
01475         Assert(hashp->alloc == DynaHashAlloc);
01476         pfree(old_p);
01477 
01478         return true;
01479     }
01480 
01481     return false;
01482 }
01483 
01484 
01485 static HASHSEGMENT
01486 seg_alloc(HTAB *hashp)
01487 {
01488     HASHSEGMENT segp;
01489 
01490     CurrentDynaHashCxt = hashp->hcxt;
01491     segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);
01492 
01493     if (!segp)
01494         return NULL;
01495 
01496     MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);
01497 
01498     return segp;
01499 }
01500 
01501 /*
01502  * allocate some new elements and link them into the free list
01503  */
01504 static bool
01505 element_alloc(HTAB *hashp, int nelem)
01506 {
01507     /* use volatile pointer to prevent code rearrangement */
01508     volatile HASHHDR *hctlv = hashp->hctl;
01509     Size        elementSize;
01510     HASHELEMENT *firstElement;
01511     HASHELEMENT *tmpElement;
01512     HASHELEMENT *prevElement;
01513     int         i;
01514 
01515     if (hashp->isfixed)
01516         return false;
01517 
01518     /* Each element has a HASHELEMENT header plus user data. */
01519     elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctlv->entrysize);
01520 
01521     CurrentDynaHashCxt = hashp->hcxt;
01522     firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
01523 
01524     if (!firstElement)
01525         return false;
01526 
01527     /* prepare to link all the new entries into the freelist */
01528     prevElement = NULL;
01529     tmpElement = firstElement;
01530     for (i = 0; i < nelem; i++)
01531     {
01532         tmpElement->link = prevElement;
01533         prevElement = tmpElement;
01534         tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
01535     }
01536 
01537     /* if partitioned, must lock to touch freeList */
01538     if (IS_PARTITIONED(hctlv))
01539         SpinLockAcquire(&hctlv->mutex);
01540 
01541     /* freelist could be nonempty if two backends did this concurrently */
01542     firstElement->link = hctlv->freeList;
01543     hctlv->freeList = prevElement;
01544 
01545     if (IS_PARTITIONED(hctlv))
01546         SpinLockRelease(&hctlv->mutex);
01547 
01548     return true;
01549 }
01550 
01551 /* complain when we have detected a corrupted hashtable */
01552 static void
01553 hash_corrupted(HTAB *hashp)
01554 {
01555     /*
01556      * If the corruption is in a shared hashtable, we'd better force a
01557      * systemwide restart.  Otherwise, just shut down this one backend.
01558      */
01559     if (hashp->isshared)
01560         elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname);
01561     else
01562         elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname);
01563 }
01564 
01565 /* calculate ceil(log base 2) of num */
01566 int
01567 my_log2(long num)
01568 {
01569     int         i;
01570     long        limit;
01571 
01572     /* guard against too-large input, which would put us into infinite loop */
01573     if (num > LONG_MAX / 2)
01574         num = LONG_MAX / 2;
01575 
01576     for (i = 0, limit = 1; limit < num; i++, limit <<= 1)
01577         ;
01578     return i;
01579 }
01580 
01581 /* calculate first power of 2 >= num, bounded to what will fit in a long */
01582 static long
01583 next_pow2_long(long num)
01584 {
01585     /* my_log2's internal range check is sufficient */
01586     return 1L << my_log2(num);
01587 }
01588 
01589 /* calculate first power of 2 >= num, bounded to what will fit in an int */
01590 static int
01591 next_pow2_int(long num)
01592 {
01593     if (num > INT_MAX / 2)
01594         num = INT_MAX / 2;
01595     return 1 << my_log2(num);
01596 }
01597 
01598 
01599 /************************* SEQ SCAN TRACKING ************************/
01600 
01601 /*
01602  * We track active hash_seq_search scans here.  The need for this mechanism
01603  * comes from the fact that a scan will get confused if a bucket split occurs
01604  * while it's in progress: it might visit entries twice, or even miss some
01605  * entirely (if it's partway through the same bucket that splits).  Hence
01606  * we want to inhibit bucket splits if there are any active scans on the
01607  * table being inserted into.  This is a fairly rare case in current usage,
01608  * so just postponing the split until the next insertion seems sufficient.
01609  *
01610  * Given present usages of the function, only a few scans are likely to be
01611  * open concurrently; so a finite-size stack of open scans seems sufficient,
01612  * and we don't worry that linear search is too slow.  Note that we do
01613  * allow multiple scans of the same hashtable to be open concurrently.
01614  *
01615  * This mechanism can support concurrent scan and insertion in a shared
01616  * hashtable if it's the same backend doing both.  It would fail otherwise,
01617  * but locking reasons seem to preclude any such scenario anyway, so we don't
01618  * worry.
01619  *
01620  * This arrangement is reasonably robust if a transient hashtable is deleted
01621  * without notifying us.  The absolute worst case is we might inhibit splits
01622  * in another table created later at exactly the same address.  We will give
01623  * a warning at transaction end for reference leaks, so any bugs leading to
01624  * lack of notification should be easy to catch.
01625  */
01626 
01627 #define MAX_SEQ_SCANS 100
01628 
01629 static HTAB *seq_scan_tables[MAX_SEQ_SCANS];    /* tables being scanned */
01630 static int  seq_scan_level[MAX_SEQ_SCANS];      /* subtransaction nest level */
01631 static int  num_seq_scans = 0;
01632 
01633 
01634 /* Register a table as having an active hash_seq_search scan */
01635 static void
01636 register_seq_scan(HTAB *hashp)
01637 {
01638     if (num_seq_scans >= MAX_SEQ_SCANS)
01639         elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"",
01640              hashp->tabname);
01641     seq_scan_tables[num_seq_scans] = hashp;
01642     seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
01643     num_seq_scans++;
01644 }
01645 
01646 /* Deregister an active scan */
01647 static void
01648 deregister_seq_scan(HTAB *hashp)
01649 {
01650     int         i;
01651 
01652     /* Search backward since it's most likely at the stack top */
01653     for (i = num_seq_scans - 1; i >= 0; i--)
01654     {
01655         if (seq_scan_tables[i] == hashp)
01656         {
01657             seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
01658             seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
01659             num_seq_scans--;
01660             return;
01661         }
01662     }
01663     elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
01664          hashp->tabname);
01665 }
01666 
01667 /* Check if a table has any active scan */
01668 static bool
01669 has_seq_scans(HTAB *hashp)
01670 {
01671     int         i;
01672 
01673     for (i = 0; i < num_seq_scans; i++)
01674     {
01675         if (seq_scan_tables[i] == hashp)
01676             return true;
01677     }
01678     return false;
01679 }
01680 
01681 /* Clean up any open scans at end of transaction */
01682 void
01683 AtEOXact_HashTables(bool isCommit)
01684 {
01685     /*
01686      * During abort cleanup, open scans are expected; just silently clean 'em
01687      * out.  An open scan at commit means someone forgot a hash_seq_term()
01688      * call, so complain.
01689      *
01690      * Note: it's tempting to try to print the tabname here, but refrain for
01691      * fear of touching deallocated memory.  This isn't a user-facing message
01692      * anyway, so it needn't be pretty.
01693      */
01694     if (isCommit)
01695     {
01696         int         i;
01697 
01698         for (i = 0; i < num_seq_scans; i++)
01699         {
01700             elog(WARNING, "leaked hash_seq_search scan for hash table %p",
01701                  seq_scan_tables[i]);
01702         }
01703     }
01704     num_seq_scans = 0;
01705 }
01706 
01707 /* Clean up any open scans at end of subtransaction */
01708 void
01709 AtEOSubXact_HashTables(bool isCommit, int nestDepth)
01710 {
01711     int         i;
01712 
01713     /*
01714      * Search backward to make cleanup easy.  Note we must check all entries,
01715      * not only those at the end of the array, because deletion technique
01716      * doesn't keep them in order.
01717      */
01718     for (i = num_seq_scans - 1; i >= 0; i--)
01719     {
01720         if (seq_scan_level[i] >= nestDepth)
01721         {
01722             if (isCommit)
01723                 elog(WARNING, "leaked hash_seq_search scan for hash table %p",
01724                      seq_scan_tables[i]);
01725             seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
01726             seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
01727             num_seq_scans--;
01728         }
01729     }
01730 }