Header And Logo

PostgreSQL
| The world's most advanced open source database.

hashfunc.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * hashfunc.c
00004  *    Support functions for hash access method.
00005  *
00006  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00007  * Portions Copyright (c) 1994, Regents of the University of California
00008  *
00009  *
00010  * IDENTIFICATION
00011  *    src/backend/access/hash/hashfunc.c
00012  *
00013  * NOTES
00014  *    These functions are stored in pg_amproc.  For each operator class
00015  *    defined for hash indexes, they compute the hash value of the argument.
00016  *
00017  *    Additional hash functions appear in /utils/adt/ files for various
00018  *    specialized datatypes.
00019  *
00020  *    It is expected that every bit of a hash function's 32-bit result is
00021  *    as random as every other; failure to ensure this is likely to lead
00022  *    to poor performance of hash joins, for example.  In most cases a hash
00023  *    function should use hash_any() or its variant hash_uint32().
00024  *-------------------------------------------------------------------------
00025  */
00026 
00027 #include "postgres.h"
00028 
00029 #include "access/hash.h"
00030 
00031 
00032 /* Note: this is used for both "char" and boolean datatypes */
00033 Datum
00034 hashchar(PG_FUNCTION_ARGS)
00035 {
00036     return hash_uint32((int32) PG_GETARG_CHAR(0));
00037 }
00038 
00039 Datum
00040 hashint2(PG_FUNCTION_ARGS)
00041 {
00042     return hash_uint32((int32) PG_GETARG_INT16(0));
00043 }
00044 
00045 Datum
00046 hashint4(PG_FUNCTION_ARGS)
00047 {
00048     return hash_uint32(PG_GETARG_INT32(0));
00049 }
00050 
00051 Datum
00052 hashint8(PG_FUNCTION_ARGS)
00053 {
00054     /*
00055      * The idea here is to produce a hash value compatible with the values
00056      * produced by hashint4 and hashint2 for logically equal inputs; this is
00057      * necessary to support cross-type hash joins across these input types.
00058      * Since all three types are signed, we can xor the high half of the int8
00059      * value if the sign is positive, or the complement of the high half when
00060      * the sign is negative.
00061      */
00062     int64       val = PG_GETARG_INT64(0);
00063     uint32      lohalf = (uint32) val;
00064     uint32      hihalf = (uint32) (val >> 32);
00065 
00066     lohalf ^= (val >= 0) ? hihalf : ~hihalf;
00067 
00068     return hash_uint32(lohalf);
00069 }
00070 
00071 Datum
00072 hashoid(PG_FUNCTION_ARGS)
00073 {
00074     return hash_uint32((uint32) PG_GETARG_OID(0));
00075 }
00076 
00077 Datum
00078 hashenum(PG_FUNCTION_ARGS)
00079 {
00080     return hash_uint32((uint32) PG_GETARG_OID(0));
00081 }
00082 
00083 Datum
00084 hashfloat4(PG_FUNCTION_ARGS)
00085 {
00086     float4      key = PG_GETARG_FLOAT4(0);
00087     float8      key8;
00088 
00089     /*
00090      * On IEEE-float machines, minus zero and zero have different bit patterns
00091      * but should compare as equal.  We must ensure that they have the same
00092      * hash value, which is most reliably done this way:
00093      */
00094     if (key == (float4) 0)
00095         PG_RETURN_UINT32(0);
00096 
00097     /*
00098      * To support cross-type hashing of float8 and float4, we want to return
00099      * the same hash value hashfloat8 would produce for an equal float8 value.
00100      * So, widen the value to float8 and hash that.  (We must do this rather
00101      * than have hashfloat8 try to narrow its value to float4; that could fail
00102      * on overflow.)
00103      */
00104     key8 = key;
00105 
00106     return hash_any((unsigned char *) &key8, sizeof(key8));
00107 }
00108 
00109 Datum
00110 hashfloat8(PG_FUNCTION_ARGS)
00111 {
00112     float8      key = PG_GETARG_FLOAT8(0);
00113 
00114     /*
00115      * On IEEE-float machines, minus zero and zero have different bit patterns
00116      * but should compare as equal.  We must ensure that they have the same
00117      * hash value, which is most reliably done this way:
00118      */
00119     if (key == (float8) 0)
00120         PG_RETURN_UINT32(0);
00121 
00122     return hash_any((unsigned char *) &key, sizeof(key));
00123 }
00124 
00125 Datum
00126 hashoidvector(PG_FUNCTION_ARGS)
00127 {
00128     oidvector  *key = (oidvector *) PG_GETARG_POINTER(0);
00129 
00130     return hash_any((unsigned char *) key->values, key->dim1 * sizeof(Oid));
00131 }
00132 
00133 Datum
00134 hashint2vector(PG_FUNCTION_ARGS)
00135 {
00136     int2vector *key = (int2vector *) PG_GETARG_POINTER(0);
00137 
00138     return hash_any((unsigned char *) key->values, key->dim1 * sizeof(int16));
00139 }
00140 
00141 Datum
00142 hashname(PG_FUNCTION_ARGS)
00143 {
00144     char       *key = NameStr(*PG_GETARG_NAME(0));
00145     int         keylen = strlen(key);
00146 
00147     Assert(keylen < NAMEDATALEN);       /* else it's not truncated correctly */
00148 
00149     return hash_any((unsigned char *) key, keylen);
00150 }
00151 
00152 Datum
00153 hashtext(PG_FUNCTION_ARGS)
00154 {
00155     text       *key = PG_GETARG_TEXT_PP(0);
00156     Datum       result;
00157 
00158     /*
00159      * Note: this is currently identical in behavior to hashvarlena, but keep
00160      * it as a separate function in case we someday want to do something
00161      * different in non-C locales.  (See also hashbpchar, if so.)
00162      */
00163     result = hash_any((unsigned char *) VARDATA_ANY(key),
00164                       VARSIZE_ANY_EXHDR(key));
00165 
00166     /* Avoid leaking memory for toasted inputs */
00167     PG_FREE_IF_COPY(key, 0);
00168 
00169     return result;
00170 }
00171 
00172 /*
00173  * hashvarlena() can be used for any varlena datatype in which there are
00174  * no non-significant bits, ie, distinct bitpatterns never compare as equal.
00175  */
00176 Datum
00177 hashvarlena(PG_FUNCTION_ARGS)
00178 {
00179     struct varlena *key = PG_GETARG_VARLENA_PP(0);
00180     Datum       result;
00181 
00182     result = hash_any((unsigned char *) VARDATA_ANY(key),
00183                       VARSIZE_ANY_EXHDR(key));
00184 
00185     /* Avoid leaking memory for toasted inputs */
00186     PG_FREE_IF_COPY(key, 0);
00187 
00188     return result;
00189 }
00190 
00191 /*
00192  * This hash function was written by Bob Jenkins
00193  * ([email protected]), and superficially adapted
00194  * for PostgreSQL by Neil Conway. For more information on this
00195  * hash function, see http://burtleburtle.net/bob/hash/doobs.html,
00196  * or Bob's article in Dr. Dobb's Journal, Sept. 1997.
00197  *
00198  * In the current code, we have adopted Bob's 2006 update of his hash
00199  * function to fetch the data a word at a time when it is suitably aligned.
00200  * This makes for a useful speedup, at the cost of having to maintain
00201  * four code paths (aligned vs unaligned, and little-endian vs big-endian).
00202  * It also uses two separate mixing functions mix() and final(), instead
00203  * of a slower multi-purpose function.
00204  */
00205 
00206 /* Get a bit mask of the bits set in non-uint32 aligned addresses */
00207 #define UINT32_ALIGN_MASK (sizeof(uint32) - 1)
00208 
00209 /* Rotate a uint32 value left by k bits - note multiple evaluation! */
00210 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
00211 
00212 /*----------
00213  * mix -- mix 3 32-bit values reversibly.
00214  *
00215  * This is reversible, so any information in (a,b,c) before mix() is
00216  * still in (a,b,c) after mix().
00217  *
00218  * If four pairs of (a,b,c) inputs are run through mix(), or through
00219  * mix() in reverse, there are at least 32 bits of the output that
00220  * are sometimes the same for one pair and different for another pair.
00221  * This was tested for:
00222  * * pairs that differed by one bit, by two bits, in any combination
00223  *   of top bits of (a,b,c), or in any combination of bottom bits of
00224  *   (a,b,c).
00225  * * "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
00226  *   the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
00227  *   is commonly produced by subtraction) look like a single 1-bit
00228  *   difference.
00229  * * the base values were pseudorandom, all zero but one bit set, or
00230  *   all zero plus a counter that starts at zero.
00231  *
00232  * This does not achieve avalanche.  There are input bits of (a,b,c)
00233  * that fail to affect some output bits of (a,b,c), especially of a.  The
00234  * most thoroughly mixed value is c, but it doesn't really even achieve
00235  * avalanche in c.
00236  *
00237  * This allows some parallelism.  Read-after-writes are good at doubling
00238  * the number of bits affected, so the goal of mixing pulls in the opposite
00239  * direction from the goal of parallelism.  I did what I could.  Rotates
00240  * seem to cost as much as shifts on every machine I could lay my hands on,
00241  * and rotates are much kinder to the top and bottom bits, so I used rotates.
00242  *----------
00243  */
00244 #define mix(a,b,c) \
00245 { \
00246   a -= c;  a ^= rot(c, 4);  c += b; \
00247   b -= a;  b ^= rot(a, 6);  a += c; \
00248   c -= b;  c ^= rot(b, 8);  b += a; \
00249   a -= c;  a ^= rot(c,16);  c += b; \
00250   b -= a;  b ^= rot(a,19);  a += c; \
00251   c -= b;  c ^= rot(b, 4);  b += a; \
00252 }
00253 
00254 /*----------
00255  * final -- final mixing of 3 32-bit values (a,b,c) into c
00256  *
00257  * Pairs of (a,b,c) values differing in only a few bits will usually
00258  * produce values of c that look totally different.  This was tested for
00259  * * pairs that differed by one bit, by two bits, in any combination
00260  *   of top bits of (a,b,c), or in any combination of bottom bits of
00261  *   (a,b,c).
00262  * * "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
00263  *   the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
00264  *   is commonly produced by subtraction) look like a single 1-bit
00265  *   difference.
00266  * * the base values were pseudorandom, all zero but one bit set, or
00267  *   all zero plus a counter that starts at zero.
00268  *
00269  * The use of separate functions for mix() and final() allow for a
00270  * substantial performance increase since final() does not need to
00271  * do well in reverse, but is does need to affect all output bits.
00272  * mix(), on the other hand, does not need to affect all output
00273  * bits (affecting 32 bits is enough).  The original hash function had
00274  * a single mixing operation that had to satisfy both sets of requirements
00275  * and was slower as a result.
00276  *----------
00277  */
00278 #define final(a,b,c) \
00279 { \
00280   c ^= b; c -= rot(b,14); \
00281   a ^= c; a -= rot(c,11); \
00282   b ^= a; b -= rot(a,25); \
00283   c ^= b; c -= rot(b,16); \
00284   a ^= c; a -= rot(c, 4); \
00285   b ^= a; b -= rot(a,14); \
00286   c ^= b; c -= rot(b,24); \
00287 }
00288 
00289 /*
00290  * hash_any() -- hash a variable-length key into a 32-bit value
00291  *      k       : the key (the unaligned variable-length array of bytes)
00292  *      len     : the length of the key, counting by bytes
00293  *
00294  * Returns a uint32 value.  Every bit of the key affects every bit of
00295  * the return value.  Every 1-bit and 2-bit delta achieves avalanche.
00296  * About 6*len+35 instructions. The best hash table sizes are powers
00297  * of 2.  There is no need to do mod a prime (mod is sooo slow!).
00298  * If you need less than 32 bits, use a bitmask.
00299  *
00300  * Note: we could easily change this function to return a 64-bit hash value
00301  * by using the final values of both b and c.  b is perhaps a little less
00302  * well mixed than c, however.
00303  */
00304 Datum
00305 hash_any(register const unsigned char *k, register int keylen)
00306 {
00307     register uint32 a,
00308                 b,
00309                 c,
00310                 len;
00311 
00312     /* Set up the internal state */
00313     len = keylen;
00314     a = b = c = 0x9e3779b9 + len + 3923095;
00315 
00316     /* If the source pointer is word-aligned, we use word-wide fetches */
00317     if (((intptr_t) k & UINT32_ALIGN_MASK) == 0)
00318     {
00319         /* Code path for aligned source data */
00320         register const uint32 *ka = (const uint32 *) k;
00321 
00322         /* handle most of the key */
00323         while (len >= 12)
00324         {
00325             a += ka[0];
00326             b += ka[1];
00327             c += ka[2];
00328             mix(a, b, c);
00329             ka += 3;
00330             len -= 12;
00331         }
00332 
00333         /* handle the last 11 bytes */
00334         k = (const unsigned char *) ka;
00335 #ifdef WORDS_BIGENDIAN
00336         switch (len)
00337         {
00338             case 11:
00339                 c += ((uint32) k[10] << 8);
00340                 /* fall through */
00341             case 10:
00342                 c += ((uint32) k[9] << 16);
00343                 /* fall through */
00344             case 9:
00345                 c += ((uint32) k[8] << 24);
00346                 /* the lowest byte of c is reserved for the length */
00347                 /* fall through */
00348             case 8:
00349                 b += ka[1];
00350                 a += ka[0];
00351                 break;
00352             case 7:
00353                 b += ((uint32) k[6] << 8);
00354                 /* fall through */
00355             case 6:
00356                 b += ((uint32) k[5] << 16);
00357                 /* fall through */
00358             case 5:
00359                 b += ((uint32) k[4] << 24);
00360                 /* fall through */
00361             case 4:
00362                 a += ka[0];
00363                 break;
00364             case 3:
00365                 a += ((uint32) k[2] << 8);
00366                 /* fall through */
00367             case 2:
00368                 a += ((uint32) k[1] << 16);
00369                 /* fall through */
00370             case 1:
00371                 a += ((uint32) k[0] << 24);
00372                 /* case 0: nothing left to add */
00373         }
00374 #else                           /* !WORDS_BIGENDIAN */
00375         switch (len)
00376         {
00377             case 11:
00378                 c += ((uint32) k[10] << 24);
00379                 /* fall through */
00380             case 10:
00381                 c += ((uint32) k[9] << 16);
00382                 /* fall through */
00383             case 9:
00384                 c += ((uint32) k[8] << 8);
00385                 /* the lowest byte of c is reserved for the length */
00386                 /* fall through */
00387             case 8:
00388                 b += ka[1];
00389                 a += ka[0];
00390                 break;
00391             case 7:
00392                 b += ((uint32) k[6] << 16);
00393                 /* fall through */
00394             case 6:
00395                 b += ((uint32) k[5] << 8);
00396                 /* fall through */
00397             case 5:
00398                 b += k[4];
00399                 /* fall through */
00400             case 4:
00401                 a += ka[0];
00402                 break;
00403             case 3:
00404                 a += ((uint32) k[2] << 16);
00405                 /* fall through */
00406             case 2:
00407                 a += ((uint32) k[1] << 8);
00408                 /* fall through */
00409             case 1:
00410                 a += k[0];
00411                 /* case 0: nothing left to add */
00412         }
00413 #endif   /* WORDS_BIGENDIAN */
00414     }
00415     else
00416     {
00417         /* Code path for non-aligned source data */
00418 
00419         /* handle most of the key */
00420         while (len >= 12)
00421         {
00422 #ifdef WORDS_BIGENDIAN
00423             a += (k[3] + ((uint32) k[2] << 8) + ((uint32) k[1] << 16) + ((uint32) k[0] << 24));
00424             b += (k[7] + ((uint32) k[6] << 8) + ((uint32) k[5] << 16) + ((uint32) k[4] << 24));
00425             c += (k[11] + ((uint32) k[10] << 8) + ((uint32) k[9] << 16) + ((uint32) k[8] << 24));
00426 #else                           /* !WORDS_BIGENDIAN */
00427             a += (k[0] + ((uint32) k[1] << 8) + ((uint32) k[2] << 16) + ((uint32) k[3] << 24));
00428             b += (k[4] + ((uint32) k[5] << 8) + ((uint32) k[6] << 16) + ((uint32) k[7] << 24));
00429             c += (k[8] + ((uint32) k[9] << 8) + ((uint32) k[10] << 16) + ((uint32) k[11] << 24));
00430 #endif   /* WORDS_BIGENDIAN */
00431             mix(a, b, c);
00432             k += 12;
00433             len -= 12;
00434         }
00435 
00436         /* handle the last 11 bytes */
00437 #ifdef WORDS_BIGENDIAN
00438         switch (len)            /* all the case statements fall through */
00439         {
00440             case 11:
00441                 c += ((uint32) k[10] << 8);
00442             case 10:
00443                 c += ((uint32) k[9] << 16);
00444             case 9:
00445                 c += ((uint32) k[8] << 24);
00446                 /* the lowest byte of c is reserved for the length */
00447             case 8:
00448                 b += k[7];
00449             case 7:
00450                 b += ((uint32) k[6] << 8);
00451             case 6:
00452                 b += ((uint32) k[5] << 16);
00453             case 5:
00454                 b += ((uint32) k[4] << 24);
00455             case 4:
00456                 a += k[3];
00457             case 3:
00458                 a += ((uint32) k[2] << 8);
00459             case 2:
00460                 a += ((uint32) k[1] << 16);
00461             case 1:
00462                 a += ((uint32) k[0] << 24);
00463                 /* case 0: nothing left to add */
00464         }
00465 #else                           /* !WORDS_BIGENDIAN */
00466         switch (len)            /* all the case statements fall through */
00467         {
00468             case 11:
00469                 c += ((uint32) k[10] << 24);
00470             case 10:
00471                 c += ((uint32) k[9] << 16);
00472             case 9:
00473                 c += ((uint32) k[8] << 8);
00474                 /* the lowest byte of c is reserved for the length */
00475             case 8:
00476                 b += ((uint32) k[7] << 24);
00477             case 7:
00478                 b += ((uint32) k[6] << 16);
00479             case 6:
00480                 b += ((uint32) k[5] << 8);
00481             case 5:
00482                 b += k[4];
00483             case 4:
00484                 a += ((uint32) k[3] << 24);
00485             case 3:
00486                 a += ((uint32) k[2] << 16);
00487             case 2:
00488                 a += ((uint32) k[1] << 8);
00489             case 1:
00490                 a += k[0];
00491                 /* case 0: nothing left to add */
00492         }
00493 #endif   /* WORDS_BIGENDIAN */
00494     }
00495 
00496     final(a, b, c);
00497 
00498     /* report the result */
00499     return UInt32GetDatum(c);
00500 }
00501 
00502 /*
00503  * hash_uint32() -- hash a 32-bit value
00504  *
00505  * This has the same result as
00506  *      hash_any(&k, sizeof(uint32))
00507  * but is faster and doesn't force the caller to store k into memory.
00508  */
00509 Datum
00510 hash_uint32(uint32 k)
00511 {
00512     register uint32 a,
00513                 b,
00514                 c;
00515 
00516     a = b = c = 0x9e3779b9 + (uint32) sizeof(uint32) + 3923095;
00517     a += k;
00518 
00519     final(a, b, c);
00520 
00521     /* report the result */
00522     return UInt32GetDatum(c);
00523 }