Header And Logo

PostgreSQL
| The world's most advanced open source database.

trgm_gist.c

Go to the documentation of this file.
00001 /*
00002  * contrib/pg_trgm/trgm_gist.c
00003  */
00004 #include "postgres.h"
00005 
00006 #include "trgm.h"
00007 
00008 #include "access/skey.h"
00009 
00010 
00011 typedef struct
00012 {
00013     /* most recent inputs to gtrgm_consistent */
00014     StrategyNumber strategy;
00015     text       *query;
00016     /* extracted trigrams for query */
00017     TRGM       *trigrams;
00018     /* if a regex operator, the extracted graph */
00019     TrgmPackedGraph *graph;
00020 
00021     /*
00022      * The "query" and "trigrams" are stored in the same palloc block as this
00023      * cache struct, at MAXALIGN'ed offsets.  The graph however isn't.
00024      */
00025 } gtrgm_consistent_cache;
00026 
00027 #define GETENTRY(vec,pos) ((TRGM *) DatumGetPointer((vec)->vector[(pos)].key))
00028 
00029 
00030 PG_FUNCTION_INFO_V1(gtrgm_in);
00031 Datum       gtrgm_in(PG_FUNCTION_ARGS);
00032 
00033 PG_FUNCTION_INFO_V1(gtrgm_out);
00034 Datum       gtrgm_out(PG_FUNCTION_ARGS);
00035 
00036 PG_FUNCTION_INFO_V1(gtrgm_compress);
00037 Datum       gtrgm_compress(PG_FUNCTION_ARGS);
00038 
00039 PG_FUNCTION_INFO_V1(gtrgm_decompress);
00040 Datum       gtrgm_decompress(PG_FUNCTION_ARGS);
00041 
00042 PG_FUNCTION_INFO_V1(gtrgm_consistent);
00043 Datum       gtrgm_consistent(PG_FUNCTION_ARGS);
00044 
00045 PG_FUNCTION_INFO_V1(gtrgm_distance);
00046 Datum       gtrgm_distance(PG_FUNCTION_ARGS);
00047 
00048 PG_FUNCTION_INFO_V1(gtrgm_union);
00049 Datum       gtrgm_union(PG_FUNCTION_ARGS);
00050 
00051 PG_FUNCTION_INFO_V1(gtrgm_same);
00052 Datum       gtrgm_same(PG_FUNCTION_ARGS);
00053 
00054 PG_FUNCTION_INFO_V1(gtrgm_penalty);
00055 Datum       gtrgm_penalty(PG_FUNCTION_ARGS);
00056 
00057 PG_FUNCTION_INFO_V1(gtrgm_picksplit);
00058 Datum       gtrgm_picksplit(PG_FUNCTION_ARGS);
00059 
00060 /* Number of one-bits in an unsigned byte */
00061 static const uint8 number_of_ones[256] = {
00062     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
00063     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
00064     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
00065     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
00066     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
00067     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
00068     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
00069     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
00070     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
00071     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
00072     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
00073     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
00074     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
00075     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
00076     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
00077     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
00078 };
00079 
00080 
00081 Datum
00082 gtrgm_in(PG_FUNCTION_ARGS)
00083 {
00084     elog(ERROR, "not implemented");
00085     PG_RETURN_DATUM(0);
00086 }
00087 
00088 Datum
00089 gtrgm_out(PG_FUNCTION_ARGS)
00090 {
00091     elog(ERROR, "not implemented");
00092     PG_RETURN_DATUM(0);
00093 }
00094 
00095 static void
00096 makesign(BITVECP sign, TRGM *a)
00097 {
00098     int32       k,
00099                 len = ARRNELEM(a);
00100     trgm       *ptr = GETARR(a);
00101     int32       tmp = 0;
00102 
00103     MemSet((void *) sign, 0, sizeof(BITVEC));
00104     SETBIT(sign, SIGLENBIT);    /* set last unused bit */
00105     for (k = 0; k < len; k++)
00106     {
00107         CPTRGM(((char *) &tmp), ptr + k);
00108         HASH(sign, tmp);
00109     }
00110 }
00111 
00112 Datum
00113 gtrgm_compress(PG_FUNCTION_ARGS)
00114 {
00115     GISTENTRY  *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
00116     GISTENTRY  *retval = entry;
00117 
00118     if (entry->leafkey)
00119     {                           /* trgm */
00120         TRGM       *res;
00121         text       *val = DatumGetTextP(entry->key);
00122 
00123         res = generate_trgm(VARDATA(val), VARSIZE(val) - VARHDRSZ);
00124         retval = (GISTENTRY *) palloc(sizeof(GISTENTRY));
00125         gistentryinit(*retval, PointerGetDatum(res),
00126                       entry->rel, entry->page,
00127                       entry->offset, FALSE);
00128     }
00129     else if (ISSIGNKEY(DatumGetPointer(entry->key)) &&
00130              !ISALLTRUE(DatumGetPointer(entry->key)))
00131     {
00132         int32       i,
00133                     len;
00134         TRGM       *res;
00135         BITVECP     sign = GETSIGN(DatumGetPointer(entry->key));
00136 
00137         LOOPBYTE
00138         {
00139             if ((sign[i] & 0xff) != 0xff)
00140                 PG_RETURN_POINTER(retval);
00141         }
00142 
00143         len = CALCGTSIZE(SIGNKEY | ALLISTRUE, 0);
00144         res = (TRGM *) palloc(len);
00145         SET_VARSIZE(res, len);
00146         res->flag = SIGNKEY | ALLISTRUE;
00147 
00148         retval = (GISTENTRY *) palloc(sizeof(GISTENTRY));
00149         gistentryinit(*retval, PointerGetDatum(res),
00150                       entry->rel, entry->page,
00151                       entry->offset, FALSE);
00152     }
00153     PG_RETURN_POINTER(retval);
00154 }
00155 
00156 Datum
00157 gtrgm_decompress(PG_FUNCTION_ARGS)
00158 {
00159     GISTENTRY  *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
00160     GISTENTRY  *retval;
00161     text       *key;
00162 
00163     key = DatumGetTextP(entry->key);
00164 
00165     if (key != (text *) DatumGetPointer(entry->key))
00166     {
00167         /* need to pass back the decompressed item */
00168         retval = palloc(sizeof(GISTENTRY));
00169         gistentryinit(*retval, PointerGetDatum(key),
00170                       entry->rel, entry->page, entry->offset, entry->leafkey);
00171         PG_RETURN_POINTER(retval);
00172     }
00173     else
00174     {
00175         /* we can return the entry as-is */
00176         PG_RETURN_POINTER(entry);
00177     }
00178 }
00179 
00180 static int32
00181 cnt_sml_sign_common(TRGM *qtrg, BITVECP sign)
00182 {
00183     int32       count = 0;
00184     int32       k,
00185                 len = ARRNELEM(qtrg);
00186     trgm       *ptr = GETARR(qtrg);
00187     int32       tmp = 0;
00188 
00189     for (k = 0; k < len; k++)
00190     {
00191         CPTRGM(((char *) &tmp), ptr + k);
00192         count += GETBIT(sign, HASHVAL(tmp));
00193     }
00194 
00195     return count;
00196 }
00197 
00198 Datum
00199 gtrgm_consistent(PG_FUNCTION_ARGS)
00200 {
00201     GISTENTRY  *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
00202     text       *query = PG_GETARG_TEXT_P(1);
00203     StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
00204 
00205     /* Oid      subtype = PG_GETARG_OID(3); */
00206     bool       *recheck = (bool *) PG_GETARG_POINTER(4);
00207     TRGM       *key = (TRGM *) DatumGetPointer(entry->key);
00208     TRGM       *qtrg;
00209     bool        res;
00210     Size        querysize = VARSIZE(query);
00211     gtrgm_consistent_cache *cache;
00212 
00213     /*
00214      * We keep the extracted trigrams in cache, because trigram extraction is
00215      * relatively CPU-expensive.  When trying to reuse a cached value, check
00216      * strategy number not just query itself, because trigram extraction
00217      * depends on strategy.
00218      *
00219      * The cached structure is a single palloc chunk containing the
00220      * gtrgm_consistent_cache header, then the input query (starting at a
00221      * MAXALIGN boundary), then the TRGM value (also starting at a MAXALIGN
00222      * boundary).  However we don't try to include the regex graph (if any) in
00223      * that struct.  (XXX currently, this approach can leak regex graphs
00224      * across index rescans.  Not clear if that's worth fixing.)
00225      */
00226     cache = (gtrgm_consistent_cache *) fcinfo->flinfo->fn_extra;
00227     if (cache == NULL ||
00228         cache->strategy != strategy ||
00229         VARSIZE(cache->query) != querysize ||
00230         memcmp((char *) cache->query, (char *) query, querysize) != 0)
00231     {
00232         gtrgm_consistent_cache *newcache;
00233         TrgmPackedGraph *graph = NULL;
00234         Size        qtrgsize;
00235 
00236         switch (strategy)
00237         {
00238             case SimilarityStrategyNumber:
00239                 qtrg = generate_trgm(VARDATA(query),
00240                                      querysize - VARHDRSZ);
00241                 break;
00242             case ILikeStrategyNumber:
00243 #ifndef IGNORECASE
00244                 elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
00245 #endif
00246                 /* FALL THRU */
00247             case LikeStrategyNumber:
00248                 qtrg = generate_wildcard_trgm(VARDATA(query),
00249                                               querysize - VARHDRSZ);
00250                 break;
00251             case RegExpICaseStrategyNumber:
00252 #ifndef IGNORECASE
00253                 elog(ERROR, "cannot handle ~* with case-sensitive trigrams");
00254 #endif
00255                 /* FALL THRU */
00256             case RegExpStrategyNumber:
00257                 qtrg = createTrgmNFA(query, PG_GET_COLLATION(),
00258                                      &graph, fcinfo->flinfo->fn_mcxt);
00259                 /* just in case an empty array is returned ... */
00260                 if (qtrg && ARRNELEM(qtrg) <= 0)
00261                 {
00262                     pfree(qtrg);
00263                     qtrg = NULL;
00264                 }
00265                 break;
00266             default:
00267                 elog(ERROR, "unrecognized strategy number: %d", strategy);
00268                 qtrg = NULL;    /* keep compiler quiet */
00269                 break;
00270         }
00271 
00272         qtrgsize = qtrg ? VARSIZE(qtrg) : 0;
00273 
00274         newcache = (gtrgm_consistent_cache *)
00275             MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
00276                                MAXALIGN(sizeof(gtrgm_consistent_cache)) +
00277                                MAXALIGN(querysize) +
00278                                qtrgsize);
00279 
00280         newcache->strategy = strategy;
00281         newcache->query = (text *)
00282             ((char *) newcache + MAXALIGN(sizeof(gtrgm_consistent_cache)));
00283         memcpy((char *) newcache->query, (char *) query, querysize);
00284         if (qtrg)
00285         {
00286             newcache->trigrams = (TRGM *)
00287                 ((char *) newcache->query + MAXALIGN(querysize));
00288             memcpy((char *) newcache->trigrams, (char *) qtrg, qtrgsize);
00289             /* release qtrg in case it was made in fn_mcxt */
00290             pfree(qtrg);
00291         }
00292         else
00293             newcache->trigrams = NULL;
00294         newcache->graph = graph;
00295 
00296         if (cache)
00297             pfree(cache);
00298         fcinfo->flinfo->fn_extra = (void *) newcache;
00299         cache = newcache;
00300     }
00301 
00302     qtrg = cache->trigrams;
00303 
00304     switch (strategy)
00305     {
00306         case SimilarityStrategyNumber:
00307             /* Similarity search is exact */
00308             *recheck = false;
00309 
00310             if (GIST_LEAF(entry))
00311             {                   /* all leafs contains orig trgm */
00312                 float4      tmpsml = cnt_sml(key, qtrg);
00313 
00314                 /* strange bug at freebsd 5.2.1 and gcc 3.3.3 */
00315                 res = (*(int *) &tmpsml == *(int *) &trgm_limit || tmpsml > trgm_limit) ? true : false;
00316             }
00317             else if (ISALLTRUE(key))
00318             {                   /* non-leaf contains signature */
00319                 res = true;
00320             }
00321             else
00322             {                   /* non-leaf contains signature */
00323                 int32       count = cnt_sml_sign_common(qtrg, GETSIGN(key));
00324                 int32       len = ARRNELEM(qtrg);
00325 
00326                 if (len == 0)
00327                     res = false;
00328                 else
00329                     res = (((((float8) count) / ((float8) len))) >= trgm_limit) ? true : false;
00330             }
00331             break;
00332         case ILikeStrategyNumber:
00333 #ifndef IGNORECASE
00334             elog(ERROR, "cannot handle ~~* with case-sensitive trigrams");
00335 #endif
00336             /* FALL THRU */
00337         case LikeStrategyNumber:
00338             /* Wildcard search is inexact */
00339             *recheck = true;
00340 
00341             /*
00342              * Check if all the extracted trigrams can be present in child
00343              * nodes.
00344              */
00345             if (GIST_LEAF(entry))
00346             {                   /* all leafs contains orig trgm */
00347                 res = trgm_contained_by(qtrg, key);
00348             }
00349             else if (ISALLTRUE(key))
00350             {                   /* non-leaf contains signature */
00351                 res = true;
00352             }
00353             else
00354             {                   /* non-leaf contains signature */
00355                 int32       k,
00356                             tmp = 0,
00357                             len = ARRNELEM(qtrg);
00358                 trgm       *ptr = GETARR(qtrg);
00359                 BITVECP     sign = GETSIGN(key);
00360 
00361                 res = true;
00362                 for (k = 0; k < len; k++)
00363                 {
00364                     CPTRGM(((char *) &tmp), ptr + k);
00365                     if (!GETBIT(sign, HASHVAL(tmp)))
00366                     {
00367                         res = false;
00368                         break;
00369                     }
00370                 }
00371             }
00372             break;
00373         case RegExpICaseStrategyNumber:
00374 #ifndef IGNORECASE
00375             elog(ERROR, "cannot handle ~* with case-sensitive trigrams");
00376 #endif
00377             /* FALL THRU */
00378         case RegExpStrategyNumber:
00379             /* Regexp search is inexact */
00380             *recheck = true;
00381 
00382             /* Check regex match as much as we can with available info */
00383             if (qtrg)
00384             {
00385                 if (GIST_LEAF(entry))
00386                 {               /* all leafs contains orig trgm */
00387                     bool       *check;
00388 
00389                     check = trgm_presence_map(qtrg, key);
00390                     res = trigramsMatchGraph(cache->graph, check);
00391                     pfree(check);
00392                 }
00393                 else if (ISALLTRUE(key))
00394                 {               /* non-leaf contains signature */
00395                     res = true;
00396                 }
00397                 else
00398                 {               /* non-leaf contains signature */
00399                     int32       k,
00400                                 tmp = 0,
00401                                 len = ARRNELEM(qtrg);
00402                     trgm       *ptr = GETARR(qtrg);
00403                     BITVECP     sign = GETSIGN(key);
00404                     bool       *check;
00405 
00406                     /*
00407                      * GETBIT() tests may give false positives, due to limited
00408                      * size of the sign array.  But since trigramsMatchGraph()
00409                      * implements a monotone boolean function, false positives
00410                      * in the check array can't lead to false negative answer.
00411                      * So we can apply trigramsMatchGraph despite uncertainty,
00412                      * and that usefully improves the quality of the search.
00413                      */
00414                     check = (bool *) palloc(len * sizeof(bool));
00415                     for (k = 0; k < len; k++)
00416                     {
00417                         CPTRGM(((char *) &tmp), ptr + k);
00418                         check[k] = GETBIT(sign, HASHVAL(tmp));
00419                     }
00420                     res = trigramsMatchGraph(cache->graph, check);
00421                     pfree(check);
00422                 }
00423             }
00424             else
00425             {
00426                 /* trigram-free query must be rechecked everywhere */
00427                 res = true;
00428             }
00429             break;
00430         default:
00431             elog(ERROR, "unrecognized strategy number: %d", strategy);
00432             res = false;        /* keep compiler quiet */
00433             break;
00434     }
00435 
00436     PG_RETURN_BOOL(res);
00437 }
00438 
00439 Datum
00440 gtrgm_distance(PG_FUNCTION_ARGS)
00441 {
00442     GISTENTRY  *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
00443     text       *query = PG_GETARG_TEXT_P(1);
00444     StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
00445 
00446     /* Oid      subtype = PG_GETARG_OID(3); */
00447     TRGM       *key = (TRGM *) DatumGetPointer(entry->key);
00448     TRGM       *qtrg;
00449     float8      res;
00450     Size        querysize = VARSIZE(query);
00451     char       *cache = (char *) fcinfo->flinfo->fn_extra;
00452 
00453     /*
00454      * Cache the generated trigrams across multiple calls with the same query.
00455      */
00456     if (cache == NULL ||
00457         VARSIZE(cache) != querysize ||
00458         memcmp(cache, query, querysize) != 0)
00459     {
00460         char       *newcache;
00461 
00462         qtrg = generate_trgm(VARDATA(query), querysize - VARHDRSZ);
00463 
00464         newcache = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
00465                                       MAXALIGN(querysize) +
00466                                       VARSIZE(qtrg));
00467 
00468         memcpy(newcache, query, querysize);
00469         memcpy(newcache + MAXALIGN(querysize), qtrg, VARSIZE(qtrg));
00470 
00471         if (cache)
00472             pfree(cache);
00473         fcinfo->flinfo->fn_extra = newcache;
00474         cache = newcache;
00475     }
00476 
00477     qtrg = (TRGM *) (cache + MAXALIGN(querysize));
00478 
00479     switch (strategy)
00480     {
00481         case DistanceStrategyNumber:
00482             if (GIST_LEAF(entry))
00483             {                   /* all leafs contains orig trgm */
00484                 res = 1.0 - cnt_sml(key, qtrg);
00485             }
00486             else if (ISALLTRUE(key))
00487             {                   /* all leafs contains orig trgm */
00488                 res = 0.0;
00489             }
00490             else
00491             {                   /* non-leaf contains signature */
00492                 int32       count = cnt_sml_sign_common(qtrg, GETSIGN(key));
00493                 int32       len = ARRNELEM(qtrg);
00494 
00495                 res = (len == 0) ? -1.0 : 1.0 - ((float8) count) / ((float8) len);
00496             }
00497             break;
00498         default:
00499             elog(ERROR, "unrecognized strategy number: %d", strategy);
00500             res = 0;            /* keep compiler quiet */
00501             break;
00502     }
00503 
00504     PG_RETURN_FLOAT8(res);
00505 }
00506 
00507 static int32
00508 unionkey(BITVECP sbase, TRGM *add)
00509 {
00510     int32       i;
00511 
00512     if (ISSIGNKEY(add))
00513     {
00514         BITVECP     sadd = GETSIGN(add);
00515 
00516         if (ISALLTRUE(add))
00517             return 1;
00518 
00519         LOOPBYTE
00520             sbase[i] |= sadd[i];
00521     }
00522     else
00523     {
00524         trgm       *ptr = GETARR(add);
00525         int32       tmp = 0;
00526 
00527         for (i = 0; i < ARRNELEM(add); i++)
00528         {
00529             CPTRGM(((char *) &tmp), ptr + i);
00530             HASH(sbase, tmp);
00531         }
00532     }
00533     return 0;
00534 }
00535 
00536 
00537 Datum
00538 gtrgm_union(PG_FUNCTION_ARGS)
00539 {
00540     GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0);
00541     int32       len = entryvec->n;
00542     int        *size = (int *) PG_GETARG_POINTER(1);
00543     BITVEC      base;
00544     int32       i;
00545     int32       flag = 0;
00546     TRGM       *result;
00547 
00548     MemSet((void *) base, 0, sizeof(BITVEC));
00549     for (i = 0; i < len; i++)
00550     {
00551         if (unionkey(base, GETENTRY(entryvec, i)))
00552         {
00553             flag = ALLISTRUE;
00554             break;
00555         }
00556     }
00557 
00558     flag |= SIGNKEY;
00559     len = CALCGTSIZE(flag, 0);
00560     result = (TRGM *) palloc(len);
00561     SET_VARSIZE(result, len);
00562     result->flag = flag;
00563     if (!ISALLTRUE(result))
00564         memcpy((void *) GETSIGN(result), (void *) base, sizeof(BITVEC));
00565     *size = len;
00566 
00567     PG_RETURN_POINTER(result);
00568 }
00569 
00570 Datum
00571 gtrgm_same(PG_FUNCTION_ARGS)
00572 {
00573     TRGM       *a = (TRGM *) PG_GETARG_POINTER(0);
00574     TRGM       *b = (TRGM *) PG_GETARG_POINTER(1);
00575     bool       *result = (bool *) PG_GETARG_POINTER(2);
00576 
00577     if (ISSIGNKEY(a))
00578     {                           /* then b also ISSIGNKEY */
00579         if (ISALLTRUE(a) && ISALLTRUE(b))
00580             *result = true;
00581         else if (ISALLTRUE(a))
00582             *result = false;
00583         else if (ISALLTRUE(b))
00584             *result = false;
00585         else
00586         {
00587             int32       i;
00588             BITVECP     sa = GETSIGN(a),
00589                         sb = GETSIGN(b);
00590 
00591             *result = true;
00592             LOOPBYTE
00593             {
00594                 if (sa[i] != sb[i])
00595                 {
00596                     *result = false;
00597                     break;
00598                 }
00599             }
00600         }
00601     }
00602     else
00603     {                           /* a and b ISARRKEY */
00604         int32       lena = ARRNELEM(a),
00605                     lenb = ARRNELEM(b);
00606 
00607         if (lena != lenb)
00608             *result = false;
00609         else
00610         {
00611             trgm       *ptra = GETARR(a),
00612                        *ptrb = GETARR(b);
00613             int32       i;
00614 
00615             *result = true;
00616             for (i = 0; i < lena; i++)
00617                 if (CMPTRGM(ptra + i, ptrb + i))
00618                 {
00619                     *result = false;
00620                     break;
00621                 }
00622         }
00623     }
00624 
00625     PG_RETURN_POINTER(result);
00626 }
00627 
00628 static int32
00629 sizebitvec(BITVECP sign)
00630 {
00631     int32       size = 0,
00632                 i;
00633 
00634     LOOPBYTE
00635         size += number_of_ones[(unsigned char) sign[i]];
00636     return size;
00637 }
00638 
00639 static int
00640 hemdistsign(BITVECP a, BITVECP b)
00641 {
00642     int         i,
00643                 diff,
00644                 dist = 0;
00645 
00646     LOOPBYTE
00647     {
00648         diff = (unsigned char) (a[i] ^ b[i]);
00649         dist += number_of_ones[diff];
00650     }
00651     return dist;
00652 }
00653 
00654 static int
00655 hemdist(TRGM *a, TRGM *b)
00656 {
00657     if (ISALLTRUE(a))
00658     {
00659         if (ISALLTRUE(b))
00660             return 0;
00661         else
00662             return SIGLENBIT - sizebitvec(GETSIGN(b));
00663     }
00664     else if (ISALLTRUE(b))
00665         return SIGLENBIT - sizebitvec(GETSIGN(a));
00666 
00667     return hemdistsign(GETSIGN(a), GETSIGN(b));
00668 }
00669 
00670 Datum
00671 gtrgm_penalty(PG_FUNCTION_ARGS)
00672 {
00673     GISTENTRY  *origentry = (GISTENTRY *) PG_GETARG_POINTER(0); /* always ISSIGNKEY */
00674     GISTENTRY  *newentry = (GISTENTRY *) PG_GETARG_POINTER(1);
00675     float      *penalty = (float *) PG_GETARG_POINTER(2);
00676     TRGM       *origval = (TRGM *) DatumGetPointer(origentry->key);
00677     TRGM       *newval = (TRGM *) DatumGetPointer(newentry->key);
00678     BITVECP     orig = GETSIGN(origval);
00679 
00680     *penalty = 0.0;
00681 
00682     if (ISARRKEY(newval))
00683     {
00684         char       *cache = (char *) fcinfo->flinfo->fn_extra;
00685         TRGM       *cachedVal = (TRGM *) (cache + MAXALIGN(sizeof(BITVEC)));
00686         Size        newvalsize = VARSIZE(newval);
00687         BITVECP     sign;
00688 
00689         /*
00690          * Cache the sign data across multiple calls with the same newval.
00691          */
00692         if (cache == NULL ||
00693             VARSIZE(cachedVal) != newvalsize ||
00694             memcmp(cachedVal, newval, newvalsize) != 0)
00695         {
00696             char       *newcache;
00697 
00698             newcache = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
00699                                           MAXALIGN(sizeof(BITVEC)) +
00700                                           newvalsize);
00701 
00702             makesign((BITVECP) newcache, newval);
00703 
00704             cachedVal = (TRGM *) (newcache + MAXALIGN(sizeof(BITVEC)));
00705             memcpy(cachedVal, newval, newvalsize);
00706 
00707             if (cache)
00708                 pfree(cache);
00709             fcinfo->flinfo->fn_extra = newcache;
00710             cache = newcache;
00711         }
00712 
00713         sign = (BITVECP) cache;
00714 
00715         if (ISALLTRUE(origval))
00716             *penalty = ((float) (SIGLENBIT - sizebitvec(sign))) / (float) (SIGLENBIT + 1);
00717         else
00718             *penalty = hemdistsign(sign, orig);
00719     }
00720     else
00721         *penalty = hemdist(origval, newval);
00722     PG_RETURN_POINTER(penalty);
00723 }
00724 
00725 typedef struct
00726 {
00727     bool        allistrue;
00728     BITVEC      sign;
00729 } CACHESIGN;
00730 
00731 static void
00732 fillcache(CACHESIGN *item, TRGM *key)
00733 {
00734     item->allistrue = false;
00735     if (ISARRKEY(key))
00736         makesign(item->sign, key);
00737     else if (ISALLTRUE(key))
00738         item->allistrue = true;
00739     else
00740         memcpy((void *) item->sign, (void *) GETSIGN(key), sizeof(BITVEC));
00741 }
00742 
00743 #define WISH_F(a,b,c) (double)( -(double)(((a)-(b))*((a)-(b))*((a)-(b)))*(c) )
00744 typedef struct
00745 {
00746     OffsetNumber pos;
00747     int32       cost;
00748 } SPLITCOST;
00749 
00750 static int
00751 comparecost(const void *a, const void *b)
00752 {
00753     if (((const SPLITCOST *) a)->cost == ((const SPLITCOST *) b)->cost)
00754         return 0;
00755     else
00756         return (((const SPLITCOST *) a)->cost > ((const SPLITCOST *) b)->cost) ? 1 : -1;
00757 }
00758 
00759 
00760 static int
00761 hemdistcache(CACHESIGN *a, CACHESIGN *b)
00762 {
00763     if (a->allistrue)
00764     {
00765         if (b->allistrue)
00766             return 0;
00767         else
00768             return SIGLENBIT - sizebitvec(b->sign);
00769     }
00770     else if (b->allistrue)
00771         return SIGLENBIT - sizebitvec(a->sign);
00772 
00773     return hemdistsign(a->sign, b->sign);
00774 }
00775 
00776 Datum
00777 gtrgm_picksplit(PG_FUNCTION_ARGS)
00778 {
00779     GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0);
00780     OffsetNumber maxoff = entryvec->n - 2;
00781     GIST_SPLITVEC *v = (GIST_SPLITVEC *) PG_GETARG_POINTER(1);
00782     OffsetNumber k,
00783                 j;
00784     TRGM       *datum_l,
00785                *datum_r;
00786     BITVECP     union_l,
00787                 union_r;
00788     int32       size_alpha,
00789                 size_beta;
00790     int32       size_waste,
00791                 waste = -1;
00792     int32       nbytes;
00793     OffsetNumber seed_1 = 0,
00794                 seed_2 = 0;
00795     OffsetNumber *left,
00796                *right;
00797     BITVECP     ptr;
00798     int         i;
00799     CACHESIGN  *cache;
00800     SPLITCOST  *costvector;
00801 
00802     /* cache the sign data for each existing item */
00803     cache = (CACHESIGN *) palloc(sizeof(CACHESIGN) * (maxoff + 2));
00804     for (k = FirstOffsetNumber; k <= maxoff; k = OffsetNumberNext(k))
00805         fillcache(&cache[k], GETENTRY(entryvec, k));
00806 
00807     /* now find the two furthest-apart items */
00808     for (k = FirstOffsetNumber; k < maxoff; k = OffsetNumberNext(k))
00809     {
00810         for (j = OffsetNumberNext(k); j <= maxoff; j = OffsetNumberNext(j))
00811         {
00812             size_waste = hemdistcache(&(cache[j]), &(cache[k]));
00813             if (size_waste > waste)
00814             {
00815                 waste = size_waste;
00816                 seed_1 = k;
00817                 seed_2 = j;
00818             }
00819         }
00820     }
00821 
00822     /* just in case we didn't make a selection ... */
00823     if (seed_1 == 0 || seed_2 == 0)
00824     {
00825         seed_1 = 1;
00826         seed_2 = 2;
00827     }
00828 
00829     /* initialize the result vectors */
00830     nbytes = (maxoff + 2) * sizeof(OffsetNumber);
00831     v->spl_left = left = (OffsetNumber *) palloc(nbytes);
00832     v->spl_right = right = (OffsetNumber *) palloc(nbytes);
00833     v->spl_nleft = 0;
00834     v->spl_nright = 0;
00835 
00836     /* form initial .. */
00837     if (cache[seed_1].allistrue)
00838     {
00839         datum_l = (TRGM *) palloc(CALCGTSIZE(SIGNKEY | ALLISTRUE, 0));
00840         SET_VARSIZE(datum_l, CALCGTSIZE(SIGNKEY | ALLISTRUE, 0));
00841         datum_l->flag = SIGNKEY | ALLISTRUE;
00842     }
00843     else
00844     {
00845         datum_l = (TRGM *) palloc(CALCGTSIZE(SIGNKEY, 0));
00846         SET_VARSIZE(datum_l, CALCGTSIZE(SIGNKEY, 0));
00847         datum_l->flag = SIGNKEY;
00848         memcpy((void *) GETSIGN(datum_l), (void *) cache[seed_1].sign, sizeof(BITVEC));
00849     }
00850     if (cache[seed_2].allistrue)
00851     {
00852         datum_r = (TRGM *) palloc(CALCGTSIZE(SIGNKEY | ALLISTRUE, 0));
00853         SET_VARSIZE(datum_r, CALCGTSIZE(SIGNKEY | ALLISTRUE, 0));
00854         datum_r->flag = SIGNKEY | ALLISTRUE;
00855     }
00856     else
00857     {
00858         datum_r = (TRGM *) palloc(CALCGTSIZE(SIGNKEY, 0));
00859         SET_VARSIZE(datum_r, CALCGTSIZE(SIGNKEY, 0));
00860         datum_r->flag = SIGNKEY;
00861         memcpy((void *) GETSIGN(datum_r), (void *) cache[seed_2].sign, sizeof(BITVEC));
00862     }
00863 
00864     union_l = GETSIGN(datum_l);
00865     union_r = GETSIGN(datum_r);
00866     maxoff = OffsetNumberNext(maxoff);
00867     fillcache(&cache[maxoff], GETENTRY(entryvec, maxoff));
00868     /* sort before ... */
00869     costvector = (SPLITCOST *) palloc(sizeof(SPLITCOST) * maxoff);
00870     for (j = FirstOffsetNumber; j <= maxoff; j = OffsetNumberNext(j))
00871     {
00872         costvector[j - 1].pos = j;
00873         size_alpha = hemdistcache(&(cache[seed_1]), &(cache[j]));
00874         size_beta = hemdistcache(&(cache[seed_2]), &(cache[j]));
00875         costvector[j - 1].cost = abs(size_alpha - size_beta);
00876     }
00877     qsort((void *) costvector, maxoff, sizeof(SPLITCOST), comparecost);
00878 
00879     for (k = 0; k < maxoff; k++)
00880     {
00881         j = costvector[k].pos;
00882         if (j == seed_1)
00883         {
00884             *left++ = j;
00885             v->spl_nleft++;
00886             continue;
00887         }
00888         else if (j == seed_2)
00889         {
00890             *right++ = j;
00891             v->spl_nright++;
00892             continue;
00893         }
00894 
00895         if (ISALLTRUE(datum_l) || cache[j].allistrue)
00896         {
00897             if (ISALLTRUE(datum_l) && cache[j].allistrue)
00898                 size_alpha = 0;
00899             else
00900                 size_alpha = SIGLENBIT - sizebitvec(
00901                                                     (cache[j].allistrue) ? GETSIGN(datum_l) : GETSIGN(cache[j].sign)
00902                     );
00903         }
00904         else
00905             size_alpha = hemdistsign(cache[j].sign, GETSIGN(datum_l));
00906 
00907         if (ISALLTRUE(datum_r) || cache[j].allistrue)
00908         {
00909             if (ISALLTRUE(datum_r) && cache[j].allistrue)
00910                 size_beta = 0;
00911             else
00912                 size_beta = SIGLENBIT - sizebitvec(
00913                                                    (cache[j].allistrue) ? GETSIGN(datum_r) : GETSIGN(cache[j].sign)
00914                     );
00915         }
00916         else
00917             size_beta = hemdistsign(cache[j].sign, GETSIGN(datum_r));
00918 
00919         if (size_alpha < size_beta + WISH_F(v->spl_nleft, v->spl_nright, 0.1))
00920         {
00921             if (ISALLTRUE(datum_l) || cache[j].allistrue)
00922             {
00923                 if (!ISALLTRUE(datum_l))
00924                     MemSet((void *) GETSIGN(datum_l), 0xff, sizeof(BITVEC));
00925             }
00926             else
00927             {
00928                 ptr = cache[j].sign;
00929                 LOOPBYTE
00930                     union_l[i] |= ptr[i];
00931             }
00932             *left++ = j;
00933             v->spl_nleft++;
00934         }
00935         else
00936         {
00937             if (ISALLTRUE(datum_r) || cache[j].allistrue)
00938             {
00939                 if (!ISALLTRUE(datum_r))
00940                     MemSet((void *) GETSIGN(datum_r), 0xff, sizeof(BITVEC));
00941             }
00942             else
00943             {
00944                 ptr = cache[j].sign;
00945                 LOOPBYTE
00946                     union_r[i] |= ptr[i];
00947             }
00948             *right++ = j;
00949             v->spl_nright++;
00950         }
00951     }
00952 
00953     *right = *left = FirstOffsetNumber;
00954     v->spl_ldatum = PointerGetDatum(datum_l);
00955     v->spl_rdatum = PointerGetDatum(datum_r);
00956 
00957     PG_RETURN_POINTER(v);
00958 }