Header And Logo

PostgreSQL
| The world's most advanced open source database.

rangetypes_selfuncs.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * rangetypes_selfuncs.c
00004  *    Functions for selectivity estimation of range operators
00005  *
00006  * Estimates are based on histograms of lower and upper bounds, and the
00007  * fraction of empty ranges.
00008  *
00009  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00010  * Portions Copyright (c) 1994, Regents of the University of California
00011  *
00012  *
00013  * IDENTIFICATION
00014  *    src/backend/utils/adt/rangetypes_selfuncs.c
00015  *
00016  *-------------------------------------------------------------------------
00017  */
00018 #include "postgres.h"
00019 
00020 #include "access/htup_details.h"
00021 #include "catalog/pg_operator.h"
00022 #include "catalog/pg_statistic.h"
00023 #include "utils/builtins.h"
00024 #include "utils/lsyscache.h"
00025 #include "utils/rangetypes.h"
00026 #include "utils/selfuncs.h"
00027 #include "utils/typcache.h"
00028 
00029 static double calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
00030               RangeType *constval, Oid operator);
00031 static double default_range_selectivity(Oid operator);
00032 static double calc_hist_selectivity(TypeCacheEntry *typcache,
00033                       VariableStatData *vardata, RangeType *constval,
00034                       Oid operator);
00035 static double calc_hist_selectivity_scalar(TypeCacheEntry *typcache,
00036                              RangeBound *constbound,
00037                              RangeBound *hist, int hist_nvalues,
00038                              bool equal);
00039 static int rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value,
00040                RangeBound *hist, int hist_length, bool equal);
00041 static float8 get_position(TypeCacheEntry *typcache, RangeBound *value,
00042              RangeBound *hist1, RangeBound *hist2);
00043 static float8 get_len_position(double value, double hist1, double hist2);
00044 static float8 get_distance(TypeCacheEntry *typcache, RangeBound *bound1,
00045                                                             RangeBound *bound2);
00046 static int length_hist_bsearch(Datum *length_hist_values,
00047                     int length_hist_nvalues, double value, bool equal);
00048 static double calc_length_hist_frac(Datum *length_hist_values,
00049                                     int length_hist_nvalues, double length1, double length2, bool equal);
00050 static double calc_hist_selectivity_contained(TypeCacheEntry *typcache,
00051                                 RangeBound *lower, RangeBound *upper,
00052                                 RangeBound *hist_lower, int hist_nvalues,
00053                             Datum *length_hist_values, int length_hist_nvalues);
00054 static double calc_hist_selectivity_contains(TypeCacheEntry *typcache,
00055                                RangeBound *lower, RangeBound *upper,
00056                                RangeBound *hist_lower, int hist_nvalues,
00057                             Datum *length_hist_values, int length_hist_nvalues);
00058 
00059 /*
00060  * Returns a default selectivity estimate for given operator, when we don't
00061  * have statistics or cannot use them for some reason.
00062  */
00063 static double
00064 default_range_selectivity(Oid operator)
00065 {
00066     switch (operator)
00067     {
00068         case OID_RANGE_OVERLAP_OP:
00069             return 0.01;
00070 
00071         case OID_RANGE_CONTAINS_OP:
00072         case OID_RANGE_CONTAINED_OP:
00073             return 0.005;
00074 
00075         case OID_RANGE_CONTAINS_ELEM_OP:
00076             /*
00077              * "range @> elem" is more or less identical to a scalar
00078              * inequality "A >= b AND A <= c".
00079              */
00080             return DEFAULT_RANGE_INEQ_SEL;
00081 
00082         case OID_RANGE_LESS_OP:
00083         case OID_RANGE_LESS_EQUAL_OP:
00084         case OID_RANGE_GREATER_OP:
00085         case OID_RANGE_GREATER_EQUAL_OP:
00086         case OID_RANGE_LEFT_OP:
00087         case OID_RANGE_RIGHT_OP:
00088             /* these are similar to regular scalar inequalities */
00089             return DEFAULT_INEQ_SEL;
00090 
00091         default:
00092             /* all range operators should be handled above, but just in case */
00093             return 0.01;
00094     }
00095 }
00096 
00097 /*
00098  * rangesel -- restriction selectivity for range operators
00099  */
00100 Datum
00101 rangesel(PG_FUNCTION_ARGS)
00102 {
00103     PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
00104     Oid         operator = PG_GETARG_OID(1);
00105     List       *args = (List *) PG_GETARG_POINTER(2);
00106     int         varRelid = PG_GETARG_INT32(3);
00107     VariableStatData vardata;
00108     Node       *other;
00109     bool        varonleft;
00110     Selectivity selec;
00111     TypeCacheEntry *typcache;
00112     RangeType  *constrange = NULL;
00113 
00114     /*
00115      * If expression is not (variable op something) or (something op
00116      * variable), then punt and return a default estimate.
00117      */
00118     if (!get_restriction_variable(root, args, varRelid,
00119                                   &vardata, &other, &varonleft))
00120         PG_RETURN_FLOAT8(default_range_selectivity(operator));
00121 
00122     /*
00123      * Can't do anything useful if the something is not a constant, either.
00124      */
00125     if (!IsA(other, Const))
00126     {
00127         ReleaseVariableStats(vardata);
00128         PG_RETURN_FLOAT8(default_range_selectivity(operator));
00129     }
00130 
00131     /*
00132      * All the range operators are strict, so we can cope with a NULL constant
00133      * right away.
00134      */
00135     if (((Const *) other)->constisnull)
00136     {
00137         ReleaseVariableStats(vardata);
00138         PG_RETURN_FLOAT8(0.0);
00139     }
00140 
00141     /*
00142      * If var is on the right, commute the operator, so that we can assume the
00143      * var is on the left in what follows.
00144      */
00145     if (!varonleft)
00146     {
00147         /* we have other Op var, commute to make var Op other */
00148         operator = get_commutator(operator);
00149         if (!operator)
00150         {
00151             /* Use default selectivity (should we raise an error instead?) */
00152             ReleaseVariableStats(vardata);
00153             PG_RETURN_FLOAT8(default_range_selectivity(operator));
00154         }
00155     }
00156 
00157     /*
00158      * OK, there's a Var and a Const we're dealing with here.  We need the
00159      * Const to be of same range type as the column, else we can't do anything
00160      * useful. (Such cases will likely fail at runtime, but here we'd rather
00161      * just return a default estimate.)
00162      *
00163      * If the operator is "range @> element", the constant should be of the
00164      * element type of the range column. Convert it to a range that includes
00165      * only that single point, so that we don't need special handling for
00166      * that in what follows.
00167      */
00168     if (operator == OID_RANGE_CONTAINS_ELEM_OP)
00169     {
00170         typcache = range_get_typcache(fcinfo, vardata.vartype);
00171 
00172         if (((Const *) other)->consttype == typcache->rngelemtype->type_id)
00173         {
00174             RangeBound lower, upper;
00175             lower.inclusive = true;
00176             lower.val = ((Const *) other)->constvalue;
00177             lower.infinite = false;
00178             lower.lower = true;
00179             upper.inclusive = true;
00180             upper.val = ((Const *) other)->constvalue;
00181             upper.infinite = false;
00182             upper.lower = false;
00183             constrange = range_serialize(typcache, &lower, &upper, false);
00184         }
00185     }
00186     else
00187     {
00188         typcache = range_get_typcache(fcinfo, ((Const *) other)->consttype);
00189 
00190         if (((Const *) other)->consttype == vardata.vartype)
00191             constrange = DatumGetRangeType(((Const *) other)->constvalue);
00192     }
00193 
00194     /*
00195      * If we got a valid constant on one side of the operator, proceed to
00196      * estimate using statistics. Otherwise punt and return a default
00197      * constant estimate.
00198      */
00199     if (constrange)
00200         selec = calc_rangesel(typcache, &vardata, constrange, operator);
00201     else
00202         selec = default_range_selectivity(operator);
00203 
00204     ReleaseVariableStats(vardata);
00205 
00206     CLAMP_PROBABILITY(selec);
00207 
00208     PG_RETURN_FLOAT8((float8) selec);
00209 }
00210 
00211 static double
00212 calc_rangesel(TypeCacheEntry *typcache, VariableStatData *vardata,
00213               RangeType *constval, Oid operator)
00214 {
00215     double      hist_selec;
00216     double      selec;
00217     float4      empty_frac, null_frac;
00218 
00219     /*
00220      * First look up the fraction of NULLs and empty ranges from pg_statistic.
00221      */
00222     if (HeapTupleIsValid(vardata->statsTuple))
00223     {
00224         Form_pg_statistic stats;
00225         float4     *numbers;
00226         int         nnumbers;
00227 
00228         stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
00229         null_frac = stats->stanullfrac;
00230 
00231         /* Try to get fraction of empty ranges */
00232         if (get_attstatsslot(vardata->statsTuple,
00233                              vardata->atttype, vardata->atttypmod,
00234                              STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM, InvalidOid,
00235                              NULL,
00236                              NULL, NULL,
00237                              &numbers, &nnumbers))
00238         {
00239             if (nnumbers != 1)
00240                 elog(ERROR, "invalid empty fraction statistic"); /* shouldn't happen */
00241             empty_frac = numbers[0];
00242         }
00243         else
00244         {
00245             /* No empty fraction statistic. Assume no empty ranges. */
00246             empty_frac = 0.0;
00247         }
00248     }
00249     else
00250     {
00251         /*
00252          * No stats are available. Follow through the calculations below
00253          * anyway, assuming no NULLs and no empty ranges. This still allows
00254          * us to give a better-than-nothing estimate based on whether the
00255          * constant is an empty range or not.
00256          */
00257         null_frac = 0.0;
00258         empty_frac = 0.0;
00259     }
00260 
00261     if (RangeIsEmpty(constval))
00262     {
00263         /*
00264          * An empty range matches all ranges, all empty ranges, or nothing,
00265          * depending on the operator
00266          */
00267         switch (operator)
00268         {
00269             case OID_RANGE_OVERLAP_OP:
00270             case OID_RANGE_OVERLAPS_LEFT_OP:
00271             case OID_RANGE_OVERLAPS_RIGHT_OP:
00272             case OID_RANGE_LEFT_OP:
00273             case OID_RANGE_RIGHT_OP:
00274                 /* these return false if either argument is empty */
00275                 selec = 0.0;
00276                 break;
00277 
00278             case OID_RANGE_CONTAINED_OP:
00279             case OID_RANGE_LESS_EQUAL_OP:
00280             case OID_RANGE_GREATER_EQUAL_OP:
00281                 /*
00282                  * these return true when both args are empty, false if only
00283                  * one is empty
00284                  */
00285                 selec = empty_frac;
00286                 break;
00287 
00288             case OID_RANGE_CONTAINS_OP:
00289                 /* everything contains an empty range */
00290                 selec = 1.0;
00291                 break;
00292 
00293             case OID_RANGE_CONTAINS_ELEM_OP:
00294             default:
00295                 elog(ERROR, "unexpected operator %u", operator);
00296                 selec = 0.0; /* keep compiler quiet */
00297                 break;
00298         }
00299     }
00300     else
00301     {
00302         /*
00303          * Calculate selectivity using bound histograms. If that fails for
00304          * some reason, e.g no histogram in pg_statistic, use the default
00305          * constant estimate for the fraction of non-empty values. This is
00306          * still somewhat better than just returning the default estimate,
00307          * because this still takes into account the fraction of empty and
00308          * NULL tuples, if we had statistics for them.
00309          */
00310         hist_selec = calc_hist_selectivity(typcache, vardata, constval,
00311                                            operator);
00312         if (hist_selec < 0.0)
00313             hist_selec = default_range_selectivity(operator);
00314 
00315         /*
00316          * Now merge the results for the empty ranges and histogram
00317          * calculations, realizing that the histogram covers only the
00318          * non-null, non-empty values.
00319          */
00320         if (operator == OID_RANGE_CONTAINED_OP)
00321         {
00322             /* empty is contained by anything non-empty */
00323             selec = (1.0 - empty_frac) * hist_selec + empty_frac;
00324         }
00325         else
00326         {
00327             /* with any other operator, empty Op non-empty matches nothing */
00328             selec = (1.0 - empty_frac) * hist_selec;
00329         }
00330     }
00331 
00332     /* all range operators are strict */
00333     selec *= (1.0 - null_frac);
00334 
00335     /* result should be in range, but make sure... */
00336     CLAMP_PROBABILITY(selec);
00337 
00338     return selec;
00339 }
00340 
00341 /*
00342  * Calculate range operator selectivity using histograms of range bounds.
00343  *
00344  * This estimate is for the portion of values that are not empty and not
00345  * NULL.
00346  */
00347 static double
00348 calc_hist_selectivity(TypeCacheEntry *typcache, VariableStatData *vardata,
00349                       RangeType *constval, Oid operator)
00350 {
00351     Datum      *hist_values;
00352     int         nhist;
00353     Datum      *length_hist_values;
00354     int         length_nhist;
00355     RangeBound *hist_lower;
00356     RangeBound *hist_upper;
00357     int         i;
00358     RangeBound  const_lower;
00359     RangeBound  const_upper;
00360     bool        empty;
00361     double      hist_selec;
00362 
00363     /* Try to get histogram of ranges */
00364     if (!(HeapTupleIsValid(vardata->statsTuple) &&
00365           get_attstatsslot(vardata->statsTuple,
00366                            vardata->atttype, vardata->atttypmod,
00367                            STATISTIC_KIND_BOUNDS_HISTOGRAM, InvalidOid,
00368                            NULL,
00369                            &hist_values, &nhist,
00370                            NULL, NULL)))
00371         return -1.0;
00372 
00373     /*
00374      * Convert histogram of ranges into histograms of its lower and upper
00375      * bounds.
00376      */
00377     hist_lower = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
00378     hist_upper = (RangeBound *) palloc(sizeof(RangeBound) * nhist);
00379     for (i = 0; i < nhist; i++)
00380     {
00381         range_deserialize(typcache, DatumGetRangeType(hist_values[i]),
00382                           &hist_lower[i], &hist_upper[i], &empty);
00383         /* The histogram should not contain any empty ranges */
00384         if (empty)
00385             elog(ERROR, "bounds histogram contains an empty range");
00386     }
00387 
00388     /* @> and @< also need a histogram of range lengths */
00389     if (operator == OID_RANGE_CONTAINS_OP ||
00390         operator == OID_RANGE_CONTAINED_OP)
00391     {
00392         if (!(HeapTupleIsValid(vardata->statsTuple) &&
00393               get_attstatsslot(vardata->statsTuple,
00394                                vardata->atttype, vardata->atttypmod,
00395                                STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM,
00396                                InvalidOid,
00397                                NULL,
00398                                &length_hist_values, &length_nhist,
00399                                NULL, NULL)))
00400             return -1.0;
00401 
00402         /* check that it's a histogram, not just a dummy entry */
00403         if (length_nhist < 2)
00404             return -1.0;
00405     }
00406 
00407     /* Extract the bounds of the constant value. */
00408     range_deserialize(typcache, constval, &const_lower, &const_upper, &empty);
00409     Assert (!empty);
00410 
00411     /*
00412      * Calculate selectivity comparing the lower or upper bound of the
00413      * constant with the histogram of lower or upper bounds.
00414      */
00415     switch (operator)
00416     {
00417         case OID_RANGE_LESS_OP:
00418             /*
00419              * The regular b-tree comparison operators (<, <=, >, >=) compare
00420              * the lower bounds first, and the upper bounds for values with
00421              * equal lower bounds. Estimate that by comparing the lower bounds
00422              * only. This gives a fairly accurate estimate assuming there
00423              * aren't many rows with a lower bound equal to the constant's
00424              * lower bound.
00425              */
00426             hist_selec =
00427                 calc_hist_selectivity_scalar(typcache, &const_lower,
00428                                              hist_lower, nhist, false);
00429             break;
00430 
00431         case OID_RANGE_LESS_EQUAL_OP:
00432             hist_selec =
00433                 calc_hist_selectivity_scalar(typcache, &const_lower,
00434                                              hist_lower, nhist, true);
00435             break;
00436 
00437         case OID_RANGE_GREATER_OP:
00438             hist_selec =
00439                 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
00440                                                  hist_lower, nhist, true);
00441             break;
00442 
00443         case OID_RANGE_GREATER_EQUAL_OP:
00444             hist_selec =
00445                 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
00446                                                  hist_lower, nhist, false);
00447             break;
00448 
00449         case OID_RANGE_LEFT_OP:
00450             /* var << const when upper(var) < lower(const) */
00451             hist_selec =
00452                 calc_hist_selectivity_scalar(typcache, &const_lower,
00453                                              hist_upper, nhist, false);
00454             break;
00455 
00456         case OID_RANGE_RIGHT_OP:
00457             /* var >> const when lower(var) > upper(const) */
00458             hist_selec =
00459                 1 - calc_hist_selectivity_scalar(typcache, &const_upper,
00460                                                  hist_lower, nhist, true);
00461             break;
00462 
00463         case OID_RANGE_OVERLAPS_RIGHT_OP:
00464             /* compare lower bounds */
00465             hist_selec =
00466                 1 - calc_hist_selectivity_scalar(typcache, &const_lower,
00467                                                  hist_lower, nhist, false);
00468             break;
00469 
00470         case OID_RANGE_OVERLAPS_LEFT_OP:
00471             /* compare upper bounds */
00472             hist_selec =
00473                 calc_hist_selectivity_scalar(typcache, &const_upper,
00474                                              hist_upper, nhist, true);
00475             break;
00476 
00477         case OID_RANGE_OVERLAP_OP:
00478         case OID_RANGE_CONTAINS_ELEM_OP:
00479             /*
00480              * A && B <=> NOT (A << B OR A >> B).
00481              *
00482              * Since A << B and A >> B are mutually exclusive events we can sum
00483              * their probabilities to find probability of (A << B OR A >> B).
00484              *
00485              * "range @> elem" is equivalent to "range && [elem,elem]". The
00486              * caller already constructed the singular range from the element
00487              * constant, so just treat it the same as &&.
00488              */
00489             hist_selec =
00490                 calc_hist_selectivity_scalar(typcache, &const_lower, hist_upper,
00491                                              nhist, false);
00492             hist_selec +=
00493                 (1.0 - calc_hist_selectivity_scalar(typcache, &const_upper, hist_lower,
00494                                                   nhist, true));
00495             hist_selec = 1.0 - hist_selec;
00496             break;
00497 
00498         case OID_RANGE_CONTAINS_OP:
00499             hist_selec =
00500                 calc_hist_selectivity_contains(typcache, &const_lower,
00501                                             &const_upper, hist_lower, nhist,
00502                                             length_hist_values, length_nhist);
00503             break;
00504 
00505         case OID_RANGE_CONTAINED_OP:
00506             if (const_lower.infinite)
00507             {
00508                 /*
00509                  * Lower bound no longer matters. Just estimate the fraction
00510                  * with an upper bound <= const uppert bound
00511                  */
00512                 hist_selec =
00513                     calc_hist_selectivity_scalar(typcache, &const_upper,
00514                                                  hist_upper, nhist, true);
00515             }
00516             else if (const_upper.infinite)
00517             {
00518                 hist_selec =
00519                     1.0 - calc_hist_selectivity_scalar(typcache, &const_lower,
00520                                                        hist_lower, nhist, false);
00521             }
00522             else
00523             {
00524                 hist_selec =
00525                     calc_hist_selectivity_contained(typcache, &const_lower,
00526                                                     &const_upper, hist_lower, nhist,
00527                                                     length_hist_values, length_nhist);
00528             }
00529             break;
00530 
00531         default:
00532             elog(ERROR, "unknown range operator %u", operator);
00533             hist_selec = -1.0; /* keep compiler quiet */
00534             break;
00535     }
00536 
00537     return hist_selec;
00538 }
00539 
00540 
00541 /*
00542  * Look up the fraction of values less than (or equal, if 'equal' argument
00543  * is true) a given const in a histogram of range bounds.
00544  */
00545 static double
00546 calc_hist_selectivity_scalar(TypeCacheEntry *typcache, RangeBound *constbound,
00547                              RangeBound *hist, int hist_nvalues, bool equal)
00548 {
00549     Selectivity selec;
00550     int         index;
00551 
00552     /*
00553      * Find the histogram bin the given constant falls into. Estimate
00554      * selectivity as the number of preceding whole bins.
00555      */
00556     index = rbound_bsearch(typcache, constbound, hist, hist_nvalues, equal);
00557     selec = (Selectivity) (Max(index, 0)) / (Selectivity) (hist_nvalues - 1);
00558 
00559     /* Adjust using linear interpolation within the bin */
00560     if (index >= 0 && index < hist_nvalues - 1)
00561         selec += get_position(typcache, constbound, &hist[index],
00562                         &hist[index + 1]) / (Selectivity) (hist_nvalues - 1);
00563 
00564     return selec;
00565 }
00566 
00567 /*
00568  * Binary search on an array of range bounds. Returns greatest index of range
00569  * bound in array which is less(less or equal) than given range bound. If all
00570  * range bounds in array are greater or equal(greater) than given range bound,
00571  * return -1. When "equal" flag is set conditions in brackets are used.
00572  *
00573  * This function is used in scalar operators selectivity estimation. Another
00574  * goal of this function is to found an histogram bin where to stop
00575  * interpolation of portion of bounds which are less or equal to given bound.
00576  */
00577 static int
00578 rbound_bsearch(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist,
00579               int hist_length, bool equal)
00580 {
00581     int         lower = -1,
00582                 upper = hist_length - 1,
00583                 cmp,
00584                 middle;
00585 
00586     while (lower < upper)
00587     {
00588         middle = (lower + upper + 1) / 2;
00589         cmp = range_cmp_bounds(typcache, &hist[middle], value);
00590 
00591         if (cmp < 0 || (equal && cmp == 0))
00592             lower = middle;
00593         else
00594             upper = middle - 1;
00595     }
00596     return lower;
00597 }
00598 
00599 
00600 /*
00601  * Binary search on length histogram. Returns greatest index of range length in
00602  * histogram which is less than (less than or equal) the given length value. If
00603  * all lengths in the histogram are greater than (greater than or equal) the
00604  * given length, returns -1.
00605  */
00606 static int
00607 length_hist_bsearch(Datum *length_hist_values, int length_hist_nvalues,
00608                     double value, bool equal)
00609 {
00610     int         lower = -1,
00611                 upper = length_hist_nvalues - 1,
00612                 middle;
00613 
00614     while (lower < upper)
00615     {
00616         double middleval;
00617 
00618         middle = (lower + upper + 1) / 2;
00619 
00620         middleval = DatumGetFloat8(length_hist_values[middle]);
00621         if (middleval < value || (equal && middleval <= value))
00622             lower = middle;
00623         else
00624             upper = middle - 1;
00625     }
00626     return lower;
00627 }
00628 
00629 /*
00630  * Get relative position of value in histogram bin in [0,1] range.
00631  */
00632 static float8
00633 get_position(TypeCacheEntry *typcache, RangeBound *value, RangeBound *hist1,
00634              RangeBound *hist2)
00635 {
00636     bool        has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
00637     float8      position;
00638 
00639     if (!hist1->infinite && !hist2->infinite)
00640     {
00641         float8      bin_width;
00642 
00643         /*
00644          * Both bounds are finite. Assuming the subtype's comparison function
00645          * works sanely, the value must be finite, too, because it lies
00646          * somewhere between the bounds. If it doesn't, just return something.
00647          */
00648         if (value->infinite)
00649             return 0.5;
00650 
00651         /* Can't interpolate without subdiff function */
00652         if (!has_subdiff)
00653             return 0.5;
00654 
00655         /* Calculate relative position using subdiff function. */
00656         bin_width = DatumGetFloat8(FunctionCall2Coll(
00657                                                 &typcache->rng_subdiff_finfo,
00658                                                      typcache->rng_collation,
00659                                                      hist2->val,
00660                                                      hist1->val));
00661         if (bin_width <= 0.0)
00662             return 0.5;     /* zero width bin */
00663 
00664         position = DatumGetFloat8(FunctionCall2Coll(
00665                                                 &typcache->rng_subdiff_finfo,
00666                                                     typcache->rng_collation,
00667                                                     value->val,
00668                                                     hist1->val))
00669             / bin_width;
00670 
00671         /* Relative position must be in [0,1] range */
00672         position = Max(position, 0.0);
00673         position = Min(position, 1.0);
00674         return position;
00675     }
00676     else if (hist1->infinite && !hist2->infinite)
00677     {
00678         /*
00679          * Lower bin boundary is -infinite, upper is finite. If the value is
00680          * -infinite, return 0.0 to indicate it's equal to the lower bound.
00681          * Otherwise return 1.0 to indicate it's infinitely far from the lower
00682          * bound.
00683          */
00684         return ((value->infinite && value->lower) ? 0.0 : 1.0);
00685     }
00686     else if (!hist1->infinite && hist2->infinite)
00687     {
00688         /* same as above, but in reverse */
00689         return ((value->infinite && !value->lower) ? 1.0 : 0.0);
00690     }
00691     else
00692     {
00693         /*
00694          * If both bin boundaries are infinite, they should be equal to each
00695          * other, and the value should also be infinite and equal to both
00696          * bounds. (But don't Assert that, to avoid crashing if a user creates
00697          * a datatype with a broken comparison function).
00698          *
00699          * Assume the value to lie in the middle of the infinite bounds.
00700          */
00701         return 0.5;
00702     }
00703 }
00704 
00705 
00706 /*
00707  * Get relative position of value in a length histogram bin in [0,1] range.
00708  */
00709 static double
00710 get_len_position(double value, double hist1, double hist2)
00711 {
00712     if (!is_infinite(hist1) && !is_infinite(hist2))
00713     {
00714         /*
00715          * Both bounds are finite. The value should be finite too, because it
00716          * lies somewhere between the bounds. If it doesn't, just return
00717          * something.
00718          */
00719         if (is_infinite(value))
00720             return 0.5;
00721 
00722         return 1.0 - (hist2 - value) / (hist2 - hist1);
00723     }
00724     else if (is_infinite(hist1) && !is_infinite(hist2))
00725     {
00726         /*
00727          * Lower bin boundary is -infinite, upper is finite.
00728          * Return 1.0 to indicate the value is infinitely far from the lower
00729          * bound.
00730          */
00731         return 1.0;
00732     }
00733     else if (is_infinite(hist1) && is_infinite(hist2))
00734     {
00735         /* same as above, but in reverse */
00736         return 0.0;
00737     }
00738     else
00739     {
00740         /*
00741          * If both bin boundaries are infinite, they should be equal to each
00742          * other, and the value should also be infinite and equal to both
00743          * bounds. (But don't Assert that, to avoid crashing unnecessarily
00744          * if the caller messes up)
00745          *
00746          * Assume the value to lie in the middle of the infinite bounds.
00747          */
00748         return 0.5;
00749     }
00750 }
00751 
00752 /*
00753  * Measure distance between two range bounds.
00754  */
00755 static float8
00756 get_distance(TypeCacheEntry *typcache, RangeBound *bound1, RangeBound *bound2)
00757 {
00758     bool    has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
00759 
00760     if (!bound1->infinite && !bound2->infinite)
00761     {
00762         /*
00763          * No bounds are infinite, use subdiff function or return default
00764          * value of 1.0 if no subdiff is available.
00765          */
00766         if (has_subdiff)
00767             return
00768                 DatumGetFloat8(FunctionCall2Coll(&typcache->rng_subdiff_finfo,
00769                                                  typcache->rng_collation,
00770                                                  bound2->val,
00771                                                  bound1->val));
00772         else
00773             return 1.0;
00774     }
00775     else if (bound1->infinite && bound2->infinite)
00776     {
00777         /* Both bounds are infinite */
00778         if (bound1->lower == bound2->lower)
00779             return 0.0;
00780         else
00781             return get_float8_infinity();
00782     }
00783     else
00784     {
00785         /* One bound is infinite, another is not */
00786         return get_float8_infinity();
00787     }
00788 }
00789 
00790 /*
00791  * Calculate the average of function P(x), in the interval [length1, length2],
00792  * where P(x) is the fraction of tuples with length < x (or length <= x if
00793  * 'equal' is true).
00794  */
00795 static double
00796 calc_length_hist_frac(Datum *length_hist_values, int length_hist_nvalues,
00797                       double length1, double length2, bool equal)
00798 {
00799     double      frac;
00800     double      A, B, PA, PB;
00801     double      pos;
00802     int         i;
00803     double      area;
00804 
00805     Assert(length2 >= length1);
00806 
00807     if (length2 < 0.0)
00808         return 0.0; /* shouldn't happen, but doesn't hurt to check */
00809 
00810     /* All lengths in the table are <= infinite. */
00811     if (is_infinite(length2) && equal)
00812         return 1.0;
00813 
00814     /*----------
00815      * The average of a function between A and B can be calculated by the
00816      * formula:
00817      *
00818      *          B
00819      *    1     /
00820      * -------  | P(x)dx
00821      *  B - A   /
00822      *          A
00823      *
00824      * The geometrical interpretation of the integral is the area under the
00825      * graph of P(x). P(x) is defined by the length histogram. We calculate
00826      * the area in a piecewise fashion, iterating through the length histogram
00827      * bins. Each bin is a trapezoid:
00828      *
00829      *       P(x2)
00830      *        /|
00831      *       / |
00832      * P(x1)/  |
00833      *     |   |
00834      *     |   |
00835      *  ---+---+--
00836      *     x1  x2
00837      *
00838      * where x1 and x2 are the boundaries of the current histogram, and P(x1)
00839      * and P(x1) are the cumulative fraction of tuples at the boundaries.
00840      *
00841      * The area of each trapezoid is 1/2 * (P(x2) + P(x1)) * (x2 - x1)
00842      *
00843      * The first bin contains the lower bound passed by the caller, so we
00844      * use linear interpolation between the previous and next histogram bin
00845      * boundary to calculate P(x1). Likewise for the last bin: we use linear
00846      * interpolation to calculate P(x2). For the bins in between, x1 and x2
00847      * lie on histogram bin boundaries, so P(x1) and P(x2) are simply:
00848      * P(x1) =    (bin index) / (number of bins)
00849      * P(x2) = (bin index + 1 / (number of bins)
00850      */
00851 
00852     /* First bin, the one that contains lower bound */
00853     i = length_hist_bsearch(length_hist_values, length_hist_nvalues, length1, equal);
00854     if (i >= length_hist_nvalues - 1)
00855         return 1.0;
00856 
00857     if (i < 0)
00858     {
00859         i = 0;
00860         pos = 0.0;
00861     }
00862     else
00863     {
00864         /* interpolate length1's position in the bin */
00865         pos = get_len_position(length1,
00866                                DatumGetFloat8(length_hist_values[i]),
00867                                DatumGetFloat8(length_hist_values[i + 1]));
00868     }
00869     PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
00870     B = length1;
00871 
00872     /*
00873      * In the degenerate case that length1 == length2, simply return P(length1).
00874      * This is not merely an optimization: if length1 == length2, we'd divide
00875      * by zero later on.
00876      */
00877     if (length2 == length1)
00878         return PB;
00879 
00880     /*
00881      * Loop through all the bins, until we hit the last bin, the one that
00882      * contains the upper bound. (if lower and upper bounds are in the same
00883      * bin, this falls out immediately)
00884      */
00885     area = 0.0;
00886     for (; i < length_hist_nvalues - 1; i++)
00887     {
00888         double bin_upper = DatumGetFloat8(length_hist_values[i + 1]);
00889 
00890         /* check if we've reached the last bin */
00891         if (!(bin_upper < length2 || (equal && bin_upper <= length2)))
00892             break;
00893 
00894         /* the upper bound of previous bin is the lower bound of this bin */
00895         A = B; PA = PB;
00896 
00897         B = bin_upper;
00898         PB = (double) i / (double) (length_hist_nvalues - 1);
00899 
00900         /*
00901          * Add the area of this trapezoid to the total. The point of the
00902          * if-check is to avoid NaN, in the corner case that PA == PB == 0, and
00903          * B - A == Inf. The area of a zero-height trapezoid (PA == PB == 0) is
00904          * zero, regardless of the width (B - A).
00905          */
00906         if (PA > 0 || PB > 0)
00907             area += 0.5 * (PB + PA) * (B - A);
00908     }
00909 
00910     /* Last bin */
00911     A = B; PA = PB;
00912 
00913     B = length2; /* last bin ends at the query upper bound */
00914     if (i >= length_hist_nvalues - 1)
00915         pos = 0.0;
00916     else
00917     {
00918         if (DatumGetFloat8(length_hist_values[i]) == DatumGetFloat8(length_hist_values[i + 1]))
00919             pos = 0.0;
00920         else
00921             pos = get_len_position(length2, DatumGetFloat8(length_hist_values[i]), DatumGetFloat8(length_hist_values[i + 1]));
00922     }
00923     PB = (((double) i) + pos) / (double) (length_hist_nvalues - 1);
00924 
00925     if (PA > 0 || PB > 0)
00926         area += 0.5 * (PB + PA) * (B - A);
00927 
00928     /*
00929      * Ok, we have calculated the area, ie. the integral. Divide by width to
00930      * get the requested average.
00931      *
00932      * Avoid NaN arising from infinite / infinite. This happens at least if
00933      * length2 is infinite. It's not clear what the correct value would be in
00934      * that case, so 0.5 seems as good as any value.
00935      */
00936     if (is_infinite(area) && is_infinite(length2))
00937         frac = 0.5;
00938     else
00939         frac = area / (length2 - length1);
00940 
00941     return frac;
00942 }
00943 
00944 /*
00945  * Calculate selectivity of "var <@ const" operator, ie. estimate the fraction
00946  * of ranges that fall within the constant lower and upper bounds. This uses
00947  * the histograms of range lower bounds and range lengths, on the assumption
00948  * that the range lengths are independent of the lower bounds.
00949  *
00950  * The caller has already checked that constant lower and upper bounds are
00951  * finite.
00952  */
00953 static double
00954 calc_hist_selectivity_contained(TypeCacheEntry *typcache,
00955                                 RangeBound *lower, RangeBound *upper,
00956                                 RangeBound *hist_lower, int hist_nvalues,
00957                                 Datum *length_hist_values, int length_hist_nvalues)
00958 {
00959     int         i,
00960                 upper_index;
00961     float8      prev_dist;
00962     double      bin_width;
00963     double      upper_bin_width;
00964     double      sum_frac;
00965 
00966     /*
00967      * Begin by finding the bin containing the upper bound, in the lower bound
00968      * histogram. Any range with a lower bound > constant upper bound can't
00969      * match, ie. there are no matches in bins greater than upper_index.
00970      */
00971     upper->inclusive = !upper->inclusive;
00972     upper->lower = true;
00973     upper_index = rbound_bsearch(typcache, upper, hist_lower, hist_nvalues,
00974                                  false);
00975 
00976     /*
00977      * Calculate upper_bin_width, ie. the fraction of the (upper_index,
00978      * upper_index + 1) bin which is greater than upper bound of query range
00979      * using linear interpolation of subdiff function.
00980      */
00981     if (upper_index >= 0 && upper_index < hist_nvalues - 1)
00982         upper_bin_width = get_position(typcache, upper,
00983                                        &hist_lower[upper_index],
00984                                        &hist_lower[upper_index + 1]);
00985     else
00986         upper_bin_width = 0.0;
00987 
00988     /*
00989      * In the loop, dist and prev_dist are the distance of the "current" bin's
00990      * lower and upper bounds from the constant upper bound.
00991      *
00992      * bin_width represents the width of the current bin. Normally it is 1.0,
00993      * meaning a full width bin, but can be less in the corner cases: start
00994      * and end of the loop. We start with bin_width = upper_bin_width, because
00995      * we begin at the bin containing the upper bound.
00996      */
00997     prev_dist = 0.0;
00998     bin_width = upper_bin_width;
00999 
01000     sum_frac = 0.0;
01001     for (i = upper_index; i >= 0; i--)
01002     {
01003         double      dist;
01004         double      length_hist_frac;
01005         bool        final_bin = false;
01006 
01007         /*
01008          * dist -- distance from upper bound of query range to lower bound of
01009          * the current bin in the lower bound histogram. Or to the lower bound
01010          * of the constant range, if this is the final bin, containing the
01011          * constant lower bound.
01012          */
01013         if (range_cmp_bounds(typcache, &hist_lower[i], lower) < 0)
01014         {
01015             dist = get_distance(typcache, lower, upper);
01016             /*
01017              * Subtract from bin_width the portion of this bin that we want
01018              * to ignore.
01019              */
01020             bin_width -= get_position(typcache, lower, &hist_lower[i],
01021                                       &hist_lower[i + 1]);
01022             if (bin_width < 0.0)
01023                 bin_width = 0.0;
01024             final_bin = true;
01025         }
01026         else
01027             dist = get_distance(typcache, &hist_lower[i], upper);
01028 
01029         /*
01030          * Estimate the fraction of tuples in this bin that are narrow enough
01031          * to not exceed the distance to the upper bound of the query range.
01032          */
01033         length_hist_frac = calc_length_hist_frac(length_hist_values,
01034                                                  length_hist_nvalues,
01035                                                  prev_dist, dist, true);
01036 
01037         /*
01038          * Add the fraction of tuples in this bin, with a suitable length,
01039          * to the total.
01040          */
01041         sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
01042 
01043         if (final_bin)
01044             break;
01045 
01046         bin_width = 1.0;
01047         prev_dist = dist;
01048     }
01049 
01050     return sum_frac;
01051 }
01052 
01053 /*
01054  * Calculate selectivity of "var @> const" operator, ie. estimate the fraction
01055  * of ranges that contain the constant lower and upper bounds. This uses
01056  * the histograms of range lower bounds and range lengths, on the assumption
01057  * that the range lengths are independent of the lower bounds.
01058  *
01059  * Note, this is "var @> const", ie. estimate the fraction of ranges that
01060  * contain the constant lower and upper bounds.
01061  */
01062 static double
01063 calc_hist_selectivity_contains(TypeCacheEntry *typcache,
01064                                RangeBound *lower, RangeBound *upper,
01065                                RangeBound *hist_lower, int hist_nvalues,
01066                                Datum *length_hist_values, int length_hist_nvalues)
01067 {
01068     int         i,
01069                 lower_index;
01070     double      bin_width,
01071                 lower_bin_width;
01072     double      sum_frac;
01073     float8      prev_dist;
01074 
01075     /* Find the bin containing the lower bound of query range. */
01076     lower_index = rbound_bsearch(typcache, lower, hist_lower, hist_nvalues,
01077                                  true);
01078 
01079     /*
01080      * Calculate lower_bin_width, ie. the fraction of the of (lower_index,
01081      * lower_index + 1) bin which is greater than lower bound of query range
01082      * using linear interpolation of subdiff function.
01083      */
01084     if (lower_index >= 0 && lower_index < hist_nvalues - 1)
01085         lower_bin_width = get_position(typcache, lower, &hist_lower[lower_index],
01086                                   &hist_lower[lower_index + 1]);
01087     else
01088         lower_bin_width = 0.0;
01089 
01090     /*
01091      * Loop through all the lower bound bins, smaller than the query lower
01092      * bound. In the loop, dist and prev_dist are the distance of the "current"
01093      * bin's lower and upper bounds from the constant upper bound. We begin
01094      * from query lower bound, and walk backwards, so the first bin's upper
01095      * bound is the query lower bound, and its distance to the query upper
01096      * bound is the length of the query range.
01097      *
01098      * bin_width represents the width of the current bin. Normally it is 1.0,
01099      * meaning a full width bin, except for the first bin, which is only
01100      * counted up to the constant lower bound.
01101      */
01102     prev_dist = get_distance(typcache, lower, upper);
01103     sum_frac = 0.0;
01104     bin_width = lower_bin_width;
01105     for (i = lower_index; i >= 0; i--)
01106     {
01107         float8      dist;
01108         double      length_hist_frac;
01109 
01110         /*
01111          * dist -- distance from upper bound of query range to current
01112          * value of lower bound histogram or lower bound of query range (if
01113          * we've reach it).
01114          */
01115         dist = get_distance(typcache, &hist_lower[i], upper);
01116 
01117         /*
01118          * Get average fraction of length histogram which covers intervals
01119          * longer than (or equal to) distance to upper bound of query range.
01120          */
01121         length_hist_frac =
01122             1.0 - calc_length_hist_frac(length_hist_values,
01123                                         length_hist_nvalues,
01124                                         prev_dist, dist, false);
01125 
01126         sum_frac += length_hist_frac * bin_width / (double) (hist_nvalues - 1);
01127 
01128         bin_width = 1.0;
01129         prev_dist = dist;
01130     }
01131 
01132     return sum_frac;
01133 }