Header And Logo

PostgreSQL
| The world's most advanced open source database.

rangetypes_typanalyze.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * ragetypes_typanalyze.c
00004  *    Functions for gathering statistics from range columns
00005  *
00006  * For a range type column, histograms of lower and upper bounds, and
00007  * the fraction of NULL and empty ranges are collected.
00008  *
00009  * Both histograms have the same length, and they are combined into a
00010  * single array of ranges. This has the same shape as the histogram that
00011  * std_typanalyze would collect, but the values are different. Each range
00012  * in the array is a valid range, even though the lower and upper bounds
00013  * come from different tuples. In theory, the standard scalar selectivity
00014  * functions could be used with the combined histogram.
00015  *
00016  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00017  * Portions Copyright (c) 1994, Regents of the University of California
00018  *
00019  *
00020  * IDENTIFICATION
00021  *    src/backend/utils/adt/rangetypes_typanalyze.c
00022  *
00023  *-------------------------------------------------------------------------
00024  */
00025 #include "postgres.h"
00026 
00027 #include "catalog/pg_operator.h"
00028 #include "commands/vacuum.h"
00029 #include "utils/builtins.h"
00030 #include "utils/rangetypes.h"
00031 
00032 static int float8_qsort_cmp(const void *a1, const void *a2);
00033 static int range_bound_qsort_cmp(const void *a1, const void *a2, void *arg);
00034 static void compute_range_stats(VacAttrStats *stats,
00035            AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows);
00036 
00037 /*
00038  * range_typanalyze -- typanalyze function for range columns
00039  */
00040 Datum
00041 range_typanalyze(PG_FUNCTION_ARGS)
00042 {
00043     VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
00044     TypeCacheEntry *typcache;
00045     Form_pg_attribute attr = stats->attr;
00046 
00047     /* Get information about range type */
00048     typcache = range_get_typcache(fcinfo, stats->attrtypid);
00049 
00050     if (attr->attstattarget < 0)
00051         attr->attstattarget = default_statistics_target;
00052 
00053     stats->compute_stats = compute_range_stats;
00054     stats->extra_data = typcache;
00055     /* same as in std_typanalyze */
00056     stats->minrows = 300 * attr->attstattarget;
00057 
00058     PG_RETURN_BOOL(true);
00059 }
00060 
00061 /*
00062  * Comparison function for sorting float8s, used for range lengths.
00063  */
00064 static int
00065 float8_qsort_cmp(const void *a1, const void *a2)
00066 {
00067     const float8 *f1 = (const float8 *) a1;
00068     const float8 *f2 = (const float8 *) a2;
00069 
00070     if (*f1 < *f2)
00071         return -1;
00072     else if (*f1 == *f2)
00073         return 0;
00074     else
00075         return 1;
00076 }
00077 
00078 /*
00079  * Comparison function for sorting RangeBounds.
00080  */
00081 static int
00082 range_bound_qsort_cmp(const void *a1, const void *a2, void *arg)
00083 {
00084     RangeBound *b1 = (RangeBound *)a1;
00085     RangeBound *b2 = (RangeBound *)a2;
00086     TypeCacheEntry *typcache = (TypeCacheEntry *)arg;
00087 
00088     return range_cmp_bounds(typcache, b1, b2);
00089 }
00090 
00091 /*
00092  * compute_range_stats() -- compute statistics for a range column
00093  */
00094 static void
00095 compute_range_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
00096                     int samplerows, double totalrows)
00097 {
00098     TypeCacheEntry *typcache = (TypeCacheEntry *) stats->extra_data;
00099     bool        has_subdiff = OidIsValid(typcache->rng_subdiff_finfo.fn_oid);
00100     int         null_cnt = 0;
00101     int         non_null_cnt = 0;
00102     int         non_empty_cnt = 0;
00103     int         empty_cnt = 0;
00104     int         range_no;
00105     int         slot_idx;
00106     int         num_bins = stats->attr->attstattarget;
00107     int         num_hist;
00108     float8     *lengths;
00109     RangeBound *lowers, *uppers;
00110     double      total_width = 0;
00111 
00112     /* Allocate memory to hold range bounds and lengths of the sample ranges. */
00113     lowers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
00114     uppers = (RangeBound *) palloc(sizeof(RangeBound) * samplerows);
00115     lengths = (float8 *) palloc(sizeof(float8) * samplerows);
00116 
00117     /* Loop over the sample ranges. */
00118     for (range_no = 0; range_no < samplerows; range_no++)
00119     {
00120         Datum       value;
00121         bool        isnull,
00122                     empty;
00123         RangeType  *range;
00124         RangeBound  lower,
00125                     upper;
00126         float8      length;
00127 
00128         vacuum_delay_point();
00129 
00130         value = fetchfunc(stats, range_no, &isnull);
00131         if (isnull)
00132         {
00133             /* range is null, just count that */
00134             null_cnt++;
00135             continue;
00136         }
00137 
00138         /*
00139          * XXX: should we ignore wide values, like std_typanalyze does, to
00140          * avoid bloating the statistics table?
00141          */
00142         total_width += VARSIZE_ANY(DatumGetPointer(value));
00143 
00144         /* Get range and deserialize it for further analysis. */
00145         range = DatumGetRangeType(value);
00146         range_deserialize(typcache, range, &lower, &upper, &empty);
00147 
00148         if (!empty)
00149         {
00150             /* Remember bounds and length for further usage in histograms */
00151             lowers[non_empty_cnt] = lower;
00152             uppers[non_empty_cnt] = upper;
00153 
00154             if (lower.infinite || upper.infinite)
00155             {
00156                 /* Length of any kind of an infinite range is infinite */
00157                 length = get_float8_infinity();
00158             }
00159             else if (has_subdiff)
00160             {
00161                 /*
00162                  * For an ordinary range, use subdiff function between upper
00163                  * and lower bound values.
00164                  */
00165                 length = DatumGetFloat8(FunctionCall2Coll(
00166                                             &typcache->rng_subdiff_finfo,
00167                                             typcache->rng_collation,
00168                                             upper.val, lower.val));
00169             }
00170             else
00171             {
00172                 /* Use default value of 1.0 if no subdiff is available. */
00173                 length = 1.0;
00174             }
00175             lengths[non_empty_cnt] = length;
00176 
00177             non_empty_cnt++;
00178         }
00179         else
00180             empty_cnt++;
00181 
00182         non_null_cnt++;
00183     }
00184 
00185     slot_idx = 0;
00186 
00187     /* We can only compute real stats if we found some non-null values. */
00188     if (non_null_cnt > 0)
00189     {
00190         Datum      *bound_hist_values;
00191         Datum      *length_hist_values;
00192         int         pos,
00193                     posfrac,
00194                     delta,
00195                     deltafrac,
00196                     i;
00197         MemoryContext old_cxt;
00198         float4     *emptyfrac;
00199 
00200         stats->stats_valid = true;
00201         /* Do the simple null-frac and width stats */
00202         stats->stanullfrac = (double) null_cnt / (double) samplerows;
00203         stats->stawidth = total_width / (double) non_null_cnt;
00204         stats->stadistinct = -1.0;
00205 
00206         /* Must copy the target values into anl_context */
00207         old_cxt = MemoryContextSwitchTo(stats->anl_context);
00208 
00209         /*
00210          * Generate a bounds histogram slot entry if there are at least two
00211          * values.
00212          */
00213         if (non_empty_cnt >= 2)
00214         {
00215             /* Sort bound values */
00216             qsort_arg(lowers, non_empty_cnt, sizeof(RangeBound),
00217                       range_bound_qsort_cmp, typcache);
00218             qsort_arg(uppers, non_empty_cnt, sizeof(RangeBound),
00219                       range_bound_qsort_cmp, typcache);
00220 
00221             num_hist = non_empty_cnt;
00222             if (num_hist > num_bins)
00223                 num_hist = num_bins + 1;
00224 
00225             bound_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
00226 
00227             /*
00228              * The object of this loop is to construct ranges from first and
00229              * last entries in lowers[] and uppers[] along with evenly-spaced
00230              * values in between. So the i'th value is a range of
00231              * lowers[(i * (nvals - 1)) / (num_hist - 1)] and
00232              * uppers[(i * (nvals - 1)) / (num_hist - 1)]. But computing that
00233              * subscript directly risks integer overflow when the stats target
00234              * is more than a couple thousand.  Instead we add
00235              * (nvals - 1) / (num_hist - 1) to pos at each step, tracking the
00236              * integral and fractional parts of the sum separately.
00237              */
00238             delta = (non_empty_cnt - 1) / (num_hist - 1);
00239             deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
00240             pos = posfrac = 0;
00241 
00242             for (i = 0; i < num_hist; i++)
00243             {
00244                 bound_hist_values[i] = PointerGetDatum(range_serialize(
00245                                 typcache, &lowers[pos], &uppers[pos], false));
00246                 pos += delta;
00247                 posfrac += deltafrac;
00248                 if (posfrac >= (num_hist - 1))
00249                 {
00250                     /* fractional part exceeds 1, carry to integer part */
00251                     pos++;
00252                     posfrac -= (num_hist - 1);
00253                 }
00254             }
00255 
00256             stats->stakind[slot_idx] = STATISTIC_KIND_BOUNDS_HISTOGRAM;
00257             stats->stavalues[slot_idx] = bound_hist_values;
00258             stats->numvalues[slot_idx] = num_hist;
00259             slot_idx++;
00260         }
00261 
00262         /*
00263          * Generate a length histogram slot entry if there are at least two
00264          * values.
00265          */
00266         if (non_empty_cnt >= 2)
00267         {
00268             /*
00269              * Ascending sort of range lengths for further filling of
00270              * histogram
00271              */
00272             qsort(lengths, non_empty_cnt, sizeof(float8), float8_qsort_cmp);
00273 
00274             num_hist = non_empty_cnt;
00275             if (num_hist > num_bins)
00276                 num_hist = num_bins + 1;
00277 
00278             length_hist_values = (Datum *) palloc(num_hist * sizeof(Datum));
00279 
00280             /*
00281              * The object of this loop is to copy the first and last lengths[]
00282              * entries along with evenly-spaced values in between. So the i'th
00283              * value is lengths[(i * (nvals - 1)) / (num_hist - 1)]. But
00284              * computing that subscript directly risks integer overflow when the
00285              * stats target is more than a couple thousand.  Instead we add
00286              * (nvals - 1) / (num_hist - 1) to pos at each step, tracking the
00287              * integral and fractional parts of the sum separately.
00288              */
00289             delta = (non_empty_cnt - 1) / (num_hist - 1);
00290             deltafrac = (non_empty_cnt - 1) % (num_hist - 1);
00291             pos = posfrac = 0;
00292 
00293             for (i = 0; i < num_hist; i++)
00294             {
00295                 length_hist_values[i] = Float8GetDatum(lengths[pos]);
00296                 pos += delta;
00297                 posfrac += deltafrac;
00298                 if (posfrac >= (num_hist - 1))
00299                 {
00300                     /* fractional part exceeds 1, carry to integer part */
00301                     pos++;
00302                     posfrac -= (num_hist - 1);
00303                 }
00304             }
00305         }
00306         else
00307         {
00308             /*
00309              * Even when we don't create the histogram, store an empty array
00310              * to mean "no histogram". We can't just leave stavalues NULL,
00311              * because get_attstatsslot() errors if you ask for stavalues, and
00312              * it's NULL. We'll still store the empty fraction in stanumbers.
00313              */
00314             length_hist_values = palloc(0);
00315             num_hist = 0;
00316         }
00317         stats->staop[slot_idx] = Float8LessOperator;
00318         stats->stavalues[slot_idx] = length_hist_values;
00319         stats->numvalues[slot_idx] = num_hist;
00320         stats->statypid[slot_idx] = FLOAT8OID;
00321         stats->statyplen[slot_idx] = sizeof(float8);
00322 #ifdef USE_FLOAT8_BYVAL
00323         stats->statypbyval[slot_idx] = true;
00324 #else
00325         stats->statypbyval[slot_idx] = false;
00326 #endif
00327         stats->statypalign[slot_idx] = 'd';
00328 
00329         /* Store the fraction of empty ranges */
00330         emptyfrac = (float4 *) palloc(sizeof(float4));
00331         *emptyfrac = ((double) empty_cnt) / ((double) non_null_cnt);
00332         stats->stanumbers[slot_idx] = emptyfrac;
00333         stats->numnumbers[slot_idx] = 1;
00334 
00335         stats->stakind[slot_idx] = STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM;
00336         slot_idx++;
00337 
00338         MemoryContextSwitchTo(old_cxt);
00339     }
00340     else if (null_cnt > 0)
00341     {
00342         /* We found only nulls; assume the column is entirely null */
00343         stats->stats_valid = true;
00344         stats->stanullfrac = 1.0;
00345         stats->stawidth = 0;        /* "unknown" */
00346         stats->stadistinct = 0.0;   /* "unknown" */
00347     }
00348     /*
00349      * We don't need to bother cleaning up any of our temporary palloc's. The
00350      * hashtable should also go away, as it used a child memory context.
00351      */
00352 }