Header And Logo

PostgreSQL
| The world's most advanced open source database.

array_selfuncs.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * array_selfuncs.c
00004  *    Functions for selectivity estimation of array operators
00005  *
00006  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00007  * Portions Copyright (c) 1994, Regents of the University of California
00008  *
00009  *
00010  * IDENTIFICATION
00011  *    src/backend/utils/adt/array_selfuncs.c
00012  *
00013  *-------------------------------------------------------------------------
00014  */
00015 #include "postgres.h"
00016 
00017 #include <math.h>
00018 
00019 #include "access/htup_details.h"
00020 #include "catalog/pg_collation.h"
00021 #include "catalog/pg_operator.h"
00022 #include "catalog/pg_statistic.h"
00023 #include "optimizer/clauses.h"
00024 #include "utils/array.h"
00025 #include "utils/lsyscache.h"
00026 #include "utils/selfuncs.h"
00027 #include "utils/typcache.h"
00028 
00029 
00030 /* Default selectivity constant for "@>" and "<@" operators */
00031 #define DEFAULT_CONTAIN_SEL 0.005
00032 
00033 /* Default selectivity constant for "&&" operator */
00034 #define DEFAULT_OVERLAP_SEL 0.01
00035 
00036 /* Default selectivity for given operator */
00037 #define DEFAULT_SEL(operator) \
00038     ((operator) == OID_ARRAY_OVERLAP_OP ? \
00039         DEFAULT_OVERLAP_SEL : DEFAULT_CONTAIN_SEL)
00040 
00041 static Selectivity calc_arraycontsel(VariableStatData *vardata, Datum constval,
00042                   Oid elemtype, Oid operator);
00043 static Selectivity mcelem_array_selec(ArrayType *array,
00044                    TypeCacheEntry *typentry,
00045                    Datum *mcelem, int nmcelem,
00046                    float4 *numbers, int nnumbers,
00047                    float4 *hist, int nhist,
00048                    Oid operator, FmgrInfo *cmpfunc);
00049 static Selectivity mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
00050                                    float4 *numbers, int nnumbers,
00051                                    Datum *array_data, int nitems,
00052                                    Oid operator, FmgrInfo *cmpfunc);
00053 static Selectivity mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
00054                              float4 *numbers, int nnumbers,
00055                              Datum *array_data, int nitems,
00056                              float4 *hist, int nhist,
00057                              Oid operator, FmgrInfo *cmpfunc);
00058 static float *calc_hist(const float4 *hist, int nhist, int n);
00059 static float *calc_distr(const float *p, int n, int m, float rest);
00060 static int  floor_log2(uint32 n);
00061 static bool find_next_mcelem(Datum *mcelem, int nmcelem, Datum value,
00062                  int *index, FmgrInfo *cmpfunc);
00063 static int  element_compare(const void *key1, const void *key2, void *arg);
00064 static int  float_compare_desc(const void *key1, const void *key2);
00065 
00066 
00067 /*
00068  * scalararraysel_containment
00069  *      Estimate selectivity of ScalarArrayOpExpr via array containment.
00070  *
00071  * scalararraysel() has already verified that the operator of a
00072  * ScalarArrayOpExpr is the array element type's default equality or
00073  * inequality operator.  If we have const =/<> ANY/ALL (array_var)
00074  * then we can estimate the selectivity as though this were an array
00075  * containment operator, array_var op ARRAY[const].
00076  *
00077  * Returns selectivity (0..1), or -1 if we fail to estimate selectivity.
00078  */
00079 Selectivity
00080 scalararraysel_containment(PlannerInfo *root,
00081                            Node *leftop, Node *rightop,
00082                            Oid elemtype, bool isEquality, bool useOr,
00083                            int varRelid)
00084 {
00085     Selectivity selec;
00086     VariableStatData vardata;
00087     Datum       constval;
00088     TypeCacheEntry *typentry;
00089     FmgrInfo   *cmpfunc;
00090 
00091     /*
00092      * rightop must be a variable, else punt.
00093      */
00094     examine_variable(root, rightop, varRelid, &vardata);
00095     if (!vardata.rel)
00096     {
00097         ReleaseVariableStats(vardata);
00098         return -1.0;
00099     }
00100 
00101     /*
00102      * Aggressively reduce leftop to a constant, if possible.
00103      */
00104     leftop = estimate_expression_value(root, leftop);
00105     if (!IsA(leftop, Const))
00106     {
00107         ReleaseVariableStats(vardata);
00108         return -1.0;
00109     }
00110     if (((Const *) leftop)->constisnull)
00111     {
00112         /* qual can't succeed if null on left */
00113         ReleaseVariableStats(vardata);
00114         return (Selectivity) 0.0;
00115     }
00116     constval = ((Const *) leftop)->constvalue;
00117 
00118     /* Get element type's default comparison function */
00119     typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
00120     if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
00121     {
00122         ReleaseVariableStats(vardata);
00123         return -1.0;
00124     }
00125     cmpfunc = &typentry->cmp_proc_finfo;
00126 
00127     /*
00128      * If the operator is <>, swap ANY/ALL, then invert the result later.
00129      */
00130     if (!isEquality)
00131         useOr = !useOr;
00132 
00133     /* Get array element stats for var, if available */
00134     if (HeapTupleIsValid(vardata.statsTuple))
00135     {
00136         Form_pg_statistic stats;
00137         Datum      *values;
00138         int         nvalues;
00139         float4     *numbers;
00140         int         nnumbers;
00141         float4     *hist;
00142         int         nhist;
00143 
00144         stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
00145 
00146         /* MCELEM will be an array of same type as element */
00147         if (get_attstatsslot(vardata.statsTuple,
00148                              elemtype, vardata.atttypmod,
00149                              STATISTIC_KIND_MCELEM, InvalidOid,
00150                              NULL,
00151                              &values, &nvalues,
00152                              &numbers, &nnumbers))
00153         {
00154             /* For ALL case, also get histogram of distinct-element counts */
00155             if (useOr ||
00156                 !get_attstatsslot(vardata.statsTuple,
00157                                   elemtype, vardata.atttypmod,
00158                                   STATISTIC_KIND_DECHIST, InvalidOid,
00159                                   NULL,
00160                                   NULL, NULL,
00161                                   &hist, &nhist))
00162             {
00163                 hist = NULL;
00164                 nhist = 0;
00165             }
00166 
00167             /*
00168              * For = ANY, estimate as var @> ARRAY[const].
00169              *
00170              * For = ALL, estimate as var <@ ARRAY[const].
00171              */
00172             if (useOr)
00173                 selec = mcelem_array_contain_overlap_selec(values, nvalues,
00174                                                            numbers, nnumbers,
00175                                                            &constval, 1,
00176                                                        OID_ARRAY_CONTAINS_OP,
00177                                                            cmpfunc);
00178             else
00179                 selec = mcelem_array_contained_selec(values, nvalues,
00180                                                      numbers, nnumbers,
00181                                                      &constval, 1,
00182                                                      hist, nhist,
00183                                                      OID_ARRAY_CONTAINED_OP,
00184                                                      cmpfunc);
00185 
00186             if (hist)
00187                 free_attstatsslot(elemtype, NULL, 0, hist, nhist);
00188             free_attstatsslot(elemtype, values, nvalues, numbers, nnumbers);
00189         }
00190         else
00191         {
00192             /* No most-common-elements info, so do without */
00193             if (useOr)
00194                 selec = mcelem_array_contain_overlap_selec(NULL, 0,
00195                                                            NULL, 0,
00196                                                            &constval, 1,
00197                                                        OID_ARRAY_CONTAINS_OP,
00198                                                            cmpfunc);
00199             else
00200                 selec = mcelem_array_contained_selec(NULL, 0,
00201                                                      NULL, 0,
00202                                                      &constval, 1,
00203                                                      NULL, 0,
00204                                                      OID_ARRAY_CONTAINED_OP,
00205                                                      cmpfunc);
00206         }
00207 
00208         /*
00209          * MCE stats count only non-null rows, so adjust for null rows.
00210          */
00211         selec *= (1.0 - stats->stanullfrac);
00212     }
00213     else
00214     {
00215         /* No stats at all, so do without */
00216         if (useOr)
00217             selec = mcelem_array_contain_overlap_selec(NULL, 0,
00218                                                        NULL, 0,
00219                                                        &constval, 1,
00220                                                        OID_ARRAY_CONTAINS_OP,
00221                                                        cmpfunc);
00222         else
00223             selec = mcelem_array_contained_selec(NULL, 0,
00224                                                  NULL, 0,
00225                                                  &constval, 1,
00226                                                  NULL, 0,
00227                                                  OID_ARRAY_CONTAINED_OP,
00228                                                  cmpfunc);
00229         /* we assume no nulls here, so no stanullfrac correction */
00230     }
00231 
00232     ReleaseVariableStats(vardata);
00233 
00234     /*
00235      * If the operator is <>, invert the results.
00236      */
00237     if (!isEquality)
00238         selec = 1.0 - selec;
00239 
00240     CLAMP_PROBABILITY(selec);
00241 
00242     return selec;
00243 }
00244 
00245 /*
00246  * arraycontsel -- restriction selectivity for array @>, &&, <@ operators
00247  */
00248 Datum
00249 arraycontsel(PG_FUNCTION_ARGS)
00250 {
00251     PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
00252     Oid         operator = PG_GETARG_OID(1);
00253     List       *args = (List *) PG_GETARG_POINTER(2);
00254     int         varRelid = PG_GETARG_INT32(3);
00255     VariableStatData vardata;
00256     Node       *other;
00257     bool        varonleft;
00258     Selectivity selec;
00259     Oid         element_typeid;
00260 
00261     /*
00262      * If expression is not (variable op something) or (something op
00263      * variable), then punt and return a default estimate.
00264      */
00265     if (!get_restriction_variable(root, args, varRelid,
00266                                   &vardata, &other, &varonleft))
00267         PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
00268 
00269     /*
00270      * Can't do anything useful if the something is not a constant, either.
00271      */
00272     if (!IsA(other, Const))
00273     {
00274         ReleaseVariableStats(vardata);
00275         PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
00276     }
00277 
00278     /*
00279      * The "&&", "@>" and "<@" operators are strict, so we can cope with a
00280      * NULL constant right away.
00281      */
00282     if (((Const *) other)->constisnull)
00283     {
00284         ReleaseVariableStats(vardata);
00285         PG_RETURN_FLOAT8(0.0);
00286     }
00287 
00288     /*
00289      * If var is on the right, commute the operator, so that we can assume the
00290      * var is on the left in what follows.
00291      */
00292     if (!varonleft)
00293     {
00294         if (operator == OID_ARRAY_CONTAINS_OP)
00295             operator = OID_ARRAY_CONTAINED_OP;
00296         else if (operator == OID_ARRAY_CONTAINED_OP)
00297             operator = OID_ARRAY_CONTAINS_OP;
00298     }
00299 
00300     /*
00301      * OK, there's a Var and a Const we're dealing with here.  We need the
00302      * Const to be a array with same element type as column, else we can't do
00303      * anything useful.  (Such cases will likely fail at runtime, but here
00304      * we'd rather just return a default estimate.)
00305      */
00306     element_typeid = get_base_element_type(((Const *) other)->consttype);
00307     if (element_typeid != InvalidOid &&
00308         element_typeid == get_base_element_type(vardata.vartype))
00309     {
00310         selec = calc_arraycontsel(&vardata, ((Const *) other)->constvalue,
00311                                   element_typeid, operator);
00312     }
00313     else
00314     {
00315         selec = DEFAULT_SEL(operator);
00316     }
00317 
00318     ReleaseVariableStats(vardata);
00319 
00320     CLAMP_PROBABILITY(selec);
00321 
00322     PG_RETURN_FLOAT8((float8) selec);
00323 }
00324 
00325 /*
00326  * arraycontjoinsel -- join selectivity for array @>, &&, <@ operators
00327  */
00328 Datum
00329 arraycontjoinsel(PG_FUNCTION_ARGS)
00330 {
00331     /* For the moment this is just a stub */
00332     Oid         operator = PG_GETARG_OID(1);
00333 
00334     PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
00335 }
00336 
00337 /*
00338  * Calculate selectivity for "arraycolumn @> const", "arraycolumn && const"
00339  * or "arraycolumn <@ const" based on the statistics
00340  *
00341  * This function is mainly responsible for extracting the pg_statistic data
00342  * to be used; we then pass the problem on to mcelem_array_selec().
00343  */
00344 static Selectivity
00345 calc_arraycontsel(VariableStatData *vardata, Datum constval,
00346                   Oid elemtype, Oid operator)
00347 {
00348     Selectivity selec;
00349     TypeCacheEntry *typentry;
00350     FmgrInfo   *cmpfunc;
00351     ArrayType  *array;
00352 
00353     /* Get element type's default comparison function */
00354     typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
00355     if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
00356         return DEFAULT_SEL(operator);
00357     cmpfunc = &typentry->cmp_proc_finfo;
00358 
00359     /*
00360      * The caller made sure the const is a array with same element type, so
00361      * get it now
00362      */
00363     array = DatumGetArrayTypeP(constval);
00364 
00365     if (HeapTupleIsValid(vardata->statsTuple))
00366     {
00367         Form_pg_statistic stats;
00368         Datum      *values;
00369         int         nvalues;
00370         float4     *numbers;
00371         int         nnumbers;
00372         float4     *hist;
00373         int         nhist;
00374 
00375         stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
00376 
00377         /* MCELEM will be an array of same type as column */
00378         if (get_attstatsslot(vardata->statsTuple,
00379                              elemtype, vardata->atttypmod,
00380                              STATISTIC_KIND_MCELEM, InvalidOid,
00381                              NULL,
00382                              &values, &nvalues,
00383                              &numbers, &nnumbers))
00384         {
00385             /*
00386              * For "array <@ const" case we also need histogram of distinct
00387              * element counts.
00388              */
00389             if (operator != OID_ARRAY_CONTAINED_OP ||
00390                 !get_attstatsslot(vardata->statsTuple,
00391                                   elemtype, vardata->atttypmod,
00392                                   STATISTIC_KIND_DECHIST, InvalidOid,
00393                                   NULL,
00394                                   NULL, NULL,
00395                                   &hist, &nhist))
00396             {
00397                 hist = NULL;
00398                 nhist = 0;
00399             }
00400 
00401             /* Use the most-common-elements slot for the array Var. */
00402             selec = mcelem_array_selec(array, typentry,
00403                                        values, nvalues,
00404                                        numbers, nnumbers,
00405                                        hist, nhist,
00406                                        operator, cmpfunc);
00407 
00408             if (hist)
00409                 free_attstatsslot(elemtype, NULL, 0, hist, nhist);
00410             free_attstatsslot(elemtype, values, nvalues, numbers, nnumbers);
00411         }
00412         else
00413         {
00414             /* No most-common-elements info, so do without */
00415             selec = mcelem_array_selec(array, typentry,
00416                                        NULL, 0, NULL, 0, NULL, 0,
00417                                        operator, cmpfunc);
00418         }
00419 
00420         /*
00421          * MCE stats count only non-null rows, so adjust for null rows.
00422          */
00423         selec *= (1.0 - stats->stanullfrac);
00424     }
00425     else
00426     {
00427         /* No stats at all, so do without */
00428         selec = mcelem_array_selec(array, typentry,
00429                                    NULL, 0, NULL, 0, NULL, 0,
00430                                    operator, cmpfunc);
00431         /* we assume no nulls here, so no stanullfrac correction */
00432     }
00433 
00434     /* If constant was toasted, release the copy we made */
00435     if (PointerGetDatum(array) != constval)
00436         pfree(array);
00437 
00438     return selec;
00439 }
00440 
00441 /*
00442  * Array selectivity estimation based on most common elements statistics
00443  *
00444  * This function just deconstructs and sorts the array constant's contents,
00445  * and then passes the problem on to mcelem_array_contain_overlap_selec or
00446  * mcelem_array_contained_selec depending on the operator.
00447  */
00448 static Selectivity
00449 mcelem_array_selec(ArrayType *array, TypeCacheEntry *typentry,
00450                    Datum *mcelem, int nmcelem,
00451                    float4 *numbers, int nnumbers,
00452                    float4 *hist, int nhist,
00453                    Oid operator, FmgrInfo *cmpfunc)
00454 {
00455     Selectivity selec;
00456     int         num_elems;
00457     Datum      *elem_values;
00458     bool       *elem_nulls;
00459     bool        null_present;
00460     int         nonnull_nitems;
00461     int         i;
00462 
00463     /*
00464      * Prepare constant array data for sorting.  Sorting lets us find unique
00465      * elements and efficiently merge with the MCELEM array.
00466      */
00467     deconstruct_array(array,
00468                       typentry->type_id,
00469                       typentry->typlen,
00470                       typentry->typbyval,
00471                       typentry->typalign,
00472                       &elem_values, &elem_nulls, &num_elems);
00473 
00474     /* Collapse out any null elements */
00475     nonnull_nitems = 0;
00476     null_present = false;
00477     for (i = 0; i < num_elems; i++)
00478     {
00479         if (elem_nulls[i])
00480             null_present = true;
00481         else
00482             elem_values[nonnull_nitems++] = elem_values[i];
00483     }
00484 
00485     /*
00486      * Query "column @> '{anything, null}'" matches nothing.  For the other
00487      * two operators, presence of a null in the constant can be ignored.
00488      */
00489     if (null_present && operator == OID_ARRAY_CONTAINS_OP)
00490     {
00491         pfree(elem_values);
00492         pfree(elem_nulls);
00493         return (Selectivity) 0.0;
00494     }
00495 
00496     /* Sort extracted elements using their default comparison function. */
00497     qsort_arg(elem_values, nonnull_nitems, sizeof(Datum),
00498               element_compare, cmpfunc);
00499 
00500     /* Separate cases according to operator */
00501     if (operator == OID_ARRAY_CONTAINS_OP || operator == OID_ARRAY_OVERLAP_OP)
00502         selec = mcelem_array_contain_overlap_selec(mcelem, nmcelem,
00503                                                    numbers, nnumbers,
00504                                                  elem_values, nonnull_nitems,
00505                                                    operator, cmpfunc);
00506     else if (operator == OID_ARRAY_CONTAINED_OP)
00507         selec = mcelem_array_contained_selec(mcelem, nmcelem,
00508                                              numbers, nnumbers,
00509                                              elem_values, nonnull_nitems,
00510                                              hist, nhist,
00511                                              operator, cmpfunc);
00512     else
00513     {
00514         elog(ERROR, "arraycontsel called for unrecognized operator %u",
00515              operator);
00516         selec = 0.0;            /* keep compiler quiet */
00517     }
00518 
00519     pfree(elem_values);
00520     pfree(elem_nulls);
00521     return selec;
00522 }
00523 
00524 /*
00525  * Estimate selectivity of "column @> const" and "column && const" based on
00526  * most common element statistics.  This estimation assumes element
00527  * occurrences are independent.
00528  *
00529  * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
00530  * the array column's MCELEM statistics slot, or are NULL/0 if stats are
00531  * not available.  array_data (of length nitems) is the constant's elements.
00532  *
00533  * Both the mcelem and array_data arrays are assumed presorted according
00534  * to the element type's cmpfunc.  Null elements are not present.
00535  *
00536  * TODO: this estimate probably could be improved by using the distinct
00537  * elements count histogram.  For example, excepting the special case of
00538  * "column @> '{}'", we can multiply the calculated selectivity by the
00539  * fraction of nonempty arrays in the column.
00540  */
00541 static Selectivity
00542 mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
00543                                    float4 *numbers, int nnumbers,
00544                                    Datum *array_data, int nitems,
00545                                    Oid operator, FmgrInfo *cmpfunc)
00546 {
00547     Selectivity selec,
00548                 elem_selec;
00549     int         mcelem_index,
00550                 i;
00551     bool        use_bsearch;
00552     float4      minfreq;
00553 
00554     /*
00555      * There should be three more Numbers than Values, because the last three
00556      * cells should hold minimal and maximal frequency among the non-null
00557      * elements, and then the frequency of null elements.  Ignore the Numbers
00558      * if not right.
00559      */
00560     if (nnumbers != nmcelem + 3)
00561     {
00562         numbers = NULL;
00563         nnumbers = 0;
00564     }
00565 
00566     if (numbers)
00567     {
00568         /* Grab the lowest observed frequency */
00569         minfreq = numbers[nmcelem];
00570     }
00571     else
00572     {
00573         /* Without statistics make some default assumptions */
00574         minfreq = 2 * (float4) DEFAULT_CONTAIN_SEL;
00575     }
00576 
00577     /* Decide whether it is faster to use binary search or not. */
00578     if (nitems * floor_log2((uint32) nmcelem) < nmcelem + nitems)
00579         use_bsearch = true;
00580     else
00581         use_bsearch = false;
00582 
00583     if (operator == OID_ARRAY_CONTAINS_OP)
00584     {
00585         /*
00586          * Initial selectivity for "column @> const" query is 1.0, and it will
00587          * be decreased with each element of constant array.
00588          */
00589         selec = 1.0;
00590     }
00591     else
00592     {
00593         /*
00594          * Initial selectivity for "column && const" query is 0.0, and it will
00595          * be increased with each element of constant array.
00596          */
00597         selec = 0.0;
00598     }
00599 
00600     /* Scan mcelem and array in parallel. */
00601     mcelem_index = 0;
00602     for (i = 0; i < nitems; i++)
00603     {
00604         bool        match = false;
00605 
00606         /* Ignore any duplicates in the array data. */
00607         if (i > 0 &&
00608             element_compare(&array_data[i - 1], &array_data[i], cmpfunc) == 0)
00609             continue;
00610 
00611         /* Find the smallest MCELEM >= this array item. */
00612         if (use_bsearch)
00613         {
00614             match = find_next_mcelem(mcelem, nmcelem, array_data[i],
00615                                      &mcelem_index, cmpfunc);
00616         }
00617         else
00618         {
00619             while (mcelem_index < nmcelem)
00620             {
00621                 int         cmp = element_compare(&mcelem[mcelem_index],
00622                                                   &array_data[i],
00623                                                   cmpfunc);
00624 
00625                 if (cmp < 0)
00626                     mcelem_index++;
00627                 else
00628                 {
00629                     if (cmp == 0)
00630                         match = true;   /* mcelem is found */
00631                     break;
00632                 }
00633             }
00634         }
00635 
00636         if (match && numbers)
00637         {
00638             /* MCELEM matches the array item; use its frequency. */
00639             elem_selec = numbers[mcelem_index];
00640             mcelem_index++;
00641         }
00642         else
00643         {
00644             /*
00645              * The element is not in MCELEM.  Punt, but assume that the
00646              * selectivity cannot be more than minfreq / 2.
00647              */
00648             elem_selec = Min(DEFAULT_CONTAIN_SEL, minfreq / 2);
00649         }
00650 
00651         /*
00652          * Update overall selectivity using the current element's selectivity
00653          * and an assumption of element occurrence independence.
00654          */
00655         if (operator == OID_ARRAY_CONTAINS_OP)
00656             selec *= elem_selec;
00657         else
00658             selec = selec + elem_selec - selec * elem_selec;
00659 
00660         /* Clamp intermediate results to stay sane despite roundoff error */
00661         CLAMP_PROBABILITY(selec);
00662     }
00663 
00664     return selec;
00665 }
00666 
00667 /*
00668  * Estimate selectivity of "column <@ const" based on most common element
00669  * statistics.
00670  *
00671  * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
00672  * the array column's MCELEM statistics slot, or are NULL/0 if stats are
00673  * not available.  array_data (of length nitems) is the constant's elements.
00674  * hist (of length nhist) is from the array column's DECHIST statistics slot,
00675  * or is NULL/0 if those stats are not available.
00676  *
00677  * Both the mcelem and array_data arrays are assumed presorted according
00678  * to the element type's cmpfunc.  Null elements are not present.
00679  *
00680  * Independent element occurrence would imply a particular distribution of
00681  * distinct element counts among matching rows.  Real data usually falsifies
00682  * that assumption.  For example, in a set of 11-element integer arrays having
00683  * elements in the range [0..10], element occurrences are typically not
00684  * independent.  If they were, a sufficiently-large set would include all
00685  * distinct element counts 0 through 11.  We correct for this using the
00686  * histogram of distinct element counts.
00687  *
00688  * In the "column @> const" and "column && const" cases, we usually have a
00689  * "const" with low number of elements (otherwise we have selectivity close
00690  * to 0 or 1 respectively).  That's why the effect of dependence related
00691  * to distinct element count distribution is negligible there.  In the
00692  * "column <@ const" case, number of elements is usually high (otherwise we
00693  * have selectivity close to 0).  That's why we should do a correction with
00694  * the array distinct element count distribution here.
00695  *
00696  * Using the histogram of distinct element counts produces a different
00697  * distribution law than independent occurrences of elements.  This
00698  * distribution law can be described as follows:
00699  *
00700  * P(o1, o2, ..., on) = f1^o1 * (1 - f1)^(1 - o1) * f2^o2 *
00701  *    (1 - f2)^(1 - o2) * ... * fn^on * (1 - fn)^(1 - on) * hist[m] / ind[m]
00702  *
00703  * where:
00704  * o1, o2, ..., on - occurrences of elements 1, 2, ..., n
00705  *      (1 - occurrence, 0 - no occurrence) in row
00706  * f1, f2, ..., fn - frequencies of elements 1, 2, ..., n
00707  *      (scalar values in [0..1]) according to collected statistics
00708  * m = o1 + o2 + ... + on = total number of distinct elements in row
00709  * hist[m] - histogram data for occurrence of m elements.
00710  * ind[m] - probability of m occurrences from n events assuming their
00711  *    probabilities to be equal to frequencies of array elements.
00712  *
00713  * ind[m] = sum(f1^o1 * (1 - f1)^(1 - o1) * f2^o2 * (1 - f2)^(1 - o2) *
00714  * ... * fn^on * (1 - fn)^(1 - on), o1, o2, ..., on) | o1 + o2 + .. on = m
00715  */
00716 static Selectivity
00717 mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
00718                              float4 *numbers, int nnumbers,
00719                              Datum *array_data, int nitems,
00720                              float4 *hist, int nhist,
00721                              Oid operator, FmgrInfo *cmpfunc)
00722 {
00723     int         mcelem_index,
00724                 i,
00725                 unique_nitems = 0;
00726     float       selec,
00727                 minfreq,
00728                 nullelem_freq;
00729     float      *dist,
00730                *mcelem_dist,
00731                *hist_part;
00732     float       avg_count,
00733                 mult,
00734                 rest;
00735     float      *elem_selec;
00736 
00737     /*
00738      * There should be three more Numbers than Values in the MCELEM slot,
00739      * because the last three cells should hold minimal and maximal frequency
00740      * among the non-null elements, and then the frequency of null elements.
00741      * Punt if not right, because we can't do much without the element freqs.
00742      */
00743     if (numbers == NULL || nnumbers != nmcelem + 3)
00744         return DEFAULT_CONTAIN_SEL;
00745 
00746     /* Can't do much without a count histogram, either */
00747     if (hist == NULL || nhist < 3)
00748         return DEFAULT_CONTAIN_SEL;
00749 
00750     /*
00751      * Grab some of the summary statistics that compute_array_stats() stores:
00752      * lowest frequency, frequency of null elements, and average distinct
00753      * element count.
00754      */
00755     minfreq = numbers[nmcelem];
00756     nullelem_freq = numbers[nmcelem + 2];
00757     avg_count = hist[nhist - 1];
00758 
00759     /*
00760      * "rest" will be the sum of the frequencies of all elements not
00761      * represented in MCELEM.  The average distinct element count is the sum
00762      * of the frequencies of *all* elements.  Begin with that; we will proceed
00763      * to subtract the MCELEM frequencies.
00764      */
00765     rest = avg_count;
00766 
00767     /*
00768      * mult is a multiplier representing estimate of probability that each
00769      * mcelem that is not present in constant doesn't occur.
00770      */
00771     mult = 1.0f;
00772 
00773     /*
00774      * elem_selec is array of estimated frequencies for elements in the
00775      * constant.
00776      */
00777     elem_selec = (float *) palloc(sizeof(float) * nitems);
00778 
00779     /* Scan mcelem and array in parallel. */
00780     mcelem_index = 0;
00781     for (i = 0; i < nitems; i++)
00782     {
00783         bool        match = false;
00784 
00785         /* Ignore any duplicates in the array data. */
00786         if (i > 0 &&
00787             element_compare(&array_data[i - 1], &array_data[i], cmpfunc) == 0)
00788             continue;
00789 
00790         /*
00791          * Iterate over MCELEM until we find an entry greater than or equal to
00792          * this element of the constant.  Update "rest" and "mult" for mcelem
00793          * entries skipped over.
00794          */
00795         while (mcelem_index < nmcelem)
00796         {
00797             int         cmp = element_compare(&mcelem[mcelem_index],
00798                                               &array_data[i],
00799                                               cmpfunc);
00800 
00801             if (cmp < 0)
00802             {
00803                 mult *= (1.0f - numbers[mcelem_index]);
00804                 rest -= numbers[mcelem_index];
00805                 mcelem_index++;
00806             }
00807             else
00808             {
00809                 if (cmp == 0)
00810                     match = true;       /* mcelem is found */
00811                 break;
00812             }
00813         }
00814 
00815         if (match)
00816         {
00817             /* MCELEM matches the array item. */
00818             elem_selec[unique_nitems] = numbers[mcelem_index];
00819             /* "rest" is decremented for all mcelems, matched or not */
00820             rest -= numbers[mcelem_index];
00821             mcelem_index++;
00822         }
00823         else
00824         {
00825             /*
00826              * The element is not in MCELEM.  Punt, but assume that the
00827              * selectivity cannot be more than minfreq / 2.
00828              */
00829             elem_selec[unique_nitems] = Min(DEFAULT_CONTAIN_SEL,
00830                                             minfreq / 2);
00831         }
00832 
00833         unique_nitems++;
00834     }
00835 
00836     /*
00837      * If we handled all constant elements without exhausting the MCELEM
00838      * array, finish walking it to complete calculation of "rest" and "mult".
00839      */
00840     while (mcelem_index < nmcelem)
00841     {
00842         mult *= (1.0f - numbers[mcelem_index]);
00843         rest -= numbers[mcelem_index];
00844         mcelem_index++;
00845     }
00846 
00847     /*
00848      * The presence of many distinct rare elements materially decreases
00849      * selectivity.  Use the Poisson distribution to estimate the probability
00850      * of a column value having zero occurrences of such elements.  See above
00851      * for the definition of "rest".
00852      */
00853     mult *= exp(-rest);
00854 
00855     /*----------
00856      * Using the distinct element count histogram requires
00857      *      O(unique_nitems * (nmcelem + unique_nitems))
00858      * operations.  Beyond a certain computational cost threshold, it's
00859      * reasonable to sacrifice accuracy for decreased planning time.  We limit
00860      * the number of operations to EFFORT * nmcelem; since nmcelem is limited
00861      * by the column's statistics target, the work done is user-controllable.
00862      *
00863      * If the number of operations would be too large, we can reduce it
00864      * without losing all accuracy by reducing unique_nitems and considering
00865      * only the most-common elements of the constant array.  To make the
00866      * results exactly match what we would have gotten with only those
00867      * elements to start with, we'd have to remove any discarded elements'
00868      * frequencies from "mult", but since this is only an approximation
00869      * anyway, we don't bother with that.  Therefore it's sufficient to qsort
00870      * elem_selec[] and take the largest elements.  (They will no longer match
00871      * up with the elements of array_data[], but we don't care.)
00872      *----------
00873      */
00874 #define EFFORT 100
00875 
00876     if ((nmcelem + unique_nitems) > 0 &&
00877         unique_nitems > EFFORT * nmcelem / (nmcelem + unique_nitems))
00878     {
00879         /*
00880          * Use the quadratic formula to solve for largest allowable N.  We
00881          * have A = 1, B = nmcelem, C = - EFFORT * nmcelem.
00882          */
00883         double      b = (double) nmcelem;
00884         int         n;
00885 
00886         n = (int) ((sqrt(b * b + 4 * EFFORT * b) - b) / 2);
00887 
00888         /* Sort, then take just the first n elements */
00889         qsort(elem_selec, unique_nitems, sizeof(float),
00890               float_compare_desc);
00891         unique_nitems = n;
00892     }
00893 
00894     /*
00895      * Calculate probabilities of each distinct element count for both mcelems
00896      * and constant elements.  At this point, assume independent element
00897      * occurrence.
00898      */
00899     dist = calc_distr(elem_selec, unique_nitems, unique_nitems, 0.0f);
00900     mcelem_dist = calc_distr(numbers, nmcelem, unique_nitems, rest);
00901 
00902     /* ignore hist[nhist-1], which is the average not a histogram member */
00903     hist_part = calc_hist(hist, nhist - 1, unique_nitems);
00904 
00905     selec = 0.0f;
00906     for (i = 0; i <= unique_nitems; i++)
00907     {
00908         /*
00909          * mult * dist[i] / mcelem_dist[i] gives us probability of qual
00910          * matching from assumption of independent element occurrence with the
00911          * condition that distinct element count = i.
00912          */
00913         if (mcelem_dist[i] > 0)
00914             selec += hist_part[i] * mult * dist[i] / mcelem_dist[i];
00915     }
00916 
00917     pfree(dist);
00918     pfree(mcelem_dist);
00919     pfree(hist_part);
00920     pfree(elem_selec);
00921 
00922     /* Take into account occurrence of NULL element. */
00923     selec *= (1.0f - nullelem_freq);
00924 
00925     CLAMP_PROBABILITY(selec);
00926 
00927     return selec;
00928 }
00929 
00930 /*
00931  * Calculate the first n distinct element count probabilities from a
00932  * histogram of distinct element counts.
00933  *
00934  * Returns a palloc'd array of n+1 entries, with array[k] being the
00935  * probability of element count k, k in [0..n].
00936  *
00937  * We assume that a histogram box with bounds a and b gives 1 / ((b - a + 1) *
00938  * (nhist - 1)) probability to each value in (a,b) and an additional half of
00939  * that to a and b themselves.
00940  */
00941 static float *
00942 calc_hist(const float4 *hist, int nhist, int n)
00943 {
00944     float      *hist_part;
00945     int         k,
00946                 i = 0;
00947     float       prev_interval = 0,
00948                 next_interval;
00949     float       frac;
00950 
00951     hist_part = (float *) palloc((n + 1) * sizeof(float));
00952 
00953     /*
00954      * frac is a probability contribution for each interval between histogram
00955      * values.  We have nhist - 1 intervals, so contribution of each one will
00956      * be 1 / (nhist - 1).
00957      */
00958     frac = 1.0f / ((float) (nhist - 1));
00959 
00960     for (k = 0; k <= n; k++)
00961     {
00962         int         count = 0;
00963 
00964         /*
00965          * Count the histogram boundaries equal to k.  (Although the histogram
00966          * should theoretically contain only exact integers, entries are
00967          * floats so there could be roundoff error in large values.  Treat any
00968          * fractional value as equal to the next larger k.)
00969          */
00970         while (i < nhist && hist[i] <= k)
00971         {
00972             count++;
00973             i++;
00974         }
00975 
00976         if (count > 0)
00977         {
00978             /* k is an exact bound for at least one histogram box. */
00979             float       val;
00980 
00981             /* Find length between current histogram value and the next one */
00982             if (i < nhist)
00983                 next_interval = hist[i] - hist[i - 1];
00984             else
00985                 next_interval = 0;
00986 
00987             /*
00988              * count - 1 histogram boxes contain k exclusively.  They
00989              * contribute a total of (count - 1) * frac probability.  Also
00990              * factor in the partial histogram boxes on either side.
00991              */
00992             val = (float) (count - 1);
00993             if (next_interval > 0)
00994                 val += 0.5f / next_interval;
00995             if (prev_interval > 0)
00996                 val += 0.5f / prev_interval;
00997             hist_part[k] = frac * val;
00998 
00999             prev_interval = next_interval;
01000         }
01001         else
01002         {
01003             /* k does not appear as an exact histogram bound. */
01004             if (prev_interval > 0)
01005                 hist_part[k] = frac / prev_interval;
01006             else
01007                 hist_part[k] = 0.0f;
01008         }
01009     }
01010 
01011     return hist_part;
01012 }
01013 
01014 /*
01015  * Consider n independent events with probabilities p[].  This function
01016  * calculates probabilities of exact k of events occurrence for k in [0..m].
01017  * Returns a palloc'd array of size m+1.
01018  *
01019  * "rest" is the sum of the probabilities of all low-probability events not
01020  * included in p.
01021  *
01022  * Imagine matrix M of size (n + 1) x (m + 1).  Element M[i,j] denotes the
01023  * probability that exactly j of first i events occur.  Obviously M[0,0] = 1.
01024  * For any constant j, each increment of i increases the probability iff the
01025  * event occurs.  So, by the law of total probability:
01026  *  M[i,j] = M[i - 1, j] * (1 - p[i]) + M[i - 1, j - 1] * p[i]
01027  *      for i > 0, j > 0.
01028  *  M[i,0] = M[i - 1, 0] * (1 - p[i]) for i > 0.
01029  */
01030 static float *
01031 calc_distr(const float *p, int n, int m, float rest)
01032 {
01033     float      *row,
01034                *prev_row,
01035                *tmp;
01036     int         i,
01037                 j;
01038 
01039     /*
01040      * Since we return only the last row of the matrix and need only the
01041      * current and previous row for calculations, allocate two rows.
01042      */
01043     row = (float *) palloc((m + 1) * sizeof(float));
01044     prev_row = (float *) palloc((m + 1) * sizeof(float));
01045 
01046     /* M[0,0] = 1 */
01047     row[0] = 1.0f;
01048     for (i = 1; i <= n; i++)
01049     {
01050         float       t = p[i - 1];
01051 
01052         /* Swap rows */
01053         tmp = row;
01054         row = prev_row;
01055         prev_row = tmp;
01056 
01057         /* Calculate next row */
01058         for (j = 0; j <= i && j <= m; j++)
01059         {
01060             float       val = 0.0f;
01061 
01062             if (j < i)
01063                 val += prev_row[j] * (1.0f - t);
01064             if (j > 0)
01065                 val += prev_row[j - 1] * t;
01066             row[j] = val;
01067         }
01068     }
01069 
01070     /*
01071      * The presence of many distinct rare (not in "p") elements materially
01072      * decreases selectivity.  Model their collective occurrence with the
01073      * Poisson distribution.
01074      */
01075     if (rest > DEFAULT_CONTAIN_SEL)
01076     {
01077         float       t;
01078 
01079         /* Swap rows */
01080         tmp = row;
01081         row = prev_row;
01082         prev_row = tmp;
01083 
01084         for (i = 0; i <= m; i++)
01085             row[i] = 0.0f;
01086 
01087         /* Value of Poisson distribution for 0 occurrences */
01088         t = exp(-rest);
01089 
01090         /*
01091          * Calculate convolution of previously computed distribution and the
01092          * Poisson distribution.
01093          */
01094         for (i = 0; i <= m; i++)
01095         {
01096             for (j = 0; j <= m - i; j++)
01097                 row[j + i] += prev_row[j] * t;
01098 
01099             /* Get Poisson distribution value for (i + 1) occurrences */
01100             t *= rest / (float) (i + 1);
01101         }
01102     }
01103 
01104     pfree(prev_row);
01105     return row;
01106 }
01107 
01108 /* Fast function for floor value of 2 based logarithm calculation. */
01109 static int
01110 floor_log2(uint32 n)
01111 {
01112     int         logval = 0;
01113 
01114     if (n == 0)
01115         return -1;
01116     if (n >= (1 << 16))
01117     {
01118         n >>= 16;
01119         logval += 16;
01120     }
01121     if (n >= (1 << 8))
01122     {
01123         n >>= 8;
01124         logval += 8;
01125     }
01126     if (n >= (1 << 4))
01127     {
01128         n >>= 4;
01129         logval += 4;
01130     }
01131     if (n >= (1 << 2))
01132     {
01133         n >>= 2;
01134         logval += 2;
01135     }
01136     if (n >= (1 << 1))
01137     {
01138         logval += 1;
01139     }
01140     return logval;
01141 }
01142 
01143 /*
01144  * find_next_mcelem binary-searches a most common elements array, starting
01145  * from *index, for the first member >= value.  It saves the position of the
01146  * match into *index and returns true if it's an exact match.  (Note: we
01147  * assume the mcelem elements are distinct so there can't be more than one
01148  * exact match.)
01149  */
01150 static bool
01151 find_next_mcelem(Datum *mcelem, int nmcelem, Datum value, int *index,
01152                  FmgrInfo *cmpfunc)
01153 {
01154     int         l = *index,
01155                 r = nmcelem - 1,
01156                 i,
01157                 res;
01158 
01159     while (l <= r)
01160     {
01161         i = (l + r) / 2;
01162         res = element_compare(&mcelem[i], &value, cmpfunc);
01163         if (res == 0)
01164         {
01165             *index = i;
01166             return true;
01167         }
01168         else if (res < 0)
01169             l = i + 1;
01170         else
01171             r = i - 1;
01172     }
01173     *index = l;
01174     return false;
01175 }
01176 
01177 /*
01178  * Comparison function for elements.
01179  *
01180  * We use the element type's default btree opclass, and the default collation
01181  * if the type is collation-sensitive.
01182  *
01183  * XXX consider using SortSupport infrastructure
01184  */
01185 static int
01186 element_compare(const void *key1, const void *key2, void *arg)
01187 {
01188     Datum       d1 = *((const Datum *) key1);
01189     Datum       d2 = *((const Datum *) key2);
01190     FmgrInfo   *cmpfunc = (FmgrInfo *) arg;
01191     Datum       c;
01192 
01193     c = FunctionCall2Coll(cmpfunc, DEFAULT_COLLATION_OID, d1, d2);
01194     return DatumGetInt32(c);
01195 }
01196 
01197 /*
01198  * Comparison function for sorting floats into descending order.
01199  */
01200 static int
01201 float_compare_desc(const void *key1, const void *key2)
01202 {
01203     float       d1 = *((const float *) key1);
01204     float       d2 = *((const float *) key2);
01205 
01206     if (d1 > d2)
01207         return -1;
01208     else if (d1 < d2)
01209         return 1;
01210     else
01211         return 0;
01212 }