Header And Logo

PostgreSQL
| The world's most advanced open source database.

ts_selfuncs.c

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * ts_selfuncs.c
00004  *    Selectivity estimation functions for text search operators.
00005  *
00006  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00007  *
00008  *
00009  * IDENTIFICATION
00010  *    src/backend/tsearch/ts_selfuncs.c
00011  *
00012  *-------------------------------------------------------------------------
00013  */
00014 #include "postgres.h"
00015 
00016 #include "access/htup_details.h"
00017 #include "catalog/pg_statistic.h"
00018 #include "catalog/pg_type.h"
00019 #include "miscadmin.h"
00020 #include "nodes/nodes.h"
00021 #include "tsearch/ts_type.h"
00022 #include "utils/lsyscache.h"
00023 #include "utils/selfuncs.h"
00024 #include "utils/syscache.h"
00025 
00026 
00027 /*
00028  * The default text search selectivity is chosen to be small enough to
00029  * encourage indexscans for typical table densities.  See selfuncs.h and
00030  * DEFAULT_EQ_SEL for details.
00031  */
00032 #define DEFAULT_TS_MATCH_SEL 0.005
00033 
00034 /* lookup table type for binary searching through MCELEMs */
00035 typedef struct
00036 {
00037     text       *element;
00038     float4      frequency;
00039 } TextFreq;
00040 
00041 /* type of keys for bsearch'ing through an array of TextFreqs */
00042 typedef struct
00043 {
00044     char       *lexeme;
00045     int         length;
00046 } LexemeKey;
00047 
00048 static Selectivity tsquerysel(VariableStatData *vardata, Datum constval);
00049 static Selectivity mcelem_tsquery_selec(TSQuery query,
00050                      Datum *mcelem, int nmcelem,
00051                      float4 *numbers, int nnumbers);
00052 static Selectivity tsquery_opr_selec(QueryItem *item, char *operand,
00053                   TextFreq *lookup, int length, float4 minfreq);
00054 static int  compare_lexeme_textfreq(const void *e1, const void *e2);
00055 
00056 #define tsquery_opr_selec_no_stats(query) \
00057     tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), NULL, 0, 0)
00058 
00059 
00060 /*
00061  *  tsmatchsel -- Selectivity of "@@"
00062  *
00063  * restriction selectivity function for tsvector @@ tsquery and
00064  * tsquery @@ tsvector
00065  */
00066 Datum
00067 tsmatchsel(PG_FUNCTION_ARGS)
00068 {
00069     PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
00070 
00071 #ifdef NOT_USED
00072     Oid         operator = PG_GETARG_OID(1);
00073 #endif
00074     List       *args = (List *) PG_GETARG_POINTER(2);
00075     int         varRelid = PG_GETARG_INT32(3);
00076     VariableStatData vardata;
00077     Node       *other;
00078     bool        varonleft;
00079     Selectivity selec;
00080 
00081     /*
00082      * If expression is not variable = something or something = variable, then
00083      * punt and return a default estimate.
00084      */
00085     if (!get_restriction_variable(root, args, varRelid,
00086                                   &vardata, &other, &varonleft))
00087         PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
00088 
00089     /*
00090      * Can't do anything useful if the something is not a constant, either.
00091      */
00092     if (!IsA(other, Const))
00093     {
00094         ReleaseVariableStats(vardata);
00095         PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
00096     }
00097 
00098     /*
00099      * The "@@" operator is strict, so we can cope with NULL right away
00100      */
00101     if (((Const *) other)->constisnull)
00102     {
00103         ReleaseVariableStats(vardata);
00104         PG_RETURN_FLOAT8(0.0);
00105     }
00106 
00107     /*
00108      * OK, there's a Var and a Const we're dealing with here.  We need the
00109      * Const to be a TSQuery, else we can't do anything useful.  We have to
00110      * check this because the Var might be the TSQuery not the TSVector.
00111      */
00112     if (((Const *) other)->consttype == TSQUERYOID)
00113     {
00114         /* tsvector @@ tsquery or the other way around */
00115         Assert(vardata.vartype == TSVECTOROID);
00116 
00117         selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
00118     }
00119     else
00120     {
00121         /* If we can't see the query structure, must punt */
00122         selec = DEFAULT_TS_MATCH_SEL;
00123     }
00124 
00125     ReleaseVariableStats(vardata);
00126 
00127     CLAMP_PROBABILITY(selec);
00128 
00129     PG_RETURN_FLOAT8((float8) selec);
00130 }
00131 
00132 
00133 /*
00134  *  tsmatchjoinsel -- join selectivity of "@@"
00135  *
00136  * join selectivity function for tsvector @@ tsquery and tsquery @@ tsvector
00137  */
00138 Datum
00139 tsmatchjoinsel(PG_FUNCTION_ARGS)
00140 {
00141     /* for the moment we just punt */
00142     PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
00143 }
00144 
00145 
00146 /*
00147  * @@ selectivity for tsvector var vs tsquery constant
00148  */
00149 static Selectivity
00150 tsquerysel(VariableStatData *vardata, Datum constval)
00151 {
00152     Selectivity selec;
00153     TSQuery     query;
00154 
00155     /* The caller made sure the const is a TSQuery, so get it now */
00156     query = DatumGetTSQuery(constval);
00157 
00158     /* Empty query matches nothing */
00159     if (query->size == 0)
00160         return (Selectivity) 0.0;
00161 
00162     if (HeapTupleIsValid(vardata->statsTuple))
00163     {
00164         Form_pg_statistic stats;
00165         Datum      *values;
00166         int         nvalues;
00167         float4     *numbers;
00168         int         nnumbers;
00169 
00170         stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
00171 
00172         /* MCELEM will be an array of TEXT elements for a tsvector column */
00173         if (get_attstatsslot(vardata->statsTuple,
00174                              TEXTOID, -1,
00175                              STATISTIC_KIND_MCELEM, InvalidOid,
00176                              NULL,
00177                              &values, &nvalues,
00178                              &numbers, &nnumbers))
00179         {
00180             /*
00181              * There is a most-common-elements slot for the tsvector Var, so
00182              * use that.
00183              */
00184             selec = mcelem_tsquery_selec(query, values, nvalues,
00185                                          numbers, nnumbers);
00186             free_attstatsslot(TEXTOID, values, nvalues, numbers, nnumbers);
00187         }
00188         else
00189         {
00190             /* No most-common-elements info, so do without */
00191             selec = tsquery_opr_selec_no_stats(query);
00192         }
00193 
00194         /*
00195          * MCE stats count only non-null rows, so adjust for null rows.
00196          */
00197         selec *= (1.0 - stats->stanullfrac);
00198     }
00199     else
00200     {
00201         /* No stats at all, so do without */
00202         selec = tsquery_opr_selec_no_stats(query);
00203         /* we assume no nulls here, so no stanullfrac correction */
00204     }
00205 
00206     return selec;
00207 }
00208 
00209 /*
00210  * Extract data from the pg_statistic arrays into useful format.
00211  */
00212 static Selectivity
00213 mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
00214                      float4 *numbers, int nnumbers)
00215 {
00216     float4      minfreq;
00217     TextFreq   *lookup;
00218     Selectivity selec;
00219     int         i;
00220 
00221     /*
00222      * There should be two more Numbers than Values, because the last two
00223      * cells are taken for minimal and maximal frequency.  Punt if not.
00224      *
00225      * (Note: the MCELEM statistics slot definition allows for a third extra
00226      * number containing the frequency of nulls, but we're not expecting that
00227      * to appear for a tsvector column.)
00228      */
00229     if (nnumbers != nmcelem + 2)
00230         return tsquery_opr_selec_no_stats(query);
00231 
00232     /*
00233      * Transpose the data into a single array so we can use bsearch().
00234      */
00235     lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem);
00236     for (i = 0; i < nmcelem; i++)
00237     {
00238         /*
00239          * The text Datums came from an array, so it cannot be compressed or
00240          * stored out-of-line -- it's safe to use VARSIZE_ANY*.
00241          */
00242         Assert(!VARATT_IS_COMPRESSED(mcelem[i]) && !VARATT_IS_EXTERNAL(mcelem[i]));
00243         lookup[i].element = (text *) DatumGetPointer(mcelem[i]);
00244         lookup[i].frequency = numbers[i];
00245     }
00246 
00247     /*
00248      * Grab the lowest frequency. compute_tsvector_stats() stored it for us in
00249      * the one before the last cell of the Numbers array. See ts_typanalyze.c
00250      */
00251     minfreq = numbers[nnumbers - 2];
00252 
00253     selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup,
00254                               nmcelem, minfreq);
00255 
00256     pfree(lookup);
00257 
00258     return selec;
00259 }
00260 
00261 /*
00262  * Traverse the tsquery in preorder, calculating selectivity as:
00263  *
00264  *   selec(left_oper) * selec(right_oper) in AND nodes,
00265  *
00266  *   selec(left_oper) + selec(right_oper) -
00267  *      selec(left_oper) * selec(right_oper) in OR nodes,
00268  *
00269  *   1 - select(oper) in NOT nodes
00270  *
00271  *   histogram-based estimation in prefix VAL nodes
00272  *
00273  *   freq[val] in exact VAL nodes, if the value is in MCELEM
00274  *   min(freq[MCELEM]) / 2 in VAL nodes, if it is not
00275  *
00276  * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
00277  * binary search for determining freq[MCELEM].
00278  *
00279  * If we don't have stats for the tsvector, we still use this logic,
00280  * except we use default estimates for VAL nodes.  This case is signaled
00281  * by lookup == NULL.
00282  */
00283 static Selectivity
00284 tsquery_opr_selec(QueryItem *item, char *operand,
00285                   TextFreq *lookup, int length, float4 minfreq)
00286 {
00287     Selectivity selec;
00288 
00289     /* since this function recurses, it could be driven to stack overflow */
00290     check_stack_depth();
00291 
00292     if (item->type == QI_VAL)
00293     {
00294         QueryOperand *oper = (QueryOperand *) item;
00295         LexemeKey   key;
00296 
00297         /*
00298          * Prepare the key for bsearch().
00299          */
00300         key.lexeme = operand + oper->distance;
00301         key.length = oper->length;
00302 
00303         if (oper->prefix)
00304         {
00305             /* Prefix match, ie the query item is lexeme:* */
00306             Selectivity matched,
00307                         allmces;
00308             int         i,
00309                         n_matched;
00310 
00311             /*
00312              * Our strategy is to scan through the MCELEM list and combine the
00313              * frequencies of the ones that match the prefix.  We then
00314              * extrapolate the fraction of matching MCELEMs to the remaining
00315              * rows, assuming that the MCELEMs are representative of the whole
00316              * lexeme population in this respect.  (Compare
00317              * histogram_selectivity().)  Note that these are most common
00318              * elements not most common values, so they're not mutually
00319              * exclusive.  We treat occurrences as independent events.
00320              *
00321              * This is only a good plan if we have a pretty fair number of
00322              * MCELEMs available; we set the threshold at 100.  If no stats or
00323              * insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
00324              */
00325             if (lookup == NULL || length < 100)
00326                 return (Selectivity) (DEFAULT_TS_MATCH_SEL * 4);
00327 
00328             matched = allmces = 0;
00329             n_matched = 0;
00330             for (i = 0; i < length; i++)
00331             {
00332                 TextFreq   *t = lookup + i;
00333                 int         tlen = VARSIZE_ANY_EXHDR(t->element);
00334 
00335                 if (tlen >= key.length &&
00336                     strncmp(key.lexeme, VARDATA_ANY(t->element),
00337                             key.length) == 0)
00338                 {
00339                     matched += t->frequency - matched * t->frequency;
00340                     n_matched++;
00341                 }
00342                 allmces += t->frequency - allmces * t->frequency;
00343             }
00344 
00345             /* Clamp to ensure sanity in the face of roundoff error */
00346             CLAMP_PROBABILITY(matched);
00347             CLAMP_PROBABILITY(allmces);
00348 
00349             selec = matched + (1.0 - allmces) * ((double) n_matched / length);
00350 
00351             /*
00352              * In any case, never believe that a prefix match has selectivity
00353              * less than we would assign for a non-MCELEM lexeme.  This
00354              * preserves the property that "word:*" should be estimated to
00355              * match at least as many rows as "word" would be.
00356              */
00357             selec = Max(Min(DEFAULT_TS_MATCH_SEL, minfreq / 2), selec);
00358         }
00359         else
00360         {
00361             /* Regular exact lexeme match */
00362             TextFreq   *searchres;
00363 
00364             /* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
00365             if (lookup == NULL)
00366                 return (Selectivity) DEFAULT_TS_MATCH_SEL;
00367 
00368             searchres = (TextFreq *) bsearch(&key, lookup, length,
00369                                              sizeof(TextFreq),
00370                                              compare_lexeme_textfreq);
00371 
00372             if (searchres)
00373             {
00374                 /*
00375                  * The element is in MCELEM.  Return precise selectivity (or
00376                  * at least as precise as ANALYZE could find out).
00377                  */
00378                 selec = searchres->frequency;
00379             }
00380             else
00381             {
00382                 /*
00383                  * The element is not in MCELEM.  Punt, but assume that the
00384                  * selectivity cannot be more than minfreq / 2.
00385                  */
00386                 selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
00387             }
00388         }
00389     }
00390     else
00391     {
00392         /* Current TSQuery node is an operator */
00393         Selectivity s1,
00394                     s2;
00395 
00396         switch (item->qoperator.oper)
00397         {
00398             case OP_NOT:
00399                 selec = 1.0 - tsquery_opr_selec(item + 1, operand,
00400                                                 lookup, length, minfreq);
00401                 break;
00402 
00403             case OP_AND:
00404                 s1 = tsquery_opr_selec(item + 1, operand,
00405                                        lookup, length, minfreq);
00406                 s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
00407                                        lookup, length, minfreq);
00408                 selec = s1 * s2;
00409                 break;
00410 
00411             case OP_OR:
00412                 s1 = tsquery_opr_selec(item + 1, operand,
00413                                        lookup, length, minfreq);
00414                 s2 = tsquery_opr_selec(item + item->qoperator.left, operand,
00415                                        lookup, length, minfreq);
00416                 selec = s1 + s2 - s1 * s2;
00417                 break;
00418 
00419             default:
00420                 elog(ERROR, "unrecognized operator: %d", item->qoperator.oper);
00421                 selec = 0;      /* keep compiler quiet */
00422                 break;
00423         }
00424     }
00425 
00426     /* Clamp intermediate results to stay sane despite roundoff error */
00427     CLAMP_PROBABILITY(selec);
00428 
00429     return selec;
00430 }
00431 
00432 /*
00433  * bsearch() comparator for a lexeme (non-NULL terminated string with length)
00434  * and a TextFreq. Use length, then byte-for-byte comparison, because that's
00435  * how ANALYZE code sorted data before storing it in a statistic tuple.
00436  * See ts_typanalyze.c for details.
00437  */
00438 static int
00439 compare_lexeme_textfreq(const void *e1, const void *e2)
00440 {
00441     const LexemeKey *key = (const LexemeKey *) e1;
00442     const TextFreq *t = (const TextFreq *) e2;
00443     int         len1,
00444                 len2;
00445 
00446     len1 = key->length;
00447     len2 = VARSIZE_ANY_EXHDR(t->element);
00448 
00449     /* Compare lengths first, possibly avoiding a strncmp call */
00450     if (len1 > len2)
00451         return 1;
00452     else if (len1 < len2)
00453         return -1;
00454 
00455     /* Fall back on byte-for-byte comparison */
00456     return strncmp(key->lexeme, VARDATA_ANY(t->element), len1);
00457 }