PostgreSQL Source Code: src/backend/tsearch/ts

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * ts_typanalyze.c
00004  *    functions for gathering statistics from tsvector columns
00005  *
00006  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00007  *
00008  *
00009  * IDENTIFICATION
00010  *    src/backend/tsearch/ts_typanalyze.c
00011  *
00012  *-------------------------------------------------------------------------
00013  */
00014 #include "postgres.h"
00015 
00016 #include "access/hash.h"
00017 #include "catalog/pg_operator.h"
00018 #include "commands/vacuum.h"
00019 #include "tsearch/ts_type.h"
00020 #include "utils/builtins.h"
00021 
00022 
00023 /* A hash key for lexemes */
00024 typedef struct
00025 {
00026     char       *lexeme;         /* lexeme (not NULL terminated!) */
00027     int         length;         /* its length in bytes */
00028 } LexemeHashKey;
00029 
00030 /* A hash table entry for the Lossy Counting algorithm */
00031 typedef struct
00032 {
00033     LexemeHashKey key;          /* This is 'e' from the LC algorithm. */
00034     int         frequency;      /* This is 'f'. */
00035     int         delta;          /* And this is 'delta'. */
00036 } TrackItem;
00037 
00038 static void compute_tsvector_stats(VacAttrStats *stats,
00039                        AnalyzeAttrFetchFunc fetchfunc,
00040                        int samplerows,
00041                        double totalrows);
00042 static void prune_lexemes_hashtable(HTAB *lexemes_tab, int b_current);
00043 static uint32 lexeme_hash(const void *key, Size keysize);
00044 static int  lexeme_match(const void *key1, const void *key2, Size keysize);
00045 static int  lexeme_compare(const void *key1, const void *key2);
00046 static int  trackitem_compare_frequencies_desc(const void *e1, const void *e2);
00047 static int  trackitem_compare_lexemes(const void *e1, const void *e2);
00048 
00049 
00050 /*
00051  *  ts_typanalyze -- a custom typanalyze function for tsvector columns
00052  */
00053 Datum
00054 ts_typanalyze(PG_FUNCTION_ARGS)
00055 {
00056     VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
00057     Form_pg_attribute attr = stats->attr;
00058 
00059     /* If the attstattarget column is negative, use the default value */
00060     /* NB: it is okay to scribble on stats->attr since it's a copy */
00061     if (attr->attstattarget < 0)
00062         attr->attstattarget = default_statistics_target;
00063 
00064     stats->compute_stats = compute_tsvector_stats;
00065     /* see comment about the choice of minrows in commands/analyze.c */
00066     stats->minrows = 300 * attr->attstattarget;
00067 
00068     PG_RETURN_BOOL(true);
00069 }
00070 
00071 /*
00072  *  compute_tsvector_stats() -- compute statistics for a tsvector column
00073  *
00074  *  This functions computes statistics that are useful for determining @@
00075  *  operations' selectivity, along with the fraction of non-null rows and
00076  *  average width.
00077  *
00078  *  Instead of finding the most common values, as we do for most datatypes,
00079  *  we're looking for the most common lexemes. This is more useful, because
00080  *  there most probably won't be any two rows with the same tsvector and thus
00081  *  the notion of a MCV is a bit bogus with this datatype. With a list of the
00082  *  most common lexemes we can do a better job at figuring out @@ selectivity.
00083  *
00084  *  For the same reasons we assume that tsvector columns are unique when
00085  *  determining the number of distinct values.
00086  *
00087  *  The algorithm used is Lossy Counting, as proposed in the paper "Approximate
00088  *  frequency counts over data streams" by G. S. Manku and R. Motwani, in
00089  *  Proceedings of the 28th International Conference on Very Large Data Bases,
00090  *  Hong Kong, China, August 2002, section 4.2. The paper is available at
00091  *  http://www.vldb.org/conf/2002/S10P03.pdf
00092  *
00093  *  The Lossy Counting (aka LC) algorithm goes like this:
00094  *  Let s be the threshold frequency for an item (the minimum frequency we
00095  *  are interested in) and epsilon the error margin for the frequency. Let D
00096  *  be a set of triples (e, f, delta), where e is an element value, f is that
00097  *  element's frequency (actually, its current occurrence count) and delta is
00098  *  the maximum error in f. We start with D empty and process the elements in
00099  *  batches of size w. (The batch size is also known as "bucket size" and is
00100  *  equal to 1/epsilon.) Let the current batch number be b_current, starting
00101  *  with 1. For each element e we either increment its f count, if it's
00102  *  already in D, or insert a new triple into D with values (e, 1, b_current
00103  *  - 1). After processing each batch we prune D, by removing from it all
00104  *  elements with f + delta <= b_current.  After the algorithm finishes we
00105  *  suppress all elements from D that do not satisfy f >= (s - epsilon) * N,
00106  *  where N is the total number of elements in the input.  We emit the
00107  *  remaining elements with estimated frequency f/N.  The LC paper proves
00108  *  that this algorithm finds all elements with true frequency at least s,
00109  *  and that no frequency is overestimated or is underestimated by more than
00110  *  epsilon.  Furthermore, given reasonable assumptions about the input
00111  *  distribution, the required table size is no more than about 7 times w.
00112  *
00113  *  We set s to be the estimated frequency of the K'th word in a natural
00114  *  language's frequency table, where K is the target number of entries in
00115  *  the MCELEM array plus an arbitrary constant, meant to reflect the fact
00116  *  that the most common words in any language would usually be stopwords
00117  *  so we will not actually see them in the input.  We assume that the
00118  *  distribution of word frequencies (including the stopwords) follows Zipf's
00119  *  law with an exponent of 1.
00120  *
00121  *  Assuming Zipfian distribution, the frequency of the K'th word is equal
00122  *  to 1/(K * H(W)) where H(n) is 1/2 + 1/3 + ... + 1/n and W is the number of
00123  *  words in the language.  Putting W as one million, we get roughly 0.07/K.
00124  *  Assuming top 10 words are stopwords gives s = 0.07/(K + 10).  We set
00125  *  epsilon = s/10, which gives bucket width w = (K + 10)/0.007 and
00126  *  maximum expected hashtable size of about 1000 * (K + 10).
00127  *
00128  *  Note: in the above discussion, s, epsilon, and f/N are in terms of a
00129  *  lexeme's frequency as a fraction of all lexemes seen in the input.
00130  *  However, what we actually want to store in the finished pg_statistic
00131  *  entry is each lexeme's frequency as a fraction of all rows that it occurs
00132  *  in.  Assuming that the input tsvectors are correctly constructed, no
00133  *  lexeme occurs more than once per tsvector, so the final count f is a
00134  *  correct estimate of the number of input tsvectors it occurs in, and we
00135  *  need only change the divisor from N to nonnull_cnt to get the number we
00136  *  want.
00137  */
00138 static void
00139 compute_tsvector_stats(VacAttrStats *stats,
00140                        AnalyzeAttrFetchFunc fetchfunc,
00141                        int samplerows,
00142                        double totalrows)
00143 {
00144     int         num_mcelem;
00145     int         null_cnt = 0;
00146     double      total_width = 0;
00147 
00148     /* This is D from the LC algorithm. */
00149     HTAB       *lexemes_tab;
00150     HASHCTL     hash_ctl;
00151     HASH_SEQ_STATUS scan_status;
00152 
00153     /* This is the current bucket number from the LC algorithm */
00154     int         b_current;
00155 
00156     /* This is 'w' from the LC algorithm */
00157     int         bucket_width;
00158     int         vector_no,
00159                 lexeme_no;
00160     LexemeHashKey hash_key;
00161     TrackItem  *item;
00162 
00163     /*
00164      * We want statistics_target * 10 lexemes in the MCELEM array.  This
00165      * multiplier is pretty arbitrary, but is meant to reflect the fact that
00166      * the number of individual lexeme values tracked in pg_statistic ought to
00167      * be more than the number of values for a simple scalar column.
00168      */
00169     num_mcelem = stats->attr->attstattarget * 10;
00170 
00171     /*
00172      * We set bucket width equal to (num_mcelem + 10) / 0.007 as per the
00173      * comment above.
00174      */
00175     bucket_width = (num_mcelem + 10) * 1000 / 7;
00176 
00177     /*
00178      * Create the hashtable. It will be in local memory, so we don't need to
00179      * worry about overflowing the initial size. Also we don't need to pay any
00180      * attention to locking and memory management.
00181      */
00182     MemSet(&hash_ctl, 0, sizeof(hash_ctl));
00183     hash_ctl.keysize = sizeof(LexemeHashKey);
00184     hash_ctl.entrysize = sizeof(TrackItem);
00185     hash_ctl.hash = lexeme_hash;
00186     hash_ctl.match = lexeme_match;
00187     hash_ctl.hcxt = CurrentMemoryContext;
00188     lexemes_tab = hash_create("Analyzed lexemes table",
00189                               num_mcelem,
00190                               &hash_ctl,
00191                     HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT);
00192 
00193     /* Initialize counters. */
00194     b_current = 1;
00195     lexeme_no = 0;
00196 
00197     /* Loop over the tsvectors. */
00198     for (vector_no = 0; vector_no < samplerows; vector_no++)
00199     {
00200         Datum       value;
00201         bool        isnull;
00202         TSVector    vector;
00203         WordEntry  *curentryptr;
00204         char       *lexemesptr;
00205         int         j;
00206 
00207         vacuum_delay_point();
00208 
00209         value = fetchfunc(stats, vector_no, &isnull);
00210 
00211         /*
00212          * Check for null/nonnull.
00213          */
00214         if (isnull)
00215         {
00216             null_cnt++;
00217             continue;
00218         }
00219 
00220         /*
00221          * Add up widths for average-width calculation.  Since it's a
00222          * tsvector, we know it's varlena.  As in the regular
00223          * compute_minimal_stats function, we use the toasted width for this
00224          * calculation.
00225          */
00226         total_width += VARSIZE_ANY(DatumGetPointer(value));
00227 
00228         /*
00229          * Now detoast the tsvector if needed.
00230          */
00231         vector = DatumGetTSVector(value);
00232 
00233         /*
00234          * We loop through the lexemes in the tsvector and add them to our
00235          * tracking hashtable.  Note: the hashtable entries will point into
00236          * the (detoasted) tsvector value, therefore we cannot free that
00237          * storage until we're done.
00238          */
00239         lexemesptr = STRPTR(vector);
00240         curentryptr = ARRPTR(vector);
00241         for (j = 0; j < vector->size; j++)
00242         {
00243             bool        found;
00244 
00245             /* Construct a hash key */
00246             hash_key.lexeme = lexemesptr + curentryptr->pos;
00247             hash_key.length = curentryptr->len;
00248 
00249             /* Lookup current lexeme in hashtable, adding it if new */
00250             item = (TrackItem *) hash_search(lexemes_tab,
00251                                              (const void *) &hash_key,
00252                                              HASH_ENTER, &found);
00253 
00254             if (found)
00255             {
00256                 /* The lexeme is already on the tracking list */
00257                 item->frequency++;
00258             }
00259             else
00260             {
00261                 /* Initialize new tracking list element */
00262                 item->frequency = 1;
00263                 item->delta = b_current - 1;
00264             }
00265 
00266             /* lexeme_no is the number of elements processed (ie N) */
00267             lexeme_no++;
00268 
00269             /* We prune the D structure after processing each bucket */
00270             if (lexeme_no % bucket_width == 0)
00271             {
00272                 prune_lexemes_hashtable(lexemes_tab, b_current);
00273                 b_current++;
00274             }
00275 
00276             /* Advance to the next WordEntry in the tsvector */
00277             curentryptr++;
00278         }
00279     }
00280 
00281     /* We can only compute real stats if we found some non-null values. */
00282     if (null_cnt < samplerows)
00283     {
00284         int         nonnull_cnt = samplerows - null_cnt;
00285         int         i;
00286         TrackItem **sort_table;
00287         int         track_len;
00288         int         cutoff_freq;
00289         int         minfreq,
00290                     maxfreq;
00291 
00292         stats->stats_valid = true;
00293         /* Do the simple null-frac and average width stats */
00294         stats->stanullfrac = (double) null_cnt / (double) samplerows;
00295         stats->stawidth = total_width / (double) nonnull_cnt;
00296 
00297         /* Assume it's a unique column (see notes above) */
00298         stats->stadistinct = -1.0;
00299 
00300         /*
00301          * Construct an array of the interesting hashtable items, that is,
00302          * those meeting the cutoff frequency (s - epsilon)*N.  Also identify
00303          * the minimum and maximum frequencies among these items.
00304          *
00305          * Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff
00306          * frequency is 9*N / bucket_width.
00307          */
00308         cutoff_freq = 9 * lexeme_no / bucket_width;
00309 
00310         i = hash_get_num_entries(lexemes_tab);  /* surely enough space */
00311         sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * i);
00312 
00313         hash_seq_init(&scan_status, lexemes_tab);
00314         track_len = 0;
00315         minfreq = lexeme_no;
00316         maxfreq = 0;
00317         while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
00318         {
00319             if (item->frequency > cutoff_freq)
00320             {
00321                 sort_table[track_len++] = item;
00322                 minfreq = Min(minfreq, item->frequency);
00323                 maxfreq = Max(maxfreq, item->frequency);
00324             }
00325         }
00326         Assert(track_len <= i);
00327 
00328         /* emit some statistics for debug purposes */
00329         elog(DEBUG3, "tsvector_stats: target # mces = %d, bucket width = %d, "
00330              "# lexemes = %d, hashtable size = %d, usable entries = %d",
00331              num_mcelem, bucket_width, lexeme_no, i, track_len);
00332 
00333         /*
00334          * If we obtained more lexemes than we really want, get rid of those
00335          * with least frequencies.  The easiest way is to qsort the array into
00336          * descending frequency order and truncate the array.
00337          */
00338         if (num_mcelem < track_len)
00339         {
00340             qsort(sort_table, track_len, sizeof(TrackItem *),
00341                   trackitem_compare_frequencies_desc);
00342             /* reset minfreq to the smallest frequency we're keeping */
00343             minfreq = sort_table[num_mcelem - 1]->frequency;
00344         }
00345         else
00346             num_mcelem = track_len;
00347 
00348         /* Generate MCELEM slot entry */
00349         if (num_mcelem > 0)
00350         {
00351             MemoryContext old_context;
00352             Datum      *mcelem_values;
00353             float4     *mcelem_freqs;
00354 
00355             /*
00356              * We want to store statistics sorted on the lexeme value using
00357              * first length, then byte-for-byte comparison. The reason for
00358              * doing length comparison first is that we don't care about the
00359              * ordering so long as it's consistent, and comparing lengths
00360              * first gives us a chance to avoid a strncmp() call.
00361              *
00362              * This is different from what we do with scalar statistics --
00363              * they get sorted on frequencies. The rationale is that we
00364              * usually search through most common elements looking for a
00365              * specific value, so we can grab its frequency.  When values are
00366              * presorted we can employ binary search for that.  See
00367              * ts_selfuncs.c for a real usage scenario.
00368              */
00369             qsort(sort_table, num_mcelem, sizeof(TrackItem *),
00370                   trackitem_compare_lexemes);
00371 
00372             /* Must copy the target values into anl_context */
00373             old_context = MemoryContextSwitchTo(stats->anl_context);
00374 
00375             /*
00376              * We sorted statistics on the lexeme value, but we want to be
00377              * able to find out the minimal and maximal frequency without
00378              * going through all the values.  We keep those two extra
00379              * frequencies in two extra cells in mcelem_freqs.
00380              *
00381              * (Note: the MCELEM statistics slot definition allows for a third
00382              * extra number containing the frequency of nulls, but we don't
00383              * create that for a tsvector column, since null elements aren't
00384              * possible.)
00385              */
00386             mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum));
00387             mcelem_freqs = (float4 *) palloc((num_mcelem + 2) * sizeof(float4));
00388 
00389             /*
00390              * See comments above about use of nonnull_cnt as the divisor for
00391              * the final frequency estimates.
00392              */
00393             for (i = 0; i < num_mcelem; i++)
00394             {
00395                 TrackItem  *item = sort_table[i];
00396 
00397                 mcelem_values[i] =
00398                     PointerGetDatum(cstring_to_text_with_len(item->key.lexeme,
00399                                                           item->key.length));
00400                 mcelem_freqs[i] = (double) item->frequency / (double) nonnull_cnt;
00401             }
00402             mcelem_freqs[i++] = (double) minfreq / (double) nonnull_cnt;
00403             mcelem_freqs[i] = (double) maxfreq / (double) nonnull_cnt;
00404             MemoryContextSwitchTo(old_context);
00405 
00406             stats->stakind[0] = STATISTIC_KIND_MCELEM;
00407             stats->staop[0] = TextEqualOperator;
00408             stats->stanumbers[0] = mcelem_freqs;
00409             /* See above comment about two extra frequency fields */
00410             stats->numnumbers[0] = num_mcelem + 2;
00411             stats->stavalues[0] = mcelem_values;
00412             stats->numvalues[0] = num_mcelem;
00413             /* We are storing text values */
00414             stats->statypid[0] = TEXTOID;
00415             stats->statyplen[0] = -1;   /* typlen, -1 for varlena */
00416             stats->statypbyval[0] = false;
00417             stats->statypalign[0] = 'i';
00418         }
00419     }
00420     else
00421     {
00422         /* We found only nulls; assume the column is entirely null */
00423         stats->stats_valid = true;
00424         stats->stanullfrac = 1.0;
00425         stats->stawidth = 0;    /* "unknown" */
00426         stats->stadistinct = 0.0;       /* "unknown" */
00427     }
00428 
00429     /*
00430      * We don't need to bother cleaning up any of our temporary palloc's. The
00431      * hashtable should also go away, as it used a child memory context.
00432      */
00433 }
00434 
00435 /*
00436  *  A function to prune the D structure from the Lossy Counting algorithm.
00437  *  Consult compute_tsvector_stats() for wider explanation.
00438  */
00439 static void
00440 prune_lexemes_hashtable(HTAB *lexemes_tab, int b_current)
00441 {
00442     HASH_SEQ_STATUS scan_status;
00443     TrackItem  *item;
00444 
00445     hash_seq_init(&scan_status, lexemes_tab);
00446     while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
00447     {
00448         if (item->frequency + item->delta <= b_current)
00449         {
00450             if (hash_search(lexemes_tab, (const void *) &item->key,
00451                             HASH_REMOVE, NULL) == NULL)
00452                 elog(ERROR, "hash table corrupted");
00453         }
00454     }
00455 }
00456 
00457 /*
00458  * Hash functions for lexemes. They are strings, but not NULL terminated,
00459  * so we need a special hash function.
00460  */
00461 static uint32
00462 lexeme_hash(const void *key, Size keysize)
00463 {
00464     const LexemeHashKey *l = (const LexemeHashKey *) key;
00465 
00466     return DatumGetUInt32(hash_any((const unsigned char *) l->lexeme,
00467                                    l->length));
00468 }
00469 
00470 /*
00471  *  Matching function for lexemes, to be used in hashtable lookups.
00472  */
00473 static int
00474 lexeme_match(const void *key1, const void *key2, Size keysize)
00475 {
00476     /* The keysize parameter is superfluous, the keys store their lengths */
00477     return lexeme_compare(key1, key2);
00478 }
00479 
00480 /*
00481  *  Comparison function for lexemes.
00482  */
00483 static int
00484 lexeme_compare(const void *key1, const void *key2)
00485 {
00486     const LexemeHashKey *d1 = (const LexemeHashKey *) key1;
00487     const LexemeHashKey *d2 = (const LexemeHashKey *) key2;
00488 
00489     /* First, compare by length */
00490     if (d1->length > d2->length)
00491         return 1;
00492     else if (d1->length < d2->length)
00493         return -1;
00494     /* Lengths are equal, do a byte-by-byte comparison */
00495     return strncmp(d1->lexeme, d2->lexeme, d1->length);
00496 }
00497 
00498 /*
00499  *  qsort() comparator for sorting TrackItems on frequencies (descending sort)
00500  */
00501 static int
00502 trackitem_compare_frequencies_desc(const void *e1, const void *e2)
00503 {
00504     const TrackItem *const * t1 = (const TrackItem *const *) e1;
00505     const TrackItem *const * t2 = (const TrackItem *const *) e2;
00506 
00507     return (*t2)->frequency - (*t1)->frequency;
00508 }
00509 
00510 /*
00511  *  qsort() comparator for sorting TrackItems on lexemes
00512  */
00513 static int
00514 trackitem_compare_lexemes(const void *e1, const void *e2)
00515 {
00516     const TrackItem *const * t1 = (const TrackItem *const *) e1;
00517     const TrackItem *const * t2 = (const TrackItem *const *) e2;
00518 
00519     return lexeme_compare(&(*t1)->key, &(*t2)->key);
00520 }
Header And Logo

ts_typanalyze.c