00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014 #include "postgres.h"
00015
00016 #include "access/hash.h"
00017 #include "catalog/pg_operator.h"
00018 #include "commands/vacuum.h"
00019 #include "tsearch/ts_type.h"
00020 #include "utils/builtins.h"
00021
00022
00023
00024 typedef struct
00025 {
00026 char *lexeme;
00027 int length;
00028 } LexemeHashKey;
00029
00030
00031 typedef struct
00032 {
00033 LexemeHashKey key;
00034 int frequency;
00035 int delta;
00036 } TrackItem;
00037
00038 static void compute_tsvector_stats(VacAttrStats *stats,
00039 AnalyzeAttrFetchFunc fetchfunc,
00040 int samplerows,
00041 double totalrows);
00042 static void prune_lexemes_hashtable(HTAB *lexemes_tab, int b_current);
00043 static uint32 lexeme_hash(const void *key, Size keysize);
00044 static int lexeme_match(const void *key1, const void *key2, Size keysize);
00045 static int lexeme_compare(const void *key1, const void *key2);
00046 static int trackitem_compare_frequencies_desc(const void *e1, const void *e2);
00047 static int trackitem_compare_lexemes(const void *e1, const void *e2);
00048
00049
00050
00051
00052
00053 Datum
00054 ts_typanalyze(PG_FUNCTION_ARGS)
00055 {
00056 VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
00057 Form_pg_attribute attr = stats->attr;
00058
00059
00060
00061 if (attr->attstattarget < 0)
00062 attr->attstattarget = default_statistics_target;
00063
00064 stats->compute_stats = compute_tsvector_stats;
00065
00066 stats->minrows = 300 * attr->attstattarget;
00067
00068 PG_RETURN_BOOL(true);
00069 }
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138 static void
00139 compute_tsvector_stats(VacAttrStats *stats,
00140 AnalyzeAttrFetchFunc fetchfunc,
00141 int samplerows,
00142 double totalrows)
00143 {
00144 int num_mcelem;
00145 int null_cnt = 0;
00146 double total_width = 0;
00147
00148
00149 HTAB *lexemes_tab;
00150 HASHCTL hash_ctl;
00151 HASH_SEQ_STATUS scan_status;
00152
00153
00154 int b_current;
00155
00156
00157 int bucket_width;
00158 int vector_no,
00159 lexeme_no;
00160 LexemeHashKey hash_key;
00161 TrackItem *item;
00162
00163
00164
00165
00166
00167
00168
00169 num_mcelem = stats->attr->attstattarget * 10;
00170
00171
00172
00173
00174
00175 bucket_width = (num_mcelem + 10) * 1000 / 7;
00176
00177
00178
00179
00180
00181
00182 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
00183 hash_ctl.keysize = sizeof(LexemeHashKey);
00184 hash_ctl.entrysize = sizeof(TrackItem);
00185 hash_ctl.hash = lexeme_hash;
00186 hash_ctl.match = lexeme_match;
00187 hash_ctl.hcxt = CurrentMemoryContext;
00188 lexemes_tab = hash_create("Analyzed lexemes table",
00189 num_mcelem,
00190 &hash_ctl,
00191 HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT);
00192
00193
00194 b_current = 1;
00195 lexeme_no = 0;
00196
00197
00198 for (vector_no = 0; vector_no < samplerows; vector_no++)
00199 {
00200 Datum value;
00201 bool isnull;
00202 TSVector vector;
00203 WordEntry *curentryptr;
00204 char *lexemesptr;
00205 int j;
00206
00207 vacuum_delay_point();
00208
00209 value = fetchfunc(stats, vector_no, &isnull);
00210
00211
00212
00213
00214 if (isnull)
00215 {
00216 null_cnt++;
00217 continue;
00218 }
00219
00220
00221
00222
00223
00224
00225
00226 total_width += VARSIZE_ANY(DatumGetPointer(value));
00227
00228
00229
00230
00231 vector = DatumGetTSVector(value);
00232
00233
00234
00235
00236
00237
00238
00239 lexemesptr = STRPTR(vector);
00240 curentryptr = ARRPTR(vector);
00241 for (j = 0; j < vector->size; j++)
00242 {
00243 bool found;
00244
00245
00246 hash_key.lexeme = lexemesptr + curentryptr->pos;
00247 hash_key.length = curentryptr->len;
00248
00249
00250 item = (TrackItem *) hash_search(lexemes_tab,
00251 (const void *) &hash_key,
00252 HASH_ENTER, &found);
00253
00254 if (found)
00255 {
00256
00257 item->frequency++;
00258 }
00259 else
00260 {
00261
00262 item->frequency = 1;
00263 item->delta = b_current - 1;
00264 }
00265
00266
00267 lexeme_no++;
00268
00269
00270 if (lexeme_no % bucket_width == 0)
00271 {
00272 prune_lexemes_hashtable(lexemes_tab, b_current);
00273 b_current++;
00274 }
00275
00276
00277 curentryptr++;
00278 }
00279 }
00280
00281
00282 if (null_cnt < samplerows)
00283 {
00284 int nonnull_cnt = samplerows - null_cnt;
00285 int i;
00286 TrackItem **sort_table;
00287 int track_len;
00288 int cutoff_freq;
00289 int minfreq,
00290 maxfreq;
00291
00292 stats->stats_valid = true;
00293
00294 stats->stanullfrac = (double) null_cnt / (double) samplerows;
00295 stats->stawidth = total_width / (double) nonnull_cnt;
00296
00297
00298 stats->stadistinct = -1.0;
00299
00300
00301
00302
00303
00304
00305
00306
00307
00308 cutoff_freq = 9 * lexeme_no / bucket_width;
00309
00310 i = hash_get_num_entries(lexemes_tab);
00311 sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * i);
00312
00313 hash_seq_init(&scan_status, lexemes_tab);
00314 track_len = 0;
00315 minfreq = lexeme_no;
00316 maxfreq = 0;
00317 while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
00318 {
00319 if (item->frequency > cutoff_freq)
00320 {
00321 sort_table[track_len++] = item;
00322 minfreq = Min(minfreq, item->frequency);
00323 maxfreq = Max(maxfreq, item->frequency);
00324 }
00325 }
00326 Assert(track_len <= i);
00327
00328
00329 elog(DEBUG3, "tsvector_stats: target # mces = %d, bucket width = %d, "
00330 "# lexemes = %d, hashtable size = %d, usable entries = %d",
00331 num_mcelem, bucket_width, lexeme_no, i, track_len);
00332
00333
00334
00335
00336
00337
00338 if (num_mcelem < track_len)
00339 {
00340 qsort(sort_table, track_len, sizeof(TrackItem *),
00341 trackitem_compare_frequencies_desc);
00342
00343 minfreq = sort_table[num_mcelem - 1]->frequency;
00344 }
00345 else
00346 num_mcelem = track_len;
00347
00348
00349 if (num_mcelem > 0)
00350 {
00351 MemoryContext old_context;
00352 Datum *mcelem_values;
00353 float4 *mcelem_freqs;
00354
00355
00356
00357
00358
00359
00360
00361
00362
00363
00364
00365
00366
00367
00368
00369 qsort(sort_table, num_mcelem, sizeof(TrackItem *),
00370 trackitem_compare_lexemes);
00371
00372
00373 old_context = MemoryContextSwitchTo(stats->anl_context);
00374
00375
00376
00377
00378
00379
00380
00381
00382
00383
00384
00385
00386 mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum));
00387 mcelem_freqs = (float4 *) palloc((num_mcelem + 2) * sizeof(float4));
00388
00389
00390
00391
00392
00393 for (i = 0; i < num_mcelem; i++)
00394 {
00395 TrackItem *item = sort_table[i];
00396
00397 mcelem_values[i] =
00398 PointerGetDatum(cstring_to_text_with_len(item->key.lexeme,
00399 item->key.length));
00400 mcelem_freqs[i] = (double) item->frequency / (double) nonnull_cnt;
00401 }
00402 mcelem_freqs[i++] = (double) minfreq / (double) nonnull_cnt;
00403 mcelem_freqs[i] = (double) maxfreq / (double) nonnull_cnt;
00404 MemoryContextSwitchTo(old_context);
00405
00406 stats->stakind[0] = STATISTIC_KIND_MCELEM;
00407 stats->staop[0] = TextEqualOperator;
00408 stats->stanumbers[0] = mcelem_freqs;
00409
00410 stats->numnumbers[0] = num_mcelem + 2;
00411 stats->stavalues[0] = mcelem_values;
00412 stats->numvalues[0] = num_mcelem;
00413
00414 stats->statypid[0] = TEXTOID;
00415 stats->statyplen[0] = -1;
00416 stats->statypbyval[0] = false;
00417 stats->statypalign[0] = 'i';
00418 }
00419 }
00420 else
00421 {
00422
00423 stats->stats_valid = true;
00424 stats->stanullfrac = 1.0;
00425 stats->stawidth = 0;
00426 stats->stadistinct = 0.0;
00427 }
00428
00429
00430
00431
00432
00433 }
00434
00435
00436
00437
00438
00439 static void
00440 prune_lexemes_hashtable(HTAB *lexemes_tab, int b_current)
00441 {
00442 HASH_SEQ_STATUS scan_status;
00443 TrackItem *item;
00444
00445 hash_seq_init(&scan_status, lexemes_tab);
00446 while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
00447 {
00448 if (item->frequency + item->delta <= b_current)
00449 {
00450 if (hash_search(lexemes_tab, (const void *) &item->key,
00451 HASH_REMOVE, NULL) == NULL)
00452 elog(ERROR, "hash table corrupted");
00453 }
00454 }
00455 }
00456
00457
00458
00459
00460
00461 static uint32
00462 lexeme_hash(const void *key, Size keysize)
00463 {
00464 const LexemeHashKey *l = (const LexemeHashKey *) key;
00465
00466 return DatumGetUInt32(hash_any((const unsigned char *) l->lexeme,
00467 l->length));
00468 }
00469
00470
00471
00472
00473 static int
00474 lexeme_match(const void *key1, const void *key2, Size keysize)
00475 {
00476
00477 return lexeme_compare(key1, key2);
00478 }
00479
00480
00481
00482
00483 static int
00484 lexeme_compare(const void *key1, const void *key2)
00485 {
00486 const LexemeHashKey *d1 = (const LexemeHashKey *) key1;
00487 const LexemeHashKey *d2 = (const LexemeHashKey *) key2;
00488
00489
00490 if (d1->length > d2->length)
00491 return 1;
00492 else if (d1->length < d2->length)
00493 return -1;
00494
00495 return strncmp(d1->lexeme, d2->lexeme, d1->length);
00496 }
00497
00498
00499
00500
00501 static int
00502 trackitem_compare_frequencies_desc(const void *e1, const void *e2)
00503 {
00504 const TrackItem *const * t1 = (const TrackItem *const *) e1;
00505 const TrackItem *const * t2 = (const TrackItem *const *) e2;
00506
00507 return (*t2)->frequency - (*t1)->frequency;
00508 }
00509
00510
00511
00512
00513 static int
00514 trackitem_compare_lexemes(const void *e1, const void *e2)
00515 {
00516 const TrackItem *const * t1 = (const TrackItem *const *) e1;
00517 const TrackItem *const * t2 = (const TrackItem *const *) e2;
00518
00519 return lexeme_compare(&(*t1)->key, &(*t2)->key);
00520 }