00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 #include "postgres.h"
00016
00017 #include "access/tuptoaster.h"
00018 #include "catalog/pg_collation.h"
00019 #include "commands/vacuum.h"
00020 #include "utils/array.h"
00021 #include "utils/datum.h"
00022 #include "utils/lsyscache.h"
00023 #include "utils/typcache.h"
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033 #define ARRAY_WIDTH_THRESHOLD 0x10000
00034
00035
00036 typedef struct
00037 {
00038
00039 Oid type_id;
00040 Oid eq_opr;
00041 bool typbyval;
00042 int16 typlen;
00043 char typalign;
00044
00045
00046
00047
00048
00049
00050 FmgrInfo *cmp;
00051 FmgrInfo *hash;
00052
00053
00054 AnalyzeAttrComputeStatsFunc std_compute_stats;
00055 void *std_extra_data;
00056 } ArrayAnalyzeExtraData;
00057
00058
00059
00060
00061
00062
00063
00064 static ArrayAnalyzeExtraData *array_extra_data;
00065
00066
00067 typedef struct
00068 {
00069 Datum key;
00070 int frequency;
00071 int delta;
00072 int last_container;
00073 } TrackItem;
00074
00075
00076 typedef struct
00077 {
00078 int count;
00079 int frequency;
00080 } DECountItem;
00081
00082 static void compute_array_stats(VacAttrStats *stats,
00083 AnalyzeAttrFetchFunc fetchfunc, int samplerows, double totalrows);
00084 static void prune_element_hashtable(HTAB *elements_tab, int b_current);
00085 static uint32 element_hash(const void *key, Size keysize);
00086 static int element_match(const void *key1, const void *key2, Size keysize);
00087 static int element_compare(const void *key1, const void *key2);
00088 static int trackitem_compare_frequencies_desc(const void *e1, const void *e2);
00089 static int trackitem_compare_element(const void *e1, const void *e2);
00090 static int countitem_compare_count(const void *e1, const void *e2);
00091
00092
00093
00094
00095
00096 Datum
00097 array_typanalyze(PG_FUNCTION_ARGS)
00098 {
00099 VacAttrStats *stats = (VacAttrStats *) PG_GETARG_POINTER(0);
00100 Oid element_typeid;
00101 TypeCacheEntry *typentry;
00102 ArrayAnalyzeExtraData *extra_data;
00103
00104
00105
00106
00107
00108 if (!std_typanalyze(stats))
00109 PG_RETURN_BOOL(false);
00110
00111
00112
00113
00114 element_typeid = get_base_element_type(stats->attrtypid);
00115 if (!OidIsValid(element_typeid))
00116 elog(ERROR, "array_typanalyze was invoked for non-array type %u",
00117 stats->attrtypid);
00118
00119
00120
00121
00122
00123 typentry = lookup_type_cache(element_typeid,
00124 TYPECACHE_EQ_OPR |
00125 TYPECACHE_CMP_PROC_FINFO |
00126 TYPECACHE_HASH_PROC_FINFO);
00127
00128 if (!OidIsValid(typentry->eq_opr) ||
00129 !OidIsValid(typentry->cmp_proc_finfo.fn_oid) ||
00130 !OidIsValid(typentry->hash_proc_finfo.fn_oid))
00131 PG_RETURN_BOOL(true);
00132
00133
00134 extra_data = (ArrayAnalyzeExtraData *) palloc(sizeof(ArrayAnalyzeExtraData));
00135 extra_data->type_id = typentry->type_id;
00136 extra_data->eq_opr = typentry->eq_opr;
00137 extra_data->typbyval = typentry->typbyval;
00138 extra_data->typlen = typentry->typlen;
00139 extra_data->typalign = typentry->typalign;
00140 extra_data->cmp = &typentry->cmp_proc_finfo;
00141 extra_data->hash = &typentry->hash_proc_finfo;
00142
00143
00144 extra_data->std_compute_stats = stats->compute_stats;
00145 extra_data->std_extra_data = stats->extra_data;
00146
00147
00148 stats->compute_stats = compute_array_stats;
00149 stats->extra_data = extra_data;
00150
00151
00152
00153
00154
00155
00156 PG_RETURN_BOOL(true);
00157 }
00158
00159
00160
00161
00162
00163
00164
00165
00166
00167
00168
00169
00170
00171
00172
00173
00174
00175
00176
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213 static void
00214 compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
00215 int samplerows, double totalrows)
00216 {
00217 ArrayAnalyzeExtraData *extra_data;
00218 int num_mcelem;
00219 int null_cnt = 0;
00220 int null_elem_cnt = 0;
00221 int analyzed_rows = 0;
00222
00223
00224 HTAB *elements_tab;
00225 HASHCTL elem_hash_ctl;
00226 HASH_SEQ_STATUS scan_status;
00227
00228
00229 int b_current;
00230
00231
00232 int bucket_width;
00233 int array_no;
00234 int64 element_no;
00235 TrackItem *item;
00236 int slot_idx;
00237 HTAB *count_tab;
00238 HASHCTL count_hash_ctl;
00239 DECountItem *count_item;
00240
00241 extra_data = (ArrayAnalyzeExtraData *) stats->extra_data;
00242
00243
00244
00245
00246
00247
00248 stats->extra_data = extra_data->std_extra_data;
00249 (*extra_data->std_compute_stats) (stats, fetchfunc, samplerows, totalrows);
00250 stats->extra_data = extra_data;
00251
00252
00253
00254
00255
00256
00257 array_extra_data = extra_data;
00258
00259
00260
00261
00262
00263
00264
00265 num_mcelem = stats->attr->attstattarget * 10;
00266
00267
00268
00269
00270
00271 bucket_width = num_mcelem * 1000 / 7;
00272
00273
00274
00275
00276
00277
00278 MemSet(&elem_hash_ctl, 0, sizeof(elem_hash_ctl));
00279 elem_hash_ctl.keysize = sizeof(Datum);
00280 elem_hash_ctl.entrysize = sizeof(TrackItem);
00281 elem_hash_ctl.hash = element_hash;
00282 elem_hash_ctl.match = element_match;
00283 elem_hash_ctl.hcxt = CurrentMemoryContext;
00284 elements_tab = hash_create("Analyzed elements table",
00285 num_mcelem,
00286 &elem_hash_ctl,
00287 HASH_ELEM | HASH_FUNCTION | HASH_COMPARE | HASH_CONTEXT);
00288
00289
00290 MemSet(&count_hash_ctl, 0, sizeof(count_hash_ctl));
00291 count_hash_ctl.keysize = sizeof(int);
00292 count_hash_ctl.entrysize = sizeof(DECountItem);
00293 count_hash_ctl.hash = tag_hash;
00294 count_hash_ctl.hcxt = CurrentMemoryContext;
00295 count_tab = hash_create("Array distinct element count table",
00296 64,
00297 &count_hash_ctl,
00298 HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
00299
00300
00301 b_current = 1;
00302 element_no = 0;
00303
00304
00305 for (array_no = 0; array_no < samplerows; array_no++)
00306 {
00307 Datum value;
00308 bool isnull;
00309 ArrayType *array;
00310 int num_elems;
00311 Datum *elem_values;
00312 bool *elem_nulls;
00313 bool null_present;
00314 int j;
00315 int64 prev_element_no = element_no;
00316 int distinct_count;
00317 bool count_item_found;
00318
00319 vacuum_delay_point();
00320
00321 value = fetchfunc(stats, array_no, &isnull);
00322 if (isnull)
00323 {
00324
00325 null_cnt++;
00326 continue;
00327 }
00328
00329
00330 if (toast_raw_datum_size(value) > ARRAY_WIDTH_THRESHOLD)
00331 continue;
00332 else
00333 analyzed_rows++;
00334
00335
00336
00337
00338 array = DatumGetArrayTypeP(value);
00339
00340 Assert(ARR_ELEMTYPE(array) == extra_data->type_id);
00341 deconstruct_array(array,
00342 extra_data->type_id,
00343 extra_data->typlen,
00344 extra_data->typbyval,
00345 extra_data->typalign,
00346 &elem_values, &elem_nulls, &num_elems);
00347
00348
00349
00350
00351
00352 null_present = false;
00353 for (j = 0; j < num_elems; j++)
00354 {
00355 Datum elem_value;
00356 bool found;
00357
00358
00359 if (elem_nulls[j])
00360 {
00361 null_present = true;
00362 continue;
00363 }
00364
00365
00366 elem_value = elem_values[j];
00367 item = (TrackItem *) hash_search(elements_tab,
00368 (const void *) &elem_value,
00369 HASH_ENTER, &found);
00370
00371 if (found)
00372 {
00373
00374
00375
00376
00377
00378
00379 if (item->last_container == array_no)
00380 continue;
00381
00382 item->frequency++;
00383 item->last_container = array_no;
00384 }
00385 else
00386 {
00387
00388
00389
00390
00391
00392
00393
00394
00395
00396 item->key = datumCopy(elem_value,
00397 extra_data->typbyval,
00398 extra_data->typlen);
00399
00400 item->frequency = 1;
00401 item->delta = b_current - 1;
00402 item->last_container = array_no;
00403 }
00404
00405
00406 element_no++;
00407
00408
00409 if (element_no % bucket_width == 0)
00410 {
00411 prune_element_hashtable(elements_tab, b_current);
00412 b_current++;
00413 }
00414 }
00415
00416
00417 if (null_present)
00418 null_elem_cnt++;
00419
00420
00421 distinct_count = (int) (element_no - prev_element_no);
00422 count_item = (DECountItem *) hash_search(count_tab, &distinct_count,
00423 HASH_ENTER,
00424 &count_item_found);
00425
00426 if (count_item_found)
00427 count_item->frequency++;
00428 else
00429 count_item->frequency = 1;
00430
00431
00432 if (PointerGetDatum(array) != value)
00433 pfree(array);
00434 pfree(elem_values);
00435 pfree(elem_nulls);
00436 }
00437
00438
00439 slot_idx = 0;
00440 while (slot_idx < STATISTIC_NUM_SLOTS && stats->stakind[slot_idx] != 0)
00441 slot_idx++;
00442 if (slot_idx > STATISTIC_NUM_SLOTS - 2)
00443 elog(ERROR, "insufficient pg_statistic slots for array stats");
00444
00445
00446 if (analyzed_rows > 0)
00447 {
00448 int nonnull_cnt = analyzed_rows;
00449 int count_items_count;
00450 int i;
00451 TrackItem **sort_table;
00452 int track_len;
00453 int64 cutoff_freq;
00454 int64 minfreq,
00455 maxfreq;
00456
00457
00458
00459
00460
00461
00462
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472 cutoff_freq = 9 * element_no / bucket_width;
00473
00474 i = hash_get_num_entries(elements_tab);
00475 sort_table = (TrackItem **) palloc(sizeof(TrackItem *) * i);
00476
00477 hash_seq_init(&scan_status, elements_tab);
00478 track_len = 0;
00479 minfreq = element_no;
00480 maxfreq = 0;
00481 while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
00482 {
00483 if (item->frequency > cutoff_freq)
00484 {
00485 sort_table[track_len++] = item;
00486 minfreq = Min(minfreq, item->frequency);
00487 maxfreq = Max(maxfreq, item->frequency);
00488 }
00489 }
00490 Assert(track_len <= i);
00491
00492
00493 elog(DEBUG3, "compute_array_stats: target # mces = %d, "
00494 "bucket width = %d, "
00495 "# elements = " INT64_FORMAT ", hashtable size = %d, "
00496 "usable entries = %d",
00497 num_mcelem, bucket_width, element_no, i, track_len);
00498
00499
00500
00501
00502
00503
00504 if (num_mcelem < track_len)
00505 {
00506 qsort(sort_table, track_len, sizeof(TrackItem *),
00507 trackitem_compare_frequencies_desc);
00508
00509 minfreq = sort_table[num_mcelem - 1]->frequency;
00510 }
00511 else
00512 num_mcelem = track_len;
00513
00514
00515 if (num_mcelem > 0)
00516 {
00517 MemoryContext old_context;
00518 Datum *mcelem_values;
00519 float4 *mcelem_freqs;
00520
00521
00522
00523
00524
00525
00526 qsort(sort_table, num_mcelem, sizeof(TrackItem *),
00527 trackitem_compare_element);
00528
00529
00530 old_context = MemoryContextSwitchTo(stats->anl_context);
00531
00532
00533
00534
00535
00536
00537
00538 mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum));
00539 mcelem_freqs = (float4 *) palloc((num_mcelem + 3) * sizeof(float4));
00540
00541
00542
00543
00544
00545 for (i = 0; i < num_mcelem; i++)
00546 {
00547 TrackItem *item = sort_table[i];
00548
00549 mcelem_values[i] = datumCopy(item->key,
00550 extra_data->typbyval,
00551 extra_data->typlen);
00552 mcelem_freqs[i] = (double) item->frequency /
00553 (double) nonnull_cnt;
00554 }
00555 mcelem_freqs[i++] = (double) minfreq / (double) nonnull_cnt;
00556 mcelem_freqs[i++] = (double) maxfreq / (double) nonnull_cnt;
00557 mcelem_freqs[i++] = (double) null_elem_cnt / (double) nonnull_cnt;
00558
00559 MemoryContextSwitchTo(old_context);
00560
00561 stats->stakind[slot_idx] = STATISTIC_KIND_MCELEM;
00562 stats->staop[slot_idx] = extra_data->eq_opr;
00563 stats->stanumbers[slot_idx] = mcelem_freqs;
00564
00565 stats->numnumbers[slot_idx] = num_mcelem + 3;
00566 stats->stavalues[slot_idx] = mcelem_values;
00567 stats->numvalues[slot_idx] = num_mcelem;
00568
00569 stats->statypid[slot_idx] = extra_data->type_id;
00570 stats->statyplen[slot_idx] = extra_data->typlen;
00571 stats->statypbyval[slot_idx] = extra_data->typbyval;
00572 stats->statypalign[slot_idx] = extra_data->typalign;
00573 slot_idx++;
00574 }
00575
00576
00577 count_items_count = hash_get_num_entries(count_tab);
00578 if (count_items_count > 0)
00579 {
00580 int num_hist = stats->attr->attstattarget;
00581 DECountItem **sorted_count_items;
00582 int j;
00583 int delta;
00584 int64 frac;
00585 float4 *hist;
00586
00587
00588 num_hist = Max(num_hist, 2);
00589
00590
00591
00592
00593
00594 sorted_count_items = (DECountItem **)
00595 palloc(sizeof(DECountItem *) * count_items_count);
00596 hash_seq_init(&scan_status, count_tab);
00597 j = 0;
00598 while ((count_item = (DECountItem *) hash_seq_search(&scan_status)) != NULL)
00599 {
00600 sorted_count_items[j++] = count_item;
00601 }
00602 qsort(sorted_count_items, count_items_count,
00603 sizeof(DECountItem *), countitem_compare_count);
00604
00605
00606
00607
00608
00609 hist = (float4 *)
00610 MemoryContextAlloc(stats->anl_context,
00611 sizeof(float4) * (num_hist + 1));
00612 hist[num_hist] = (double) element_no / (double) nonnull_cnt;
00613
00614
00615
00616
00617
00618
00619
00620
00621
00622
00623
00624
00625
00626
00627
00628
00629
00630
00631
00632
00633
00634
00635
00636
00637
00638
00639
00640
00641
00642
00643
00644
00645 delta = analyzed_rows - 1;
00646 j = 0;
00647
00648 frac = (int64) sorted_count_items[0]->frequency * (num_hist - 1);
00649 for (i = 0; i < num_hist; i++)
00650 {
00651 while (frac <= 0)
00652 {
00653
00654 j++;
00655 frac += (int64) sorted_count_items[j]->frequency * (num_hist - 1);
00656 }
00657 hist[i] = sorted_count_items[j]->count;
00658 frac -= delta;
00659 }
00660 Assert(j == count_items_count - 1);
00661
00662 stats->stakind[slot_idx] = STATISTIC_KIND_DECHIST;
00663 stats->staop[slot_idx] = extra_data->eq_opr;
00664 stats->stanumbers[slot_idx] = hist;
00665 stats->numnumbers[slot_idx] = num_hist + 1;
00666 slot_idx++;
00667 }
00668 }
00669
00670
00671
00672
00673
00674 }
00675
00676
00677
00678
00679
00680 static void
00681 prune_element_hashtable(HTAB *elements_tab, int b_current)
00682 {
00683 HASH_SEQ_STATUS scan_status;
00684 TrackItem *item;
00685
00686 hash_seq_init(&scan_status, elements_tab);
00687 while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
00688 {
00689 if (item->frequency + item->delta <= b_current)
00690 {
00691 Datum value = item->key;
00692
00693 if (hash_search(elements_tab, (const void *) &item->key,
00694 HASH_REMOVE, NULL) == NULL)
00695 elog(ERROR, "hash table corrupted");
00696
00697 if (!array_extra_data->typbyval)
00698 pfree(DatumGetPointer(value));
00699 }
00700 }
00701 }
00702
00703
00704
00705
00706
00707
00708
00709 static uint32
00710 element_hash(const void *key, Size keysize)
00711 {
00712 Datum d = *((const Datum *) key);
00713 Datum h;
00714
00715 h = FunctionCall1Coll(array_extra_data->hash, DEFAULT_COLLATION_OID, d);
00716 return DatumGetUInt32(h);
00717 }
00718
00719
00720
00721
00722 static int
00723 element_match(const void *key1, const void *key2, Size keysize)
00724 {
00725
00726 return element_compare(key1, key2);
00727 }
00728
00729
00730
00731
00732
00733
00734
00735
00736
00737 static int
00738 element_compare(const void *key1, const void *key2)
00739 {
00740 Datum d1 = *((const Datum *) key1);
00741 Datum d2 = *((const Datum *) key2);
00742 Datum c;
00743
00744 c = FunctionCall2Coll(array_extra_data->cmp, DEFAULT_COLLATION_OID, d1, d2);
00745 return DatumGetInt32(c);
00746 }
00747
00748
00749
00750
00751 static int
00752 trackitem_compare_frequencies_desc(const void *e1, const void *e2)
00753 {
00754 const TrackItem *const * t1 = (const TrackItem *const *) e1;
00755 const TrackItem *const * t2 = (const TrackItem *const *) e2;
00756
00757 return (*t2)->frequency - (*t1)->frequency;
00758 }
00759
00760
00761
00762
00763 static int
00764 trackitem_compare_element(const void *e1, const void *e2)
00765 {
00766 const TrackItem *const * t1 = (const TrackItem *const *) e1;
00767 const TrackItem *const * t2 = (const TrackItem *const *) e2;
00768
00769 return element_compare(&(*t1)->key, &(*t2)->key);
00770 }
00771
00772
00773
00774
00775 static int
00776 countitem_compare_count(const void *e1, const void *e2)
00777 {
00778 const DECountItem *const * t1 = (const DECountItem *const *) e1;
00779 const DECountItem *const * t2 = (const DECountItem *const *) e2;
00780
00781 if ((*t1)->count < (*t2)->count)
00782 return -1;
00783 else if ((*t1)->count == (*t2)->count)
00784 return 0;
00785 else
00786 return 1;
00787 }