00001 /*------------------------------------------------------------------------- 00002 * 00003 * pg_statistic.h 00004 * definition of the system "statistic" relation (pg_statistic) 00005 * along with the relation's initial contents. 00006 * 00007 * 00008 * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group 00009 * Portions Copyright (c) 1994, Regents of the University of California 00010 * 00011 * src/include/catalog/pg_statistic.h 00012 * 00013 * NOTES 00014 * the genbki.pl script reads this file and generates .bki 00015 * information from the DATA() statements. 00016 * 00017 *------------------------------------------------------------------------- 00018 */ 00019 #ifndef PG_STATISTIC_H 00020 #define PG_STATISTIC_H 00021 00022 #include "catalog/genbki.h" 00023 00024 /* ---------------- 00025 * pg_statistic definition. cpp turns this into 00026 * typedef struct FormData_pg_statistic 00027 * ---------------- 00028 */ 00029 #define StatisticRelationId 2619 00030 00031 CATALOG(pg_statistic,2619) BKI_WITHOUT_OIDS 00032 { 00033 /* These fields form the unique key for the entry: */ 00034 Oid starelid; /* relation containing attribute */ 00035 int16 staattnum; /* attribute (column) stats are for */ 00036 bool stainherit; /* true if inheritance children are included */ 00037 00038 /* the fraction of the column's entries that are NULL: */ 00039 float4 stanullfrac; 00040 00041 /* 00042 * stawidth is the average width in bytes of non-null entries. For 00043 * fixed-width datatypes this is of course the same as the typlen, but for 00044 * var-width types it is more useful. Note that this is the average width 00045 * of the data as actually stored, post-TOASTing (eg, for a 00046 * moved-out-of-line value, only the size of the pointer object is 00047 * counted). This is the appropriate definition for the primary use of 00048 * the statistic, which is to estimate sizes of in-memory hash tables of 00049 * tuples. 00050 */ 00051 int32 stawidth; 00052 00053 /* ---------------- 00054 * stadistinct indicates the (approximate) number of distinct non-null 00055 * data values in the column. The interpretation is: 00056 * 0 unknown or not computed 00057 * > 0 actual number of distinct values 00058 * < 0 negative of multiplier for number of rows 00059 * The special negative case allows us to cope with columns that are 00060 * unique (stadistinct = -1) or nearly so (for example, a column in 00061 * which values appear about twice on the average could be represented 00062 * by stadistinct = -0.5). Because the number-of-rows statistic in 00063 * pg_class may be updated more frequently than pg_statistic is, it's 00064 * important to be able to describe such situations as a multiple of 00065 * the number of rows, rather than a fixed number of distinct values. 00066 * But in other cases a fixed number is correct (eg, a boolean column). 00067 * ---------------- 00068 */ 00069 float4 stadistinct; 00070 00071 /* ---------------- 00072 * To allow keeping statistics on different kinds of datatypes, 00073 * we do not hard-wire any particular meaning for the remaining 00074 * statistical fields. Instead, we provide several "slots" in which 00075 * statistical data can be placed. Each slot includes: 00076 * kind integer code identifying kind of data (see below) 00077 * op OID of associated operator, if needed 00078 * numbers float4 array (for statistical values) 00079 * values anyarray (for representations of data values) 00080 * The ID and operator fields are never NULL; they are zeroes in an 00081 * unused slot. The numbers and values fields are NULL in an unused 00082 * slot, and might also be NULL in a used slot if the slot kind has 00083 * no need for one or the other. 00084 * ---------------- 00085 */ 00086 00087 int16 stakind1; 00088 int16 stakind2; 00089 int16 stakind3; 00090 int16 stakind4; 00091 int16 stakind5; 00092 00093 Oid staop1; 00094 Oid staop2; 00095 Oid staop3; 00096 Oid staop4; 00097 Oid staop5; 00098 00099 #ifdef CATALOG_VARLEN /* variable-length fields start here */ 00100 float4 stanumbers1[1]; 00101 float4 stanumbers2[1]; 00102 float4 stanumbers3[1]; 00103 float4 stanumbers4[1]; 00104 float4 stanumbers5[1]; 00105 00106 /* 00107 * Values in these arrays are values of the column's data type, or of some 00108 * related type such as an array element type. We presently have to cheat 00109 * quite a bit to allow polymorphic arrays of this kind, but perhaps 00110 * someday it'll be a less bogus facility. 00111 */ 00112 anyarray stavalues1; 00113 anyarray stavalues2; 00114 anyarray stavalues3; 00115 anyarray stavalues4; 00116 anyarray stavalues5; 00117 #endif 00118 } FormData_pg_statistic; 00119 00120 #define STATISTIC_NUM_SLOTS 5 00121 00122 00123 /* ---------------- 00124 * Form_pg_statistic corresponds to a pointer to a tuple with 00125 * the format of pg_statistic relation. 00126 * ---------------- 00127 */ 00128 typedef FormData_pg_statistic *Form_pg_statistic; 00129 00130 /* ---------------- 00131 * compiler constants for pg_statistic 00132 * ---------------- 00133 */ 00134 #define Natts_pg_statistic 26 00135 #define Anum_pg_statistic_starelid 1 00136 #define Anum_pg_statistic_staattnum 2 00137 #define Anum_pg_statistic_stainherit 3 00138 #define Anum_pg_statistic_stanullfrac 4 00139 #define Anum_pg_statistic_stawidth 5 00140 #define Anum_pg_statistic_stadistinct 6 00141 #define Anum_pg_statistic_stakind1 7 00142 #define Anum_pg_statistic_stakind2 8 00143 #define Anum_pg_statistic_stakind3 9 00144 #define Anum_pg_statistic_stakind4 10 00145 #define Anum_pg_statistic_stakind5 11 00146 #define Anum_pg_statistic_staop1 12 00147 #define Anum_pg_statistic_staop2 13 00148 #define Anum_pg_statistic_staop3 14 00149 #define Anum_pg_statistic_staop4 15 00150 #define Anum_pg_statistic_staop5 16 00151 #define Anum_pg_statistic_stanumbers1 17 00152 #define Anum_pg_statistic_stanumbers2 18 00153 #define Anum_pg_statistic_stanumbers3 19 00154 #define Anum_pg_statistic_stanumbers4 20 00155 #define Anum_pg_statistic_stanumbers5 21 00156 #define Anum_pg_statistic_stavalues1 22 00157 #define Anum_pg_statistic_stavalues2 23 00158 #define Anum_pg_statistic_stavalues3 24 00159 #define Anum_pg_statistic_stavalues4 25 00160 #define Anum_pg_statistic_stavalues5 26 00161 00162 /* 00163 * Currently, five statistical slot "kinds" are defined by core PostgreSQL, 00164 * as documented below. Additional "kinds" will probably appear in 00165 * future to help cope with non-scalar datatypes. Also, custom data types 00166 * can define their own "kind" codes by mutual agreement between a custom 00167 * typanalyze routine and the selectivity estimation functions of the type's 00168 * operators. 00169 * 00170 * Code reading the pg_statistic relation should not assume that a particular 00171 * data "kind" will appear in any particular slot. Instead, search the 00172 * stakind fields to see if the desired data is available. (The standard 00173 * function get_attstatsslot() may be used for this.) 00174 */ 00175 00176 /* 00177 * The present allocation of "kind" codes is: 00178 * 00179 * 1-99: reserved for assignment by the core PostgreSQL project 00180 * (values in this range will be documented in this file) 00181 * 100-199: reserved for assignment by the PostGIS project 00182 * (values to be documented in PostGIS documentation) 00183 * 200-299: reserved for assignment by the ESRI ST_Geometry project 00184 * (values to be documented in ESRI ST_Geometry documentation) 00185 * 300-9999: reserved for future public assignments 00186 * 00187 * For private use you may choose a "kind" code at random in the range 00188 * 10000-30000. However, for code that is to be widely disseminated it is 00189 * better to obtain a publicly defined "kind" code by request from the 00190 * PostgreSQL Global Development Group. 00191 */ 00192 00193 /* 00194 * In a "most common values" slot, staop is the OID of the "=" operator 00195 * used to decide whether values are the same or not. stavalues contains 00196 * the K most common non-null values appearing in the column, and stanumbers 00197 * contains their frequencies (fractions of total row count). The values 00198 * shall be ordered in decreasing frequency. Note that since the arrays are 00199 * variable-size, K may be chosen by the statistics collector. Values should 00200 * not appear in MCV unless they have been observed to occur more than once; 00201 * a unique column will have no MCV slot. 00202 */ 00203 #define STATISTIC_KIND_MCV 1 00204 00205 /* 00206 * A "histogram" slot describes the distribution of scalar data. staop is 00207 * the OID of the "<" operator that describes the sort ordering. (In theory, 00208 * more than one histogram could appear, if a datatype has more than one 00209 * useful sort operator.) stavalues contains M (>=2) non-null values that 00210 * divide the non-null column data values into M-1 bins of approximately equal 00211 * population. The first stavalues item is the MIN and the last is the MAX. 00212 * stanumbers is not used and should be NULL. IMPORTANT POINT: if an MCV 00213 * slot is also provided, then the histogram describes the data distribution 00214 * *after removing the values listed in MCV* (thus, it's a "compressed 00215 * histogram" in the technical parlance). This allows a more accurate 00216 * representation of the distribution of a column with some very-common 00217 * values. In a column with only a few distinct values, it's possible that 00218 * the MCV list describes the entire data population; in this case the 00219 * histogram reduces to empty and should be omitted. 00220 */ 00221 #define STATISTIC_KIND_HISTOGRAM 2 00222 00223 /* 00224 * A "correlation" slot describes the correlation between the physical order 00225 * of table tuples and the ordering of data values of this column, as seen 00226 * by the "<" operator identified by staop. (As with the histogram, more 00227 * than one entry could theoretically appear.) stavalues is not used and 00228 * should be NULL. stanumbers contains a single entry, the correlation 00229 * coefficient between the sequence of data values and the sequence of 00230 * their actual tuple positions. The coefficient ranges from +1 to -1. 00231 */ 00232 #define STATISTIC_KIND_CORRELATION 3 00233 00234 /* 00235 * A "most common elements" slot is similar to a "most common values" slot, 00236 * except that it stores the most common non-null *elements* of the column 00237 * values. This is useful when the column datatype is an array or some other 00238 * type with identifiable elements (for instance, tsvector). staop contains 00239 * the equality operator appropriate to the element type. stavalues contains 00240 * the most common element values, and stanumbers their frequencies. Unlike 00241 * MCV slots, frequencies are measured as the fraction of non-null rows the 00242 * element value appears in, not the frequency of all rows. Also unlike 00243 * MCV slots, the values are sorted into the element type's default order 00244 * (to support binary search for a particular value). Since this puts the 00245 * minimum and maximum frequencies at unpredictable spots in stanumbers, 00246 * there are two extra members of stanumbers, holding copies of the minimum 00247 * and maximum frequencies. Optionally, there can be a third extra member, 00248 * which holds the frequency of null elements (expressed in the same terms: 00249 * the fraction of non-null rows that contain at least one null element). If 00250 * this member is omitted, the column is presumed to contain no null elements. 00251 * 00252 * Note: in current usage for tsvector columns, the stavalues elements are of 00253 * type text, even though their representation within tsvector is not 00254 * exactly text. 00255 */ 00256 #define STATISTIC_KIND_MCELEM 4 00257 00258 /* 00259 * A "distinct elements count histogram" slot describes the distribution of 00260 * the number of distinct element values present in each row of an array-type 00261 * column. Only non-null rows are considered, and only non-null elements. 00262 * staop contains the equality operator appropriate to the element type. 00263 * stavalues is not used and should be NULL. The last member of stanumbers is 00264 * the average count of distinct element values over all non-null rows. The 00265 * preceding M (>=2) members form a histogram that divides the population of 00266 * distinct-elements counts into M-1 bins of approximately equal population. 00267 * The first of these is the minimum observed count, and the last the maximum. 00268 */ 00269 #define STATISTIC_KIND_DECHIST 5 00270 00271 /* 00272 * A "length histogram" slot describes the distribution of range lengths in 00273 * rows of a range-type column. stanumbers contains a single entry, the 00274 * fraction of empty ranges. stavalues is a histogram of non-empty lengths, in 00275 * a format similar to STATISTIC_KIND_HISTOGRAM: it contains M (>=2) range 00276 * values that divide the column data values into M-1 bins of approximately 00277 * equal population. The lengths are stores as float8s, as measured by the 00278 * range type's subdiff function. Only non-null rows are considered. 00279 */ 00280 #define STATISTIC_KIND_RANGE_LENGTH_HISTOGRAM 6 00281 00282 /* 00283 * A "bounds histogram" slot is similar to STATISTIC_KIND_HISTOGRAM, but for 00284 * a range-type column. stavalues contains M (>=2) range values that divide 00285 * the column data values into M-1 bins of approximately equal population. 00286 * Unlike a regular scalar histogram, this is actually two histograms combined 00287 * into a single array, with the lower bounds of each value forming a 00288 * histogram of lower bounds, and the upper bounds a histogram of upper 00289 * bounds. Only non-NULL, non-empty ranges are included. 00290 */ 00291 #define STATISTIC_KIND_BOUNDS_HISTOGRAM 7 00292 00293 #endif /* PG_STATISTIC_H */