Header And Logo

PostgreSQL
| The world's most advanced open source database.

hashjoin.h

Go to the documentation of this file.
00001 /*-------------------------------------------------------------------------
00002  *
00003  * hashjoin.h
00004  *    internal structures for hash joins
00005  *
00006  *
00007  * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
00008  * Portions Copyright (c) 1994, Regents of the University of California
00009  *
00010  * src/include/executor/hashjoin.h
00011  *
00012  *-------------------------------------------------------------------------
00013  */
00014 #ifndef HASHJOIN_H
00015 #define HASHJOIN_H
00016 
00017 #include "nodes/execnodes.h"
00018 #include "storage/buffile.h"
00019 
00020 /* ----------------------------------------------------------------
00021  *              hash-join hash table structures
00022  *
00023  * Each active hashjoin has a HashJoinTable control block, which is
00024  * palloc'd in the executor's per-query context.  All other storage needed
00025  * for the hashjoin is kept in private memory contexts, two for each hashjoin.
00026  * This makes it easy and fast to release the storage when we don't need it
00027  * anymore.  (Exception: data associated with the temp files lives in the
00028  * per-query context too, since we always call buffile.c in that context.)
00029  *
00030  * The hashtable contexts are made children of the per-query context, ensuring
00031  * that they will be discarded at end of statement even if the join is
00032  * aborted early by an error.  (Likewise, any temporary files we make will
00033  * be cleaned up by the virtual file manager in event of an error.)
00034  *
00035  * Storage that should live through the entire join is allocated from the
00036  * "hashCxt", while storage that is only wanted for the current batch is
00037  * allocated in the "batchCxt".  By resetting the batchCxt at the end of
00038  * each batch, we free all the per-batch storage reliably and without tedium.
00039  *
00040  * During first scan of inner relation, we get its tuples from executor.
00041  * If nbatch > 1 then tuples that don't belong in first batch get saved
00042  * into inner-batch temp files. The same statements apply for the
00043  * first scan of the outer relation, except we write tuples to outer-batch
00044  * temp files.  After finishing the first scan, we do the following for
00045  * each remaining batch:
00046  *  1. Read tuples from inner batch file, load into hash buckets.
00047  *  2. Read tuples from outer batch file, match to hash buckets and output.
00048  *
00049  * It is possible to increase nbatch on the fly if the in-memory hash table
00050  * gets too big.  The hash-value-to-batch computation is arranged so that this
00051  * can only cause a tuple to go into a later batch than previously thought,
00052  * never into an earlier batch.  When we increase nbatch, we rescan the hash
00053  * table and dump out any tuples that are now of a later batch to the correct
00054  * inner batch file.  Subsequently, while reading either inner or outer batch
00055  * files, we might find tuples that no longer belong to the current batch;
00056  * if so, we just dump them out to the correct batch file.
00057  * ----------------------------------------------------------------
00058  */
00059 
00060 /* these are in nodes/execnodes.h: */
00061 /* typedef struct HashJoinTupleData *HashJoinTuple; */
00062 /* typedef struct HashJoinTableData *HashJoinTable; */
00063 
00064 typedef struct HashJoinTupleData
00065 {
00066     struct HashJoinTupleData *next;     /* link to next tuple in same bucket */
00067     uint32      hashvalue;      /* tuple's hash code */
00068     /* Tuple data, in MinimalTuple format, follows on a MAXALIGN boundary */
00069 }   HashJoinTupleData;
00070 
00071 #define HJTUPLE_OVERHEAD  MAXALIGN(sizeof(HashJoinTupleData))
00072 #define HJTUPLE_MINTUPLE(hjtup)  \
00073     ((MinimalTuple) ((char *) (hjtup) + HJTUPLE_OVERHEAD))
00074 
00075 /*
00076  * If the outer relation's distribution is sufficiently nonuniform, we attempt
00077  * to optimize the join by treating the hash values corresponding to the outer
00078  * relation's MCVs specially.  Inner relation tuples matching these hash
00079  * values go into the "skew" hashtable instead of the main hashtable, and
00080  * outer relation tuples with these hash values are matched against that
00081  * table instead of the main one.  Thus, tuples with these hash values are
00082  * effectively handled as part of the first batch and will never go to disk.
00083  * The skew hashtable is limited to SKEW_WORK_MEM_PERCENT of the total memory
00084  * allowed for the join; while building the hashtables, we decrease the number
00085  * of MCVs being specially treated if needed to stay under this limit.
00086  *
00087  * Note: you might wonder why we look at the outer relation stats for this,
00088  * rather than the inner.  One reason is that the outer relation is typically
00089  * bigger, so we get more I/O savings by optimizing for its most common values.
00090  * Also, for similarly-sized relations, the planner prefers to put the more
00091  * uniformly distributed relation on the inside, so we're more likely to find
00092  * interesting skew in the outer relation.
00093  */
00094 typedef struct HashSkewBucket
00095 {
00096     uint32      hashvalue;      /* common hash value */
00097     HashJoinTuple tuples;       /* linked list of inner-relation tuples */
00098 } HashSkewBucket;
00099 
00100 #define SKEW_BUCKET_OVERHEAD  MAXALIGN(sizeof(HashSkewBucket))
00101 #define INVALID_SKEW_BUCKET_NO  (-1)
00102 #define SKEW_WORK_MEM_PERCENT  2
00103 #define SKEW_MIN_OUTER_FRACTION  0.01
00104 
00105 
00106 typedef struct HashJoinTableData
00107 {
00108     int         nbuckets;       /* # buckets in the in-memory hash table */
00109     int         log2_nbuckets;  /* its log2 (nbuckets must be a power of 2) */
00110 
00111     /* buckets[i] is head of list of tuples in i'th in-memory bucket */
00112     struct HashJoinTupleData **buckets;
00113     /* buckets array is per-batch storage, as are all the tuples */
00114 
00115     bool        keepNulls;      /* true to store unmatchable NULL tuples */
00116 
00117     bool        skewEnabled;    /* are we using skew optimization? */
00118     HashSkewBucket **skewBucket;    /* hashtable of skew buckets */
00119     int         skewBucketLen;  /* size of skewBucket array (a power of 2!) */
00120     int         nSkewBuckets;   /* number of active skew buckets */
00121     int        *skewBucketNums; /* array indexes of active skew buckets */
00122 
00123     int         nbatch;         /* number of batches */
00124     int         curbatch;       /* current batch #; 0 during 1st pass */
00125 
00126     int         nbatch_original;    /* nbatch when we started inner scan */
00127     int         nbatch_outstart;    /* nbatch when we started outer scan */
00128 
00129     bool        growEnabled;    /* flag to shut off nbatch increases */
00130 
00131     double      totalTuples;    /* # tuples obtained from inner plan */
00132 
00133     /*
00134      * These arrays are allocated for the life of the hash join, but only if
00135      * nbatch > 1.  A file is opened only when we first write a tuple into it
00136      * (otherwise its pointer remains NULL).  Note that the zero'th array
00137      * elements never get used, since we will process rather than dump out any
00138      * tuples of batch zero.
00139      */
00140     BufFile   **innerBatchFile; /* buffered virtual temp file per batch */
00141     BufFile   **outerBatchFile; /* buffered virtual temp file per batch */
00142 
00143     /*
00144      * Info about the datatype-specific hash functions for the datatypes being
00145      * hashed. These are arrays of the same length as the number of hash join
00146      * clauses (hash keys).
00147      */
00148     FmgrInfo   *outer_hashfunctions;    /* lookup data for hash functions */
00149     FmgrInfo   *inner_hashfunctions;    /* lookup data for hash functions */
00150     bool       *hashStrict;     /* is each hash join operator strict? */
00151 
00152     Size        spaceUsed;      /* memory space currently used by tuples */
00153     Size        spaceAllowed;   /* upper limit for space used */
00154     Size        spacePeak;      /* peak space used */
00155     Size        spaceUsedSkew;  /* skew hash table's current space usage */
00156     Size        spaceAllowedSkew;       /* upper limit for skew hashtable */
00157 
00158     MemoryContext hashCxt;      /* context for whole-hash-join storage */
00159     MemoryContext batchCxt;     /* context for this-batch-only storage */
00160 }   HashJoinTableData;
00161 
00162 #endif   /* HASHJOIN_H */