LLVM API Documentation

OnDiskHashTable.h
Go to the documentation of this file.
00001 //===--- OnDiskHashTable.h - On-Disk Hash Table Implementation --*- C++ -*-===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 ///
00010 /// \file
00011 /// \brief Defines facilities for reading and writing on-disk hash tables.
00012 ///
00013 //===----------------------------------------------------------------------===//
00014 #ifndef LLVM_SUPPORT_ONDISKHASHTABLE_H
00015 #define LLVM_SUPPORT_ONDISKHASHTABLE_H
00016 
00017 #include "llvm/Support/Allocator.h"
00018 #include "llvm/Support/AlignOf.h"
00019 #include "llvm/Support/DataTypes.h"
00020 #include "llvm/Support/EndianStream.h"
00021 #include "llvm/Support/Host.h"
00022 #include "llvm/Support/MathExtras.h"
00023 #include "llvm/Support/raw_ostream.h"
00024 #include <cassert>
00025 #include <cstdlib>
00026 
00027 namespace llvm {
00028 
00029 /// \brief Generates an on disk hash table.
00030 ///
00031 /// This needs an \c Info that handles storing values into the hash table's
00032 /// payload and computes the hash for a given key. This should provide the
00033 /// following interface:
00034 ///
00035 /// \code
00036 /// class ExampleInfo {
00037 /// public:
00038 ///   typedef ExampleKey key_type;   // Must be copy constructible
00039 ///   typedef ExampleKey &key_type_ref;
00040 ///   typedef ExampleData data_type; // Must be copy constructible
00041 ///   typedef ExampleData &data_type_ref;
00042 ///   typedef uint32_t hash_value_type; // The type the hash function returns.
00043 ///   typedef uint32_t offset_type; // The type for offsets into the table.
00044 ///
00045 ///   /// Calculate the hash for Key
00046 ///   static hash_value_type ComputeHash(key_type_ref Key);
00047 ///   /// Return the lengths, in bytes, of the given Key/Data pair.
00048 ///   static std::pair<offset_type, offset_type>
00049 ///   EmitKeyDataLength(raw_ostream &Out, key_type_ref Key, data_type_ref Data);
00050 ///   /// Write Key to Out.  KeyLen is the length from EmitKeyDataLength.
00051 ///   static void EmitKey(raw_ostream &Out, key_type_ref Key,
00052 ///                       offset_type KeyLen);
00053 ///   /// Write Data to Out.  DataLen is the length from EmitKeyDataLength.
00054 ///   static void EmitData(raw_ostream &Out, key_type_ref Key,
00055 ///                        data_type_ref Data, offset_type DataLen);
00056 /// };
00057 /// \endcode
00058 template <typename Info> class OnDiskChainedHashTableGenerator {
00059   /// \brief A single item in the hash table.
00060   class Item {
00061   public:
00062     typename Info::key_type Key;
00063     typename Info::data_type Data;
00064     Item *Next;
00065     const typename Info::hash_value_type Hash;
00066 
00067     Item(typename Info::key_type_ref Key, typename Info::data_type_ref Data,
00068          Info &InfoObj)
00069         : Key(Key), Data(Data), Next(nullptr), Hash(InfoObj.ComputeHash(Key)) {}
00070   };
00071 
00072   typedef typename Info::offset_type offset_type;
00073   offset_type NumBuckets;
00074   offset_type NumEntries;
00075   llvm::SpecificBumpPtrAllocator<Item> BA;
00076 
00077   /// \brief A linked list of values in a particular hash bucket.
00078   class Bucket {
00079   public:
00080     offset_type Off;
00081     Item *Head;
00082     unsigned Length;
00083 
00084     Bucket() {}
00085   };
00086 
00087   Bucket *Buckets;
00088 
00089 private:
00090   /// \brief Insert an item into the appropriate hash bucket.
00091   void insert(Bucket *Buckets, size_t Size, Item *E) {
00092     Bucket &B = Buckets[E->Hash & (Size - 1)];
00093     E->Next = B.Head;
00094     ++B.Length;
00095     B.Head = E;
00096   }
00097 
00098   /// \brief Resize the hash table, moving the old entries into the new buckets.
00099   void resize(size_t NewSize) {
00100     Bucket *NewBuckets = (Bucket *)std::calloc(NewSize, sizeof(Bucket));
00101     // Populate NewBuckets with the old entries.
00102     for (size_t I = 0; I < NumBuckets; ++I)
00103       for (Item *E = Buckets[I].Head; E;) {
00104         Item *N = E->Next;
00105         E->Next = nullptr;
00106         insert(NewBuckets, NewSize, E);
00107         E = N;
00108       }
00109 
00110     free(Buckets);
00111     NumBuckets = NewSize;
00112     Buckets = NewBuckets;
00113   }
00114 
00115 public:
00116   /// \brief Insert an entry into the table.
00117   void insert(typename Info::key_type_ref Key,
00118               typename Info::data_type_ref Data) {
00119     Info InfoObj;
00120     insert(Key, Data, InfoObj);
00121   }
00122 
00123   /// \brief Insert an entry into the table.
00124   ///
00125   /// Uses the provided Info instead of a stack allocated one.
00126   void insert(typename Info::key_type_ref Key,
00127               typename Info::data_type_ref Data, Info &InfoObj) {
00128 
00129     ++NumEntries;
00130     if (4 * NumEntries >= 3 * NumBuckets)
00131       resize(NumBuckets * 2);
00132     insert(Buckets, NumBuckets, new (BA.Allocate()) Item(Key, Data, InfoObj));
00133   }
00134 
00135   /// \brief Emit the table to Out, which must not be at offset 0.
00136   offset_type Emit(raw_ostream &Out) {
00137     Info InfoObj;
00138     return Emit(Out, InfoObj);
00139   }
00140 
00141   /// \brief Emit the table to Out, which must not be at offset 0.
00142   ///
00143   /// Uses the provided Info instead of a stack allocated one.
00144   offset_type Emit(raw_ostream &Out, Info &InfoObj) {
00145     using namespace llvm::support;
00146     endian::Writer<little> LE(Out);
00147 
00148     // Emit the payload of the table.
00149     for (offset_type I = 0; I < NumBuckets; ++I) {
00150       Bucket &B = Buckets[I];
00151       if (!B.Head)
00152         continue;
00153 
00154       // Store the offset for the data of this bucket.
00155       B.Off = Out.tell();
00156       assert(B.Off && "Cannot write a bucket at offset 0. Please add padding.");
00157 
00158       // Write out the number of items in the bucket.
00159       LE.write<uint16_t>(B.Length);
00160       assert(B.Length != 0 && "Bucket has a head but zero length?");
00161 
00162       // Write out the entries in the bucket.
00163       for (Item *I = B.Head; I; I = I->Next) {
00164         LE.write<typename Info::hash_value_type>(I->Hash);
00165         const std::pair<offset_type, offset_type> &Len =
00166             InfoObj.EmitKeyDataLength(Out, I->Key, I->Data);
00167         InfoObj.EmitKey(Out, I->Key, Len.first);
00168         InfoObj.EmitData(Out, I->Key, I->Data, Len.second);
00169       }
00170     }
00171 
00172     // Pad with zeros so that we can start the hashtable at an aligned address.
00173     offset_type TableOff = Out.tell();
00174     uint64_t N = llvm::OffsetToAlignment(TableOff, alignOf<offset_type>());
00175     TableOff += N;
00176     while (N--)
00177       LE.write<uint8_t>(0);
00178 
00179     // Emit the hashtable itself.
00180     LE.write<offset_type>(NumBuckets);
00181     LE.write<offset_type>(NumEntries);
00182     for (offset_type I = 0; I < NumBuckets; ++I)
00183       LE.write<offset_type>(Buckets[I].Off);
00184 
00185     return TableOff;
00186   }
00187 
00188   OnDiskChainedHashTableGenerator() {
00189     NumEntries = 0;
00190     NumBuckets = 64;
00191     // Note that we do not need to run the constructors of the individual
00192     // Bucket objects since 'calloc' returns bytes that are all 0.
00193     Buckets = (Bucket *)std::calloc(NumBuckets, sizeof(Bucket));
00194   }
00195 
00196   ~OnDiskChainedHashTableGenerator() { std::free(Buckets); }
00197 };
00198 
00199 /// \brief Provides lookup on an on disk hash table.
00200 ///
00201 /// This needs an \c Info that handles reading values from the hash table's
00202 /// payload and computes the hash for a given key. This should provide the
00203 /// following interface:
00204 ///
00205 /// \code
00206 /// class ExampleLookupInfo {
00207 /// public:
00208 ///   typedef ExampleData data_type;
00209 ///   typedef ExampleInternalKey internal_key_type; // The stored key type.
00210 ///   typedef ExampleKey external_key_type; // The type to pass to find().
00211 ///   typedef uint32_t hash_value_type; // The type the hash function returns.
00212 ///   typedef uint32_t offset_type; // The type for offsets into the table.
00213 ///
00214 ///   /// Compare two keys for equality.
00215 ///   static bool EqualKey(internal_key_type &Key1, internal_key_type &Key2);
00216 ///   /// Calculate the hash for the given key.
00217 ///   static hash_value_type ComputeHash(internal_key_type &IKey);
00218 ///   /// Translate from the semantic type of a key in the hash table to the
00219 ///   /// type that is actually stored and used for hashing and comparisons.
00220 ///   /// The internal and external types are often the same, in which case this
00221 ///   /// can simply return the passed in value.
00222 ///   static const internal_key_type &GetInternalKey(external_key_type &EKey);
00223 ///   /// Read the key and data length from Buffer, leaving it pointing at the
00224 ///   /// following byte.
00225 ///   static std::pair<offset_type, offset_type>
00226 ///   ReadKeyDataLength(const unsigned char *&Buffer);
00227 ///   /// Read the key from Buffer, given the KeyLen as reported from
00228 ///   /// ReadKeyDataLength.
00229 ///   const internal_key_type &ReadKey(const unsigned char *Buffer,
00230 ///                                    offset_type KeyLen);
00231 ///   /// Read the data for Key from Buffer, given the DataLen as reported from
00232 ///   /// ReadKeyDataLength.
00233 ///   data_type ReadData(StringRef Key, const unsigned char *Buffer,
00234 ///                      offset_type DataLen);
00235 /// };
00236 /// \endcode
00237 template <typename Info> class OnDiskChainedHashTable {
00238   const typename Info::offset_type NumBuckets;
00239   const typename Info::offset_type NumEntries;
00240   const unsigned char *const Buckets;
00241   const unsigned char *const Base;
00242   Info InfoObj;
00243 
00244 public:
00245   typedef typename Info::internal_key_type internal_key_type;
00246   typedef typename Info::external_key_type external_key_type;
00247   typedef typename Info::data_type         data_type;
00248   typedef typename Info::hash_value_type   hash_value_type;
00249   typedef typename Info::offset_type       offset_type;
00250 
00251   OnDiskChainedHashTable(offset_type NumBuckets, offset_type NumEntries,
00252                          const unsigned char *Buckets,
00253                          const unsigned char *Base,
00254                          const Info &InfoObj = Info())
00255       : NumBuckets(NumBuckets), NumEntries(NumEntries), Buckets(Buckets),
00256         Base(Base), InfoObj(InfoObj) {
00257     assert((reinterpret_cast<uintptr_t>(Buckets) & 0x3) == 0 &&
00258            "'buckets' must have a 4-byte alignment");
00259   }
00260 
00261   offset_type getNumBuckets() const { return NumBuckets; }
00262   offset_type getNumEntries() const { return NumEntries; }
00263   const unsigned char *getBase() const { return Base; }
00264   const unsigned char *getBuckets() const { return Buckets; }
00265 
00266   bool isEmpty() const { return NumEntries == 0; }
00267 
00268   class iterator {
00269     internal_key_type Key;
00270     const unsigned char *const Data;
00271     const offset_type Len;
00272     Info *InfoObj;
00273 
00274   public:
00275     iterator() : Data(nullptr), Len(0) {}
00276     iterator(const internal_key_type K, const unsigned char *D, offset_type L,
00277              Info *InfoObj)
00278         : Key(K), Data(D), Len(L), InfoObj(InfoObj) {}
00279 
00280     data_type operator*() const { return InfoObj->ReadData(Key, Data, Len); }
00281     bool operator==(const iterator &X) const { return X.Data == Data; }
00282     bool operator!=(const iterator &X) const { return X.Data != Data; }
00283   };
00284 
00285   /// \brief Look up the stored data for a particular key.
00286   iterator find(const external_key_type &EKey, Info *InfoPtr = 0) {
00287     if (!InfoPtr)
00288       InfoPtr = &InfoObj;
00289 
00290     using namespace llvm::support;
00291     const internal_key_type &IKey = InfoObj.GetInternalKey(EKey);
00292     hash_value_type KeyHash = InfoObj.ComputeHash(IKey);
00293 
00294     // Each bucket is just an offset into the hash table file.
00295     offset_type Idx = KeyHash & (NumBuckets - 1);
00296     const unsigned char *Bucket = Buckets + sizeof(offset_type) * Idx;
00297 
00298     offset_type Offset = endian::readNext<offset_type, little, aligned>(Bucket);
00299     if (Offset == 0)
00300       return iterator(); // Empty bucket.
00301     const unsigned char *Items = Base + Offset;
00302 
00303     // 'Items' starts with a 16-bit unsigned integer representing the
00304     // number of items in this bucket.
00305     unsigned Len = endian::readNext<uint16_t, little, unaligned>(Items);
00306 
00307     for (unsigned i = 0; i < Len; ++i) {
00308       // Read the hash.
00309       hash_value_type ItemHash =
00310           endian::readNext<hash_value_type, little, unaligned>(Items);
00311 
00312       // Determine the length of the key and the data.
00313       const std::pair<offset_type, offset_type> &L =
00314           Info::ReadKeyDataLength(Items);
00315       offset_type ItemLen = L.first + L.second;
00316 
00317       // Compare the hashes.  If they are not the same, skip the entry entirely.
00318       if (ItemHash != KeyHash) {
00319         Items += ItemLen;
00320         continue;
00321       }
00322 
00323       // Read the key.
00324       const internal_key_type &X =
00325           InfoPtr->ReadKey((const unsigned char *const)Items, L.first);
00326 
00327       // If the key doesn't match just skip reading the value.
00328       if (!InfoPtr->EqualKey(X, IKey)) {
00329         Items += ItemLen;
00330         continue;
00331       }
00332 
00333       // The key matches!
00334       return iterator(X, Items + L.first, L.second, InfoPtr);
00335     }
00336 
00337     return iterator();
00338   }
00339 
00340   iterator end() const { return iterator(); }
00341 
00342   Info &getInfoObj() { return InfoObj; }
00343 
00344   /// \brief Create the hash table.
00345   ///
00346   /// \param Buckets is the beginning of the hash table itself, which follows
00347   /// the payload of entire structure. This is the value returned by
00348   /// OnDiskHashTableGenerator::Emit.
00349   ///
00350   /// \param Base is the point from which all offsets into the structure are
00351   /// based. This is offset 0 in the stream that was used when Emitting the
00352   /// table.
00353   static OnDiskChainedHashTable *Create(const unsigned char *Buckets,
00354                                         const unsigned char *const Base,
00355                                         const Info &InfoObj = Info()) {
00356     using namespace llvm::support;
00357     assert(Buckets > Base);
00358     assert((reinterpret_cast<uintptr_t>(Buckets) & 0x3) == 0 &&
00359            "buckets should be 4-byte aligned.");
00360 
00361     offset_type NumBuckets =
00362         endian::readNext<offset_type, little, aligned>(Buckets);
00363     offset_type NumEntries =
00364         endian::readNext<offset_type, little, aligned>(Buckets);
00365     return new OnDiskChainedHashTable<Info>(NumBuckets, NumEntries, Buckets,
00366                                             Base, InfoObj);
00367   }
00368 };
00369 
00370 /// \brief Provides lookup and iteration over an on disk hash table.
00371 ///
00372 /// \copydetails llvm::OnDiskChainedHashTable
00373 template <typename Info>
00374 class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> {
00375   const unsigned char *Payload;
00376 
00377 public:
00378   typedef OnDiskChainedHashTable<Info>          base_type;
00379   typedef typename base_type::internal_key_type internal_key_type;
00380   typedef typename base_type::external_key_type external_key_type;
00381   typedef typename base_type::data_type         data_type;
00382   typedef typename base_type::hash_value_type   hash_value_type;
00383   typedef typename base_type::offset_type       offset_type;
00384 
00385   OnDiskIterableChainedHashTable(offset_type NumBuckets, offset_type NumEntries,
00386                                  const unsigned char *Buckets,
00387                                  const unsigned char *Payload,
00388                                  const unsigned char *Base,
00389                                  const Info &InfoObj = Info())
00390       : base_type(NumBuckets, NumEntries, Buckets, Base, InfoObj),
00391         Payload(Payload) {}
00392 
00393   /// \brief Iterates over all of the keys in the table.
00394   class key_iterator {
00395     const unsigned char *Ptr;
00396     offset_type NumItemsInBucketLeft;
00397     offset_type NumEntriesLeft;
00398     Info *InfoObj;
00399 
00400   public:
00401     typedef external_key_type value_type;
00402 
00403     key_iterator(const unsigned char *const Ptr, offset_type NumEntries,
00404                  Info *InfoObj)
00405         : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
00406           InfoObj(InfoObj) {}
00407     key_iterator()
00408         : Ptr(nullptr), NumItemsInBucketLeft(0), NumEntriesLeft(0),
00409           InfoObj(0) {}
00410 
00411     friend bool operator==(const key_iterator &X, const key_iterator &Y) {
00412       return X.NumEntriesLeft == Y.NumEntriesLeft;
00413     }
00414     friend bool operator!=(const key_iterator &X, const key_iterator &Y) {
00415       return X.NumEntriesLeft != Y.NumEntriesLeft;
00416     }
00417 
00418     key_iterator &operator++() { // Preincrement
00419       using namespace llvm::support;
00420       if (!NumItemsInBucketLeft) {
00421         // 'Items' starts with a 16-bit unsigned integer representing the
00422         // number of items in this bucket.
00423         NumItemsInBucketLeft =
00424             endian::readNext<uint16_t, little, unaligned>(Ptr);
00425       }
00426       Ptr += sizeof(hash_value_type); // Skip the hash.
00427       // Determine the length of the key and the data.
00428       const std::pair<offset_type, offset_type> &L =
00429           Info::ReadKeyDataLength(Ptr);
00430       Ptr += L.first + L.second;
00431       assert(NumItemsInBucketLeft);
00432       --NumItemsInBucketLeft;
00433       assert(NumEntriesLeft);
00434       --NumEntriesLeft;
00435       return *this;
00436     }
00437     key_iterator operator++(int) { // Postincrement
00438       key_iterator tmp = *this; ++*this; return tmp;
00439     }
00440 
00441     value_type operator*() const {
00442       const unsigned char *LocalPtr = Ptr;
00443       if (!NumItemsInBucketLeft)
00444         LocalPtr += 2; // number of items in bucket
00445       LocalPtr += sizeof(hash_value_type); // Skip the hash.
00446 
00447       // Determine the length of the key and the data.
00448       const std::pair<offset_type, offset_type> &L =
00449           Info::ReadKeyDataLength(LocalPtr);
00450 
00451       // Read the key.
00452       const internal_key_type &Key = InfoObj->ReadKey(LocalPtr, L.first);
00453       return InfoObj->GetExternalKey(Key);
00454     }
00455   };
00456 
00457   key_iterator key_begin() {
00458     return key_iterator(Payload, this->getNumEntries(), &this->getInfoObj());
00459   }
00460   key_iterator key_end() { return key_iterator(); }
00461 
00462   iterator_range<key_iterator> keys() {
00463     return make_range(key_begin(), key_end());
00464   }
00465 
00466   /// \brief Iterates over all the entries in the table, returning the data.
00467   class data_iterator {
00468     const unsigned char *Ptr;
00469     offset_type NumItemsInBucketLeft;
00470     offset_type NumEntriesLeft;
00471     Info *InfoObj;
00472 
00473   public:
00474     typedef data_type value_type;
00475 
00476     data_iterator(const unsigned char *const Ptr, offset_type NumEntries,
00477                   Info *InfoObj)
00478         : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
00479           InfoObj(InfoObj) {}
00480     data_iterator()
00481         : Ptr(nullptr), NumItemsInBucketLeft(0), NumEntriesLeft(0),
00482           InfoObj(nullptr) {}
00483 
00484     bool operator==(const data_iterator &X) const {
00485       return X.NumEntriesLeft == NumEntriesLeft;
00486     }
00487     bool operator!=(const data_iterator &X) const {
00488       return X.NumEntriesLeft != NumEntriesLeft;
00489     }
00490 
00491     data_iterator &operator++() { // Preincrement
00492       using namespace llvm::support;
00493       if (!NumItemsInBucketLeft) {
00494         // 'Items' starts with a 16-bit unsigned integer representing the
00495         // number of items in this bucket.
00496         NumItemsInBucketLeft =
00497             endian::readNext<uint16_t, little, unaligned>(Ptr);
00498       }
00499       Ptr += sizeof(hash_value_type); // Skip the hash.
00500       // Determine the length of the key and the data.
00501       const std::pair<offset_type, offset_type> &L =
00502           Info::ReadKeyDataLength(Ptr);
00503       Ptr += L.first + L.second;
00504       assert(NumItemsInBucketLeft);
00505       --NumItemsInBucketLeft;
00506       assert(NumEntriesLeft);
00507       --NumEntriesLeft;
00508       return *this;
00509     }
00510     data_iterator operator++(int) { // Postincrement
00511       data_iterator tmp = *this; ++*this; return tmp;
00512     }
00513 
00514     value_type operator*() const {
00515       const unsigned char *LocalPtr = Ptr;
00516       if (!NumItemsInBucketLeft)
00517         LocalPtr += 2; // number of items in bucket
00518       LocalPtr += sizeof(hash_value_type); // Skip the hash.
00519 
00520       // Determine the length of the key and the data.
00521       const std::pair<offset_type, offset_type> &L =
00522           Info::ReadKeyDataLength(LocalPtr);
00523 
00524       // Read the key.
00525       const internal_key_type &Key = InfoObj->ReadKey(LocalPtr, L.first);
00526       return InfoObj->ReadData(Key, LocalPtr + L.first, L.second);
00527     }
00528   };
00529 
00530   data_iterator data_begin() {
00531     return data_iterator(Payload, this->getNumEntries(), &this->getInfoObj());
00532   }
00533   data_iterator data_end() { return data_iterator(); }
00534 
00535   iterator_range<data_iterator> data() {
00536     return make_range(data_begin(), data_end());
00537   }
00538 
00539   /// \brief Create the hash table.
00540   ///
00541   /// \param Buckets is the beginning of the hash table itself, which follows
00542   /// the payload of entire structure. This is the value returned by
00543   /// OnDiskHashTableGenerator::Emit.
00544   ///
00545   /// \param Payload is the beginning of the data contained in the table.  This
00546   /// is Base plus any padding or header data that was stored, ie, the offset
00547   /// that the stream was at when calling Emit.
00548   ///
00549   /// \param Base is the point from which all offsets into the structure are
00550   /// based. This is offset 0 in the stream that was used when Emitting the
00551   /// table.
00552   static OnDiskIterableChainedHashTable *
00553   Create(const unsigned char *Buckets, const unsigned char *const Payload,
00554          const unsigned char *const Base, const Info &InfoObj = Info()) {
00555     using namespace llvm::support;
00556     assert(Buckets > Base);
00557     assert((reinterpret_cast<uintptr_t>(Buckets) & 0x3) == 0 &&
00558            "buckets should be 4-byte aligned.");
00559 
00560     offset_type NumBuckets =
00561         endian::readNext<offset_type, little, aligned>(Buckets);
00562     offset_type NumEntries =
00563         endian::readNext<offset_type, little, aligned>(Buckets);
00564     return new OnDiskIterableChainedHashTable<Info>(
00565         NumBuckets, NumEntries, Buckets, Payload, Base, InfoObj);
00566   }
00567 };
00568 
00569 } // end namespace llvm
00570 
00571 #endif