GraphLab: Distributed Graph-Parallel API  2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
hopscotch_table.hpp
1 #ifndef GRAPHLAB_UTIL_HOPSCOTCH_TABLE_HPP
2 #define GRAPHLAB_UTIL_HOPSCOTCH_TABLE_HPP
3 
4 #include <vector>
5 #include <utility>
6 #include <algorithm>
7 #include <functional>
8 #include <iterator>
9 
10 #include <graphlab/parallel/pthread_tools.hpp>
11 #include <graphlab/parallel/atomic.hpp>
12 
13 #include <boost/functional/hash.hpp>
14 #define _HOPSCOTCH_TABLE_DEFAULT_HASH boost::hash<T>
15 
16 
17 namespace graphlab {
18 
19 
20 /**
21  * This defines a hash table where each entry stores a
22  * fixed data type T. The data type T should be <b>small</b>
23  * and should preferably fit in a couple of words.
24  * This hash table is not resizeable. Use the hopscotch_map
25  * For a more general purpose table.
26  *
27  * Safe access is guaranteed if you restrict to the functions suffixed with
28  * _sync.
29  *
30  * \tparam T The data type stored in the hash table
31  * \tparam Synchronized Defaults to True. If True, locking is used to ensure
32  * safe reads and writes to the hash table.
33  * Even under "Synchronized", the only operations
34  * which are safe for parallel access are all functions
35  * suffixed with "sync".
36  * \tparam Hash The hash functor type. Defaults to std::hash<T> if C++11 is
37  * available. Otherwise defaults to boost::hash<T>
38  * \tparam KeyEqual The functor used to identify object equality. Defaults to
39  * std::equal_to<T>
40  */
41 template <typename T,
42  bool Synchronized = true,
43  typename Hash = _HOPSCOTCH_TABLE_DEFAULT_HASH,
44  typename KeyEqual = std::equal_to<T> >
46  public:
47  /// The data type stored in the table
48  typedef T value_type;
49  typedef size_t size_type;
50  /// The type of the hasher object
51  typedef Hash hasher;
52  /// The type of the equality tester
53  typedef KeyEqual equality_function;
54  /// A pointer to the data type stored in the table
55  typedef value_type* pointer;
56  /// A reference to the data type stored in the table
58  /// A constant pointer to the data type stored in the table
59  typedef const value_type* const_pointer;
60  /// A constant reference to the data type stored in the table
61  typedef const value_type& const_reference;
62 
63  private:
64  /// The actual contents of the hash table
65  struct element {
66  bool hasdata: 1; /// Whether this entry has data.
67  uint32_t field: 31; /// The hopscotch bitfield. Only 31 bits are useable
68  T elem; /// User data
69  element():hasdata(false), field(0) { }
70  };
71 
72  std::vector<element> data;
73  std::vector<simple_spinlock> locks;
74 
75  hasher hashfun;
76  equality_function equalfun;
77  atomic<size_t> numel;
78  size_t mask;
79 
80  /// Returns the next power of 2 of a value
81  static uint64_t next_powerof2(uint64_t val) {
82  --val;
83  val = val | (val >> 1);
84  val = val | (val >> 2);
85  val = val | (val >> 4);
86  val = val | (val >> 8);
87  val = val | (val >> 16);
88  val = val | (val >> 32);
89  return val + 1;
90  }
91 
92  /** Computes the hash of the data. And perturbs it
93  * using either CRC32 or Jenkin's 32-bit mix
94  */
95  size_t compute_hash(const value_type& d) const {
96  size_t state = hashfun(d);
97 #ifdef HAS_BUILTIN_CRC32
98  return __builtin_ia32_crc32di(0, state);
99 #else
100  /*
101  * Bob Jenkin's 32 bit integer mix function from
102  * http://home.comcast.net/~bretm/hash/3.html
103  */
104  state += (state << 12);
105  state ^= (state >> 22);
106  state += (state << 4);
107  state ^= (state >> 9);
108  state += (state << 10);
109  state ^= (state >> 2);
110  state += (state << 7);
111  state ^= (state >> 12);
112  return state;
113 #endif
114  }
115 
116 
117  /// Returns the lock ID associated with a given array index
118  static size_t associated_lock_id(size_t idx) {
119  return idx / 32;
120  }
121 
122 
123 
124  public:
125  /**
126  * Constructs a hopscotch table of a given length.
127  *
128  * \param len This rounded up to the next power of 2 will be used as
129  * the length of the table. This table is not resizeable.
130  * \param hashfun The hasher functor. Defaults to Hash()
131  * \param equalfun A functor used to test for equality. Defaults to KeyEqual()
132  */
133  hopscotch_table(size_t len,
134  Hash hashfun = Hash(),
135  KeyEqual equalfun = KeyEqual()):
136  data(next_powerof2(len) + 32),
137  locks(Synchronized ? data.size() / 32 + 1 : 0),
138  hashfun(hashfun),
139  equalfun(equalfun),
140  numel(0),
141  mask(data.size() - 32 - 1) {
142  }
143 
144  /// Returns the hash function used by the hash table
146  return hashfun;
147  }
148 
149  /// Returns the equality function used by the hash table
151  return equalfun;
152  }
153 
154  /**
155  * A const iterator which allows iteration over the hash table
156  * entries. Insertions may disrupt the iterator order. Deletions
157  * invalidate the iterator.
158  */
159  struct const_iterator {
160  typedef std::forward_iterator_tag iterator_category;
161  typedef const typename hopscotch_table::value_type value_type;
162  typedef size_t difference_type;
163  typedef value_type* pointer;
164  typedef value_type& reference;
165 
166  friend class hopscotch_table;
167 
168  const hopscotch_table* ptr;
169  typename std::vector<element>::const_iterator iter;
170 
171  const_iterator():ptr(NULL) {}
172 
173  const_iterator operator++() {
174  ++iter;
175  while(iter != ptr->data.end() && !iter->hasdata) {
176  ++iter;
177  }
178  return *this;
179  }
180 
181  const_iterator operator++(int) {
182  iterator cur = *this;
183  ++(*this);
184  return cur;
185  }
186 
187 
188  reference operator*() {
189  return iter->elem;
190  }
191 
192  pointer operator->() {
193  return &(iter->elem);
194  }
195 
196  bool operator==(const const_iterator it) const {
197  return ptr == it.ptr && iter == it.iter;
198  }
199 
200  bool operator!=(const const_iterator iter) const {
201  return !((*this) == iter);
202  }
203 
204 
205  private:
206  const_iterator(const hopscotch_table* table,
207  typename std::vector<element>::const_iterator iter):
208  ptr(table), iter(iter) { }
209  };
210 
211 
212 
213  /**
214  * A const iterator which allows iteration over the hash table
215  * entries. Insertions may disrupt the iterator order. Deletions
216  * invalidate the iterator.
217  */
218  struct iterator {
219  typedef std::forward_iterator_tag iterator_category;
220  typedef typename hopscotch_table::value_type value_type;
221  typedef size_t difference_type;
222  typedef value_type* pointer;
223  typedef value_type& reference;
224 
225  friend class hopscotch_table;
226 
227  hopscotch_table* ptr;
228  typename std::vector<element>::iterator iter;
229 
230  iterator():ptr(NULL) {}
231 
232 
233  operator const_iterator() const {
234  const_iterator it(ptr, iter);
235  return it;
236  }
237 
238  iterator operator++() {
239  ++iter;
240  while(iter != ptr->data.end() && !iter->hasdata) {
241  ++iter;
242  }
243  return *this;
244  }
245 
246  iterator operator++(int) {
247  iterator cur = *this;
248  ++(*this);
249  return cur;
250  }
251 
252 
253  reference operator*() {
254  return iter->elem;
255  }
256 
257  pointer operator->() {
258  return &(iter->elem);
259  }
260 
261  bool operator==(const iterator it) const {
262  return ptr == it.ptr && iter == it.iter;
263  }
264 
265  bool operator!=(const iterator iter) const {
266  return !((*this) == iter);
267  }
268 
269 
270  private:
271  iterator(hopscotch_table* table,
272  typename std::vector<element>::iterator iter):
273  ptr(table), iter(iter) { }
274  };
275 
276  /** Standard insert iterator. Writing into this iterator
277  * will cause insertions to occur. It is however, recommended
278  * that the insert() operation be used instead of the insert_iterator
279  * since the insert_iterator silently fails on insert failure.
280  */
282  hopscotch_table* cmap;
283  typedef std::forward_iterator_tag iterator_category;
284  typedef typename hopscotch_table::value_type value_type;
285 
286  insert_iterator(hopscotch_table* c):cmap(c) {}
287 
288  insert_iterator operator++() {
289  return (*this);
290  }
291  insert_iterator operator++(int) {
292  return (*this);
293  }
294 
295  insert_iterator& operator*() {
296  return *this;
297  }
298  insert_iterator& operator=(const insert_iterator& i) {
299  cmap = i.cmap;
300  return *this;
301  }
302 
303  insert_iterator& operator=(const value_type& v) {
304  cmap->insert(v);
305  return *this;
306  }
307  };
308 
309  private:
310  /**
311  * Searches for a target entry and overwrites if it exists
312  */
313  template <bool IsSynchronized>
314  iterator try_find_and_overwrite(const value_type& newdata,
315  size_t target,
316  bool overwrite) {
317  // find the next empty entry
318  size_t lockid = associated_lock_id(target);
319 
320  if (IsSynchronized) {
321  locks[lockid + 1].lock();
322  locks[lockid].lock();
323  }
324  iterator iter = find_impl(newdata, target);
325  if (iter != end() && overwrite) {
326  iter.iter->elem = newdata;
327  }
328  if (IsSynchronized) {
329  locks[lockid].unlock();
330  locks[lockid + 1].unlock();
331  }
332  return iter;
333  }
334 
335  /** Insert logic. If IsSynchronized is set, locks are used
336  * If overwrite is set, it will additionally check for existance
337  * of the entry and overwrite if it exists.
338  * Iterator is not going to be necessarily valid under parallel access.
339  */
340  template <bool IsSynchronized>
341  iterator insert_impl(const value_type& newdata, bool overwrite = true) {
342  // find the next empty entry
343  size_t target = compute_hash(newdata) & mask;
344 
345  iterator ret = try_find_and_overwrite<IsSynchronized>(newdata,
346  target,
347  overwrite);
348  if (ret != end()) return ret;
349 
350  // search for a place to stick it into
351  bool found = false;
352  size_t shift_target = target;
353  // let max range is 31 * 20
354  size_t limit = std::min(data.size(), target + 31 * 20);
355  size_t lockid = 0;
356  for (;shift_target < limit; shift_target++) {
357  if (data[shift_target].hasdata == false) {
358  lockid = associated_lock_id(shift_target);
359  if (IsSynchronized) locks[lockid].lock();
360  // double check
361  if (data[shift_target].hasdata == false) {
362  // yup still true.
363  // we got an empty value.
364  // quit the search
365  found = true;
366  break;
367  } else {
368  // nope. not empty anymore
369  // unlock and continue
370  if (IsSynchronized) locks[lockid].unlock();
371  }
372  }
373  }
374 
375  if (!found) {
376  // failed to find a place to put this value.
377  return iterator(this, data.end());
378  }
379 
380  // while the shift target is out of range
381  while(shift_target - target >= 31) {
382  // search backwards
383  // we would like to jump as far as possible
384  // find an hash entry whose field placed something
385  // between here and the shift target
386  // and move it to the shift target.
387  // for i = 31 to 1
388  found = false;
389  // lock one before the current lockid if available
390  if (IsSynchronized && lockid > 0) locks[lockid - 1].lock();
391 
392  for (size_t i = 30; i >= 1; --i) {
393  size_t r;
394  if (data[shift_target - i].field) {
395  r = __builtin_ctz(data[shift_target - i].field);
396  if (r <= i) {
397  // shift
398  size_t new_shift_target = shift_target - i + r;
399  assert(data[new_shift_target].hasdata);
400  data[shift_target].elem = data[new_shift_target].elem;
401  data[shift_target].hasdata = true;
402  data[new_shift_target].hasdata = false;
403  data[new_shift_target].elem = T();
404 
405  // unset the bit for r and set the bit for i
406  data[shift_target - i].field =
407  (data[shift_target - i].field & ~((uint32_t)1 << r))
408  | ((uint32_t)1 << i);
409  shift_target = new_shift_target;
410  found = true;
411  break;
412  }
413  }
414  }
415 
416  if (!found) {
417  // release all the locks acquired
418  if (IsSynchronized) {
419  locks[lockid].unlock();
420  if (lockid > 0) locks[lockid - 1].unlock();
421  }
422  return iterator(this, data.end());
423  }
424  else {
425 
426  if (IsSynchronized) {
427  // ok. depending on how far we went. we need to
428  // unlock one of lockid or lockid - 1
429  size_t newlockid = associated_lock_id(shift_target);
430  assert(newlockid == lockid || newlockid == lockid - 1);
431  if (newlockid == lockid) {
432  if (lockid > 0) locks[lockid - 1].unlock();
433  }
434  else if (newlockid == lockid - 1) {
435  locks[lockid].unlock();
436  }
437  lockid = newlockid;
438  }
439  }
440  }
441  // insert and return
442  // we need to lock ID - 1 so as to ensure intersection with the hash target
443  if (IsSynchronized && lockid > 0) locks[lockid - 1].lock();
444  data[shift_target].elem = newdata;
445  data[target].field |= (1 << (shift_target - target));
446  data[shift_target].hasdata = true;
447  if (IsSynchronized) {
448  ++numel;
449  if (lockid > 0) locks[lockid - 1].unlock();
450  locks[lockid].unlock();
451  }
452  else {
453  ++numel.value;
454  }
455  return iterator(this, data.begin() + shift_target);
456  }
457 
458 
459  /**
460  * Searches for an entry and returns an iterator to the entry.
461  * The hash entry of the key is provided.
462  * KeyEqual will be used to identify if an entry matches the request.
463  * return end() on failure.
464  */
465  const_iterator find_impl(const value_type& key, size_t target) const {
466  uint32_t field = data[target].field;
467  while (field > 0) {
468  int r = __builtin_ctz(field);
469  if (data[target + r].hasdata &&
470  key_eq()(data[target + r].elem, key)) {
471  return const_iterator(this, data.begin() + target + r);
472  }
473  else {
474  // mask out the current bit and try again.
475  field &= ~(((uint32_t)1 << r));
476  }
477  }
478  return const_iterator(this, data.end());
479  }
480 
481 
482  iterator find_impl(const value_type& key, size_t target) {
483  const_iterator iter = ((const hopscotch_table*)(this))->find_impl(key, target);
484  return iterator(this, data.begin() + (iter.iter - data.begin()));
485  }
486 
487  public:
488  /**
489  * Inserts an entry into the array.
490  * Returns an iterator to the just inserted data on success.
491  * If the entry already exists, it will be overwritten.
492  * Returns end() on failure.
493  */
494  iterator insert(const value_type& newdata) {
495  return insert_impl<false>(newdata);
496  }
497 
498  /**
499  * Inserts an entry into the array.
500  * Returns an iterator to the just inserted data on success.
501  * This function check if the entry already exists, if it does,
502  * do nothing
503  * Returns end() on failure.
504  */
506  return insert_impl<false>(newdata, false);
507  }
508 
509 
510 
511  /**
512  * Searches for an entry and returns an iterator to the entry.
513  * KeyEqual will be used to identify if an entry matches the request.
514  * return end() on failure.
515  */
516  const_iterator find(const value_type& key) const {
517  size_t target = compute_hash(key) & mask;
518  return find_impl(key, target);
519  }
520 
521  /**
522  * Searches for an entry and returns an iterator to the entry.
523  * KeyEqual will be used to identify if an entry matches the request.
524  * return end() on failure.
525  */
526  iterator find(const value_type& key) {
527  const_iterator iter = ((const hopscotch_table*)(this))->find(key);
528  return iterator(this, data.begin() + (iter.iter - data.begin()));
529  }
530 
531 
532  void clear() {
533  for (size_t i = 0;i < data.size(); ++i) {
534  data[i].hasdata = false;
535  data[i].field = 0;
536  data[i].elem = value_type();
537  }
538  numel = 0;
539  }
540 
541  /**
542  * Erases an entry pointed to by an iterator.
543  */
544  bool erase(iterator iter) {
545  if (iter.iter == data.end()) return false;
546  assert(iter.iter->hasdata);
547  size_t target = compute_hash(iter.iter->elem) & mask;
548  size_t offset = iter.iter - (data.begin() + target);
549  assert(offset < 31);
550  --numel.value;
551  iter.iter->hasdata = false;
552  iter.iter->elem = value_type();
553  data[target].field &= ~((uint32_t)1 << offset);
554  return true;
555  }
556 
557  /// Erases a entry matching a given value.
558  bool erase(const value_type& key) {
559  return erase(find(key));
560  }
561 
562  /// Returns an iterator to the start of the table
564  // find the first which is not empty
565  typename std::vector<element>::iterator iter = data.begin();
566  while (iter != data.end() && !iter->hasdata) {
567  ++iter;
568  }
569  return iterator(this, iter);
570  }
571 
572  /// Returns an iterator to the start of the table
574  // find the first which is not empty
575  typename std::vector<element>::iterator iter = data.begin();
576  while (iter != data.end() && !iter->hasdata) {
577  ++iter;
578  }
579  return const_iterator(this, iter);
580  }
581 
582  /// Returns an iterator to the end of the table
584  return iterator(this, data.end());
585  }
586 
587  /// Returns an iterator to the end of the table
588  const_iterator end() const {
589  return const_iterator(this, data.end());
590  }
591 
592  /// Returns 1 if the table contains a given element. 0 otherwise.
593  size_t count(const value_type& v) const {
594  return find(v) != end();
595  }
596 
597  /// Returns true if the table contains a given element. false otherwise.
598  bool contains(const value_type& v) const {
599  return find(v) != end();
600  }
601 
602  /// Returns the number of elements in the table
603  size_t size() const {
604  return numel;
605  }
606 
607  /// Returns the capacity of the table
608  size_t capacity() const {
609  return data.size();
610  }
611 
612  float load_factor() const {
613  return float(size()) / capacity();
614  }
615 
616  // now for the safe accessors
617 
618  /// Returns the size of the hash table. Safe under parallel access.
619  size_t size_sync() const {
620  return numel;
621  }
622 
623  /// Returns the capacity of the table
624  size_t capacity_sync() const {
625  return data.size();
626  }
627 
628  float load_factor_sync() const {
629  return float(size_sync()) / capacity_sync();
630  }
631 
632  hopscotch_table& operator=(const hopscotch_table& other) {
633  data = other.data;
634  locks.resize(other.locks.size());
635  hashfun = other.hashfun;
636  equalfun = other.equalfun;
637  numel = other.numel;
638  mask = other.mask;
639  return *this;
640  }
641 
642 
643  /** Inserts an element into the hash table. Safe under parallel access.
644  * if t already exists, it will be overwritten
645  */
646  bool put_sync(const T& t) {
647  // since data is not resizeable,
648  // data.end() is always valid.
649  return insert_impl<Synchronized>(t).iter != data.end();
650  }
651 
652 
653  /** Inserts an element into the hash table. Safe under parallel access.
654  * if t already exists, nothing will happen
655  */
656  bool put_do_not_overwrite_sync(const T& t) {
657  // since data is not resizeable,
658  // data.end() is always valid.
659  return insert_impl<Synchronized>(t, false).iter != data.end();
660  }
661 
662 
663  /** If the argument is found in the hash table,
664  * return {true, V} where V is the hash table content matching the argument.
665  * Otherwise {false, T()} is returned.
666  * KeyEqual() is used to compare entries.
667  * Safe under parallel access.
668  */
669  std::pair<bool, T> get_sync(const T& t) const {
670  // fast path. Try to get it without locking
671  const_iterator iter = find(t);
672  if (iter != end()) {
673  // take a snapshot of the data
674  element e = *(iter.iter);
675  if (e.hasdata && key_eq()(e.elem, t)) {
676  return std::make_pair(true, e.elem);
677  }
678  }
679 
680  if (!Synchronized) {
681  return std::make_pair(false, T());
682  }
683 
684  // slow path. lock
685  size_t target = compute_hash(t) & mask;
686  size_t lockid = associated_lock_id(target);
687  locks[lockid + 1].lock();
688  locks[lockid].lock();
689  iter = find_impl(t, target);
690  if (iter != end()) {
691  element e = *(iter.iter);
692  locks[lockid].unlock();
693  locks[lockid + 1].unlock();
694  assert(e.hasdata && key_eq()(e.elem, t));
695  return std::make_pair(true, e.elem);
696  }
697  else {
698  locks[lockid].unlock();
699  locks[lockid + 1].unlock();
700  return std::make_pair(false, T());
701  }
702  }
703 
704  /**
705  * Returns true if the erasure was successful.
706  * If false, the entry was not found in the hash table
707  */
708  bool erase_sync(const T& t) {
709  if (!Synchronized) {
710  return erase(t);
711  }
712 
713  // we need to find it first
714  size_t target = compute_hash(t) & mask;
715  size_t lockid = associated_lock_id(target);
716  // acquire locks around the target
717  locks[lockid + 1].lock();
718  locks[lockid].lock();
719  iterator iter = find(t);
720  if (iter == end()) {
721  locks[lockid].unlock();
722  locks[lockid + 1].unlock();
723  return false;
724  }
725  // now lets erase it
726  assert(iter.iter->hasdata);
727  size_t offset = iter.iter - (data.begin() + target);
728  assert(offset < 31);
729  --numel;
730  iter.iter->hasdata = false;
731  iter.iter->elem = value_type();
732  data[target].field &= ~((uint32_t)1 << offset);
733  locks[lockid + 1].unlock();
734  locks[lockid].unlock();
735  return true;
736  }
737 };
738 
739 } // graphlab
740 #endif