GraphLab: Distributed Graph-Parallel API  2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
cuckoo_set_pow2.hpp
1 /**
2  * Copyright (c) 2009 Carnegie Mellon University.
3  * All rights reserved.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing,
12  * software distributed under the License is distributed on an "AS
13  * IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
14  * express or implied. See the License for the specific language
15  * governing permissions and limitations under the License.
16  *
17  * For more about this software visit:
18  *
19  * http://www.graphlab.ml.cmu.edu
20  *
21  */
22 
23 #ifndef GRAPHLAB_UTIL_CUCKOO_SET_POW2_HPP
24 #define GRAPHLAB_UTIL_CUCKOO_SET_POW2_HPP
25 
26 #include <vector>
27 #include <iterator>
28 #include <boost/random.hpp>
29 #include <boost/unordered_map.hpp>
30 #include <ctime>
31 #include <graphlab/serialization/serialization_includes.hpp>
32 namespace graphlab {
33 
34 
35 
36  /**
37  * A cuckoo hash map which requires the user to
38  * provide an "illegal" value thus avoiding the need
39  * for a seperate bitmap. More or less similar
40  * interface as boost::unordered_map, not necessarily
41  * entirely STL compliant.
42  */
43  template <typename Key,
44  size_t CuckooK = 3,
45  typename IndexType = size_t,
46  typename Hash = boost::hash<Key>,
47  typename Pred = std::equal_to<Key> >
49 
50  public:
51  // public typedefs
52  typedef Key key_type;
53  typedef Key value_type;
54  typedef Hash hasher;
55  typedef Pred key_equal;
56  typedef IndexType index_type;
57  typedef value_type* pointer;
58  typedef value_type& reference;
59  typedef const value_type* const_pointer;
60  typedef const value_type& const_reference;
61 
62  private:
63  // internal typedefs
64  typedef key_type non_const_value_type;
65  typedef value_type* map_container_type;
66  typedef value_type* map_container_iterator;
67  typedef const value_type* map_container_const_iterator;
68  typedef std::vector<Key> stash_container_type;
69 
70  key_type illegalkey;
71  index_type numel;
72  index_type maxstash;
73  map_container_type data;
74  size_t datalen;
75  stash_container_type stash;
76  boost::rand48 drng;
77  boost::uniform_int<index_type> kranddist;
78  hasher hashfun;
79  key_equal keyeq;
80  index_type mask;
81 
82  map_container_iterator data_begin() {
83  return data;
84  }
85 
86  map_container_iterator data_end() {
87  return data + datalen;
88  }
89 
90  map_container_const_iterator data_begin() const {
91  return data;
92  }
93 
94  map_container_const_iterator data_end() const {
95  return data + datalen;
96  }
97 
98 
99  // bypass the const key_type with a placement new
100  void replace_in_vector(map_container_iterator iter,
101  const key_type& key) {
102  // delete
103  iter->~value_type();
104  // placement new
105  new(iter) value_type(key);
106  }
107 
108  void destroy_all() {
109  if (data != NULL) {
110  // call ze destructors
111  for(size_t i = 0; i < datalen; ++i) {
112  data[i].~value_type();
113  }
114  free(data);
115  }
116  stash.clear();
117  data = NULL;
118  datalen = 0;
119  numel = 0;
120  }
121 
122  public:
123  struct insert_iterator{
124  cuckoo_set_pow2* cmap;
125  typedef std::forward_iterator_tag iterator_category;
126  typedef typename cuckoo_set_pow2::value_type value_type;
127 
128  insert_iterator(cuckoo_set_pow2* c):cmap(c) {}
129 
130  insert_iterator operator++() {
131  return (*this);
132  }
133  insert_iterator operator++(int) {
134  return (*this);
135  }
136 
137  insert_iterator& operator*() {
138  return *this;
139  }
140  insert_iterator& operator=(const insert_iterator& i) {
141  cmap = i.cmap;
142  return *this;
143  }
144 
145  insert_iterator& operator=(const value_type& v) {
146  cmap->insert(v);
147  return *this;
148  }
149  };
150 
151  struct const_iterator {
152  const cuckoo_set_pow2* cmap;
153  bool in_stash;
154  typename cuckoo_set_pow2::map_container_const_iterator vec_iter;
155  typename cuckoo_set_pow2::stash_container_type::const_iterator stash_iter;
156 
157  typedef std::forward_iterator_tag iterator_category;
158  typedef typename cuckoo_set_pow2::value_type value_type;
159  typedef size_t difference_type;
160  typedef const value_type& reference;
161  typedef const value_type* pointer;
162  friend class cuckoo_set_pow2;
163 
164  const_iterator(): cmap(NULL), in_stash(false) {}
165 
166  const_iterator operator++() {
167  if (!in_stash) {
168  ++vec_iter;
169  // we are in the main vector. try to advance the
170  // iterator until I hit another data element
171  while(vec_iter != cmap->data_end() &&
172  cmap->key_eq()(*vec_iter, cmap->illegal_key())) ++vec_iter;
173  if (vec_iter == cmap->data_end()) {
174  in_stash = true;
175  stash_iter = cmap->stash.begin();
176  }
177  }
178  else if (in_stash) {
179  if (stash_iter != cmap->stash.end()) ++stash_iter;
180  }
181  return *this;
182  }
183 
184  const_iterator operator++(int) {
185  const_iterator cur = *this;
186  ++(*this);
187  return cur;
188  }
189 
190 
191  reference operator*() {
192  if (!in_stash) return *vec_iter;
193  else return *stash_iter;
194  }
195 
196  bool operator==(const const_iterator iter) const {
197  return in_stash == iter.in_stash &&
198  (in_stash==false ?
199  vec_iter == iter.vec_iter :
200  stash_iter == iter.stash_iter);
201  }
202 
203  bool operator!=(const const_iterator iter) const {
204  return !((*this) == iter);
205  }
206 
207  private:
208  const_iterator(const cuckoo_set_pow2* cmap, typename cuckoo_set_pow2::map_container_const_iterator vec_iter):
209  cmap(cmap), in_stash(false), vec_iter(vec_iter), stash_iter(cmap->stash.begin()) { }
210 
211  const_iterator(const cuckoo_set_pow2* cmap, typename cuckoo_set_pow2::stash_container_type::const_iterator stash_iter):
212  cmap(cmap), in_stash(true), vec_iter(cmap->data_begin()), stash_iter(stash_iter) { }
213 
214  };
215 
216 
217  struct iterator {
218  cuckoo_set_pow2* cmap;
219  bool in_stash;
220  typename cuckoo_set_pow2::map_container_iterator vec_iter;
221  typename cuckoo_set_pow2::stash_container_type::iterator stash_iter;
222 
223  typedef std::forward_iterator_tag iterator_category;
224  typedef typename cuckoo_set_pow2::value_type value_type;
225  typedef size_t difference_type;
226  typedef value_type& reference;
227  typedef value_type* pointer;
228  friend class cuckoo_set_pow2;
229 
230  iterator(): cmap(NULL), in_stash(false) {}
231 
232 
233  operator const_iterator() const {
234  const_iterator iter;
235  iter.cmap = cmap;
236  iter.in_stash = in_stash;
237  iter.vec_iter = vec_iter;
238  iter.stash_iter = stash_iter;
239  return iter;
240  }
241 
242  iterator operator++() {
243  if (!in_stash) {
244  ++vec_iter;
245  // we are in the main vector. try to advance the
246  // iterator until I hit another data element
247  while(vec_iter != cmap->data_end() &&
248  cmap->key_eq()(*vec_iter, cmap->illegal_key())) ++vec_iter;
249  if (vec_iter == cmap->data_end()) {
250  in_stash = true;
251  stash_iter = cmap->stash.begin();
252  }
253  }
254  else if (in_stash) {
255  if (stash_iter != cmap->stash.end()) ++stash_iter;
256  }
257  return *this;
258  }
259 
260  iterator operator++(int) {
261  iterator cur = *this;
262  ++(*this);
263  return cur;
264  }
265 
266 
267  reference operator*() {
268  if (!in_stash) return *vec_iter;
269  else return *stash_iter;
270  }
271 
272  bool operator==(const iterator iter) const {
273  return in_stash == iter.in_stash &&
274  (in_stash==false ?
275  vec_iter == iter.vec_iter :
276  stash_iter == iter.stash_iter);
277  }
278 
279  bool operator!=(const iterator iter) const {
280  return !((*this) == iter);
281  }
282 
283 
284  private:
285  iterator(cuckoo_set_pow2* cmap,
286  typename cuckoo_set_pow2::map_container_iterator vec_iter):
287  cmap(cmap), in_stash(false), vec_iter(vec_iter) { }
288 
289  iterator(cuckoo_set_pow2* cmap,
290  typename cuckoo_set_pow2::stash_container_type::iterator stash_iter):
291  cmap(cmap), in_stash(true), stash_iter(stash_iter) { }
292 
293  };
294 
295 
296  private:
297 
298  // the primary inserting logic.
299  // this assumes that the data is not already in the array.
300  // caller must check before performing the insert
301  iterator do_insert(const value_type& v_) {
302  non_const_value_type v = v_;
303  if (stash.size() > maxstash) {
304  // resize
305  reserve(datalen * 2);
306  }
307 
308  index_type insertpos = (index_type)(-1); // tracks where the current
309  // inserted value went
310  ++numel;
311 
312  // take a random walk down the tree
313  for (int i = 0;i < 100; ++i) {
314  // first see if one of the hashes will work
315  index_type idx = 0;
316  bool found = false;
317  size_t hash_of_k = hashfun(v);
318  for (size_t j = 0; j < CuckooK; ++j) {
319  idx = compute_hash(hash_of_k, j);
320  if (keyeq(data[idx], illegalkey)) {
321  found = true;
322  break;
323  }
324  }
325  if (!found) idx = compute_hash(hash_of_k, kranddist(drng));
326  // if insertpos is -1, v holds the current value. and we
327  // are inserting it into idx
328  // if insertpos is idx, we are bumping v again. and v will hold the
329  // current value once more. so revert
330  // insertpos to -1
331  if (insertpos == (index_type)(-1)) insertpos = idx;
332  else if (insertpos == idx) insertpos = (index_type)(-1);
333  // there is room here
334  if (found || keyeq(data[idx], illegalkey)) {
335  replace_in_vector(data_begin() + idx, v);
336  // success!
337  return iterator(this, data_begin() + insertpos);
338  }
339  // failed to insert!
340  // try again!
341 
342  non_const_value_type tmp = data[idx];
343  replace_in_vector(data_begin() + idx, v);
344  v = tmp;
345  }
346  // ok. tried and failed 100 times.
347  //stick it in the stash
348 
349  typename stash_container_type::iterator stashiter = stash.insert(stash.end(), v);
350  // if insertpos is -1, current value went into stash
351  if (insertpos == (index_type)(-1)) {
352  return iterator(this, stashiter);
353  }
354  else {
355  return iterator(this, data_begin() + insertpos);
356  }
357  }
358  public:
359 
360  cuckoo_set_pow2(key_type illegalkey,
361  index_type stashsize = 8,
362  index_type reserve_size = 128,
363  hasher const& h = hasher(),
364  key_equal const& k = key_equal()):
365  illegalkey(illegalkey),
366  numel(0),maxstash(stashsize),
367  data(NULL), datalen(0),
368  drng(time(NULL)),
369  kranddist(0, CuckooK - 1), hashfun(h), keyeq(k), mask(reserve_size - 1) {
370  reserve(reserve_size);
371  }
372 
373  cuckoo_set_pow2(const cuckoo_set_pow2& other):
374  illegalkey(other.illegalkey),
375  numel(0), maxstash(other.maxstash),
376  data(NULL), datalen(0),
377  drng(time(NULL)), kranddist(0, CuckooK - 1),
378  hashfun(other.hashfun), keyeq(other.keyeq), mask(0) {
379  data = NULL;
380  (*this) = other;
381  }
382 
383 
384  const key_type& illegal_key() const {
385  return illegalkey;
386  }
387 
388  ~cuckoo_set_pow2() {
389  destroy_all();
390  }
391 
392  cuckoo_set_pow2& operator=(const cuckoo_set_pow2& other) {
393  if (&other == this) return *this;
394  if (other.numel == 0 && numel == 0) return *this;
395  else if (other.numel == 0) {
396  for (size_t i = 0;i < datalen; ++i) data[i] = illegalkey;
397  stash.clear();
398  numel = 0;
399  return *this;
400  }
401  else {
402  destroy_all();
403 
404  // copy the data
405  data = (map_container_type)malloc(sizeof(value_type) * other.datalen);
406  datalen = other.datalen;
407  std::uninitialized_copy(other.data_begin(), other.data_end(), data_begin());
408  // copy the stash
409  stash = other.stash;
410  numel = other.numel;
411  hashfun = other.hashfun;
412  keyeq = other.keyeq;
413  mask = other.mask;
414  }
415  return *this;
416  }
417 
418  index_type size() const {
419  return numel;
420  }
421 
422  iterator begin() {
423  iterator iter;
424  iter.cmap = this;
425  iter.in_stash = false;
426  iter.vec_iter = data_begin();
427 
428  while(iter.vec_iter != data_end() &&
429  keyeq(*(iter.vec_iter), illegalkey)) ++iter.vec_iter;
430 
431  if (iter.vec_iter == data_end()) {
432  iter.in_stash = true;
433  iter.stash_iter = stash.begin();
434  }
435  return iter;
436  }
437 
438  iterator end() {
439  return iterator(this, stash.end());
440  }
441 
442 
443  const_iterator begin() const {
444  const_iterator iter;
445  iter.cmap = this;
446  iter.in_stash = false;
447  iter.vec_iter = data_begin();
448 
449  while(iter.vec_iter != data_end() &&
450  keyeq(*(iter.vec_iter), illegalkey)) ++iter.vec_iter;
451 
452 
453  if (iter.vec_iter == data_end()) {
454  iter.in_stash = true;
455  iter.stash_iter = stash.begin();
456  }
457 
458  return iter;
459  }
460 
461  const_iterator end() const {
462  return const_iterator(this, stash.end());
463 
464  }
465 
466  /*
467  * Bob Jenkin's 32 bit integer mix function from
468  * http://home.comcast.net/~bretm/hash/3.html
469  */
470  static size_t mix(size_t state) {
471  state += (state << 12);
472  state ^= (state >> 22);
473  state += (state << 4);
474  state ^= (state >> 9);
475  state += (state << 10);
476  state ^= (state >> 2);
477  state += (state << 7);
478  state ^= (state >> 12);
479  return state;
480  }
481 
482  index_type compute_hash(size_t k , const uint32_t seed) const {
483  // a bunch of random numbers
484 #if (__SIZEOF_PTRDIFF_T__ == 8)
485  static const size_t a[8] = {0x6306AA9DFC13C8E7,
486  0xA8CD7FBCA2A9FFD4,
487  0x40D341EB597ECDDC,
488  0x99CFA1168AF8DA7E,
489  0x7C55BCC3AF531D42,
490  0x1BC49DB0842A21DD,
491  0x2181F03B1DEE299F,
492  0xD524D92CBFEC63E9};
493 #else
494  static const size_t a[8] = {0xFC13C8E7,
495  0xA2A9FFD4,
496  0x597ECDDC,
497  0x8AF8DA7E,
498  0xAF531D42,
499  0x842A21DD,
500  0x1DEE299F,
501  0xBFEC63E9};
502 #endif
503  index_type s = mix(a[seed] ^ k);
504  return s & mask;
505  }
506 
507  void rehash() {
508  if (numel == 0) return;
509  stash_container_type stmp;
510  stmp.swap(stash);
511  // effectively, stmp elements are deleted
512  numel -= stmp.size();
513  for (size_t i = 0;i < datalen; ++i) {
514  // if there is an element here. erase it and reinsert
515  if (!keyeq(data[i], illegalkey)) {
516  if (count(data[i])) continue;
517  non_const_value_type v = data[i];
518  replace_in_vector(data_begin() + i, illegalkey);
519  numel--;
520  //erase(iterator(this, data_begin() + i));
521  insert(v);
522  }
523  }
524  typename stash_container_type::const_iterator iter = stmp.begin();
525  while(iter != stmp.end()) {
526  insert(*iter);
527  ++iter;
528  }
529  }
530 
531  static uint64_t next_powerof2(uint64_t val) {
532  --val;
533  val = val | (val >> 1);
534  val = val | (val >> 2);
535  val = val | (val >> 4);
536  val = val | (val >> 8);
537  val = val | (val >> 16);
538  val = val | (val >> 32);
539  return val + 1;
540  }
541 
542 
543  void reserve(size_t newlen) {
544  newlen = next_powerof2(newlen);
545  if (newlen <= datalen) return;
546 
547  mask = newlen - 1;
548  //data.reserve(newlen);
549  //data.resize(newlen, std::make_pair<Key, Value>(illegalkey, Value()));
550  data = (map_container_type)realloc(data, newlen * sizeof(value_type));
551  std::uninitialized_fill(data_end(), data+newlen, non_const_value_type(illegalkey));
552  datalen = newlen;
553  rehash();
554  }
555 
556  std::pair<iterator, bool> insert(const value_type& v_) {
557  iterator i = find(v_);
558  if (i != end()) return std::make_pair(i, false);
559  else return std::make_pair(do_insert(v_), true);
560  }
561 
562  iterator insert(const_iterator hint, value_type const& v) {
563  return insert(v).first;
564  }
565 
566  iterator find(key_type const& k) {
567  size_t hash_of_k = hashfun(k);
568  for (uint32_t i = 0;i < CuckooK; ++i) {
569  index_type idx = compute_hash(hash_of_k, i);
570  if (keyeq(data[idx], k)) return iterator(this, data_begin() + idx);
571  }
572  return iterator(this, std::find(stash.begin(), stash.end(), k));
573  }
574 
575  const_iterator find(key_type const& k) const {
576  size_t hash_of_k = hashfun(k);
577  for (uint32_t i = 0;i < CuckooK; ++i) {
578  index_type idx = compute_hash(hash_of_k, i);
579  if (keyeq(data[idx], k)) return const_iterator(this, data_begin() + idx);
580  }
581  return const_iterator(this, std::find(stash.begin(), stash.end(), k));
582  }
583 
584  size_t count(key_type const& k) const {
585  size_t hash_of_k = hashfun(k);
586  for (uint32_t i = 0;i < CuckooK; ++i) {
587  index_type idx = compute_hash(hash_of_k, i);
588  if (keyeq(data[idx], k)) return 1;
589  }
590  for (size_t i = 0; i < stash.size(); ++i) {
591  if (stash[i] == k) return 1;
592  }
593  return 0;
594  }
595 
596 
597  void erase(iterator iter) {
598  if (iter.in_stash == false) {
599  if (!keyeq(*(iter.vec_iter), illegalkey)) {
600 
601  replace_in_vector(&(*(iter.vec_iter)), illegalkey);
602 
603  --numel;
604  }
605  }
606  else if (iter.stash_iter != stash.end()) {
607  --numel;
608  stash.erase(iter.stash_iter);
609  }
610  }
611 
612  void erase(key_type const& k) {
613  iterator iter = find(k);
614  if (iter != end()) erase(iter);
615  }
616 
617  void swap(cuckoo_set_pow2& other) {
618  std::swap(illegalkey, other.illegalkey);
619  std::swap(numel, other.numel);
620  std::swap(maxstash, other.maxstash);
621  std::swap(data, other.data);
622  std::swap(datalen, other.datalen);
623  std::swap(stash, other.stash);
624  std::swap(drng, other.drng);
625  std::swap(kranddist, other.kranddist);
626  std::swap(hashfun, other.hashfun);
627  std::swap(keyeq, other.keyeq);
628  std::swap(mask, other.mask);
629  }
630 
631  key_equal key_eq() const {
632  return keyeq;
633  }
634 
635  void clear() {
636  destroy_all();
637  reserve(4);
638  }
639 
640 
641  float load_factor() const {
642  return (float)numel / (datalen + stash.size());
643  }
644 
645  void save(oarchive &oarc) const {
646  oarc << size_t(numel);
647  serialize_iterator(oarc, begin(), end(), numel);
648  }
649 
650 
651  void load(iarchive &iarc) {
652  for (size_t i = 0;i < datalen; ++i) data[i] = illegalkey;
653  stash.clear();
654  numel = 0;
655  size_t tmpnumel;
656  iarc >> tmpnumel;
657  reserve(tmpnumel * 1.5);
658  //std::cout << tmpnumel << ", " << illegalkey << std::endl;
659  deserialize_iterator<iarchive, non_const_value_type>
660  (iarc, insert_iterator(this));
661  ASSERT_EQ(numel, tmpnumel);
662  // for(size_t i = 0; i < tmpnumel; ++i) {
663  // non_const_value_type pair;
664  // iarc >> pair;
665  // operator[](pair.first) = pair.second;
666  // }
667  }
668 
669  }; // end of cuckoo_set_pow2
670 
671 }; // end of graphlab namespace
672 
673 #endif