GraphLab: Distributed Graph-Parallel API  2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
cuckoo_map_pow2.hpp
1 /**
2  * Copyright (c) 2009 Carnegie Mellon University.
3  * All rights reserved.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing,
12  * software distributed under the License is distributed on an "AS
13  * IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
14  * express or implied. See the License for the specific language
15  * governing permissions and limitations under the License.
16  *
17  * For more about this software visit:
18  *
19  * http://www.graphlab.ml.cmu.edu
20  *
21  */
22 
23 #ifndef GRAPHLAB_UTIL_CUCKOO_MAP_POW2_HPP
24 #define GRAPHLAB_UTIL_CUCKOO_MAP_POW2_HPP
25 
26 #include <vector>
27 #include <iterator>
28 #include <boost/random.hpp>
29 #include <boost/unordered_map.hpp>
30 #include <ctime>
31 #include <graphlab/serialization/serialization_includes.hpp>
32 namespace graphlab {
33 
34 
35 
36  /**
37  * A cuckoo hash map which requires the user to
38  * provide an "illegal" value thus avoiding the need
39  * for a seperate bitmap. More or less similar
40  * interface as boost::unordered_map, not necessarily
41  * entirely STL compliant.
42  */
43  template <typename Key, typename Value,
44  size_t CuckooK = 3,
45  typename IndexType = size_t,
46  typename Hash = boost::hash<Key>,
47  typename Pred = std::equal_to<Key> >
49 
50  public:
51  // public typedefs
52  typedef Key key_type;
53  typedef std::pair<Key const, Value> value_type;
54  typedef Value mapped_type;
55  typedef Hash hasher;
56  typedef Pred key_equal;
57  typedef IndexType index_type;
58  typedef value_type* pointer;
59  typedef value_type& reference;
60  typedef const value_type* const_pointer;
61  typedef const value_type& const_reference;
62 
63  private:
64  // internal typedefs
65  typedef std::pair<key_type, mapped_type> non_const_value_type;
66  typedef value_type* map_container_type;
67  typedef value_type* map_container_iterator;
68  typedef const value_type* map_container_const_iterator;
69  typedef boost::unordered_map<Key, Value, Hash, Pred> stash_container_type;
70 
71  key_type illegalkey;
72  index_type numel;
73  index_type maxstash;
74  map_container_type data;
75  size_t datalen;
76  stash_container_type stash;
77  boost::rand48 drng;
78  boost::uniform_int<index_type> kranddist;
79  hasher hashfun;
80  key_equal keyeq;
81  index_type mask;
82 
83  map_container_iterator data_begin() {
84  return data;
85  }
86 
87  map_container_iterator data_end() {
88  return data + datalen;
89  }
90 
91  map_container_const_iterator data_begin() const {
92  return data;
93  }
94 
95  map_container_const_iterator data_end() const {
96  return data + datalen;
97  }
98 
99 
100  // bypass the const key_type with a placement new
101  void replace_in_vector(map_container_iterator iter,
102  const key_type& key,
103  const mapped_type& val) {
104  // delete
105  iter->~value_type();
106  // placement new
107  new(iter) value_type(key, val);
108  }
109 
110  void destroy_all() {
111  // call ze destructors
112  for(size_t i = 0; i < datalen; ++i) {
113  data[i].~value_type();
114  }
115  free(data);
116  stash.clear();
117  data = NULL;
118  datalen = 0;
119  numel = 0;
120  }
121 
122  public:
123  struct insert_iterator{
124  cuckoo_map_pow2* cmap;
125  typedef std::forward_iterator_tag iterator_category;
126  typedef typename cuckoo_map_pow2::value_type value_type;
127 
128  insert_iterator(cuckoo_map_pow2* c):cmap(c) {}
129 
130  insert_iterator operator++() {
131  return (*this);
132  }
133  insert_iterator operator++(int) {
134  return (*this);
135  }
136 
137  insert_iterator& operator*() {
138  return *this;
139  }
140  insert_iterator& operator=(const insert_iterator& i) {
141  cmap = i.cmap;
142  return *this;
143  }
144 
145  insert_iterator& operator=(const value_type& v) {
146  cmap->insert(v);
147  return *this;
148  }
149  };
150 
151  struct const_iterator {
152  const cuckoo_map_pow2* cmap;
153  bool in_stash;
154  typename cuckoo_map_pow2::map_container_const_iterator vec_iter;
155  typename cuckoo_map_pow2::stash_container_type::const_iterator stash_iter;
156 
157  typedef std::forward_iterator_tag iterator_category;
158  typedef typename cuckoo_map_pow2::value_type value_type;
159  typedef size_t difference_type;
160  typedef const value_type* pointer;
161  typedef const value_type& reference;
162 
163  friend class cuckoo_map_pow2;
164 
165  const_iterator(): cmap(NULL), in_stash(false) {}
166 
167  const_iterator operator++() {
168  if (!in_stash) {
169  ++vec_iter;
170  // we are in the main vector. try to advance the
171  // iterator until I hit another data element
172  while(vec_iter != cmap->data_end() &&
173  cmap->key_eq()(vec_iter->first, cmap->illegal_key())) ++vec_iter;
174  if (vec_iter == cmap->data_end()) {
175  in_stash = true;
176  stash_iter = cmap->stash.begin();
177  }
178  }
179  else if (in_stash) {
180  if (stash_iter != cmap->stash.end()) ++stash_iter;
181  }
182  return *this;
183  }
184 
185  const_iterator operator++(int) {
186  const_iterator cur = *this;
187  ++(*this);
188  return cur;
189  }
190 
191 
192  reference operator*() {
193  if (!in_stash) return *vec_iter;
194  else return *stash_iter;
195  }
196 
197  pointer operator->() {
198  if (!in_stash) return &(*vec_iter);
199  else return &(*stash_iter);
200  }
201 
202  bool operator==(const const_iterator iter) const {
203  return in_stash == iter.in_stash &&
204  (in_stash==false ?
205  vec_iter == iter.vec_iter :
206  stash_iter == iter.stash_iter);
207  }
208 
209  bool operator!=(const const_iterator iter) const {
210  return !((*this) == iter);
211  }
212 
213  private:
214  const_iterator(const cuckoo_map_pow2* cmap, typename cuckoo_map_pow2::map_container_const_iterator vec_iter):
215  cmap(cmap), in_stash(false), vec_iter(vec_iter), stash_iter(cmap->stash.begin()) { }
216 
217  const_iterator(const cuckoo_map_pow2* cmap, typename cuckoo_map_pow2::stash_container_type::const_iterator stash_iter):
218  cmap(cmap), in_stash(true), vec_iter(cmap->data_begin()), stash_iter(stash_iter) { }
219 
220  };
221 
222 
223  struct iterator {
224  cuckoo_map_pow2* cmap;
225  bool in_stash;
226  typename cuckoo_map_pow2::map_container_iterator vec_iter;
227  typename cuckoo_map_pow2::stash_container_type::iterator stash_iter;
228 
229  typedef std::forward_iterator_tag iterator_category;
230  typedef typename cuckoo_map_pow2::value_type value_type;
231  typedef size_t difference_type;
232  typedef value_type* pointer;
233  typedef value_type& reference;
234 
235  friend class cuckoo_map_pow2;
236 
237  iterator(): cmap(NULL), in_stash(false) {}
238 
239 
240  operator const_iterator() const {
241  const_iterator iter;
242  iter.cmap = cmap;
243  iter.in_stash = in_stash;
244  iter.vec_iter = vec_iter;
245  iter.stash_iter = stash_iter;
246  return iter;
247  }
248 
249  iterator operator++() {
250  if (!in_stash) {
251  ++vec_iter;
252  // we are in the main vector. try to advance the
253  // iterator until I hit another data element
254  while(vec_iter != cmap->data_end() &&
255  cmap->key_eq()(vec_iter->first, cmap->illegal_key())) ++vec_iter;
256  if (vec_iter == cmap->data_end()) {
257  in_stash = true;
258  stash_iter = cmap->stash.begin();
259  }
260  }
261  else if (in_stash) {
262  if (stash_iter != cmap->stash.end()) ++stash_iter;
263  }
264  return *this;
265  }
266 
267  iterator operator++(int) {
268  iterator cur = *this;
269  ++(*this);
270  return cur;
271  }
272 
273 
274  reference operator*() {
275  if (!in_stash) return *vec_iter;
276  else return *stash_iter;
277  }
278 
279  pointer operator->() {
280  if (!in_stash) return &(*vec_iter);
281  else return &(*stash_iter);
282  }
283 
284  bool operator==(const iterator iter) const {
285  return in_stash == iter.in_stash &&
286  (in_stash==false ?
287  vec_iter == iter.vec_iter :
288  stash_iter == iter.stash_iter);
289  }
290 
291  bool operator!=(const iterator iter) const {
292  return !((*this) == iter);
293  }
294 
295 
296  private:
297  iterator(cuckoo_map_pow2* cmap,
298  typename cuckoo_map_pow2::map_container_iterator vec_iter):
299  cmap(cmap), in_stash(false), vec_iter(vec_iter) { }
300 
301  iterator(cuckoo_map_pow2* cmap,
302  typename cuckoo_map_pow2::stash_container_type::iterator stash_iter):
303  cmap(cmap), in_stash(true), stash_iter(stash_iter) { }
304 
305  };
306 
307 
308  private:
309 
310  // the primary inserting logic.
311  // this assumes that the data is not already in the array.
312  // caller must check before performing the insert
313  iterator do_insert(const value_type& v_) {
314  non_const_value_type v(v_.first, v_.second);
315  if (stash.size() > maxstash) {
316  // resize
317  reserve(datalen * 2);
318  }
319 
320  index_type insertpos = (index_type)(-1); // tracks where the current
321  // inserted value went
322  ++numel;
323 
324  // take a random walk down the tree
325  for (int i = 0;i < 100; ++i) {
326  // first see if one of the hashes will work
327  index_type idx = 0;
328  bool found = false;
329  size_t hash_of_k = hashfun(v.first);
330  for (size_t j = 0; j < CuckooK; ++j) {
331  idx = compute_hash(hash_of_k, j);
332  if (keyeq(data[idx].first, illegalkey)) {
333  found = true;
334  break;
335  }
336  }
337  if (!found) idx = compute_hash(hash_of_k, kranddist(drng));
338  // if insertpos is -1, v holds the current value. and we
339  // are inserting it into idx
340  // if insertpos is idx, we are bumping v again. and v will hold the
341  // current value once more. so revert
342  // insertpos to -1
343  if (insertpos == (index_type)(-1)) insertpos = idx;
344  else if (insertpos == idx) insertpos = (index_type)(-1);
345  // there is room here
346  if (found || keyeq(data[idx].first, illegalkey)) {
347  replace_in_vector(data_begin() + idx, v.first, v.second);
348  // success!
349  return iterator(this, data_begin() + insertpos);
350  }
351  // failed to insert!
352  // try again!
353 
354  non_const_value_type tmp = data[idx];
355  replace_in_vector(data_begin() + idx, v.first, v.second);
356  v = tmp;
357  }
358  // ok. tried and failed 100 times.
359  //stick it in the stash
360 
361  typename stash_container_type::iterator stashiter = stash.insert(v).first;
362  // if insertpos is -1, current value went into stash
363  if (insertpos == (index_type)(-1)) {
364  return iterator(this, stashiter);
365  }
366  else {
367  return iterator(this, data_begin() + insertpos);
368  }
369  }
370  public:
371 
372  cuckoo_map_pow2(key_type illegalkey,
373  index_type stashsize = 8,
374  hasher const& h = hasher(),
375  key_equal const& k = key_equal()):
376  illegalkey(illegalkey),
377  numel(0),maxstash(stashsize),
378  data(NULL), datalen(0),
379  drng(time(NULL)),
380  kranddist(0, CuckooK - 1), hashfun(h), keyeq(k), mask(127) {
381  stash.max_load_factor(1.0);
382  reserve(128);
383  }
384 
385  const key_type& illegal_key() const {
386  return illegalkey;
387  }
388 
389  ~cuckoo_map_pow2() {
390  destroy_all();
391  }
392 
393  cuckoo_map_pow2& operator=(const cuckoo_map_pow2& other) {
394  destroy_all();
395  // copy the data
396  data = (map_container_type)malloc(sizeof(value_type) * other.datalen);
397  datalen = other.datalen;
398  std::uninitialized_copy(other.data_begin(), other.data_end(), data_begin());
399 
400  // copy the stash
401  stash = other.stash;
402 
403  // copy all the other extra stuff
404  illegalkey = other.illegalkey;
405  numel = other.numel;
406  hashfun = other.hashfun;
407  keyeq = other.keyeq;
408  mask = other.mask;
409  return *this;
410  }
411 
412  index_type size() {
413  return numel;
414  }
415 
416  iterator begin() {
417  iterator iter;
418  iter.cmap = this;
419  iter.in_stash = false;
420  iter.vec_iter = data_begin();
421 
422  while(iter.vec_iter != data_end() &&
423  keyeq(iter.vec_iter->first, illegalkey)) ++iter.vec_iter;
424 
425  if (iter.vec_iter == data_end()) {
426  iter.in_stash = true;
427  iter.stash_iter = stash.begin();
428  }
429  return iter;
430  }
431 
432  iterator end() {
433  return iterator(this, stash.end());
434  }
435 
436 
437  const_iterator begin() const {
438  const_iterator iter;
439  iter.cmap = this;
440  iter.in_stash = false;
441  iter.vec_iter = data_begin();
442 
443  while(iter.vec_iter != data_end() &&
444  keyeq(iter.vec_iter->first, illegalkey)) ++iter.vec_iter;
445 
446  if (iter.vec_iter == data_end()) {
447  iter.in_stash = true;
448  iter.stash_iter = stash.begin();
449  }
450 
451  return iter;
452  }
453 
454  const_iterator end() const {
455  return const_iterator(this, stash.end());
456 
457  }
458 
459  /*
460  * Bob Jenkin's 32 bit integer mix function from
461  * http://home.comcast.net/~bretm/hash/3.html
462  */
463  static size_t mix(size_t state) {
464  state += (state << 12);
465  state ^= (state >> 22);
466  state += (state << 4);
467  state ^= (state >> 9);
468  state += (state << 10);
469  state ^= (state >> 2);
470  state += (state << 7);
471  state ^= (state >> 12);
472  return state;
473  }
474 
475  index_type compute_hash(size_t k , const uint32_t seed) const {
476  // a bunch of random numbers
477 #if (__SIZEOF_PTRDIFF_T__ == 8)
478  static const size_t a[8] = {0x6306AA9DFC13C8E7,
479  0xA8CD7FBCA2A9FFD4,
480  0x40D341EB597ECDDC,
481  0x99CFA1168AF8DA7E,
482  0x7C55BCC3AF531D42,
483  0x1BC49DB0842A21DD,
484  0x2181F03B1DEE299F,
485  0xD524D92CBFEC63E9};
486 #else
487  static const size_t a[8] = {0xFC13C8E7,
488  0xA2A9FFD4,
489  0x597ECDDC,
490  0x8AF8DA7E,
491  0xAF531D42,
492  0x842A21DD,
493  0x1DEE299F,
494  0xBFEC63E9};
495 #endif
496  index_type s = mix(a[seed] ^ k);
497  return s & mask;
498  }
499 
500  void rehash() {
501  stash_container_type stmp;
502  stmp.swap(stash);
503  // effectively, stmp elements are deleted
504  numel -= stmp.size();
505  for (size_t i = 0;i < datalen; ++i) {
506  // if there is an element here. erase it and reinsert
507  if (!keyeq(data[i].first, illegalkey)) {
508  if (count(data[i].first)) continue;
509  non_const_value_type v = data[i];
510  replace_in_vector(data_begin() + i, illegalkey, mapped_type());
511  numel--;
512  //erase(iterator(this, data_begin() + i));
513  insert(v);
514  }
515  }
516  typename stash_container_type::const_iterator iter = stmp.begin();
517  while(iter != stmp.end()) {
518  insert(*iter);
519  ++iter;
520  }
521  }
522 
523  static uint64_t next_powerof2(uint64_t val) {
524  --val;
525  val = val | (val >> 1);
526  val = val | (val >> 2);
527  val = val | (val >> 4);
528  val = val | (val >> 8);
529  val = val | (val >> 16);
530  val = val | (val >> 32);
531  return val + 1;
532  }
533 
534 
535  void reserve(size_t newlen) {
536  newlen = next_powerof2(newlen);
537  mask = newlen - 1;
538  //data.reserve(newlen);
539  //data.resize(newlen, std::make_pair<Key, Value>(illegalkey, Value()));
540  data = (map_container_type)realloc(data, newlen * sizeof(value_type));
541  std::uninitialized_fill(data_end(), data+newlen, non_const_value_type(illegalkey, mapped_type()));
542  datalen = newlen;
543  rehash();
544  }
545 
546  std::pair<iterator, bool> insert(const value_type& v_) {
547  iterator i = find(v_.first);
548  if (i != end()) return std::make_pair(i, false);
549  else return std::make_pair(do_insert(v_), true);
550  }
551 
552  iterator insert(const_iterator hint, value_type const& v) {
553  return insert(v).first;
554  }
555 
556  iterator find(key_type const& k) {
557  size_t hash_of_k = hashfun(k);
558  for (uint32_t i = 0;i < CuckooK; ++i) {
559  index_type idx = compute_hash(hash_of_k, i);
560  if (keyeq(data[idx].first, k)) return iterator(this, data_begin() + idx);
561  }
562  return iterator(this, stash.find(k));
563  }
564 
565  const_iterator find(key_type const& k) const {
566  size_t hash_of_k = hashfun(k);
567  for (uint32_t i = 0;i < CuckooK; ++i) {
568  index_type idx = compute_hash(hash_of_k, i);
569  if (keyeq(data[idx].first, k)) return const_iterator(this, data_begin() + idx);
570  }
571  return const_iterator(this, stash.find(k));
572  }
573 
574  size_t count(key_type const& k) const {
575  size_t hash_of_k = hashfun(k);
576  for (uint32_t i = 0;i < CuckooK; ++i) {
577  index_type idx = compute_hash(hash_of_k, i);
578  if (keyeq(data[idx].first, k)) return true;
579  }
580  return stash.count(k);
581  }
582 
583 
584  void erase(iterator iter) {
585  if (iter.in_stash == false) {
586  if (!keyeq(iter.vec_iter->first, illegalkey)) {
587 
588  replace_in_vector(&(*(iter.vec_iter)), illegalkey, mapped_type());
589 
590  --numel;
591  }
592  }
593  else if (iter.stash_iter != stash.end()) {
594  --numel;
595  stash.erase(iter.stash_iter);
596  }
597  }
598 
599  void erase(key_type const& k) {
600  iterator iter = find(k);
601  if (iter != end()) erase(iter);
602  }
603 
604  void swap(cuckoo_map_pow2& other) {
605  std::swap(illegalkey, other.illegalkey);
606  std::swap(numel, other.numel);
607  std::swap(maxstash, other.maxstash);
608  std::swap(data, other.data);
609  std::swap(datalen, other.datalen);
610  std::swap(stash, other.stash);
611  std::swap(drng, other.drng);
612  std::swap(kranddist, other.kranddist);
613  std::swap(hashfun, other.hashfun);
614  std::swap(keyeq, other.keyeq);
615  std::swap(mask, other.mask);
616  }
617 
618  mapped_type& operator[](const key_type& i) {
619  iterator iter = find(i);
620  value_type tmp(i, mapped_type());
621  if (iter == end()) iter = do_insert(tmp);
622  return iter->second;
623  }
624 
625  key_equal key_eq() const {
626  return keyeq;
627  }
628 
629  void clear() {
630  destroy_all();
631  reserve(128);
632  }
633 
634 
635  float load_factor() const {
636  return (float)numel / (datalen + stash.size());
637  }
638 
639  void save(oarchive &oarc) const {
640  oarc << numel << illegalkey;
641  serialize_iterator(oarc, begin(), end(), numel);
642  }
643 
644 
645  void load(iarchive &iarc) {
646  clear();
647  index_type tmpnumel = 0;
648  iarc >> tmpnumel >> illegalkey;
649  //std::cout << tmpnumel << ", " << illegalkey << std::endl;
650  reserve(tmpnumel * 1.5);
651  deserialize_iterator<iarchive, non_const_value_type>
652  (iarc, insert_iterator(this));
653  // for(size_t i = 0; i < tmpnumel; ++i) {
654  // non_const_value_type pair;
655  // iarc >> pair;
656  // operator[](pair.first) = pair.second;
657  // }
658  }
659 
660  }; // end of cuckoo_map_pow2
661 
662 }; // end of graphlab namespace
663 
664 #endif