GraphLab: Distributed Graph-Parallel API  2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
cuckoo_map.hpp
1 /**
2  * Copyright (c) 2009 Carnegie Mellon University.
3  * All rights reserved.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing,
12  * software distributed under the License is distributed on an "AS
13  * IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
14  * express or implied. See the License for the specific language
15  * governing permissions and limitations under the License.
16  *
17  * For more about this software visit:
18  *
19  * http://www.graphlab.ml.cmu.edu
20  *
21  */
22 
23 #ifndef GRAPHLAB_UTIL_CUCKOO_MAP_HPP
24 #define GRAPHLAB_UTIL_CUCKOO_MAP_HPP
25 
26 #include <vector>
27 #include <iterator>
28 #include <boost/random.hpp>
29 #include <boost/unordered_map.hpp>
30 #include <ctime>
31 #include <graphlab/serialization/serialization_includes.hpp>
32 
33 namespace graphlab {
34 
35 
36 
37 /**
38  * A cuckoo hash map which requires the user to
39  * provide an "illegal" value thus avoiding the need
40  * for a seperate bitmap. More or less similar
41  * interface as boost::unordered_map, not necessarily
42  * entirely STL compliant.
43  */
44 template <typename Key, typename Value,
45  size_t CuckooK = 3,
46  typename IndexType = size_t,
47  typename Hash = boost::hash<Key>,
48  typename Pred = std::equal_to<Key> >
49 class cuckoo_map {
50 
51 public:
52  // public typedefs
53  typedef Key key_type;
54  typedef std::pair<Key const, Value> value_type;
55  typedef Value mapped_type;
56  typedef Hash hasher;
57  typedef Pred key_equal;
58  typedef IndexType index_type;
59  typedef value_type* pointer;
60  typedef value_type& reference;
61  typedef const value_type* const_pointer;
62  typedef const value_type& const_reference;
63 private:
64  // internal typedefs
65  typedef std::pair<key_type, mapped_type> non_const_value_type;
66  typedef value_type* map_container_type;
67  typedef value_type* map_container_iterator;
68  typedef const value_type* map_container_const_iterator;
69  typedef boost::unordered_map<Key, Value, Hash, Pred> stash_container_type;
70 
71  key_type illegalkey;
72  index_type numel;
73  index_type maxstash;
74  map_container_type data;
75  size_t datalen;
76  stash_container_type stash;
77  boost::rand48 drng;
78  boost::uniform_int<index_type> kranddist;
79  hasher hashfun;
80  key_equal keyeq;
81 
82  map_container_iterator data_begin() {
83  return data;
84  }
85 
86  map_container_iterator data_end() {
87  return data + datalen;
88  }
89 
90  map_container_const_iterator data_begin() const {
91  return data;
92  }
93 
94  map_container_const_iterator data_end() const {
95  return data + datalen;
96  }
97  // bypass the const key_type with a placement new
98  void replace_in_vector(map_container_iterator iter,
99  const key_type& key,
100  const mapped_type& val) {
101  // delete
102  iter->~value_type();
103  // placement new
104  new(iter) value_type(key, val);
105  }
106 
107  void destroy_all() {
108  // call ze destructors
109  for(size_t i = 0; i < datalen; ++i) {
110  data[i].~value_type();
111  }
112  free(data);
113  stash.clear();
114  data = NULL;
115  datalen = 0;
116  numel = 0;
117  }
118 
119 public:
120  struct insert_iterator{
121  cuckoo_map* cmap;
122  typedef std::forward_iterator_tag iterator_category;
123  typedef typename cuckoo_map::value_type value_type;
124 
125  insert_iterator(cuckoo_map* c):cmap(c) {}
126 
127  insert_iterator operator++() {
128  return (*this);
129  }
130  insert_iterator operator++(int) {
131  return (*this);
132  }
133 
134  insert_iterator& operator*() {
135  return *this;
136  }
137  insert_iterator& operator=(const insert_iterator& i) {
138  cmap = i.cmap;
139  return *this;
140  }
141 
142  insert_iterator& operator=(const value_type& v) {
143  cmap->insert(v);
144  return *this;
145  }
146  };
147 
148  struct const_iterator {
149  const cuckoo_map* cmap;
150  bool in_stash;
151  typename cuckoo_map::map_container_const_iterator vec_iter;
152  typename cuckoo_map::stash_container_type::const_iterator stash_iter;
153 
154  typedef std::forward_iterator_tag iterator_category;
155  typedef typename cuckoo_map::value_type value_type;
156  typedef size_t difference_type;
157  typedef const value_type* pointer;
158  typedef const value_type& reference;
159 
160  friend class cuckoo_map;
161 
162  const_iterator(): cmap(NULL), in_stash(false) {}
163 
164  const_iterator operator++() {
165  if (!in_stash) {
166  ++vec_iter;
167  // we are in the main vector. try to advance the
168  // iterator until I hit another data element
169  while(vec_iter != cmap->data_end() &&
170  cmap->key_eq()(vec_iter->first, cmap->illegal_key())) ++vec_iter;
171  if (vec_iter == cmap->data_end()) {
172  in_stash = true;
173  stash_iter = cmap->stash.begin();
174  }
175  }
176  else if (in_stash) {
177  if (stash_iter != cmap->stash.end()) ++stash_iter;
178  }
179  return *this;
180  }
181 
182  const_iterator operator++(int) {
183  const_iterator cur = *this;
184  ++(*this);
185  return cur;
186  }
187 
188 
189  reference operator*() {
190  if (!in_stash) return *vec_iter;
191  else return *stash_iter;
192  }
193 
194  pointer operator->() {
195  if (!in_stash) return &(*vec_iter);
196  else return &(*stash_iter);
197  }
198 
199  bool operator==(const const_iterator iter) const {
200  return in_stash == iter.in_stash &&
201  (in_stash==false ?
202  vec_iter == iter.vec_iter :
203  stash_iter == iter.stash_iter);
204  }
205 
206  bool operator!=(const const_iterator iter) const {
207  return !((*this) == iter);
208  }
209 
210  private:
211  const_iterator(const cuckoo_map* cmap, typename cuckoo_map::map_container_const_iterator vec_iter):
212  cmap(cmap), in_stash(false), vec_iter(vec_iter), stash_iter(cmap->stash.begin()) { }
213 
214  const_iterator(const cuckoo_map* cmap, typename cuckoo_map::stash_container_type::const_iterator stash_iter):
215  cmap(cmap), in_stash(true), vec_iter(cmap->data_begin()), stash_iter(stash_iter) { }
216  };
217 
218 
219  struct iterator {
220  cuckoo_map* cmap;
221  bool in_stash;
222  typename cuckoo_map::map_container_iterator vec_iter;
223  typename cuckoo_map::stash_container_type::iterator stash_iter;
224 
225  typedef std::forward_iterator_tag iterator_category;
226  typedef typename cuckoo_map::value_type value_type;
227  typedef size_t difference_type;
228  typedef value_type* pointer;
229  typedef value_type& reference;
230 
231  friend class cuckoo_map;
232 
233  iterator(): cmap(NULL), in_stash(false) {}
234 
235 
236  operator const_iterator() const {
237  const_iterator iter;
238  iter.cmap = cmap;
239  iter.in_stash = in_stash;
240  iter.vec_iter = vec_iter;
241  iter.stash_iter = stash_iter;
242  return iter;
243  }
244 
245  iterator operator++() {
246  if (!in_stash) {
247  ++vec_iter;
248  // we are in the main vector. try to advance the
249  // iterator until I hit another data element
250  while(vec_iter != cmap->data_end() &&
251  cmap->key_eq()(vec_iter->first, cmap->illegal_key())) ++vec_iter;
252  if (vec_iter == cmap->data_end()) {
253  in_stash = true;
254  stash_iter = cmap->stash.begin();
255  }
256  }
257  else if (in_stash) {
258  if (stash_iter != cmap->stash.end()) ++stash_iter;
259  }
260  return *this;
261  }
262 
263  iterator operator++(int) {
264  iterator cur = *this;
265  ++(*this);
266  return cur;
267  }
268 
269 
270  reference operator*() {
271  if (!in_stash) return *vec_iter;
272  else return *stash_iter;
273  }
274 
275  pointer operator->() {
276  if (!in_stash) return &(*vec_iter);
277  else return &(*stash_iter);
278  }
279 
280  bool operator==(const iterator iter) const {
281  return in_stash == iter.in_stash &&
282  (in_stash==false ?
283  vec_iter == iter.vec_iter :
284  stash_iter == iter.stash_iter);
285  }
286 
287  bool operator!=(const iterator iter) const {
288  return !((*this) == iter);
289  }
290 
291 
292  private:
293  iterator(cuckoo_map* cmap, typename cuckoo_map::map_container_iterator vec_iter):
294  cmap(cmap), in_stash(false), vec_iter(vec_iter) { }
295 
296  iterator(cuckoo_map* cmap, typename cuckoo_map::stash_container_type::iterator stash_iter):
297  cmap(cmap), in_stash(true), stash_iter(stash_iter) { }
298 
299  };
300 
301 private:
302 
303  // the primary inserting logic.
304  // this assumes that the data is not already in the array.
305  // caller must check before performing the insert
306  iterator do_insert(const value_type& v_) {
307  non_const_value_type v(v_.first, v_.second);
308  if (stash.size() > maxstash) {
309  // resize
310  reserve(datalen * 1.5);
311  }
312 
313  index_type insertpos = (index_type)(-1); // tracks where the current
314  // inserted value went
315  ++numel;
316 
317  // take a random walk down the tree
318  for (int i = 0;i < 100; ++i) {
319  // first see if one of the hashes will work
320  index_type idx = 0;
321  bool found = false;
322  size_t hash_of_k = hashfun(v.first);
323  for (size_t j = 0; j < CuckooK; ++j) {
324  idx = compute_hash(hash_of_k, j);
325  if (keyeq(data[idx].first, illegalkey)) {
326  found = true;
327  break;
328  }
329  }
330  if (!found) idx = compute_hash(hash_of_k, kranddist(drng));
331  // if insertpos is -1, v holds the current value. and we
332  // are inserting it into idx
333  // if insertpos is idx, we are bumping v again. and v will hold the
334  // current value once more. so revert
335  // insertpos to -1
336  if (insertpos == (index_type)(-1)) insertpos = idx;
337  else if (insertpos == idx) insertpos = (index_type)(-1);
338  // there is room here
339  if (found || keyeq(data[idx].first, illegalkey)) {
340  replace_in_vector(data_begin() + idx, v.first, v.second);
341  // success!
342  return iterator(this, data_begin() + insertpos);
343  }
344  // failed to insert!
345  // try again!
346 
347  non_const_value_type tmp = data[idx];
348  replace_in_vector(data_begin() + idx, v.first, v.second);
349  v = tmp;
350  }
351  // ok. tried and failed 100 times.
352  //stick it in the stash
353 
354  typename stash_container_type::iterator stashiter = stash.insert(v).first;
355  // if insertpos is -1, current value went into stash
356  if (insertpos == (index_type)(-1)) {
357  return iterator(this, stashiter);
358  }
359  else {
360  return iterator(this, data_begin() + insertpos);
361  }
362  }
363 
364 public:
365 
366  cuckoo_map(key_type illegalkey,
367  index_type stashsize = 8,
368  hasher const& h = hasher(),
369  key_equal const& k = key_equal()):
370  illegalkey(illegalkey),
371  numel(0),maxstash(stashsize),
372  data(NULL), datalen(0),
373  drng(time(NULL)),
374  kranddist(0, CuckooK - 1), hashfun(h), keyeq(k) {
375  stash.max_load_factor(1.0);
376  reserve(128);
377  }
378 
379 
380 
381  cuckoo_map& operator=(const cuckoo_map& other) {
382  destroy_all();
383  // copy the data
384  data = (map_container_type)malloc(sizeof(value_type) * other.datalen);
385  datalen = other.datalen;
386  std::uninitialized_copy(other.data_begin(), other.data_end(), data_begin());
387 
388  // copy the stash
389  stash = other.stash;
390 
391  // copy all the other extra stuff
392  illegalkey = other.illegalkey;
393  numel = other.numel;
394  hashfun = other.hashfun;
395  keyeq = other.keyeq;
396  return *this;
397  }
398 
399  const key_type& illegal_key() const {
400  return illegalkey;
401  }
402 
403  ~cuckoo_map() {
404  destroy_all();
405  }
406 
407  index_type size() {
408  return numel;
409  }
410 
411  iterator begin() {
412  iterator iter;
413  iter.cmap = this;
414  iter.in_stash = false;
415  iter.vec_iter = data_begin();
416 
417  while(iter.vec_iter != data_end() &&
418  keyeq(iter.vec_iter->first, illegalkey)) ++iter.vec_iter;
419 
420 
421  if (iter.vec_iter == data_end()) {
422  iter.in_stash = true;
423  iter.stash_iter = stash.begin();
424  }
425 
426  return iter;
427  }
428 
429  iterator end() {
430  return iterator(this, stash.end());
431  }
432 
433 
434  const_iterator begin() const {
435  const_iterator iter;
436  iter.cmap = this;
437  iter.in_stash = false;
438  iter.vec_iter = data_begin();
439 
440  while(iter.vec_iter != data_end() &&
441  keyeq(iter.vec_iter->first, illegalkey)) ++iter.vec_iter;
442 
443  if (iter.vec_iter == data_end()) {
444  iter.in_stash = true;
445  iter.stash_iter = stash.begin();
446  }
447 
448  return iter;
449  }
450 
451  const_iterator end() const {
452  return const_iterator(this, stash.end());
453 
454  }
455 
456  /*
457  * Bob Jenkin's 32 bit integer mix function from
458  * http://home.comcast.net/~bretm/hash/3.html
459  */
460  static size_t mix(size_t state) {
461  state += (state << 12);
462  state ^= (state >> 22);
463  state += (state << 4);
464  state ^= (state >> 9);
465  state += (state << 10);
466  state ^= (state >> 2);
467  state += (state << 7);
468  state ^= (state >> 12);
469  return state;
470  }
471 
472  index_type compute_hash(size_t k , const uint32_t seed) const {
473  // a bunch of random numbers
474 #if (__SIZEOF_PTRDIFF_T__ == 8)
475  static const size_t a[8] = {0x6306AA9DFC13C8E7,
476  0xA8CD7FBCA2A9FFD4,
477  0x40D341EB597ECDDC,
478  0x99CFA1168AF8DA7E,
479  0x7C55BCC3AF531D42,
480  0x1BC49DB0842A21DD,
481  0x2181F03B1DEE299F,
482  0xD524D92CBFEC63E9};
483 #else
484  static const size_t a[8] = {0xFC13C8E7,
485  0xA2A9FFD4,
486  0x597ECDDC,
487  0x8AF8DA7E,
488  0xAF531D42,
489  0x842A21DD,
490  0x1DEE299F,
491  0xBFEC63E9};
492 #endif
493 
494  index_type s = mix(a[seed] ^ k);
495  return s % datalen;
496  }
497 
498  void rehash() {
499  stash_container_type stmp;
500  stmp.swap(stash);
501  // effectively, stmp elements are deleted
502  numel -= stmp.size();
503  for (size_t i = 0;i < datalen; ++i) {
504  // if there is an element here. erase it and reinsert
505  if (!keyeq(data[i].first, illegalkey)) {
506  if (count(data[i].first)) continue;
507  non_const_value_type v = data[i];
508  replace_in_vector(data_begin() + i, illegalkey, mapped_type());
509  numel--;
510  //erase(iterator(this, data_begin() + i));
511  insert(v);
512  }
513  }
514  typename stash_container_type::const_iterator iter = stmp.begin();
515  while(iter != stmp.end()) {
516  insert(*iter);
517  ++iter;
518  }
519  }
520 
521 
522 
523  void reserve(size_t newlen) {
524  //data.reserve(newlen);
525  //data.resize(newlen, std::make_pair<Key, Value>(illegalkey, Value()));
526  data = (map_container_type)realloc(data, newlen * sizeof(value_type));
527  std::uninitialized_fill(data_end(), data+newlen, non_const_value_type(illegalkey, mapped_type()));
528  datalen = newlen;
529  rehash();
530  }
531 
532  std::pair<iterator, bool> insert(const value_type& v_) {
533  iterator i = find(v_.first);
534  if (i != end()) return std::make_pair(i, false);
535  else return std::make_pair(do_insert(v_), true);
536  }
537 
538 
539 
540  iterator insert(const_iterator hint, value_type const& v) {
541  return insert(v).first;
542  }
543 
544  iterator find(key_type const& k) {
545  size_t hash_of_k = hashfun(k);
546  for (uint32_t i = 0;i < CuckooK; ++i) {
547  index_type idx = compute_hash(hash_of_k, i);
548  if (keyeq(data[idx].first, k)) return iterator(this, data_begin() + idx);
549  }
550  return iterator(this, stash.find(k));
551  }
552 
553  const_iterator find(key_type const& k) const {
554  size_t hash_of_k = hashfun(k);
555  for (uint32_t i = 0;i < CuckooK; ++i) {
556  index_type idx = compute_hash(hash_of_k, i);
557  if (keyeq(data[idx].first, k)) return const_iterator(this, data_begin() + idx);
558  }
559  return const_iterator(this, stash.find(k));
560  }
561 
562  size_t count(key_type const& k) const {
563  size_t hash_of_k = hashfun(k);
564  for (uint32_t i = 0;i < CuckooK; ++i) {
565  index_type idx = compute_hash(hash_of_k, i);
566  if (keyeq(data[idx].first, k)) return true;
567  }
568  return stash.count(k);
569  }
570 
571 
572  void erase(iterator iter) {
573  if (iter.in_stash == false) {
574  if (!keyeq(iter.vec_iter->first, illegalkey)) {
575 
576  replace_in_vector(&(*(iter.vec_iter)), illegalkey, mapped_type());
577 
578  --numel;
579  }
580  }
581  else if (iter.stash_iter != stash.end()) {
582  --numel;
583  stash.erase(iter.stash_iter);
584  }
585  }
586 
587  void erase(key_type const& k) {
588  iterator iter = find(k);
589  if (iter != end()) erase(iter);
590  }
591 
592  void swap(cuckoo_map& other) {
593  std::swap(illegalkey, other.illegalkey);
594  std::swap(numel, other.numel);
595  std::swap(maxstash, other.maxstash);
596  std::swap(data, other.data);
597  std::swap(datalen, other.datalen);
598  std::swap(stash, other.stash);
599  std::swap(drng, other.drng);
600  std::swap(kranddist, other.kranddist);
601  std::swap(hashfun, other.hashfun);
602  std::swap(keyeq, other.keyeq);
603  }
604 
605  mapped_type& operator[](const key_type& i) {
606  iterator iter = find(i);
607  value_type tmp(i, mapped_type());
608  if (iter == end()) iter = do_insert(tmp);
609  return iter->second;
610  }
611 
612  key_equal key_eq() const {
613  return keyeq;
614  }
615 
616  void clear() {
617  destroy_all();
618  reserve(128);
619  }
620 
621 
622  float load_factor() const {
623  return (float)numel / (datalen + stash.size());
624  }
625 
626  void save(oarchive &oarc) const {
627  oarc << numel << illegalkey;
628  serialize_iterator(oarc, begin(), end(), numel);
629  }
630 
631 
632  void load(iarchive &iarc) {
633  clear();
634  size_t tmpnumel = 0;
635  iarc >> tmpnumel >> illegalkey;
636  reserve(tmpnumel * 1.5);
637  deserialize_iterator<iarchive, non_const_value_type>
638  (iarc, insert_iterator(this));
639  }
640 
641 };
642 
643 }
644 
645 #endif