GraphLab: Distributed Graph-Parallel API  2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
graph_storage.hpp
1 /**
2  * Copyright (c) 2011 Carnegie Mellon University.
3  * All rights reserved.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing,
12  * software distributed under the License is distributed on an "AS
13  * IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
14  * express or implied. See the License for the specific language
15  * governing permissions and limitations under the License.
16  *
17  * For more about this software visit:
18  *
19  * http://www.graphlab.ml.cmu.edu
20  *
21  */
22 
23 /* *
24  * Author: Haijie Gu ([email protected])
25  * Date: 11/10/2011
26  *
27  * CSR+CSC implementation of a graph storage.
28  * */
29 
30 #ifndef GRAPHLAB_GRAPH_STORAGE_HPP
31 #define GRAPHLAB_GRAPH_STORAGE_HPP
32 
33 
34 #ifndef __NO_OPENMP__
35 #include <omp.h>
36 #endif
37 
38 
39 #include <cmath>
40 
41 #include <string>
42 #include <list>
43 #include <vector>
44 #include <set>
45 #include <map>
46 
47 #include <queue>
48 #include <algorithm>
49 #include <functional>
50 #include <fstream>
51 
52 #include <boost/version.hpp>
53 #include <boost/bind.hpp>
54 #include <boost/unordered_set.hpp>
55 #include <boost/iterator.hpp>
56 #include <boost/iterator/counting_iterator.hpp>
57 #include <boost/iterator/iterator_facade.hpp>
58 
60 #include <graphlab/logger/assertions.hpp>
61 
62 #include <graphlab/serialization/iarchive.hpp>
63 #include <graphlab/serialization/oarchive.hpp>
64 
65 #include <graphlab/util/random.hpp>
66 #include <graphlab/util/generics/shuffle.hpp>
67 #include <graphlab/graph/graph_basic_types.hpp>
68 
69 
70 #include <graphlab/parallel/atomic.hpp>
71 
72 #include <graphlab/macros_def.hpp>
73 
74 namespace graphlab {
75 
76 
77 
78  template<typename VertexData, typename EdgeData>
79  class json_parser;
80 
81 
82 
83  template<typename VertexData, typename EdgeData>
84  class graph_storage {
85  public:
88 
89  /** The type of the edge data stored in the graph. */
90  typedef EdgeData edge_data_type;
91 
92  /** The type of the vertex data stored in the graph. */
93  typedef VertexData vertex_data_type;
94 
95  /** \internal
96  * \brief The type representing a range of edges. Edges
97  * have consecutive internal ids.
98  * */
99  typedef std::pair<size_t, size_t> edge_range_type;
100 
101  friend class json_parser<VertexData, EdgeData>;
102 
103 
104  /* ----------------------------------------------------------------------------- */
105  /* helper data field and structures: edge_data_list, class edge, class edge_list */
106  /* ----------------------------------------------------------------------------- */
107  public:
108  // Edge class for temporary storage. Will be finalized into the CSR+CSC form.
109  class edge_info {
110  public:
111  std::vector<EdgeData> data;
112  std::vector<lvid_type> source_arr;
113  std::vector<lvid_type> target_arr;
114  public:
115  edge_info () {}
116  void reserve_edge_space(size_t n) {
117  data.reserve(n);
118  source_arr.reserve(n);
119  target_arr.reserve(n);
120  }
121  // \brief Add an edge to the temporary storage.
122  void add_edge(lvid_type source, lvid_type target, EdgeData _data) {
123  data.push_back(_data);
124  source_arr.push_back(source);
125  target_arr.push_back(target);
126  }
127  // \brief Add edges in block to the temporary storage.
128  void add_block_edges(const std::vector<lvid_type>& src_arr,
129  const std::vector<lvid_type>& dst_arr,
130  const std::vector<EdgeData>& edata_arr) {
131  data.insert(data.end(), edata_arr.begin(), edata_arr.end());
132  source_arr.insert(source_arr.end(), src_arr.begin(), src_arr.end());
133  target_arr.insert(target_arr.end(), dst_arr.begin(), dst_arr.end());
134  }
135  // \brief Remove all contents in the storage.
136  void clear() {
137  std::vector<EdgeData>().swap(data);
138  std::vector<lvid_type>().swap(source_arr);
139  std::vector<lvid_type>().swap(target_arr);
140  }
141  // \brief Return the size of the storage.
142  size_t size() const {
143  return source_arr.size();
144  }
145  // \brief Return the estimated memory footprint used.
146  size_t estimate_sizeof() const {
147  return data.capacity()*sizeof(EdgeData) +
148  source_arr.capacity()*sizeof(lvid_type)*2 +
149  sizeof(data) + sizeof(source_arr)*2 + sizeof(edge_info);
150  }
151  }; // end of class edge_info.
152 
153  // A class of edge information. Used as value type of the edge_list.
154  class edge_type {
155  public:
156  /** \brief Creates an empty edge type. */
157  edge_type () : _source(-1), _target(-1), _edge_id(-1),
158  _dir(NO_EDGES), _empty(true) { }
159  /** \brief Creates an edge of given source id, target id, edge
160  * id and direction enum. \internal edge id is used to locate
161  * the edge data in the edge_data array. edge_dir type is
162  * defined in graph_basic.hpp. **/
163  edge_type (const lvid_type _source,
164  const lvid_type _target,
165  const edge_id_type _eid, edge_dir_type _dir) :
166  _source(_source), _target(_target), _edge_id(_eid),
167  _dir(_dir), _empty(false) {
168  if (_dir != OUT_EDGES) std::swap(this->_source, this->_target);
169  }
170  public:
171  /** \brief Returns the source vertex id of the edge. */
172  inline lvid_type source() const {
173  return _source;
174  }
175  /** \brief Returns the target vertex id of the edge. */
176  inline lvid_type target() const {
177  return _target;
178  }
179  /** \brief Returns the direction of the edge. */
180  inline edge_dir_type get_dir() const {
181  return _dir;
182  }
183  /** \brief Returns whether this is an empty edge. */
184  inline bool empty() const { return _empty; }
185  // Data fields.
186  private:
187  lvid_type _source;
188  lvid_type _target;
189  edge_id_type _edge_id;
190  edge_dir_type _dir;
191  bool _empty;
192 
193  friend class graph_storage;
194  }; // end of class edge_type.
195 
196  // Internal iterator on edge_types.
197  class edge_iterator {
198  public:
199  typedef std::random_access_iterator_tag iterator_category;
200  typedef edge_type value_type;
201  typedef ssize_t difference_type;
202  typedef edge_type* pointer;
203  typedef edge_type reference;
204 
205  public:
206  // Cosntructors
207  /** \brief Creates an empty iterator. */
208  edge_iterator () : offset(-1), empty(true) { }
209  /** \brief Creates an iterator at a specific edge.
210  * The edge location is defined by the follows:
211  * A center vertex id, an offset to the center, the direction and the
212  * pointer to the start of edge id array. */
213  edge_iterator (lvid_type _center, size_t _offset,
214  edge_dir_type _itype, const lvid_type* _vid_arr) :
215  center(_center), offset(_offset), itype(_itype), vid_arr(_vid_arr),
216  empty(false) { }
217  /** \brief Returns the value of the iterator. An empty iterator always returns empty edge type*/
218  inline edge_type operator*() const {
219  // ASSERT_TRUE(!empty);
220  return make_value();
221  }
222 #if BOOST_VERSION < 105000
223  typedef boost::detail::
224  operator_arrow_result<edge_type, edge_type, edge_type*> arrow_type;
225  inline typename arrow_type::type operator->() const {
226  return arrow_type::make(make_value());
227  }
228 #else
229  typedef typename boost::detail::
230  operator_arrow_dispatch<edge_type, edge_type*>::result_type arrow_type;
231  inline arrow_type operator->() const {
232  return arrow_type(make_value());
233  }
234 #endif
235 
236  /** \brief Returns if two iterators point to the same edge. */
237  inline bool operator==(const edge_iterator& it) const {
238  return (empty && it.empty) ||
239  (empty == it.empty && itype == it.itype && center == it.center &&
240  offset == it.offset);
241  }
242 
243  /** \brief Returns if two iterators don't point to the same edge. */
244  inline bool operator!=(const edge_iterator& it) const {
245  return !(*this == it);
246  }
247 
248  /** \brief Increases the iterator. */
249  inline edge_iterator& operator++() {
250  //ASSERT_TRUE(!empty);
251  ++offset;
252  return *this;
253  }
254 
255  /** \brief Increases the iterator. */
256  inline edge_iterator operator++(int) {
257  //ASSERT_TRUE(!empty);
258  const edge_iterator copy(*this);
259  operator++();
260  return copy;
261  }
262 
263  /** \brief Computes the difference of two iterators. */
264  inline ssize_t operator-(const edge_iterator& it) const {
265  return offset - it.offset;
266  }
267 
268  /** \brief Returns a new iterator whose value is increased by i difference units. */
269  inline edge_iterator operator+(difference_type i) const {
270  return edge_iterator(center, offset+i, itype, vid_arr);
271  }
272 
273  /** \brief Increases the iterator by i difference units. */
274  inline edge_iterator& operator+=(difference_type i) {
275  offset+=i;
276  return *this;
277  }
278 
279  /** \brief Generate the return value of the iterator. */
280  inline edge_type make_value() const {
281  return empty ? edge_type() : edge_type(center, vid_arr[offset], offset, itype);
282  }
283 
284  private:
285  lvid_type center;
286  size_t offset;
287  edge_dir_type itype;
288  const lvid_type* vid_arr;
289  bool empty;
290  }; // end of class edge_iterator.
291 
292  /** Represents an iteratable list of edge_types. */
293  class edge_list {
294  public:
295  typedef edge_iterator iterator;
296  typedef edge_iterator const_iterator;
297  typedef edge_type value_type;
298  private:
299  edge_iterator begin_iter, end_iter;
300  public:
301  /** Cosntructs an edge_list with begin and end. */
302  edge_list(const edge_iterator begin_iter = edge_iterator(),
303  const edge_iterator end_iter = edge_iterator()) :
304  begin_iter(begin_iter), end_iter(end_iter) { }
305  inline size_t size() const { return end_iter - begin_iter;}
306  inline edge_type operator[](size_t i) const {return *(begin_iter + i);}
307  iterator begin() const { return begin_iter; }
308  iterator end() const { return end_iter; }
309  bool empty() const { return size() == 0; }
310  }; // end of class edge_list.
311 
312 
313  public:
314  // CONSTRUCTORS ============================================================>
315  graph_storage() : use_skip_list(false) { }
316 
317  // METHODS =================================================================>
318 
319  /** \internal \brief Set graph storage to use skip list.
320  * Skip list is used to jump between ...
321  * */
322  void set_use_skip_list (bool x) { use_skip_list = x;}
323 
324  /** \brief Returns the number of edges in the graph. */
325  size_t edge_size() const { return num_edges; }
326 
327  /** \brief Returns the number of vertices in the graph. */
328  size_t vertices_size() const { return num_vertices; }
329 
330  /** \brief Returns the number of in edges of the vertex. */
331  size_t num_in_edges (const lvid_type v) const {
332  if (v >= num_vertices)
333  return 0;
334 
335  ASSERT_LT(v, num_vertices);
336 
337  size_t begin = CSC_dst[v];
338  if (begin >= num_edges) return 0;
339  // Search is the next valid src vertex after v.
340  size_t search = use_skip_list ?
341  nextValid(CSC_dst_skip, v, true) : nextValid(CSC_dst, v, false);
342  size_t end = (search >= num_vertices) ? num_edges: CSC_dst[search];
343  return (end-begin);
344  }
345 
346  /** \brief Returns the number of out edges of the vertex. */
347  size_t num_out_edges (const lvid_type v) const {
348  if (v >= num_vertices)
349  return 0;
350 
351  ASSERT_LT(v, num_vertices);
352  size_t begin = CSR_src[v];
353  if (begin >= num_edges) return 0;
354 
355  size_t search = use_skip_list ? nextValid(CSR_src_skip, v, true) : nextValid(CSR_src, v, false);
356  size_t end = (search >= num_vertices) ? num_edges: CSR_src[search];
357  return (end-begin);
358  }
359 
360  /** \brief Returns the edge id of the edge.
361  * Edges are assigned with consecutive int ids,
362  * ordered first by source and then by target.
363  * */
364  edge_id_type edge_id(const edge_type& edge) const {
365  ASSERT_FALSE(edge.empty());
366  return edge.get_dir() == OUT_EDGES ?
367  edge._edge_id :
368  c2r_map[edge._edge_id];
369  }
370 
371  /** \brief Returns the reference of edge data of an edge. */
372  edge_data_type& edge_data(lvid_type source, lvid_type target) {
373  ASSERT_LT(source, num_vertices);
374  ASSERT_LT(target, num_vertices);
375  edge_type ans = find(source, target);
376  return edge_data(ans);
377  }
378 
379  /** \brief Returns the constant reference of edge data of an edge. */
380  const edge_data_type& edge_data(lvid_type source,
381  lvid_type target) const {
382  ASSERT_LT(source, num_vertices);
383  ASSERT_LT(target, num_vertices);
384  edge_type ans = find(source, target);
385  return edge_data(ans);
386  }
387 
388  /** \brief Returns the reference of edge data of an edge. */
389  edge_data_type& edge_data(edge_type edge) {
390  ASSERT_FALSE(edge.empty());
391  return edge_data_list[edge.get_dir() == OUT_EDGES ?
392  edge._edge_id :
393  c2r_map[edge._edge_id]];
394  }
395 
396  /** \brief Returns the constant reference of edge data of an edge. */
397  const edge_data_type& edge_data(edge_type edge) const {
398  ASSERT_FALSE(edge.empty());
399  return edge_data_list[edge.get_dir() == OUT_EDGES ?
400  edge._edge_id :
401  c2r_map[edge._edge_id]];
402  }
403 
404  /** \brief Returns a list of in edges of a vertex. */
405  edge_list in_edges(const lvid_type v) const {
406  if (v >= num_vertices)
407  return edge_list();
408 
409  std::pair<bool, edge_range_type> rangePair = inEdgeRange(v);
410  if (rangePair.first) {
411  edge_range_type range = rangePair.second;
412  edge_dir_type dir = IN_EDGES;
413 
414  edge_iterator begin (v, range.first, dir, &(CSC_src[0]));
415  edge_iterator end (v, range.second+1, dir, &(CSC_src[0]));
416  // std::cout << "in range (" << range.first << "," <<
417  // range.second << ")" << std::endl; std::cout << "in edge
418  // size: " << end-begin << std::endl;
419  return edge_list(begin, end);
420  } else { return edge_list(); }
421  }
422 
423  /** \brief Returns a list of out edges of a vertex. */
424  edge_list out_edges(const lvid_type v) const {
425  if (v >= num_vertices)
426  return edge_list();
427 
428  std::pair<bool, edge_range_type> rangePair = outEdgeRange(v);
429  if (rangePair.first) {
430  edge_range_type range = rangePair.second;
431  edge_iterator begin (v, range.first, OUT_EDGES, &(CSR_dst[0]));
432  edge_iterator end (v, range.second+1, OUT_EDGES, &(CSR_dst[0]));
433  // std::cout << "out range (" << range.first << "," <<
434  // range.second << ")" << std::endl; std::cout << "out_edge
435  // size: " << end-begin << std::endl;
436  return edge_list(begin, end);
437  } else { return edge_list(); }
438  }
439 
440  /** \brief Returns an edge type of a given source target pair. */
441  edge_type find (const lvid_type src,
442  const lvid_type dst) const {
443  // DEBUG printf("Find: %u, %u \n", src, dst);
444 
445  // Get the out edge range of the src, as well as the in edge
446  // range of the dst.
447  // Search CSR or CSC, whichever has less candidates.
448  std::pair<bool, edge_range_type> dstRangePair = outEdgeRange(src);
449  std::pair<bool, edge_range_type> srcRangePair = inEdgeRange(dst);
450  if( srcRangePair.first && dstRangePair.first) {
451  // The edge may exist.
452  edge_range_type srcRange = srcRangePair.second;
453  edge_range_type dstRange = dstRangePair.second;
454 
455  if ((srcRange.second - srcRange.first) <
456  (dstRange.second - dstRange.first)) {
457  // Out edge candidate size is smaller, search CSC.
458  size_t efind = binary_search(CSC_src, srcRange.first,
459  srcRange.second, src);
460  return efind >= num_edges ?
461  edge_type() : edge_type(dst, src, efind, IN_EDGES);
462  } else {
463  // In edge candidate size is smaller, search CSR.
464  size_t efind = binary_search(CSR_dst, dstRange.first,
465  dstRange.second, dst);
466  return efind >= num_edges ?
467  edge_type() : edge_type(src, dst, efind, OUT_EDGES);
468  }
469  } else {
470  return edge_type();
471  }
472  } // end of find.
473 
474  /** \brief Finalize the graph storage.
475  * Construct the CSC, CSR, by sorting edges to maximize the
476  * efficiency of graphlab.
477  * This function takes O(|V|log(degree)) time and will
478  * fail if there are any duplicate edges.
479  *
480  * Assumption:
481  * _num_of_v == 1 + max(max_element(edges.source_arr), max_element(edges.target_arr))
482  */
483  void finalize(size_t _num_of_v, edge_info &edges) {
484 #ifdef DEBUG_GRAPH
485  logstream(LOG_DEBUG) << "Graph2 finalize starts." << std::endl;
486 #endif
487  num_vertices = _num_of_v;
488  num_edges = edges.size();
489 
490  // Permute_index, alias of c2r_map. Confusing but efficient.
491  std::vector<edge_id_type>& permute_index = c2r_map;
492  permute_index.reserve(num_edges);
493 
494  // Counter_index.
495  std::vector<atomic<int> > counter_array(num_vertices+1, 0);
496  permute_index.assign(num_edges, 0);
497 
498  // Sort edges by source;
499  // Begin of counting sort.
500 #ifdef DEBUG_GRAPH
501  logstream(LOG_DEBUG) << "Graph2 finalize: Sort by source vertex" << std::endl;
502 #endif
503  counting_sort(edges.source_arr, counter_array, permute_index);
504 
505  // Parallel sort target for each source= x interval: counter_array[x] - counter_array[x+1];
506 #ifdef _OPENMP
507 #pragma omp parallel for
508 #else
509  logstream(LOG_DEBUG) << "Graph2 finalize: Parallel sort is disabled." << std::endl;
510 #endif
511 
512  for (ssize_t j = 0; j < ssize_t(num_vertices); ++j) {
513  if (counter_array[j] < counter_array[j+1]) {
514  std::sort(permute_index.begin()+counter_array[j],
515  permute_index.begin()+counter_array[j+1],
516  cmp_by_any_functor<lvid_type> (edges.target_arr));
517  }
518  }
519  // End of counting sort.
520 
521  // Inplace permute of edge_data, edge_src, edge_target array.
522  // Modified from src/graphlab/util/generics/shuffle.hpp.
523 #ifdef DEBUG_GRAPH
524  logstream(LOG_DEBUG) << "Graph2 finalize: Inplace permute by source vertex" << std::endl;
525 #endif
526  lvid_type swap_src; lvid_type swap_target;
527  for (size_t i = 0; i < permute_index.size(); ++i) {
528  if (i != permute_index[i]) {
529  // Reserve the ith entry;
530  size_t j = i;
531  EdgeData swap_data = edges.data[i];
532  swap_src = edges.source_arr[i];
533  swap_target = edges.target_arr[i];
534  // Begin swap cycle:
535  while (j != permute_index[j]) {
536  size_t next = permute_index[j];
537  if (next != i) {
538  edges.data[j] = edges.data[next];
539  edges.source_arr[j] = edges.source_arr[next];
540  edges.target_arr[j] = edges.target_arr[next];
541  permute_index[j] = j;
542  j = next;
543  } else {
544  // end of cycle
545  edges.data[j] = swap_data;
546  edges.source_arr[j] = swap_src;
547  edges.target_arr[j] = swap_target;
548  permute_index[j] = j;
549  break;
550  }
551  }
552  }
553  }
554 
555  bool duplicate_edge_warn = false;
556 
557  // Construct CSR_src:
558 #ifdef DEBUG_GRAPH
559  logstream(LOG_DEBUG)<< "Graph2 finalize: build CSR_src..." << std::endl;
560 #endif
561  CSR_src.reserve(num_vertices);
562  if (use_skip_list) {
563  CSR_src_skip.reserve(num_vertices);
564  }
565  size_t lastSrc = -1;
566  lvid_type old_src = -1;
567  lvid_type old_dst = -1;
568  // Iterate over the edges.
569  for (size_t it = 0; it < num_edges; ++it) {
570  lvid_type src = edges.source_arr[it];
571  lvid_type dst = edges.target_arr[it];
572  // Check duplicate edge.
573  if (src == old_src && dst == old_dst) {
574  if (!duplicate_edge_warn)
575  logstream(LOG_WARNING)
576  << "Duplicate edge "
577  << it << ":(" << src << ", " << dst << ") "
578  << "found! Graphlab does not support graphs "
579  << "with duplicate edges. This error will be reported only once." << std::endl;
580  duplicate_edge_warn = true;
581  continue;
582  } else {
583  old_src = src;
584  old_dst = dst;
585  }
586  // Fill in CSR_src and CSR_src_skip.
587  if (src != lastSrc) {
588  for (size_t j = (lastSrc+1); j < src; ++j) {
589  CSR_src.push_back(-1);
590  if (use_skip_list)
591  CSR_src_skip.push_back(src-lastSrc-1);
592  }
593  CSR_src.push_back(it);
594  if (use_skip_list)
595  CSR_src_skip.push_back(0);
596  lastSrc = src;
597  }
598  }
599  // Fill in the remaining row index list.
600  for( size_t j = (lastSrc +1); j < num_vertices; ++j) {
601  CSR_src.push_back(-1);
602  if (use_skip_list)
603  CSR_src_skip.push_back(num_vertices-lastSrc-1);
604  }
605  ASSERT_EQ(CSR_src.size(), num_vertices);
606  if (use_skip_list)
607  ASSERT_EQ(CSR_src_skip.size(), num_vertices);
608 
609  // End of building CSR
610 
611 
612  // Begin building CSC
613  // Directed graph need both CSC and CSR
614  // Construct c2r_map, sort the ids according to column first order.
615  // Begin of counting sort.
616 #ifdef DEBUG_GRAPH
617  logstream(LOG_DEBUG) << "Graph2 finalize: Sort by source vertex" << std::endl;
618 #endif
619  counting_sort(edges.target_arr, counter_array, permute_index);
620 #ifdef _OPENMP
621 #pragma omp parallel for
622 #endif
623  for (ssize_t i = 0; i < ssize_t(num_vertices); ++i) {
624  if (counter_array[i] < counter_array[i+1]) {
625  std::sort(permute_index.begin()+counter_array[i],
626  permute_index.begin() + counter_array[i+1],
627  cmp_by_any_functor<lvid_type>(edges.source_arr));
628  }
629  }
630  // End of counting sort.
631 
632 #ifdef AVOID_OUTOFPLACE_PERMUTE
633 #ifdef DEBUG_GRAPH
634  logstream(LOG_DEBUG) << "Graph2 finalize: Inplace permute by target vertex" << std::endl;
635 #endif
636  inplace_shuffle(edges.source_arr.begin(), edges.source_arr.end(), permute_index);
637  /// YUCHENG to JAY: why is this here? It looks like a copy and paste from above
638 // counting_sort(edges.target_arr, counter_array, permute_index);
639 // for (ssize_t i = 0; i < ssize_t(num_vertices); ++i) {
640 // if (counter_array[i] < counter_array[i+1]) {
641 // std::sort(permute_index.begin()+counter_array[i],
642 // permute_index.begin() + counter_array[i+1],
643 // cmp_by_any_functor<lvid_type>(edges.source_arr));
644 // }
645 // }
646 #else
647 #ifdef DEBUG_GRAPH
648  logstream(LOG_DEBUG) << "Graph2 finalize: Outofplace permute by target vertex" << std::endl;
649 #endif
650  outofplace_shuffle(edges.source_arr, permute_index);
651 #endif
652 
653  /* DEBUG
654  printf("c2r_map: \n");
655  foreach(edge_id_type e, c2r_map)
656  std::cout << e << " ";
657  std::cout << std::endl;
658  */
659 
660 
661  // Construct CSC_dst:
662  CSC_dst.reserve(num_vertices);
663  if (use_skip_list) {
664  CSC_dst_skip.reserve(num_vertices);
665  }
666  size_t lastDst = -1;
667 #ifdef DEBUG_GRAPH
668  logstream(LOG_DEBUG) <<"Graph2 finalize: Build CSC_dst..." << std::endl;
669 #endif
670  // Iterate over the edges.
671  for (size_t it = 0; it < num_edges; ++it) {
672  lvid_type dst = edges.target_arr[c2r_map[it]];
673 
674  // Fill in CSC_dst and CSR_src_skip.
675  if (dst != lastDst) {
676  for (size_t j = (lastDst + 1); j < dst; ++j) {
677  CSC_dst.push_back(-1);
678  if (use_skip_list)
679  CSC_dst_skip.push_back(dst-lastDst-1);
680  }
681  CSC_dst.push_back(it);
682  if (use_skip_list)
683  CSC_dst_skip.push_back(0);
684  lastDst = dst;
685  }
686  }
687  // Fill in the remaining row index list.
688  for( size_t j = (lastDst +1); j < num_vertices; ++j) {
689  CSC_dst.push_back(-1);
690  if (use_skip_list)
691  CSC_dst_skip.push_back(num_vertices-lastDst-1);
692  }
693  ASSERT_EQ(CSC_dst.size(), num_vertices);
694  if (use_skip_list)
695  ASSERT_EQ(CSC_dst_skip.size(), num_vertices);
696 
697  // Swap edges.source with CSC_src
698  CSC_src.swap(edges.source_arr);
699  // End of building CSC
700 
701 
702  // Swap edges.target with CSR_dst
703  CSR_dst.swap(edges.target_arr);
704  // Swap edge data and perserve c2r_map.
705  edge_data_list.swap(edges.data);
706 #ifdef DEBGU_GRAPH
707  logstream(LOG_DEBUG) << "End of finalize." << std::endl;
708 #endif
709 
710  /* DEBUG */
711  // printf("CSR dst:\n");
712  // foreach(lvid_type i, CSR_dst)
713  // std::cout << i << " ";
714  // std::cout << std::endl;
715  // printf("CSR src:\n");
716  // foreach(size_t i, CSR_src)
717  // std::cout << i << " ";
718  // std::cout << std::endl;
719  // printf("CSR src skip:\n");
720  // foreach(size_t i, CSR_src_skip)
721  // std::cout << i << " ";
722  // std::cout << std::endl;
723 
724  /* DEBUG */
725  // printf("CSC dst:\n");
726  // foreach(lvid_type i, CSC_dst)
727  // std::cout << i << " ";
728  // std::cout << std::endl;
729  // printf("CSC src:\n");
730  // foreach(size_t i, CSC_src)
731  // std::cout << i << " ";
732  // std::cout << std::endl;
733  // printf("CSC dst skip:\n");
734  // foreach(size_t i, CSC_dst_skip)
735  // std::cout << i << " ";
736  // std::cout << std::endl;
737 
738  } // end of finalize.
739 
740  /** \brief Reset the storage. */
741  void clear() {
742  CSR_src.clear();
743  CSR_dst.clear();
744  CSC_src.clear();
745  CSC_dst.clear();
746  CSR_src_skip.clear();
747  CSC_dst_skip.clear();
748  c2r_map.clear();
749  edge_data_list.clear();
750  }
751 
752  /** \brief Reset the storage and free the reserved memory. */
753  void clear_reserve() {
754  std::vector<edge_id_type>().swap(CSR_src);
755  std::vector<lvid_type>().swap(CSR_dst);
756  std::vector<lvid_type>().swap(CSC_src);
757  std::vector<edge_id_type>().swap(CSC_dst);
758  std::vector<edge_id_type>().swap(c2r_map);
759  std::vector<EdgeData>().swap(edge_data_list);
760  std::vector<edge_id_type>().swap(CSR_src_skip);
761  std::vector<edge_id_type>().swap(CSC_dst_skip);
762  }
763 
764  size_t estimate_sizeof() const {
765  // const size_t word_size = sizeof(size_t);
766  const size_t vid_size = sizeof(lvid_type);
767  const size_t eid_size = sizeof(edge_id_type);
768  // Actual content size;
769  const size_t CSR_size = eid_size * CSR_src.capacity() +
770  vid_size * CSR_dst.capacity();
771  const size_t CSC_size = eid_size *CSC_dst.capacity() +
772  vid_size * CSC_src.capacity() + eid_size * c2r_map.capacity();
773  const size_t edata_size = sizeof(EdgeData) * edge_data_list.capacity();
774 
775  // Container size;
776  const size_t container_size = sizeof(CSR_src) + sizeof(CSR_dst) +
777  sizeof(CSC_src) + sizeof(CSC_dst) + sizeof(c2r_map) +
778  sizeof(edge_data_list);
779  // Skip list size:
780  const size_t skip_list_size = sizeof(CSR_src_skip) +
781  sizeof(CSC_dst_skip) + CSR_src_skip.capacity() * vid_size +
782  CSC_dst_skip.capacity() * vid_size;
783 
784  logstream(LOG_DEBUG) << "CSR size: "
785  << (double)CSR_size/(1024*1024)
786  << " CSC size: "
787  << (double)CSC_size/(1024*1024)
788  << " edata size: "
789  << (double)edata_size/(1024*1024)
790  << " skiplist size: "
791  << (double)(skip_list_size)/(1024*1024)
792  << " container size: "
793  << (double)container_size/(1024*1024)
794  << " \n Total size: "
795  << double(CSR_size + CSC_size + container_size + skip_list_size) << std::endl;
796 
797  return CSR_size + CSC_size + edata_size + container_size +
798  skip_list_size;
799  } // end of estimate_sizeof
800 
801  /** To be deprecated. */
802  // This is a log(V) operation.
803  // Do not use operations on edge id.
804  // Use edge_list instead.
805  lvid_type source(edge_id_type eid) const {
806  ASSERT_LT(eid, num_edges);
807  // Helper function: binary search the CSR_row;
808  return lookup_source(eid);
809  }
810 
811  // This is a log(V) operation.
812  // Do not use operations on edge id.
813  // Use edge_list instead.
814  lvid_type target(edge_id_type eid) const {
815  ASSERT_LT(eid, num_edges);
816  return CSR_dst[eid];
817  }
818 
819 
820 
821  // ------------- Private data storage ----------------
822  private:
823  /** Number of vertices in the storage (not counting singletons)*/
824  size_t num_vertices;
825  /** Number of edges in the storage. */
826  size_t num_edges;
827 
828  /** Array of edge data sorted by source vid. */
829  std::vector<EdgeData> edge_data_list;
830 
831  /** \internal
832  * Row index of CSR, corresponding to the source vertices. */
833  std::vector<edge_id_type> CSR_src;
834 
835  /**
836  * \internal
837  * Suppose CSR_src is: 1 x x 3 x x x x 5
838  * where x means no out edges.
839  * CSR_src_skip = 0 2 2 0 4 0 0 4 0
840  * is used to jump to the prev/next valid vertex in CSR_src.
841  * Optional.
842  */
843  std::vector<edge_id_type> CSR_src_skip;
844 
845  /** \internal
846  * Col index of CSR, corresponding to the target vertices. */
847  std::vector<lvid_type> CSR_dst;
848 
849  /** \internal
850  * Map the sort-by-col edge id to sort-by-row edge id */
851  std::vector<edge_id_type> c2r_map;
852 
853  /** \internal
854  * Row index of CSC, corresponding to the target vertices. */
855  std::vector<edge_id_type> CSC_dst;
856 
857  /*
858  * \internal
859  * Suppose CSC_dst is: 1 x x 3 x x x x 5
860  * where x means no in edges.
861  * CSC_dst_skip = 0 2 2 0 4 0 0 4 0
862  * is used to jump to the prev/next valid vertex in CSC_dst.
863  * Optional.
864  */
865  std::vector<edge_id_type> CSC_dst_skip;
866  /** \internal
867  * Col index of CSC, corresponding to the source vertices. */
868  std::vector<lvid_type> CSC_src;
869 
870  /** Graph storage traits. */
871  bool use_skip_list;
872 
873 
874  /****************************************************************************
875  * Internal Functions *
876  * ---------------------- *
877  * These functions functions and types provide internal access to the *
878  * underlying graph representation. They should not be used unless you *
879  * *really* know what you are doing. *
880  ****************************************************************************/
881  private:
882  /** \internal
883  * Returns the begin and end index of the in edge of vertex v. */
884  inline std::pair<bool, edge_range_type> inEdgeRange(lvid_type v) const {
885  ASSERT_LT(v, num_vertices);
886 
887  size_t col_start = CSC_dst[v];
888  if (col_start >= num_edges) {
889  // No inbound edges.
890  return std::make_pair(false, std::make_pair(0,0));
891  } else {
892 
893  // Find the start column of the next vertex.
894  lvid_type nextV = use_skip_list ? nextValid(CSC_dst_skip, v, true) :
895  nextValid(CSC_dst, v, false);
896  size_t col_end = (nextV < num_vertices) ? CSC_dst[nextV] : num_edges;
897  return std::make_pair(true, std::make_pair(col_start, col_end-1));
898  }
899  } // End of inEdgeRange;
900 
901  /** \internal
902  * Returns the begin and end index of the out edge of vertex v. */
903  inline std::pair<bool, edge_range_type> outEdgeRange(lvid_type v) const {
904  ASSERT_LT(v, num_vertices);
905  size_t row_start = CSR_src[v];
906  if (row_start >= num_edges) {
907  // No outbound edges.
908  return std::make_pair(false, std::make_pair(0,0));;
909  } else {
910  // Find the start column of the next vertex.
911  lvid_type nextV = use_skip_list ? nextValid(CSR_src_skip, v, true) :
912  nextValid(CSR_src, v, false);
913  size_t row_end = (nextV < num_vertices) ? CSR_src[nextV] :num_edges;
914 
915  return std::make_pair(true, std::make_pair(row_start, row_end-1));
916  }
917  } // End of outEdgeRange;
918 
919 
920  //-------------Private Helper functions------------
921  /** \internal
922  * Compare functor of any type*/
923  template <typename anyvalue>
924  struct cmp_by_any_functor {
925  const std::vector<anyvalue>& vec;
926  cmp_by_any_functor(const std::vector<anyvalue>& _vec) : vec(_vec) { }
927  bool operator()(size_t me, size_t other) const {
928  return (vec[me] < vec[other]);
929  }
930  };
931 
932  /** \internal
933  * Counting sort vector in ascending order and
934  * fill the counting array and permute index array. */
935  template <typename valuetype>
936  void counting_sort(const std::vector<valuetype>& value_array, std::vector< atomic<int> >& counter_array, std::vector<edge_id_type>& permute_index) {
937  counter_array.assign(counter_array.size(), 0);
938  permute_index.assign(permute_index.size(), 0);
939 #ifdef _OPENMP
940 #pragma omp parallel for
941 #endif
942  for (ssize_t i = 0; i < ssize_t(value_array.size()); ++i) {
943  size_t val = value_array[i];
944  counter_array[val].inc();
945  }
946 
947  for (size_t i = 1; i < counter_array.size(); ++i) {
948  counter_array[i] += counter_array[i-1];
949  }
950 #ifdef _OPENMP
951 #pragma omp parallel for
952 #endif
953  for (ssize_t i = 0; i < ssize_t(value_array.size()); ++i) {
954  size_t val = value_array[i];
955  permute_index[counter_array[val].dec()] = i;
956  }
957  }
958 
959  /** \internal
960  * Binary search vfind in a vector of lvid_type
961  * within range [start, end]. Returns (size_t)(-1) if not found. */
962  size_t binary_search(const std::vector<lvid_type>& vec,
963  size_t start, size_t end,
964  lvid_type vfind) const {
965  ASSERT_LT(vfind, num_vertices);
966  while(start <= end) {
967  size_t mid = (start+end)/2;
968  lvid_type vpoke = vec[mid];
969  if(vpoke == vfind) {
970  return mid;
971  } else if (vpoke > vfind) {
972  end = mid - 1;
973  } else {
974  start = mid + 1;
975  }
976  }
977  // Not found;
978  return -1;
979  }// End of binary_search
980 
981  /** \internal
982  * To be deprecated. */
983  // This is a log(V) operation.
984  // Do not use operations on edge id.
985  // Use edge_list instead.
986  lvid_type
987  lookup_source(edge_id_type eid) const {
988  // Binary search: find i such that CSR_src[i] <= eid, CSR_src[i+1] > eid;
989  ASSERT_LT(eid, num_edges);
990  size_t start = 0;
991  size_t end = num_vertices-1;
992  if (CSR_src[start] >= num_edges) start = use_skip_list ? nextValid(CSR_src_skip, start, true) : nextValid(CSR_src, start, false);
993  if (CSR_src[end] >= num_edges) end = use_skip_list ? prevValid(CSR_src_skip, end, true) : prevValid(CSR_src, end, false);
994 
995  // Keep the invariant that CSR_src[start] <= eid, CSR_src[end] > eid.
996  while (start <= end) {
997  if (CSR_src[end] <= eid)
998  return end;
999  if (CSR_src[start] == eid)
1000  return start;
1001  if (CSR_src[start] > eid)
1002  {
1003  ASSERT_LT(0, start);
1004  lvid_type ans = use_skip_list ? prevValid(CSR_src_skip, start, true) : prevValid(CSR_src, start, false);
1005  //lvid_type ans = start -1;
1006  ASSERT_LT(ans, num_vertices);
1007  return ans;
1008  }
1009 
1010  size_t mid = (start + end)/2;
1011  // mid may fall in to an invalid grid
1012  if (CSR_src[mid] >= num_edges) mid = prevValid(CSR_src, mid, false);
1013 
1014  if (CSR_src[mid] == eid)
1015  return mid;
1016 
1017  if (CSR_src[mid] > eid) {
1018  end = use_skip_list ? prevValid(CSR_src_skip, mid, true) : prevValid(CSR_src, mid, false);
1019  //end = mid-1;
1020  } else {
1021  //start = mid+1;
1022  start = use_skip_list ? nextValid(CSR_src_skip, mid, true) : nextValid(CSR_src, mid, false);
1023  }
1024  }
1025 
1026  ASSERT_TRUE(false);
1027  return start;
1028  }
1029 
1030  /** \internal
1031  * Returns the next valid vertex to the current vertex id in vertex_array.
1032  * Return num_vertices if there is no valid vertex next to the curent vertex.
1033  *
1034  * A vertex is valid if it has out edges (or in edges).
1035  *
1036  * If use_skip_list = false
1037  * vertex_array can be: CSR_src, or CSC_dst
1038  *
1039  * If use_skip_list = true
1040  * vertex_array can be: CSR_src_skip, or CSC_dst_skip
1041  * Uses skip array to find the next valid vertex in O(1).
1042  *
1043  * Can be applied on invalid vertex.
1044  * This function is useful in binary search where the middle is not
1045  * assumed to be valid. */
1046  inline lvid_type
1047  nextValid(const std::vector<edge_id_type>& vertex_array,
1048  lvid_type curv, bool use_skip_list) const {
1049 
1050  if (curv == num_vertices-1) return num_vertices;
1051 
1052  if (use_skip_list) {
1053  return (curv + 1 + vertex_array[curv+1]);
1054  } else {
1055  lvid_type search = curv+1;
1056  while (search < num_vertices && vertex_array[search] >= num_edges)
1057  ++search;
1058  return search;
1059  }
1060  }
1061 
1062  /** \internal
1063  * Symmetric function of nextValid.
1064  * Returns the previous valid vertex to the current vertex id in vertex_array.
1065  * Return num_vertices if there is no valid vertex previous to the curent vertex.
1066  */
1067  inline lvid_type
1068  prevValid(const std::vector<lvid_type>& vertex_array,
1069  lvid_type curv, bool use_skip_list) const {
1070  if (curv == 0) return -1;
1071 
1072  if (use_skip_list) {
1073  return (curv - 1 - vertex_array[curv-1]);
1074  } else {
1075  lvid_type search = curv-1;
1076  while (search >= 0 && vertex_array[search] >= num_edges)
1077  --search;
1078  return search;
1079  }
1080  }
1081 
1082  public:
1083 
1084  /** \internal
1085  * Returns a reference of CSR_src.*/
1086  const std::vector<lvid_type>& get_csr_src() const {
1087  return CSR_src;
1088  }
1089  /** \internal
1090  * Returns a reference of CSR_dst.*/
1091  const std::vector<edge_id_type>& get_csr_dst() const {
1092  return CSR_dst;
1093  }
1094  /** \internal
1095  * Returns a reference of CSC_src.*/
1096  const std::vector<edge_id_type>& get_csc_src() const {
1097  return CSC_src;
1098  }
1099  /** \internal
1100  * Returns a reference of CSC_dst.*/
1101  const std::vector<lvid_type>& get_csc_dst() const {
1102  return CSC_dst;
1103  }
1104  /** \internal
1105  * Returns a reference of edge_data_list.*/
1106  const std::vector<EdgeData>& get_edge_data() const {
1107  return edge_data_list;
1108  }
1109 
1110  /** \brief Load the graph from an archive */
1111  void load(iarchive& arc) {
1112  clear();
1113  arc >> use_skip_list
1114  >> num_vertices
1115  >> num_edges
1116  >> edge_data_list
1117  >> CSR_src
1118  >> CSR_dst
1119  >> CSC_src
1120  >> CSC_dst
1121  >> c2r_map
1122  >> CSR_src_skip
1123  >> CSC_dst_skip;
1124  }
1125 
1126  /** \brief Save the graph to an archive */
1127  void save(oarchive& arc) const {
1128  arc << use_skip_list
1129  << num_vertices
1130  << num_edges
1131  << edge_data_list
1132  << CSR_src
1133  << CSR_dst
1134  << CSC_src
1135  << CSC_dst
1136  << c2r_map
1137  << CSR_src_skip
1138  << CSC_dst_skip;
1139  }
1140 
1141  /** swap two graph storage*/
1142  void swap(graph_storage& other) {
1143  std::swap(use_skip_list, other.use_skip_list);
1144  std::swap(num_vertices, other.num_vertices);
1145  std::swap(num_edges, other.num_edges);
1146  std::swap(edge_data_list, other.edge_data_list);
1147  std::swap(CSR_src, other.CSR_src);
1148  std::swap(CSR_dst, other.CSR_dst);
1149  std::swap(CSC_src, other.CSC_src);
1150  std::swap(CSC_dst, other.CSC_dst);
1151  std::swap(c2r_map, other.c2r_map);
1152  std::swap(CSR_src_skip, other.CSR_src_skip);
1153  std::swap(CSC_dst_skip, other.CSC_dst_skip);
1154  }
1155 
1156  };// End of graph store;
1157 }// End of namespace;
1158 
1159 namespace std {
1160  template<typename VertexData, typename EdgeData>
1161  inline void swap(graphlab::graph_storage<VertexData,EdgeData>& a,
1162  graphlab::graph_storage<VertexData,EdgeData>& b) {
1163  a.swap(b);
1164  } // end of swap
1165 }; // end of std namespace
1166 
1167 
1168 #include <graphlab/macros_undef.hpp>
1169 #endif