30 #ifndef GRAPHLAB_GRAPH_STORAGE_HPP
31 #define GRAPHLAB_GRAPH_STORAGE_HPP
52 #include <boost/version.hpp>
53 #include <boost/bind.hpp>
54 #include <boost/unordered_set.hpp>
55 #include <boost/iterator.hpp>
56 #include <boost/iterator/counting_iterator.hpp>
57 #include <boost/iterator/iterator_facade.hpp>
60 #include <graphlab/logger/assertions.hpp>
62 #include <graphlab/serialization/iarchive.hpp>
63 #include <graphlab/serialization/oarchive.hpp>
65 #include <graphlab/util/random.hpp>
66 #include <graphlab/util/generics/shuffle.hpp>
67 #include <graphlab/graph/graph_basic_types.hpp>
70 #include <graphlab/parallel/atomic.hpp>
72 #include <graphlab/macros_def.hpp>
78 template<
typename VertexData,
typename EdgeData>
83 template<
typename VertexData,
typename EdgeData>
90 typedef EdgeData edge_data_type;
93 typedef VertexData vertex_data_type;
99 typedef std::pair<size_t, size_t> edge_range_type;
101 friend class json_parser<VertexData, EdgeData>;
111 std::vector<EdgeData> data;
112 std::vector<lvid_type> source_arr;
113 std::vector<lvid_type> target_arr;
116 void reserve_edge_space(
size_t n) {
118 source_arr.reserve(n);
119 target_arr.reserve(n);
123 data.push_back(_data);
124 source_arr.push_back(source);
125 target_arr.push_back(target);
128 void add_block_edges(
const std::vector<lvid_type>& src_arr,
129 const std::vector<lvid_type>& dst_arr,
130 const std::vector<EdgeData>& edata_arr) {
131 data.insert(data.end(), edata_arr.begin(), edata_arr.end());
132 source_arr.insert(source_arr.end(), src_arr.begin(), src_arr.end());
133 target_arr.insert(target_arr.end(), dst_arr.begin(), dst_arr.end());
137 std::vector<EdgeData>().swap(data);
138 std::vector<lvid_type>().swap(source_arr);
139 std::vector<lvid_type>().swap(target_arr);
142 size_t size()
const {
143 return source_arr.size();
146 size_t estimate_sizeof()
const {
147 return data.capacity()*
sizeof(EdgeData) +
148 source_arr.capacity()*
sizeof(
lvid_type)*2 +
149 sizeof(data) +
sizeof(source_arr)*2 +
sizeof(edge_info);
157 edge_type () : _source(-1), _target(-1), _edge_id(-1),
166 _source(_source), _target(_target), _edge_id(_eid),
167 _dir(_dir), _empty(false) {
168 if (_dir !=
OUT_EDGES) std::swap(this->_source, this->_target);
184 inline bool empty()
const {
return _empty; }
193 friend class graph_storage;
197 class edge_iterator {
199 typedef std::random_access_iterator_tag iterator_category;
200 typedef edge_type value_type;
201 typedef ssize_t difference_type;
202 typedef edge_type* pointer;
203 typedef edge_type reference;
208 edge_iterator () : offset(-1), empty(true) { }
213 edge_iterator (
lvid_type _center,
size_t _offset,
215 center(_center), offset(_offset), itype(_itype), vid_arr(_vid_arr),
218 inline edge_type operator*()
const {
222 #if BOOST_VERSION < 105000
223 typedef boost::detail::
224 operator_arrow_result<edge_type, edge_type, edge_type*> arrow_type;
225 inline typename arrow_type::type operator->()
const {
226 return arrow_type::make(make_value());
229 typedef typename boost::detail::
230 operator_arrow_dispatch<edge_type, edge_type*>::result_type arrow_type;
231 inline arrow_type operator->()
const {
232 return arrow_type(make_value());
237 inline bool operator==(
const edge_iterator& it)
const {
238 return (empty && it.empty) ||
239 (empty == it.empty && itype == it.itype && center == it.center &&
240 offset == it.offset);
244 inline bool operator!=(
const edge_iterator& it)
const {
245 return !(*
this == it);
249 inline edge_iterator& operator++() {
256 inline edge_iterator operator++(
int) {
258 const edge_iterator copy(*
this);
264 inline ssize_t operator-(
const edge_iterator& it)
const {
265 return offset - it.offset;
269 inline edge_iterator operator+(difference_type i)
const {
270 return edge_iterator(center, offset+i, itype, vid_arr);
274 inline edge_iterator& operator+=(difference_type i) {
280 inline edge_type make_value()
const {
281 return empty ? edge_type() : edge_type(center, vid_arr[offset], offset, itype);
295 typedef edge_iterator iterator;
296 typedef edge_iterator const_iterator;
297 typedef edge_type value_type;
299 edge_iterator begin_iter, end_iter;
302 edge_list(
const edge_iterator begin_iter = edge_iterator(),
303 const edge_iterator end_iter = edge_iterator()) :
304 begin_iter(begin_iter), end_iter(end_iter) { }
305 inline size_t size()
const {
return end_iter - begin_iter;}
306 inline edge_type operator[](
size_t i)
const {
return *(begin_iter + i);}
307 iterator begin()
const {
return begin_iter; }
308 iterator end()
const {
return end_iter; }
309 bool empty()
const {
return size() == 0; }
315 graph_storage() : use_skip_list(false) { }
322 void set_use_skip_list (
bool x) { use_skip_list = x;}
325 size_t edge_size()
const {
return num_edges; }
328 size_t vertices_size()
const {
return num_vertices; }
331 size_t num_in_edges (
const lvid_type v)
const {
332 if (v >= num_vertices)
335 ASSERT_LT(v, num_vertices);
337 size_t begin = CSC_dst[v];
338 if (begin >= num_edges)
return 0;
340 size_t search = use_skip_list ?
341 nextValid(CSC_dst_skip, v,
true) : nextValid(CSC_dst, v, false);
342 size_t end = (search >= num_vertices) ? num_edges: CSC_dst[search];
347 size_t num_out_edges (
const lvid_type v)
const {
348 if (v >= num_vertices)
351 ASSERT_LT(v, num_vertices);
352 size_t begin = CSR_src[v];
353 if (begin >= num_edges)
return 0;
355 size_t search = use_skip_list ? nextValid(CSR_src_skip, v,
true) : nextValid(CSR_src, v, false);
356 size_t end = (search >= num_vertices) ? num_edges: CSR_src[search];
365 ASSERT_FALSE(edge.empty());
368 c2r_map[edge._edge_id];
373 ASSERT_LT(source, num_vertices);
374 ASSERT_LT(target, num_vertices);
375 edge_type ans = find(source, target);
376 return edge_data(ans);
380 const edge_data_type& edge_data(
lvid_type source,
382 ASSERT_LT(source, num_vertices);
383 ASSERT_LT(target, num_vertices);
384 edge_type ans = find(source, target);
385 return edge_data(ans);
389 edge_data_type& edge_data(edge_type edge) {
390 ASSERT_FALSE(edge.empty());
391 return edge_data_list[edge.get_dir() ==
OUT_EDGES ?
393 c2r_map[edge._edge_id]];
397 const edge_data_type& edge_data(edge_type edge)
const {
398 ASSERT_FALSE(edge.empty());
399 return edge_data_list[edge.get_dir() ==
OUT_EDGES ?
401 c2r_map[edge._edge_id]];
405 edge_list in_edges(
const lvid_type v)
const {
406 if (v >= num_vertices)
409 std::pair<bool, edge_range_type> rangePair = inEdgeRange(v);
410 if (rangePair.first) {
411 edge_range_type range = rangePair.second;
414 edge_iterator begin (v, range.first, dir, &(CSC_src[0]));
415 edge_iterator end (v, range.second+1, dir, &(CSC_src[0]));
419 return edge_list(begin, end);
420 }
else {
return edge_list(); }
424 edge_list out_edges(
const lvid_type v)
const {
425 if (v >= num_vertices)
428 std::pair<bool, edge_range_type> rangePair = outEdgeRange(v);
429 if (rangePair.first) {
430 edge_range_type range = rangePair.second;
431 edge_iterator begin (v, range.first,
OUT_EDGES, &(CSR_dst[0]));
432 edge_iterator end (v, range.second+1,
OUT_EDGES, &(CSR_dst[0]));
436 return edge_list(begin, end);
437 }
else {
return edge_list(); }
448 std::pair<bool, edge_range_type> dstRangePair = outEdgeRange(src);
449 std::pair<bool, edge_range_type> srcRangePair = inEdgeRange(dst);
450 if( srcRangePair.first && dstRangePair.first) {
452 edge_range_type srcRange = srcRangePair.second;
453 edge_range_type dstRange = dstRangePair.second;
455 if ((srcRange.second - srcRange.first) <
456 (dstRange.second - dstRange.first)) {
458 size_t efind = binary_search(CSC_src, srcRange.first,
459 srcRange.second, src);
460 return efind >= num_edges ?
461 edge_type() : edge_type(dst, src, efind,
IN_EDGES);
464 size_t efind = binary_search(CSR_dst, dstRange.first,
465 dstRange.second, dst);
466 return efind >= num_edges ?
467 edge_type() : edge_type(src, dst, efind,
OUT_EDGES);
483 void finalize(
size_t _num_of_v, edge_info &edges) {
485 logstream(
LOG_DEBUG) <<
"Graph2 finalize starts." << std::endl;
487 num_vertices = _num_of_v;
488 num_edges = edges.size();
491 std::vector<edge_id_type>& permute_index = c2r_map;
492 permute_index.reserve(num_edges);
495 std::vector<atomic<int> > counter_array(num_vertices+1, 0);
496 permute_index.assign(num_edges, 0);
501 logstream(
LOG_DEBUG) <<
"Graph2 finalize: Sort by source vertex" << std::endl;
503 counting_sort(edges.source_arr, counter_array, permute_index);
507 #pragma omp parallel for
509 logstream(
LOG_DEBUG) <<
"Graph2 finalize: Parallel sort is disabled." << std::endl;
512 for (ssize_t j = 0; j < ssize_t(num_vertices); ++j) {
513 if (counter_array[j] < counter_array[j+1]) {
514 std::sort(permute_index.begin()+counter_array[j],
515 permute_index.begin()+counter_array[j+1],
516 cmp_by_any_functor<lvid_type> (edges.target_arr));
524 logstream(
LOG_DEBUG) <<
"Graph2 finalize: Inplace permute by source vertex" << std::endl;
527 for (
size_t i = 0; i < permute_index.size(); ++i) {
528 if (i != permute_index[i]) {
531 EdgeData swap_data = edges.data[i];
532 swap_src = edges.source_arr[i];
533 swap_target = edges.target_arr[i];
535 while (j != permute_index[j]) {
536 size_t next = permute_index[j];
538 edges.data[j] = edges.data[next];
539 edges.source_arr[j] = edges.source_arr[next];
540 edges.target_arr[j] = edges.target_arr[next];
541 permute_index[j] = j;
545 edges.data[j] = swap_data;
546 edges.source_arr[j] = swap_src;
547 edges.target_arr[j] = swap_target;
548 permute_index[j] = j;
555 bool duplicate_edge_warn =
false;
559 logstream(
LOG_DEBUG)<<
"Graph2 finalize: build CSR_src..." << std::endl;
561 CSR_src.reserve(num_vertices);
563 CSR_src_skip.reserve(num_vertices);
569 for (
size_t it = 0; it < num_edges; ++it) {
573 if (src == old_src && dst == old_dst) {
574 if (!duplicate_edge_warn)
577 << it <<
":(" << src <<
", " << dst <<
") "
578 <<
"found! Graphlab does not support graphs "
579 <<
"with duplicate edges. This error will be reported only once." << std::endl;
580 duplicate_edge_warn =
true;
587 if (src != lastSrc) {
588 for (
size_t j = (lastSrc+1); j < src; ++j) {
589 CSR_src.push_back(-1);
591 CSR_src_skip.push_back(src-lastSrc-1);
593 CSR_src.push_back(it);
595 CSR_src_skip.push_back(0);
600 for(
size_t j = (lastSrc +1); j < num_vertices; ++j) {
601 CSR_src.push_back(-1);
603 CSR_src_skip.push_back(num_vertices-lastSrc-1);
605 ASSERT_EQ(CSR_src.size(), num_vertices);
607 ASSERT_EQ(CSR_src_skip.size(), num_vertices);
617 logstream(
LOG_DEBUG) <<
"Graph2 finalize: Sort by source vertex" << std::endl;
619 counting_sort(edges.target_arr, counter_array, permute_index);
621 #pragma omp parallel for
623 for (ssize_t i = 0; i < ssize_t(num_vertices); ++i) {
624 if (counter_array[i] < counter_array[i+1]) {
625 std::sort(permute_index.begin()+counter_array[i],
626 permute_index.begin() + counter_array[i+1],
627 cmp_by_any_functor<lvid_type>(edges.source_arr));
632 #ifdef AVOID_OUTOFPLACE_PERMUTE
634 logstream(
LOG_DEBUG) <<
"Graph2 finalize: Inplace permute by target vertex" << std::endl;
636 inplace_shuffle(edges.source_arr.begin(), edges.source_arr.end(), permute_index);
648 logstream(
LOG_DEBUG) <<
"Graph2 finalize: Outofplace permute by target vertex" << std::endl;
662 CSC_dst.reserve(num_vertices);
664 CSC_dst_skip.reserve(num_vertices);
668 logstream(
LOG_DEBUG) <<
"Graph2 finalize: Build CSC_dst..." << std::endl;
671 for (
size_t it = 0; it < num_edges; ++it) {
672 lvid_type dst = edges.target_arr[c2r_map[it]];
675 if (dst != lastDst) {
676 for (
size_t j = (lastDst + 1); j < dst; ++j) {
677 CSC_dst.push_back(-1);
679 CSC_dst_skip.push_back(dst-lastDst-1);
681 CSC_dst.push_back(it);
683 CSC_dst_skip.push_back(0);
688 for(
size_t j = (lastDst +1); j < num_vertices; ++j) {
689 CSC_dst.push_back(-1);
691 CSC_dst_skip.push_back(num_vertices-lastDst-1);
693 ASSERT_EQ(CSC_dst.size(), num_vertices);
695 ASSERT_EQ(CSC_dst_skip.size(), num_vertices);
698 CSC_src.swap(edges.source_arr);
703 CSR_dst.swap(edges.target_arr);
705 edge_data_list.swap(edges.data);
707 logstream(
LOG_DEBUG) <<
"End of finalize." << std::endl;
746 CSR_src_skip.clear();
747 CSC_dst_skip.clear();
749 edge_data_list.clear();
753 void clear_reserve() {
754 std::vector<edge_id_type>().swap(CSR_src);
755 std::vector<lvid_type>().swap(CSR_dst);
756 std::vector<lvid_type>().swap(CSC_src);
757 std::vector<edge_id_type>().swap(CSC_dst);
758 std::vector<edge_id_type>().swap(c2r_map);
759 std::vector<EdgeData>().swap(edge_data_list);
760 std::vector<edge_id_type>().swap(CSR_src_skip);
761 std::vector<edge_id_type>().swap(CSC_dst_skip);
764 size_t estimate_sizeof()
const {
766 const size_t vid_size =
sizeof(
lvid_type);
769 const size_t CSR_size = eid_size * CSR_src.capacity() +
770 vid_size * CSR_dst.capacity();
771 const size_t CSC_size = eid_size *CSC_dst.capacity() +
772 vid_size * CSC_src.capacity() + eid_size * c2r_map.capacity();
773 const size_t edata_size =
sizeof(EdgeData) * edge_data_list.capacity();
776 const size_t container_size =
sizeof(CSR_src) +
sizeof(CSR_dst) +
777 sizeof(CSC_src) +
sizeof(CSC_dst) +
sizeof(c2r_map) +
778 sizeof(edge_data_list);
780 const size_t skip_list_size =
sizeof(CSR_src_skip) +
781 sizeof(CSC_dst_skip) + CSR_src_skip.capacity() * vid_size +
782 CSC_dst_skip.capacity() * vid_size;
785 << (double)CSR_size/(1024*1024)
787 << (double)CSC_size/(1024*1024)
789 << (double)edata_size/(1024*1024)
790 <<
" skiplist size: "
791 << (double)(skip_list_size)/(1024*1024)
792 <<
" container size: "
793 << (
double)container_size/(1024*1024)
794 <<
" \n Total size: "
795 <<
double(CSR_size + CSC_size + container_size + skip_list_size) << std::endl;
797 return CSR_size + CSC_size + edata_size + container_size +
806 ASSERT_LT(eid, num_edges);
808 return lookup_source(eid);
815 ASSERT_LT(eid, num_edges);
829 std::vector<EdgeData> edge_data_list;
833 std::vector<edge_id_type> CSR_src;
843 std::vector<edge_id_type> CSR_src_skip;
847 std::vector<lvid_type> CSR_dst;
851 std::vector<edge_id_type> c2r_map;
855 std::vector<edge_id_type> CSC_dst;
865 std::vector<edge_id_type> CSC_dst_skip;
868 std::vector<lvid_type> CSC_src;
884 inline std::pair<bool, edge_range_type> inEdgeRange(
lvid_type v)
const {
885 ASSERT_LT(v, num_vertices);
887 size_t col_start = CSC_dst[v];
888 if (col_start >= num_edges) {
890 return std::make_pair(
false, std::make_pair(0,0));
894 lvid_type nextV = use_skip_list ? nextValid(CSC_dst_skip, v,
true) :
895 nextValid(CSC_dst, v, false);
896 size_t col_end = (nextV < num_vertices) ? CSC_dst[nextV] : num_edges;
897 return std::make_pair(
true, std::make_pair(col_start, col_end-1));
903 inline std::pair<bool, edge_range_type> outEdgeRange(
lvid_type v)
const {
904 ASSERT_LT(v, num_vertices);
905 size_t row_start = CSR_src[v];
906 if (row_start >= num_edges) {
908 return std::make_pair(
false, std::make_pair(0,0));;
911 lvid_type nextV = use_skip_list ? nextValid(CSR_src_skip, v,
true) :
912 nextValid(CSR_src, v, false);
913 size_t row_end = (nextV < num_vertices) ? CSR_src[nextV] :num_edges;
915 return std::make_pair(
true, std::make_pair(row_start, row_end-1));
923 template <
typename anyvalue>
924 struct cmp_by_any_functor {
925 const std::vector<anyvalue>& vec;
926 cmp_by_any_functor(
const std::vector<anyvalue>& _vec) : vec(_vec) { }
927 bool operator()(
size_t me,
size_t other)
const {
928 return (vec[me] < vec[other]);
935 template <
typename valuetype>
936 void counting_sort(
const std::vector<valuetype>& value_array, std::vector< atomic<int> >& counter_array, std::vector<edge_id_type>& permute_index) {
937 counter_array.assign(counter_array.size(), 0);
938 permute_index.assign(permute_index.size(), 0);
940 #pragma omp parallel for
942 for (ssize_t i = 0; i < ssize_t(value_array.size()); ++i) {
943 size_t val = value_array[i];
944 counter_array[val].inc();
947 for (
size_t i = 1; i < counter_array.size(); ++i) {
948 counter_array[i] += counter_array[i-1];
951 #pragma omp parallel for
953 for (ssize_t i = 0; i < ssize_t(value_array.size()); ++i) {
954 size_t val = value_array[i];
955 permute_index[counter_array[val].dec()] = i;
962 size_t binary_search(
const std::vector<lvid_type>& vec,
963 size_t start,
size_t end,
965 ASSERT_LT(vfind, num_vertices);
966 while(start <= end) {
967 size_t mid = (start+end)/2;
971 }
else if (vpoke > vfind) {
989 ASSERT_LT(eid, num_edges);
991 size_t end = num_vertices-1;
992 if (CSR_src[start] >= num_edges) start = use_skip_list ? nextValid(CSR_src_skip, start,
true) : nextValid(CSR_src, start, false);
993 if (CSR_src[end] >= num_edges) end = use_skip_list ? prevValid(CSR_src_skip, end,
true) : prevValid(CSR_src, end, false);
996 while (start <= end) {
997 if (CSR_src[end] <= eid)
999 if (CSR_src[start] == eid)
1001 if (CSR_src[start] > eid)
1003 ASSERT_LT(0, start);
1004 lvid_type ans = use_skip_list ? prevValid(CSR_src_skip, start,
true) : prevValid(CSR_src, start, false);
1006 ASSERT_LT(ans, num_vertices);
1010 size_t mid = (start + end)/2;
1012 if (CSR_src[mid] >= num_edges) mid = prevValid(CSR_src, mid,
false);
1014 if (CSR_src[mid] == eid)
1017 if (CSR_src[mid] > eid) {
1018 end = use_skip_list ? prevValid(CSR_src_skip, mid,
true) : prevValid(CSR_src, mid, false);
1022 start = use_skip_list ? nextValid(CSR_src_skip, mid,
true) : nextValid(CSR_src, mid, false);
1047 nextValid(
const std::vector<edge_id_type>& vertex_array,
1048 lvid_type curv,
bool use_skip_list)
const {
1050 if (curv == num_vertices-1)
return num_vertices;
1052 if (use_skip_list) {
1053 return (curv + 1 + vertex_array[curv+1]);
1056 while (search < num_vertices && vertex_array[search] >= num_edges)
1068 prevValid(
const std::vector<lvid_type>& vertex_array,
1069 lvid_type curv,
bool use_skip_list)
const {
1070 if (curv == 0)
return -1;
1072 if (use_skip_list) {
1073 return (curv - 1 - vertex_array[curv-1]);
1076 while (search >= 0 && vertex_array[search] >= num_edges)
1086 const std::vector<lvid_type>& get_csr_src()
const {
1091 const std::vector<edge_id_type>& get_csr_dst()
const {
1096 const std::vector<edge_id_type>& get_csc_src()
const {
1101 const std::vector<lvid_type>& get_csc_dst()
const {
1106 const std::vector<EdgeData>& get_edge_data()
const {
1107 return edge_data_list;
1111 void load(iarchive& arc) {
1113 arc >> use_skip_list
1127 void save(oarchive& arc)
const {
1128 arc << use_skip_list
1142 void swap(graph_storage& other) {
1143 std::swap(use_skip_list, other.use_skip_list);
1144 std::swap(num_vertices, other.num_vertices);
1145 std::swap(num_edges, other.num_edges);
1146 std::swap(edge_data_list, other.edge_data_list);
1147 std::swap(CSR_src, other.CSR_src);
1148 std::swap(CSR_dst, other.CSR_dst);
1149 std::swap(CSC_src, other.CSC_src);
1150 std::swap(CSC_dst, other.CSC_dst);
1151 std::swap(c2r_map, other.c2r_map);
1152 std::swap(CSR_src_skip, other.CSR_src_skip);
1153 std::swap(CSC_dst_skip, other.CSC_dst_skip);
1160 template<
typename VertexData,
typename EdgeData>
1161 inline void swap(graphlab::graph_storage<VertexData,EdgeData>& a,
1162 graphlab::graph_storage<VertexData,EdgeData>& b) {
1168 #include <graphlab/macros_undef.hpp>