GraphLab: Distributed Graph-Parallel API  2.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
builtin_parsers.hpp
1 /**
2  * Copyright (c) 2009 Carnegie Mellon University.
3  * All rights reserved.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing,
12  * software distributed under the License is distributed on an "AS
13  * IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
14  * express or implied. See the License for the specific language
15  * governing permissions and limitations under the License.
16  *
17  * For more about this software visit:
18  *
19  * http://www.graphlab.ml.cmu.edu
20  *
21  */
22 #ifndef GRAPHLAB_GRAPH_BUILTIN_PARSERS_HPP
23 #define GRAPHLAB_GRAPH_BUILTIN_PARSERS_HPP
24 
25 #include <string>
26 #include <sstream>
27 #include <iostream>
28 
29 #if defined(__cplusplus) && __cplusplus >= 201103L
30 // do not include spirit
31 #else
32 #include <boost/config/warning_disable.hpp>
33 #include <boost/spirit/include/qi.hpp>
34 #include <boost/spirit/include/phoenix_core.hpp>
35 #include <boost/spirit/include/phoenix_operator.hpp>
36 #include <boost/spirit/include/phoenix_stl.hpp>
37 #endif
38 
39 #include <graphlab/util/stl_util.hpp>
41 #include <graphlab/serialization/serialization_includes.hpp>
42 
43 
44 
45 
46 namespace graphlab {
47 
48  namespace builtin_parsers {
49 
50  /**
51  * \brief Parse files in the Stanford Network Analysis Package format.
52  *
53  * example:
54  *
55  * # some comment
56  * # another comment
57  * 1 2
58  * 3 4
59  * 1 4
60  *
61  */
62  template <typename Graph>
63  bool snap_parser(Graph& graph, const std::string& srcfilename,
64  const std::string& str) {
65  if (str.empty()) return true;
66  else if (str[0] == '#') {
67  std::cout << str << std::endl;
68  } else {
69  size_t source, target;
70  char* targetptr;
71  source = strtoul(str.c_str(), &targetptr, 10);
72  if (targetptr == NULL) return false;
73  target = strtoul(targetptr, NULL, 10);
74  if(source != target) graph.add_edge(source, target);
75  }
76  return true;
77  } // end of snap parser
78 
79  /**
80  * \brief Parse files in the standard tsv format
81  *
82  * This is identical to the SNAP format but does not allow comments.
83  *
84  */
85  template <typename Graph>
86  bool tsv_parser(Graph& graph, const std::string& srcfilename,
87  const std::string& str) {
88  if (str.empty()) return true;
89  size_t source, target;
90  char* targetptr;
91  source = strtoul(str.c_str(), &targetptr, 10);
92  if (targetptr == NULL) return false;
93  target = strtoul(targetptr, NULL, 10);
94  if(source != target) graph.add_edge(source, target);
95  return true;
96  } // end of tsv parser
97 
98 
99 
100 #if defined(__cplusplus) && __cplusplus >= 201103L
101  // The spirit parser seems to have issues when compiling under
102  // C++11. Temporary workaround with a hard coded parser. TOFIX
103  template <typename Graph>
104  bool adj_parser(Graph& graph, const std::string& srcfilename,
105  const std::string& line) {
106  // If the line is empty simply skip it
107  if(line.empty()) return true;
108  std::stringstream strm(line);
109  vertex_id_type source;
110  size_t n;
111  strm >> source;
112  if (strm.fail()) return false;
113  strm >> n;
114  if (strm.fail()) return true;
115 
116  size_t nadded = 0;
117  while (strm.good()) {
118  vertex_id_type target;
119  strm >> target;
120  if (strm.fail()) break;
121  if (source != target) graph.add_edge(source, target);
122  ++nadded;
123  }
124  if (n != nadded) return false;
125  return true;
126  } // end of adj parser
127 
128 #else
129 
130  template <typename Graph>
131  bool adj_parser(Graph& graph, const std::string& srcfilename,
132  const std::string& line) {
133  // If the line is empty simply skip it
134  if(line.empty()) return true;
135  // We use the boost spirit parser which requires (too) many separate
136  // namespaces so to make things clear we shorten them here.
137  namespace qi = boost::spirit::qi;
138  namespace ascii = boost::spirit::ascii;
139  namespace phoenix = boost::phoenix;
140  vertex_id_type source(-1);
141  vertex_id_type ntargets(-1);
142  std::vector<vertex_id_type> targets;
143  const bool success = qi::phrase_parse
144  (line.begin(), line.end(),
145  // Begin grammar
146  (
147  qi::ulong_[phoenix::ref(source) = qi::_1] >> -qi::char_(",") >>
148  qi::ulong_[phoenix::ref(ntargets) = qi::_1] >> -qi::char_(",") >>
149  *(qi::ulong_[phoenix::push_back(phoenix::ref(targets), qi::_1)] % -qi::char_(","))
150  )
151  ,
152  // End grammar
153  ascii::space);
154  // Test to see if the boost parser was able to parse the line
155  if(!success || ntargets != targets.size()) {
156  logstream(LOG_ERROR) << "Parse error in vertex prior parser." << std::endl;
157  return false;
158  }
159  for(size_t i = 0; i < targets.size(); ++i) {
160  if(source != targets[i]) graph.add_edge(source, targets[i]);
161  }
162  return true;
163  } // end of adj parser
164 #endif
165 
166  template <typename Graph>
167  struct tsv_writer{
168  typedef typename Graph::vertex_type vertex_type;
169  typedef typename Graph::edge_type edge_type;
170  std::string save_vertex(vertex_type) { return ""; }
171  std::string save_edge(edge_type e) {
172  return tostr(e.source().id()) + "\t" + tostr(e.target().id()) + "\n";
173  }
174  };
175 
176 
177 
178 
179  template <typename Graph>
180  struct graphjrl_writer{
181  typedef typename Graph::vertex_type vertex_type;
182  typedef typename Graph::edge_type edge_type;
183 
184  /**
185  * \internal
186  * Replaces \255 with \255\1
187  * Replaces \\n with \255\0
188  */
189  static std::string escape_newline(charstream& strm) {
190  size_t ctr = 0;
191  char *ptr = strm->c_str();
192  size_t strmlen = strm->size();
193  for (size_t i = 0;i < strmlen; ++i) {
194  ctr += (ptr[i] == (char)255 || ptr[i] == '\n');
195  }
196 
197  std::string ret(ctr + strmlen, 0);
198 
199  size_t target = 0;
200  for (size_t i = 0;i < strmlen; ++i, ++ptr) {
201  if ((*ptr) == (char)255) {
202  ret[target] = (char)255;
203  ret[target + 1] = 1;
204  target += 2;
205  }
206  else if ((*ptr) == '\n') {
207  ret[target] = (char)255;
208  ret[target + 1] = 0;
209  target += 2;
210  }
211  else {
212  ret[target] = (*ptr);
213  ++target;
214  }
215  }
216  assert(target == ctr + strmlen);
217  return ret;
218  }
219 
220  /**
221  * \internal
222  * Replaces \255\1 with \255
223  * Replaces \255\0 with \\n
224  */
225  static std::string unescape_newline(const std::string& str) {
226  size_t ctr = 0;
227  // count the number of escapes
228  for (size_t i = 0;i < str.length(); ++i) {
229  ctr += (str[i] == (char)255);
230  }
231  // real length is string length - escapes
232  std::string ret(str.size() - ctr, 0);
233 
234  const char escapemap[2] = {'\n', (char)255};
235 
236  size_t target = 0;
237  for (size_t i = 0;i < str.length(); ++i, ++target) {
238  if (str[i] == (char)255) {
239  // escape character
240  ++i;
241  ASSERT_MSG(str[i] == 0 || str[i] == 1,
242  "Malformed escape sequence in graphjrl file.");
243  ret[target] = escapemap[(int)str[i]];
244  }
245  else {
246  ret[target] = str[i];
247  }
248  }
249  return ret;
250  }
251 
252  std::string save_vertex(vertex_type v) {
253  charstream strm(128);
254  oarchive oarc(strm);
255  oarc << char(0) << v.id() << v.data();
256  strm.flush();
257  return escape_newline(strm) + "\n";
258  }
259 
260  std::string save_edge(edge_type e) {
261  charstream strm(128);
262  oarchive oarc(strm);
263  oarc << char(1) << e.source().id() << e.target().id() << e.data();
264  strm.flush();
265  return escape_newline(strm) + "\n";
266  }
267  };
268 
269 
270 
271  template <typename Graph>
272  bool graphjrl_parser(Graph& graph, const std::string& srcfilename,
273  const std::string& str) {
274  std::string unescapedstr = graphjrl_writer<Graph>::unescape_newline(str);
275  boost::iostreams::stream<boost::iostreams::array_source>
276  istrm(unescapedstr.c_str(), unescapedstr.length());
277  iarchive iarc(istrm);
278 
279  char entrytype;
280  iarc >> entrytype;
281  if (entrytype == 0) {
282  typename Graph::vertex_id_type vid;
283  typename Graph::vertex_data_type vdata;
284  iarc >> vid >> vdata;
285  graph.add_vertex(vid, vdata);
286  }
287  else if (entrytype == 1) {
288  typename Graph::vertex_id_type srcvid, destvid;
289  typename Graph::edge_data_type edata;
290  iarc >> srcvid >> destvid >> edata;
291  graph.add_edge(srcvid, destvid, edata);
292  }
293  else {
294  return false;
295  }
296  return true;
297  }
298 
299  } // namespace builtin_parsers
300 } // namespace graphlab
301 
302 #endif