Caffe2 - Python API
A deep learning, cross platform ML framework
memonger.py
1 
3 from __future__ import absolute_import
4 from __future__ import division
5 from __future__ import print_function
6 from __future__ import unicode_literals
7 
8 import networkx as nx
9 import collections
10 import time
11 import copy
12 from caffe2.python import workspace
13 from caffe2.proto import caffe2_pb2
14 import enum
15 import logging
16 
17 log = logging.getLogger("memonger")
18 log.setLevel(logging.INFO)
19 LiveRange = collections.namedtuple('LiveRange', ["defined", "used", "size"])
20 
21 
22 def share_grad_blobs(net, losses, param_grads, namescope):
23  '''
24  Implements similar optimization as Torch's shareGradInput():
25  for the gradients that are passed between layers, share blobs between
26  operators when possible. This yields significant memory savings with
27  deep networks.
28 
29  Returns an optimized protobuf (assign to net._net)
30  '''
31  def is_grad_blob(b):
32  name = str(b)
33  # Note: need to look at _{namescope} pattern as it matches
34  # to handle the auto-split gradients
35  return "_grad" in name and (name.startswith(namescope) or
36  name.startswith("_" + namescope)) and name not in param_grads
37 
38  def is_grad_op(op):
39  # TODO: something smarter
40  for b in list(op.input) + list(op.output):
41  if is_grad_blob(b):
42  return True
43  return False
44 
45  log.warn("NOTE: Executing memonger to optimize gradient memory")
46 
47  # Collect ops that have something to do with gradients
48  if not namescope.endswith("/"):
49  namescope += "/"
50 
51  netproto = copy.deepcopy(net.Proto())
52  grad_ops = [op for op in netproto.op if is_grad_op(op)]
53  return _compute_blob_recycling_for_dag(
54  netproto, losses, grad_ops, is_grad_blob, namescope
55  )
56 
57 
58 def optimize_inference_for_dag(net, input_blobs, namescope=""):
59  netproto = copy.deepcopy(net.Proto())
60  external_input = set(net.Proto().external_input)
61  external_output = set(net.Proto().external_output)
62 
63  def is_activation_blob(b):
64  return b not in external_input and b not in external_output
65 
66  seen_as_output = set()
67  ops = list(net.Proto().op)
68 
69  # Sanity check: check that all external inputs are properlyh accounted
70  # and that no gradient ops are included in 'net'
71  for op in ops:
72  for b in op.input:
73  if is_activation_blob(b) and b not in seen_as_output:
74  assert False, "{} not in external input".format(b)
75  seen_as_output = seen_as_output.union(set(op.output))
76  assert not op.is_gradient_op, \
77  "You can only pass inference-only nets to optimize_inference_for_dag"
78 
79  return _compute_blob_recycling_for_dag(
80  netproto, input_blobs, ops, is_activation_blob, namescope
81  )
82 
83 
84 def _compute_blob_recycling_for_dag(
85  netproto, heads, ops, is_shareable, namescope
86 ):
87  '''
88  Computes a blob recycling by traversing the computation DAG. The resulting
89  model can be executed safely on a DAGNet.
90  '''
91  start_time = time.time()
92 
93  # Create mapping from blobs to ops
94  blobs_to_ops = collections.defaultdict(lambda: [])
95  blob_input_count = collections.defaultdict(lambda: 0)
96  op_inputs = collections.defaultdict(lambda: 0)
97  op_visit_count = collections.defaultdict(lambda: 0)
98 
99  for i, op in enumerate(ops):
100  for inp in op.input:
101  if is_shareable(inp) or inp in heads:
102  # Ignore in-place transformation ops (self-cycles)
103  if inp not in op.output:
104  blobs_to_ops[inp].append(i)
105  op_inputs[i] += 1
106 
107  # Traverse operators starting from the heads' blobs.
108  # Keep tabs on when blobs are seen first and last, and also
109  # when operators have their input satisfied. Share blobs only
110  # under same branch, avoiding problems with parallel workers.
111  output_blobs = set()
112  mapping = {}
113 
114  def descend(op_idx, free_blobs):
115  cur_op = ops[op_idx]
116  new_free_blobs = set()
117  for inp in cur_op.input:
118  if is_shareable(inp):
119  blob_input_count[inp] += 1
120  if blob_input_count[inp] == len(blobs_to_ops[inp]):
121  actual_blob = inp if inp not in mapping else mapping[inp]
122  new_free_blobs.add(actual_blob)
123 
124  for outp in cur_op.output:
125  if is_shareable(outp):
126  if outp not in output_blobs:
127  # First seen this blob as output, can assign to a free blob
128  for freeb in free_blobs:
129  mapping[outp] = freeb
130  free_blobs.remove(freeb)
131  break
132 
133  output_blobs.add(outp)
134 
135  free_blobs.update(new_free_blobs)
136 
137  first_branch = True
138  for outp in cur_op.output:
139  for inp_op_idx in blobs_to_ops[outp]:
140  op_visit_count[inp_op_idx] += 1
141 
142  # Descend only if we have satisfied all inputs
143  if op_visit_count[inp_op_idx] == op_inputs[inp_op_idx]:
144  free_blobs_fwd = free_blobs if first_branch else set()
145  first_branch = False
146  descend(inp_op_idx, free_blobs_fwd)
147 
148  # Start DFS from the heads' (losses or inputs)
149  for head_blob in heads:
150  for op_idx in blobs_to_ops[head_blob]:
151  descend(op_idx, set())
152 
153  # Rename the shared blobs
154  shared_blobs = set(mapping.values())
155  renamed = {}
156  for j, b in enumerate(shared_blobs):
157  renamed[b] = namescope + "__m{}_".format(j)
158 
159  # Final mapping
160  for k, v in mapping.items():
161  mapping[k] = renamed[v]
162 
163  # Add the originators
164  mapping.update(renamed)
165  log.info("Remapping {} blobs, using {} shared".format(
166  len(mapping), len(renamed),
167  ))
168  log.debug("Assignments: {}".format(mapping))
169 
170  apply_assignments(netproto, mapping)
171  log.info("Memonger optimization took {} secs".format(
172  time.time() - start_time),
173  )
174  return netproto
175 
176 
177 def _find_source_nodes(g):
178  ''' Return nodes without predecessors '''
179  ret = []
180  for cn in g:
181  cur_pred = g.predecessors(cn)
182  if not cur_pred:
183  ret.append(cn)
184  return ret
185 
186 
187 def _find_target_nodes(g):
188  ''' Return nodes without successors '''
189  ret = []
190  for cn in g:
191  cur_succ = g.successors(cn)
192  if not cur_succ:
193  ret.append(cn)
194  return ret
195 
196 
197 def _add_single_target_ifneeded(g):
198  targets = _find_target_nodes(g)
199  assert len(targets) >= 1
200  if len(targets) == 1:
201  return g
202  ret = copy.deepcopy(g)
203 
204  def _next_available_idx(g):
205  ret = -1
206  for cn in g:
207  if cn > ret:
208  ret = cn
209  ret += 1
210  return ret
211 
212  target_node_idx = _next_available_idx(g)
213  ret.add_node(target_node_idx)
214  for cn in targets:
215  ret.add_edge(cn, target_node_idx)
216 
217  return ret
218 
219 
220 def _get_path(pred_list, dist_list):
221  ''' Get the path from nx.bellman_ford()'s output '''
222 
223  # distances are negative
224  assert all(dist_list[x] <= 0 for x in dist_list)
225  # node with longest distance to source is the target
226  target = min(dist_list, key=lambda x: dist_list[x])
227 
228  ret = []
229  cur = target
230  while cur is not None:
231  ret.append(cur)
232  cur = pred_list[cur]
233  return list(reversed(ret))
234 
235 
236 def _get_longest_paths(g, source_nodes):
237  ''' Get the longest path for nodes in 'source_nodes'
238  Find with bellman_ford() by setting weight = -1
239  '''
240 
241  ng = copy.deepcopy(g)
242  for u, v in ng.edges():
243  ng[u][v]["weight"] = -1
244 
245  ret = {}
246  for cn in source_nodes:
247  pred, dist = nx.bellman_ford(ng, cn, weight="weight")
248  path = _get_path(pred, dist)
249  assert path[0] == cn
250  assert len(path) - 1 == -dist[path[-1]]
251  ret[cn] = path
252 
253  return ret
254 
255 
256 def _build_tree(paths):
257  ''' Build a tree for given paths based on common elements.
258  Last elements of all paths are the same, which is the root of the tree.
259  '''
260  assert all(cp[-1] == paths[0][-1] for cp in paths)
261  g = nx.DiGraph()
262  node_set = {y for x in paths for y in x}
263  g.add_nodes_from(node_set)
264  for cp in paths:
265  for ce in zip(cp[0:-1], cp[1:]):
266  g.add_edge(ce[1], ce[0])
267 
268  root = paths[0][-1]
269  _compute_tree_height(g, root)
270 
271  return (g, root)
272 
273 
274 def _compute_tree_height(g, root):
275  ''' Compute the heights of the tree for all nodes
276  Height of leaves are 0
277  '''
278  def _get_height(root):
279  children = g.successors(root)
280  height = 0
281  if children:
282  child_heights = [_get_height(x) for x in children]
283  height = max(child_heights) + 1
284  g.node[root]["height"] = height
285  return height
286 
287  _get_height(root)
288 
289 
290 def _sort_tree_leaves(g, root):
291  ''' For each node, sort its child nodes based on the height of the nodes.
292  Return the leaf nodes of the tree after sorting.
293  '''
294  def _get_height(root):
295  return g.node[root]["height"]
296 
297  def _get_sorted_leaves(root):
298  children = g.successors(root)
299  if not children:
300  return [root]
301  child_heights = [_get_height(x) for x in children]
302  order = sorted(range(len(children)), key=lambda x: child_heights[x])
303  ret = []
304  for co in order:
305  cr = children[co]
306  ret += _get_sorted_leaves(cr)
307 
308  return ret
309 
310  return _get_sorted_leaves(root)
311 
312 
314  ''' The graph 'g' may contain several source nodes (nodes without incoming
315  edge), which could have be in any order and still being a valid
316  topoligical sorting result. We would like to arrange these source nodes
317  so that the average live spans of the computed blobs are shorter.
318  The idea is to sort the source nodes based on the length of their path to
319  the target node so that the one with longer path is used first.
320  This is done by:
321  - Add a single target node if there are multiple target nodes in 'g'.
322  - Find the longest path between each source and the target node.
323  - Convert the longest paths to a tree with the target node being the root
324  and source nodes being the leaves.
325  - Sort the nodes of the tree based on the height of the tree.
326  '''
327  gt = _add_single_target_ifneeded(g)
328  source_nodes = _find_source_nodes(gt)
329  lpaths = _get_longest_paths(gt, source_nodes)
330  tree, root = _build_tree(lpaths.values())
331  sorted_sources = _sort_tree_leaves(tree, root)
332  assert(sorted(sorted_sources) == sorted(source_nodes))
333 
334  ret = nx.topological_sort(g, sorted_sources)
335  assert(len(ret) == len(g.node))
336  return ret
337 
338 
339 def topological_sort_traversal(g):
340  return nx.topological_sort(g)
341 
342 
343 def compute_ranges(linearized_ops, blob_sizes=None):
344  if not blob_sizes:
345  log.warning('Provide blob sizes to get more accurate assignments.')
346 
347  blobs = collections.defaultdict(
348  lambda: LiveRange(defined=None, used=None, size=None))
349  for i, op in enumerate(linearized_ops):
350  for blob in op.input:
351  used = blobs[blob].used
352  if used is None:
353  used = i
354  else:
355  used = max(used, i)
356  blobs[blob] = blobs[blob]._replace(used=used)
357  blob_size = blob_sizes[blob] if blob_sizes else None
358  assert not blob_sizes or blob_size is not None
359  blobs[blob] = blobs[blob]._replace(size=blob_size)
360  for blob in op.output:
361  defined = blobs[blob].defined
362  if defined is None:
363  defined = i
364  else:
365  defined = min(defined, i)
366  blobs[blob] = blobs[blob]._replace(defined=defined)
367  blob_size = blob_sizes[blob] if blob_sizes else None
368  assert not blob_sizes or blob_size is not None
369  blobs[blob] = blobs[blob]._replace(size=blob_size)
370 
371  return blobs
372 
373 
374 def is_compatible(candidate_range, assignment, static_blobs):
375  (name, range_) = assignment[-1]
376  if name in static_blobs:
377  return False
378  if candidate_range.defined is None or range_.defined is None \
379  or range_.used is None:
380  return False
381  return candidate_range.defined > range_.used
382 
383 
384 def compute_blob_assignments(assignments):
385  blob_assignments = {}
386  for assignment in assignments:
387  if len(assignment) == 1:
388  continue
389  last_blob, _ = assignment[-1]
390  for (blob, _) in assignment:
391  blob_assignments[blob] = last_blob
392  return blob_assignments
393 
394 
395 def _get_max_size(assignment):
396  if not assignment:
397  return 0
398  ret = max([x[1].size for x in assignment])
399  ret = 0 if ret is None else ret
400  return ret
401 
402 
403 def get_memory_usage(assignments):
404  ret = 0
405  for cur in assignments:
406  ret += _get_max_size(cur)
407  return ret
408 
409 
410 def compute_assignments_greedy(ranges_sorted, init_assignments=None):
411  assignments = init_assignments or []
412  visited = {y[0] for x in assignments for y in x}
413 
414  for (name, range_) in ranges_sorted:
415  if name in visited:
416  continue
417  assigned = False
418  best_assignment = 0
419  min_dist = float("inf")
420  candidate_size = range_.size or 0
421  for idx, assignment in enumerate(assignments):
422  if is_compatible(range_, assignment, []):
423  assigned = True
424  dist = abs(_get_max_size(assignment) - candidate_size)
425  if dist < min_dist:
426  min_dist = dist
427  best_assignment = idx
428  if assigned:
429  assignment = assignments[best_assignment]
430  assignment.append((name, range_))
431  else:
432  assignments.append([(name, range_)])
433  return assignments
434 
435 
436 def _get_count(assignments):
437  ''' Return number of blobs in assignments '''
438  if assignments:
439  return sum([len(x) for x in assignments])
440  return 0
441 
442 
443 def compute_assignments_dp(ranges_sorted, init_assignment, counter=None):
444  ''' Compute assignment for blobs in 'ranges_sorted' on top of 'init_assignment'
445  using dynamic programming + recursion.
446 
447  ranges_sorted: blobs sorted by 'used'
448  init_assignment: assignment to start with, blobs in 'ranges_sorted' should
449  not be used in 'init_assignment'
450 
451  Using f(b, k, init) to represent the best assignment for blobs b[0:k]
452  given initial assignment 'init', we have
453  f(b, k, init) = f(b, j, init) +
454  find_best(b[j:k], f(b, j, init))
455  where j is the index of the last best assignment that is independent of
456  blob b[k - 1] (b[k - 1] is compatible with all assignments in
457  f(b, j, init)), and find_best(b1, init1) gives the best assignment
458  for blobs in 'b1' based on the initial assignment 'init1', and blobs
459  b1[0:-1] should be incompatible with with b1[-1]. f(b, len(b), []) gives
460  the best assignment for blobs 'b'.
461 
462  For find_best(b, init), since b[0:-1] are not compatible with b[-1], we
463  could reduce it to a smaller problem to find best assignment for b[0:-1]
464  as
465  find_best(b, init) = min {
466  f(b[0:-1], len(b) - 1, init - x) + [x, b[-1]] for x in init, or
467  f(b[0:-1], len(b) - 1, init) + [b[-1]]
468  }
469  where min{} gives the assignment with minimum memory usage.
470  '''
471 
472  def _get_compatible_prev(candidate_range, best_assignments, cur_idx):
473  ''' Find closest position k of best_assignments that is independent of
474  candidate_range that candiate_range is compatible with all assignments
475  in best_assignments[k].
476  Return -1 if not found.
477  '''
478  def is_compatible_all(candidate_range, assignments):
479  ''' return true if compatiable for all assignments in assignments '''
480  return all([is_compatible(candidate_range[1], x, []) for x in assignments])
481 
482  ii = cur_idx - 1
483  while ii >= 0:
484  cba = best_assignments[ii]
485  if is_compatible_all(candidate_range, cba):
486  return ii
487  ii -= 1
488  return -1
489 
490  def _find_best(ranges, init_assignment, prev_best_assignment, counter):
491  ''' Find the best assignment for blobs 'ranges' given an initialized
492  assignment 'init_assignment'.
493 
494  Blobs in ranges[0:-1] should be incompatible with blob range[-1].
495  'prev_best_assignment': best assignment for blobs in ranges[:-1]
496 
497  By assigning ranges[-1] to each assignment k in 'init_assignment' or
498  in a new assignment, the problem becomes a smaller problem to find
499  the best assignment for ranges[0:-1] given the initial assignment
500  init_assigment[0:k, (k+1):-1].
501  '''
502  # Blob to check
503  find_range = ranges[-1]
504  # Blobs in ranges[0:-1] are incompatible with ranges[-1] so that we can
505  # reduce it to a smaller problem.
506  assert all(not is_compatible(x[1], [find_range], []) for x in ranges[0:-1])
507 
508  sz = len(init_assignment)
509  best_candidates = []
510  # Try to assign 'find_range' to each assignment in init_assignment
511  for ii in range(sz):
512  if not is_compatible(find_range[1], init_assignment[ii], []):
513  continue
514  cur_best = copy.deepcopy(init_assignment)
515  cur_best[ii].append(find_range)
516  if len(ranges) > 1:
517  cur_best_tmp = [x for i, x in enumerate(cur_best) if i != ii]
518  # reduce to a smaller dp problem
519  cur_best_tmp = compute_assignments_dp(
520  ranges[:-1], cur_best_tmp, counter)
521  cur_best = cur_best_tmp + [cur_best[ii]]
522  best_candidates.append(cur_best)
523  # Try to put 'find_range' in a new assignment
524  best_candidates.append(prev_best_assignment + [[find_range]])
525 
526  ret = min(best_candidates, key=lambda x: get_memory_usage(x))
527  return ret
528 
529  if not counter:
530  counter = [0]
531  counter[0] += 1
532 
533  if counter and counter[0] % 5000 == 0:
534  rs = [ranges_sorted[0][1].defined, ranges_sorted[-1][1].used]
535  log.info('Finding assignments {} ({} -> {})...'.format(
536  counter[0], rs[0], rs[1]))
537 
538  init_assignment = init_assignment or []
539  # best_assignments[k]: best assignments for first k blobs ranges_sorted[0:(k+1)]
540  best_assignments = []
541  # Find best assignment for blobs ranges_sorted[0:ii]
542  for ii, cur_range in enumerate(ranges_sorted):
543  # closest best_assignment that is independent of ranges_sorted[ii]
544  prev_idx = _get_compatible_prev(cur_range, best_assignments, ii)
545  prev_best = copy.deepcopy(init_assignment) if prev_idx < 0 else \
546  copy.deepcopy(best_assignments[prev_idx])
547  # Need to find best assignment for blobs in 'ranges_part'
548  ranges_part = ranges_sorted[(prev_idx + 1):(ii + 1)]
549  cur_best = _find_best(
550  ranges_part, prev_best,
551  best_assignments[-1] if best_assignments else init_assignment,
552  counter)
553  assert _get_count(cur_best) == _get_count(prev_best) + len(ranges_part)
554  best_assignments.append(copy.deepcopy(cur_best))
555 
556  assert len(best_assignments) == len(ranges_sorted)
557 
558  best = best_assignments[-1]
559 
560  return best
561 
562 
563 def get_updated_ranges(ranges, max_live=None):
564  ''' Set LiveRange.defined = -1 if it is None
565  Set LiveRange.used = max_live if it is None
566  Set LiveRanee.size = 1 if it is None
567  '''
568 
569  def _get_max_live(ranges):
570  max_live = max(x[1].used for x in ranges if x[1].used) + 1
571  return max_live
572 
573  def _update_range(x, max_live, size):
574  cx = x
575  if x[1].defined is None:
576  cx = (cx[0], cx[1]._replace(defined=-1))
577  if x[1].used is None:
578  cx = (cx[0], cx[1]._replace(used=max_live))
579  if x[1].size is None:
580  cx = (cx[0], cx[1]._replace(size=size))
581  return cx
582 
583  if max_live is None:
584  max_live = _get_max_live(ranges)
585  ranges = [_update_range(x, max_live, 1) for x in ranges]
586 
587  return ranges
588 
589 
590 def compute_assignments(ranges, static_blobs, algo):
591  '''
592  algo: Method used to find assignments (AssignmentAlgorithm.GREEDY or
593  AssignmentAlgorithm.DYNAMIC_PROGRAMMING).
594  AssignmentAlgorithm.DYNAMIC_PROGRAMMING gives optimal soultion at the
595  cost of more computation.
596  AssignmentAlgorithm.GREEDY may be better in the case 'blob_sizes' is
597  not provided.
598  '''
599 
600  # Sort the ranges based on when they are last used.
601  # If LiveRange.used is None, then the blob is never used and could
602  # be consumed externally. Sort these to the end of the list as opposed
603  # to the beginning so that they can be shared as well.
604  ranges = sorted(
605  list(ranges.items()),
606  key=lambda p: (p[1].used is None, p[1].used),
607  )
608  # Update None values
609  ranges = get_updated_ranges(ranges)
610 
611  # Sharable blobs
612  ranges_sharable = [x for x in ranges if x[0] not in static_blobs]
613  # Static blobs, not sharable
614  ranges_static = [x for x in ranges if x[0] in static_blobs]
615 
616  log.info("Total sharable blobs {}".format(len(ranges_sharable)))
617 
618  best_assignment = []
619  if algo == AssignmentAlgorithm.DYNAMIC_PROGRAMMING:
620  best_assignment = compute_assignments_dp(ranges_sharable, [])
621  elif algo == AssignmentAlgorithm.GREEDY:
622  best_assignment = compute_assignments_greedy(ranges_sharable, [])
623  else:
624  assert "Invalid algo name {}".format(algo)
625  best_assignment += [[x] for x in ranges_static]
626 
627  # verify_assignments(best_assignment)
628 
629  return best_assignment
630 
631 
632 def verify_assignments(assignments):
633  for cur in assignments:
634  for x, y in zip(cur[0:-1], cur[1:]):
635  assert x[1].used < y[1].defined
636 
637 
638 def compute_interference_graph(ops):
639  g = nx.DiGraph()
640  for i, op in enumerate(ops):
641  g.add_node(i, op=op)
642  for i, parent_op in enumerate(ops):
643  for j, child_op in enumerate(ops):
644  if i == j:
645  continue
646  if any(output in child_op.input for output in parent_op.output):
647  deps = set(child_op.input).intersection(parent_op.output)
648  g.add_edge(i, j, deps=deps)
649  assert nx.is_directed_acyclic_graph(g), child_op
650  return g
651 
652 
653 Optimization = collections.namedtuple(
654  'Optimization', ['net', 'assignments', 'blob_assignments'])
655 
656 
657 def apply_assignments(net, blob_assignments):
658  def canonical_name(blob):
659  if blob not in blob_assignments:
660  return blob
661  return blob_assignments[blob]
662 
663  for op in net.op:
664  # Descend into subnets of the recurrent network
665  if op.type.startswith('RecurrentNetwork'):
666  apply_recurrent_blob_assignments(op, blob_assignments, canonical_name)
667 
668  for i, input_ in enumerate(op.input):
669  op.input[i] = canonical_name(input_)
670  for i, output in enumerate(op.output):
671  op.output[i] = canonical_name(output)
672 
673 
674 def apply_recurrent_blob_assignments(op, blob_assignments, canonical_name):
675  log.debug("Applying assignments to recurrent op: {}".format(op.type))
676  import google.protobuf.text_format as protobuftx
677  step_args = [a for a in op.arg if a.name.endswith("step_net")]
678  for step_arg in step_args:
679  step_proto = caffe2_pb2.NetDef()
680  protobuftx.Merge(step_arg.s, step_proto)
681  apply_assignments(step_proto, blob_assignments)
682  for i, einp in enumerate(step_proto.external_input):
683  if einp in blob_assignments:
684  step_proto.external_input[i] = canonical_name(einp)
685  step_arg.s = str(step_proto)
686  # Store renamings
687  for blob, renamed in blob_assignments.items():
688  if blob in list(op.input) + list(op.output):
689  a = caffe2_pb2.Argument()
690  a.name = blob + ".rename"
691  a.s = str(renamed)
692  op.arg.extend([a])
693 
694 
695 class AssignmentAlgorithm(enum.Enum):
696  GREEDY = 0
697  DYNAMIC_PROGRAMMING = 1
698 
699 
700 def optimize_interference(net, static_blobs,
701  ordering_function=topological_sort_traversal,
702  blob_sizes=None,
703  algo=AssignmentAlgorithm.GREEDY):
704  """
705  ordering_function: topological_sort_traversal or
706  topological_sort_traversal_longest_path.
707  topological_sort_traversal_longest_path gives better
708  results but needs a bit more computation.
709  algo: Method used to find assignments (AssignmentAlgorithm.GREEDY or
710  AssignmentAlgorithm.DYNAMIC_PROGRAMMING).
711  AssignmentAlgorithm.DYNAMIC_PROGRAMMING gives optimal soultion at the
712  cost of more computation.
713  AssignmentAlgorithm.GREEDY may be better in the case 'blob_sizes' is
714  not provided.
715  """
716 
717  """
718  1) Use a BFS traversal of the execution graph to generate an
719  ordering of the node executions.
720  2) Generate use-def ranges for each `blob` in the BFS traversal
721  order.
722  3) Assign blobs to `canonical blobs`
723  4) Rename blobs to canonical blobs
724  """
725  net = copy.deepcopy(net)
726  g = compute_interference_graph(net.op)
727  ordering = ordering_function(g)
728  linearized_ops = [net.op[i] for i in ordering]
729 
730  # Reorder ops in net based on the computed linearlized order.
731  # If the graph has multiple topological orderings and if the NetDef's
732  # ordering differs from the order used to compute ranges, then the
733  # runtime might end up overwriting blobs before they are used.
734  del net.op[:]
735  net.op.extend(linearized_ops)
736 
737  ranges = compute_ranges(linearized_ops, blob_sizes)
738  assignments = compute_assignments(ranges, static_blobs, algo)
739  blob_assignments = compute_blob_assignments(assignments)
740  apply_assignments(net, blob_assignments)
741  return Optimization(
742  net=net,
743  blob_assignments=blob_assignments,
744  assignments=assignments)
745 
746 
747 Statistics = collections.namedtuple(
748  'Statistics', ['baseline_nbytes', 'optimized_nbytes'])
749 
750 
751 def compute_statistics(assignments):
752  def blob_nbytes(blob):
753  return workspace.FetchBlob(blob).nbytes
754  blob_bytes = {
755  blob: blob_nbytes(blob) for assignment in assignments
756  for (blob, _) in assignment}
757  baseline_nbytes = sum(v for _, v in blob_bytes.items())
758  optimized_nbytes = sum(
759  max(blob_bytes[blob] for (blob, _) in assignment)
760  for assignment in assignments)
761  return Statistics(
762  baseline_nbytes=baseline_nbytes,
763  optimized_nbytes=optimized_nbytes)
764 
765 
767  ''' College blob sizes from worksapce '''
768  def blob_nbytes(blob):
769  return workspace.FetchBlob(blob).nbytes
770 
771  blobs = {}
772  for op in net.op:
773  for blob in op.input:
774  blobs[blob] = blob_nbytes(blob)
775  for blob in op.output:
776  blobs[blob] = blob_nbytes(blob)
777 
778  return blobs
def collect_blob_sizes(net)
Definition: memonger.py:766
def share_grad_blobs(net, losses, param_grads, namescope)
Definition: memonger.py:22
def compute_assignments(ranges, static_blobs, algo)
Definition: memonger.py:590
def compute_assignments_dp(ranges_sorted, init_assignment, counter=None)
Definition: memonger.py:443
def optimize_interference(net, static_blobs, ordering_function=topological_sort_traversal, blob_sizes=None, algo=AssignmentAlgorithm.GREEDY)
Definition: memonger.py:703
def FetchBlob(name)
Definition: workspace.py:276
def get_updated_ranges(ranges, max_live=None)
Definition: memonger.py:563
def topological_sort_traversal_longest_path(g)
Definition: memonger.py:313