Caffe2 - Python API
A deep learning, cross platform ML framework
muji.py
1 
3 """muji.py does multi-gpu training for caffe2 with no need to change the c++
4 side code. Everything is defined on the computation graph level.
5 
6 Currently, here are the assumptions: we only support the following use cases:
7  - 2 gpus, where peer access is enabled between them.
8  - 4 gpus, where peer access are enabled between all of them.
9  - 8 gpus, where peer access are enabled in two groups,
10  between {1, 2, 3, 4} and {5, 6, 7, 8}.
11 """
12 
13 from caffe2.python import core
14 from caffe2.proto import caffe2_pb2
15 
16 
17 def OnGPU(gpu_id):
18  """A utility function that returns a device option protobuf of the
19  specified gpu id.
20  """
21  device_option = caffe2_pb2.DeviceOption()
22  device_option.device_type = caffe2_pb2.CUDA
23  device_option.cuda_gpu_id = gpu_id
24  return device_option
25 
26 
27 def OnCPU():
28  device_option = caffe2_pb2.DeviceOption()
29  device_option.device_type = caffe2_pb2.CPU
30  return device_option
31 
32 
33 def Allreduce(net, blobs, reduced_affix="_reduced", gpu_indices=None):
34  """The general Allreduce interface that reroutes the function calls.
35  """
36  if gpu_indices is None:
37  gpu_indices = range(len(blobs))
38  if len(gpu_indices) != len(blobs):
39  raise RuntimeError(
40  "gpu_indices length and blobs length mismatch: %d vs %d" %
41  (len(gpu_indices), len(blobs))
42  )
43  if len(blobs) == 2:
44  return Allreduce2(net, blobs, reduced_affix, gpu_indices)
45  elif len(blobs) == 4:
46  return Allreduce4(net, blobs, reduced_affix, gpu_indices)
47  elif len(blobs) == 8:
48  return Allreduce8(net, blobs, reduced_affix, gpu_indices)
49  else:
50  return AllreduceFallback(net, blobs, reduced_affix, gpu_indices)
51 
52 
53 def Allreduce2(net, blobs, reduced_affix, gpu_indices):
54  """Allreduce for 2 gpus.
55 
56  Algorithm: 0r <- 0 + 1, 1r <- 0r, where r means "reduced"
57  """
58  a, b = blobs
59  gpu_a, gpu_b = gpu_indices
60  a_reduced = net.Add([a, b], a + reduced_affix, device_option=OnGPU(gpu_a))
61  b_reduced = a_reduced.Copy(
62  [],
63  b + reduced_affix,
64  device_option=OnGPU(gpu_b)
65  )
66  return a_reduced, b_reduced
67 
68 
69 def Allreduce4(net, blobs, reduced_affix, gpu_indices):
70  """Allreduce for 4 gpus.
71 
72  Algorithm: 2 level reduction.
73  0r <- 0 + 1, 2r <- 2 + 3
74  0r <- 0r + 2r
75  2r <- 0r,
76  1r <- 0r, 3r <- 2r
77  """
78  a, b, c, d = blobs
79  gpu_a, gpu_b, gpu_c, gpu_d = gpu_indices
80  # a_reduced <- a+b, c_reduced <- c + d
81  a_reduced = net.Add(
82  [a, b],
83  str(a) + reduced_affix,
84  device_option=OnGPU(gpu_a)
85  )
86  c_reduced = net.Add(
87  [c, d],
88  str(c) + reduced_affix,
89  device_option=OnGPU(gpu_c)
90  )
91  # a_reduced <- a_reduced + c_reduced
92  a_reduced = a_reduced.Add(c_reduced, a_reduced, device_option=OnGPU(gpu_a))
93  # broadcast a_reduced to c_reduced
94  c_reduced = a_reduced.Copy([], c_reduced, device_option=OnGPU(gpu_c))
95  # broadcast to b and d
96  b_reduced = a_reduced.Copy(
97  [],
98  str(b) + reduced_affix,
99  device_option=OnGPU(gpu_b)
100  )
101  d_reduced = c_reduced.Copy(
102  [],
103  str(d) + reduced_affix,
104  device_option=OnGPU(gpu_d)
105  )
106  return a_reduced, b_reduced, c_reduced, d_reduced
107 
108 
109 def Allreduce8(net, blobs, reduced_affix, gpu_indices):
110  """Allreduce for 8 gpus.
111 
112  Algorithm: 3 level reduction.
113  0r <- 0 + 1, 2r <- 2 + 3, 4r <- 4 + 5, 6r <- 6 + 7
114  0r <- 0r + 2r, 4r <- 4r + 6r
115  0r <- 0r + 4r
116  4r <- 0r
117  2r <- 0r, 6r <- 4r
118  1r <- 0r, 3r <- 2r, 5r <- 4r, 7r <- 6r
119  """
120  reduced = [None] * 8
121  # Reduction level 1
122  for i in [0, 2, 4, 6]:
123  reduced[i] = net.Add(
124  [blobs[i], blobs[i + 1]],
125  blobs[i] + reduced_affix,
126  device_option=OnGPU(gpu_indices[i])
127  )
128  # Reduction level 2
129  for i in [0, 4]:
130  reduced[i] = net.Add(
131  [reduced[i], reduced[i + 2]],
132  str(blobs[i]) + reduced_affix,
133  device_option=OnGPU(gpu_indices[i])
134  )
135  # Reduction level 3: this involves a copy.
136  reduced_4_copy = reduced[4].Copy(
137  [],
138  str(reduced[4]) + '_copy',
139  device_option=OnGPU(gpu_indices[0])
140  )
141  reduced[0] = reduced[0].Add(
142  reduced_4_copy,
143  reduced[0],
144  device_option=OnGPU(gpu_indices[0])
145  )
146  # Broadcast level 1
147  reduced[4] = reduced[0].Copy(
148  [],
149  reduced[4],
150  device_option=OnGPU(gpu_indices[4])
151  )
152  # Broadcast level 2
153  for i in [2, 6]:
154  reduced[i] = reduced[i - 2].Copy(
155  [],
156  reduced[i],
157  device_option=OnGPU(gpu_indices[i])
158  )
159  # Broadcast level 3
160  for i in [1, 3, 5, 7]:
161  reduced[i] = reduced[i - 1].Copy(
162  [],
163  blobs[i] + reduced_affix,
164  device_option=OnGPU(gpu_indices[i])
165  )
166  return reduced
167 
168 
169 def AllreduceFallback(net, blobs, reduced_affix, gpu_indices):
170  """A fallback option for Allreduce with no assumption on p2p.
171 
172  Algorithm: a flat operation on gpu 0
173  0r <- 0
174  0r <- 0r + i for i in gpu_indices[1:]
175  ir <- 0r for i in gpu_indices[1:]
176  """
177  reduced = [None] * len(gpu_indices)
178  # copy first
179  reduced[0] = net.Copy(
180  blobs[0],
181  blobs[0] + reduced_affix,
182  device_option=OnGPU(gpu_indices[0])
183  )
184  # do temp copy and add
185  temp_name = reduced[0] + '_temp_copy'
186  for i in range(1, len(gpu_indices)):
187  temp = net.Copy(
188  blobs[i],
189  temp_name,
190  device_option=OnGPU(gpu_indices[0])
191  )
192  reduced[0] = reduced[0].Add(
193  temp,
194  reduced[0],
195  device_option=OnGPU(gpu_indices[0])
196  )
197  # Broadcast to everyone else
198  for i in range(1, len(gpu_indices)):
199  reduced[i] = net.Copy(
200  reduced[0],
201  blobs[i] + reduced_affix,
202  device_option=OnGPU(gpu_indices[i])
203  )
204  return reduced
def Allreduce4(net, blobs, reduced_affix, gpu_indices)
Definition: muji.py:69
def Allreduce8(net, blobs, reduced_affix, gpu_indices)
Definition: muji.py:109
def Allreduce(net, blobs, reduced_affix="_reduced", gpu_indices=None)
Definition: muji.py:33
def Allreduce2(net, blobs, reduced_affix, gpu_indices)
Definition: muji.py:53
def AllreduceFallback(net, blobs, reduced_affix, gpu_indices)
Definition: muji.py:169
def OnGPU(gpu_id)
Definition: muji.py:17