3 """muji.py does multi-gpu training for caffe2 with no need to change the c++ 4 side code. Everything is defined on the computation graph level. 6 Currently, here are the assumptions: we only support the following use cases: 7 - 2 gpus, where peer access is enabled between them. 8 - 4 gpus, where peer access are enabled between all of them. 9 - 8 gpus, where peer access are enabled in two groups, 10 between {1, 2, 3, 4} and {5, 6, 7, 8}. 13 from caffe2.python
import core
14 from caffe2.proto
import caffe2_pb2
18 """A utility function that returns a device option protobuf of the 21 device_option = caffe2_pb2.DeviceOption()
22 device_option.device_type = caffe2_pb2.CUDA
23 device_option.cuda_gpu_id = gpu_id
28 device_option = caffe2_pb2.DeviceOption()
29 device_option.device_type = caffe2_pb2.CPU
33 def Allreduce(net, blobs, reduced_affix="_reduced", gpu_indices=None):
34 """The general Allreduce interface that reroutes the function calls. 36 if gpu_indices
is None:
37 gpu_indices = range(len(blobs))
38 if len(gpu_indices) != len(blobs):
40 "gpu_indices length and blobs length mismatch: %d vs %d" %
41 (len(gpu_indices), len(blobs))
44 return Allreduce2(net, blobs, reduced_affix, gpu_indices)
46 return Allreduce4(net, blobs, reduced_affix, gpu_indices)
48 return Allreduce8(net, blobs, reduced_affix, gpu_indices)
54 """Allreduce for 2 gpus. 56 Algorithm: 0r <- 0 + 1, 1r <- 0r, where r means "reduced" 59 gpu_a, gpu_b = gpu_indices
60 a_reduced = net.Add([a, b], a + reduced_affix, device_option=
OnGPU(gpu_a))
61 b_reduced = a_reduced.Copy(
64 device_option=
OnGPU(gpu_b)
66 return a_reduced, b_reduced
70 """Allreduce for 4 gpus. 72 Algorithm: 2 level reduction. 73 0r <- 0 + 1, 2r <- 2 + 3 79 gpu_a, gpu_b, gpu_c, gpu_d = gpu_indices
83 str(a) + reduced_affix,
84 device_option=
OnGPU(gpu_a)
88 str(c) + reduced_affix,
89 device_option=
OnGPU(gpu_c)
92 a_reduced = a_reduced.Add(c_reduced, a_reduced, device_option=
OnGPU(gpu_a))
94 c_reduced = a_reduced.Copy([], c_reduced, device_option=
OnGPU(gpu_c))
96 b_reduced = a_reduced.Copy(
98 str(b) + reduced_affix,
99 device_option=
OnGPU(gpu_b)
101 d_reduced = c_reduced.Copy(
103 str(d) + reduced_affix,
104 device_option=
OnGPU(gpu_d)
106 return a_reduced, b_reduced, c_reduced, d_reduced
110 """Allreduce for 8 gpus. 112 Algorithm: 3 level reduction. 113 0r <- 0 + 1, 2r <- 2 + 3, 4r <- 4 + 5, 6r <- 6 + 7 114 0r <- 0r + 2r, 4r <- 4r + 6r 118 1r <- 0r, 3r <- 2r, 5r <- 4r, 7r <- 6r 122 for i
in [0, 2, 4, 6]:
123 reduced[i] = net.Add(
124 [blobs[i], blobs[i + 1]],
125 blobs[i] + reduced_affix,
126 device_option=
OnGPU(gpu_indices[i])
130 reduced[i] = net.Add(
131 [reduced[i], reduced[i + 2]],
132 str(blobs[i]) + reduced_affix,
133 device_option=
OnGPU(gpu_indices[i])
136 reduced_4_copy = reduced[4].Copy(
138 str(reduced[4]) +
'_copy',
139 device_option=
OnGPU(gpu_indices[0])
141 reduced[0] = reduced[0].Add(
144 device_option=
OnGPU(gpu_indices[0])
147 reduced[4] = reduced[0].Copy(
150 device_option=
OnGPU(gpu_indices[4])
154 reduced[i] = reduced[i - 2].Copy(
157 device_option=
OnGPU(gpu_indices[i])
160 for i
in [1, 3, 5, 7]:
161 reduced[i] = reduced[i - 1].Copy(
163 blobs[i] + reduced_affix,
164 device_option=
OnGPU(gpu_indices[i])
170 """A fallback option for Allreduce with no assumption on p2p. 172 Algorithm: a flat operation on gpu 0 174 0r <- 0r + i for i in gpu_indices[1:] 175 ir <- 0r for i in gpu_indices[1:] 177 reduced = [
None] * len(gpu_indices)
179 reduced[0] = net.Copy(
181 blobs[0] + reduced_affix,
182 device_option=
OnGPU(gpu_indices[0])
185 temp_name = reduced[0] +
'_temp_copy' 186 for i
in range(1, len(gpu_indices)):
190 device_option=
OnGPU(gpu_indices[0])
192 reduced[0] = reduced[0].Add(
195 device_option=
OnGPU(gpu_indices[0])
198 for i
in range(1, len(gpu_indices)):
199 reduced[i] = net.Copy(
201 blobs[i] + reduced_affix,
202 device_option=
OnGPU(gpu_indices[i])
def Allreduce4(net, blobs, reduced_affix, gpu_indices)
def Allreduce8(net, blobs, reduced_affix, gpu_indices)
def Allreduce(net, blobs, reduced_affix="_reduced", gpu_indices=None)
def Allreduce2(net, blobs, reduced_affix, gpu_indices)
def AllreduceFallback(net, blobs, reduced_affix, gpu_indices)