Caffe2 - C++ API
A deep learning, cross platform ML framework
common_gpu.h
1 #ifndef CAFFE2_CORE_COMMON_GPU_H_
2 #define CAFFE2_CORE_COMMON_GPU_H_
3 
4 #include <assert.h>
5 #include <cublas_v2.h>
6 #include <cuda.h>
7 #include <cuda_runtime.h>
8 #include <curand.h>
9 #include <driver_types.h> // cuda driver types
10 
11 #include "caffe2/core/logging.h"
12 #include "caffe2/core/common.h"
13 
14 // This is a macro defined for cuda fp16 support. In default, cuda fp16 is
15 // supported by NVCC 7.5, but it is also included in the Tegra X1 platform with
16 // a (custom?) NVCC 7.0. As a result, we would normally just check the cuda
17 // version here, but would also allow a use to pass in the flag
18 // CAFFE_HAS_CUDA_FP16 manually.
19 
20 #ifndef CAFFE_HAS_CUDA_FP16
21 #if CUDA_VERSION >= 7050
22 #define CAFFE_HAS_CUDA_FP16
23 #endif // CUDA_VERSION >= 7050
24 #endif // CAFFE_HAS_CUDA_FP16
25 
26 #ifdef CAFFE_HAS_CUDA_FP16
27 #include <cuda_fp16.h>
28 #endif
29 
33 #define CAFFE2_COMPILE_TIME_MAX_GPUS 16
34 
41 #define CAFFE2_CUDA_MAX_PEER_SIZE 8
42 
43 namespace caffe2 {
44 
48 inline int CudaVersion() { return CUDA_VERSION; }
49 
53 int NumCudaDevices();
54 
64 inline bool HasCudaGPU() { return NumCudaDevices() > 0; }
65 
73 void SetDefaultGPUID(const int deviceid);
74 
78 int GetDefaultGPUID();
79 
83 int GetCurrentGPUID();
84 
88 int GetGPUIDForPointer(const void* ptr);
89 
93 const cudaDeviceProp& GetDeviceProperty(const int device);
94 
98 void DeviceQuery(const int deviceid);
99 
107 bool GetCudaPeerAccessPattern(vector<vector<bool> >* pattern);
108 
112 const char* cublasGetErrorString(cublasStatus_t error);
113 
117 const char* curandGetErrorString(curandStatus_t error);
118 
119 // CUDA: various checks for different function calls.
120 #define CUDA_ENFORCE(condition) \
121  do { \
122  cudaError_t error = condition; \
123  CAFFE_ENFORCE_EQ( \
124  error, \
125  cudaSuccess, \
126  "Error at: ", \
127  __FILE__, \
128  ":", \
129  __LINE__, \
130  ": ", \
131  cudaGetErrorString(error)); \
132  } while (0)
133 #define CUDA_CHECK(condition) \
134  do { \
135  cudaError_t error = condition; \
136  CHECK(error == cudaSuccess) << cudaGetErrorString(error); \
137  } while (0)
138 
139 #define CUDA_DRIVERAPI_ENFORCE(condition) \
140  do { \
141  CUresult result = condition; \
142  if (result != CUDA_SUCCESS) { \
143  const char* msg; \
144  cuGetErrorName(result, &msg); \
145  CAFFE_THROW("Error at: ", __FILE__, ":", __LINE__, ": ", msg); \
146  } \
147  } while (0)
148 #define CUDA_DRIVERAPI_CHECK(condition) \
149  do { \
150  CUresult result = condition; \
151  if (result != CUDA_SUCCESS) { \
152  const char* msg; \
153  cuGetErrorName(result, &msg); \
154  LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": " \
155  << msg; \
156  } \
157  } while (0)
158 
159 #define CUBLAS_ENFORCE(condition) \
160  do { \
161  cublasStatus_t status = condition; \
162  CAFFE_ENFORCE_EQ( \
163  status, \
164  CUBLAS_STATUS_SUCCESS, \
165  "Error at: ", \
166  __FILE__, \
167  ":", \
168  __LINE__, \
169  ": ", \
170  ::caffe2::cublasGetErrorString(status)); \
171  } while (0)
172 #define CUBLAS_CHECK(condition) \
173  do { \
174  cublasStatus_t status = condition; \
175  CHECK(status == CUBLAS_STATUS_SUCCESS) \
176  << ::caffe2::cublasGetErrorString(status); \
177  } while (0)
178 
179 #define CURAND_ENFORCE(condition) \
180  do { \
181  curandStatus_t status = condition; \
182  CAFFE_ENFORCE_EQ( \
183  status, \
184  CURAND_STATUS_SUCCESS, \
185  "Error at: ", \
186  __FILE__, \
187  ":", \
188  __LINE__, \
189  ": ", \
190  ::caffe2::curandGetErrorString(status)); \
191  } while (0)
192 #define CURAND_CHECK(condition) \
193  do { \
194  curandStatus_t status = condition; \
195  CHECK(status == CURAND_STATUS_SUCCESS) \
196  << ::caffe2::curandGetErrorString(status); \
197  } while (0)
198 
199 #define CUDA_1D_KERNEL_LOOP(i, n) \
200  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
201  i < (n); \
202  i += blockDim.x * gridDim.x)
203 
204 // CUDA_KERNEL_ASSERT is a macro that wraps an assert() call inside cuda
205 // kernels. This is not supported by Apple platforms so we special case it.
206 // See http://docs.nvidia.com/cuda/cuda-c-programming-guide/#assertion
207 #ifdef __APPLE__
208 #define CUDA_KERNEL_ASSERT(...)
209 #else // __APPLE__
210 #define CUDA_KERNEL_ASSERT(...) assert(__VA_ARGS__)
211 #endif // __APPLE__
212 
213 // The following helper functions are here so that you can write a kernel call
214 // when you are not particularly interested in maxing out the kernels'
215 // performance. Usually, this will give you a reasonable speed, but if you
216 // really want to find the best performance, it is advised that you tune the
217 // size of the blocks and grids more reasonably.
218 // A legacy note: this is derived from the old good Caffe days, when I simply
219 // hard-coded the number of threads and wanted to keep backward compatibility
220 // for different computation capabilities.
221 // For more info on CUDA compute capabilities, visit the NVidia website at:
222 // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
223 
224 // The number of cuda threads to use. 512 is used for backward compatibility,
225 // and it is observed that setting it to 1024 usually does not bring much
226 // performance gain (which makes sense, because warp size being 32 means that
227 // blindly setting a huge block for a random kernel isn't optimal).
228 constexpr int CAFFE_CUDA_NUM_THREADS = 512;
229 // The maximum number of blocks to use in the default kernel call. We set it to
230 // 4096 which would work for compute capability 2.x (where 65536 is the limit).
231 // This number is very carelessly chosen. Ideally, one would like to look at
232 // the hardware at runtime, and pick the number of blocks that makes most
233 // sense for the specific runtime environment. This is a todo item.
234 constexpr int CAFFE_MAXIMUM_NUM_BLOCKS = 4096;
235 
239 inline int CAFFE_GET_BLOCKS(const int N) {
240  return std::min((N + CAFFE_CUDA_NUM_THREADS - 1) / CAFFE_CUDA_NUM_THREADS,
241  CAFFE_MAXIMUM_NUM_BLOCKS);
242 }
243 
244 class DeviceGuard {
245  public:
246  explicit DeviceGuard(int newDevice)
247  : previous_(GetCurrentGPUID()) {
248  if (previous_ != newDevice) {
249  CUDA_ENFORCE(cudaSetDevice(newDevice));
250  }
251  }
252 
253  ~DeviceGuard() noexcept {
254  CUDA_CHECK(cudaSetDevice(previous_));
255  }
256 
257  private:
258  int previous_;
259 };
260 
261 } // namespace caffe2
262 #endif // CAFFE2_CORE_COMMON_GPU_H_
void DeviceQuery(const int device)
Runs a device query function and prints out the results to LOG(INFO).
Definition: common_gpu.cc:124
void SetDefaultGPUID(const int deviceid)
Sets the default GPU id for Caffe2.
Definition: common_gpu.cc:80
int GetCurrentGPUID()
Gets the current GPU id.
Definition: common_gpu.cc:93
int CudaVersion()
A runtime function to report the cuda version that Caffe2 is built with.
Definition: common_gpu.h:48
bool HasCudaGPU()
Check if the current running session has a cuda gpu present.
Definition: common_gpu.h:64
bool GetCudaPeerAccessPattern(vector< vector< bool > > *pattern)
Return a peer access pattern by returning a matrix (in the format of a nested vector) of boolean valu...
Definition: common_gpu.cc:159
int NumCudaDevices()
Returns the number of devices.
Definition: common_gpu.cc:13
Simple registry implementation in Caffe2 that uses static variables to register object creators durin...
int GetGPUIDForPointer(const void *ptr)
Gets the GPU id that the current pointer is located at.
Definition: common_gpu.cc:99
int GetDefaultGPUID()
Gets the default GPU id for Caffe2.
Definition: common_gpu.cc:91
const cudaDeviceProp & GetDeviceProperty(const int deviceid)
Gets the device property for the given device.
Definition: common_gpu.cc:105
const char * curandGetErrorString(curandStatus_t error)
Return a human readable curand error string.
Definition: common_gpu.cc:210
int CAFFE_GET_BLOCKS(const int N)
Compute the number of blocks needed to run N threads.
Definition: common_gpu.h:239
const char * cublasGetErrorString(cublasStatus_t error)
Return a human readable cublas error string.
Definition: common_gpu.cc:179