Caffe2 - C++ API
A deep learning, cross platform ML framework
common_cudnn.h
1 #ifndef CAFFE2_CORE_COMMON_CUDNN_H_
2 #define CAFFE2_CORE_COMMON_CUDNN_H_
3 
4 #include <array>
5 #include <mutex>
6 
7 #include <cudnn.h>
8 
9 #include "caffe2/core/common.h"
10 #include "caffe2/core/common_gpu.h"
11 #include "caffe2/core/context.h"
12 #include "caffe2/core/context_gpu.h"
13 #include "caffe2/core/logging.h"
14 #include "caffe2/core/types.h"
15 #include "caffe2/proto/caffe2.pb.h"
16 
17 static_assert(
18  CUDNN_VERSION >= 5000,
19  "Caffe2 requires cudnn version 5.0 or above.");
20 
21 #define CUDNN_VERSION_MIN(major, minor, patch) \
22  (CUDNN_VERSION >= ((major) * 1000 + (minor) * 100 + (patch)))
23 
24 namespace caffe2 {
25 
26 namespace internal {
30 inline const char* cudnnGetErrorString(cudnnStatus_t status) {
31  switch (status) {
32  case CUDNN_STATUS_SUCCESS:
33  return "CUDNN_STATUS_SUCCESS";
34  case CUDNN_STATUS_NOT_INITIALIZED:
35  return "CUDNN_STATUS_NOT_INITIALIZED";
36  case CUDNN_STATUS_ALLOC_FAILED:
37  return "CUDNN_STATUS_ALLOC_FAILED";
38  case CUDNN_STATUS_BAD_PARAM:
39  return "CUDNN_STATUS_BAD_PARAM";
40  case CUDNN_STATUS_INTERNAL_ERROR:
41  return "CUDNN_STATUS_INTERNAL_ERROR";
42  case CUDNN_STATUS_INVALID_VALUE:
43  return "CUDNN_STATUS_INVALID_VALUE";
44  case CUDNN_STATUS_ARCH_MISMATCH:
45  return "CUDNN_STATUS_ARCH_MISMATCH";
46  case CUDNN_STATUS_MAPPING_ERROR:
47  return "CUDNN_STATUS_MAPPING_ERROR";
48  case CUDNN_STATUS_EXECUTION_FAILED:
49  return "CUDNN_STATUS_EXECUTION_FAILED";
50  case CUDNN_STATUS_NOT_SUPPORTED:
51  return "CUDNN_STATUS_NOT_SUPPORTED";
52  case CUDNN_STATUS_LICENSE_ERROR:
53  return "CUDNN_STATUS_LICENSE_ERROR";
54  default:
55  return "Unknown cudnn error number";
56  }
57 }
58 } // namespace internal
59 
60 // A macro that wraps around a cudnn statement so we can check if the cudnn
61 // execution finishes or not.
62 #define CUDNN_ENFORCE(condition) \
63  do { \
64  cudnnStatus_t status = condition; \
65  CAFFE_ENFORCE_EQ( \
66  status, \
67  CUDNN_STATUS_SUCCESS, \
68  ", Error at: ", \
69  __FILE__, \
70  ":", \
71  __LINE__, \
72  ": ", \
73  ::caffe2::internal::cudnnGetErrorString(status)); \
74  } while (0)
75 #define CUDNN_CHECK(condition) \
76  do { \
77  cudnnStatus_t status = condition; \
78  CHECK(status == CUDNN_STATUS_SUCCESS) \
79  << ::caffe2::internal::cudnnGetErrorString(status); \
80  } while (0)
81 
82 // report the version of cuDNN Caffe2 was compiled with
83 inline size_t cudnnCompiledVersion() {
84  return CUDNN_VERSION;
85 }
86 // report the runtime version of cuDNN
87 inline size_t cudnnRuntimeVersion() {
88  return cudnnGetVersion();
89 }
90 
91 // Check compatibility of compiled and runtime cuDNN versions
92 inline void CheckCuDNNVersions() {
93  // Version format is major*1000 + minor*100 + patch
94  // Major, minor and patch versions must all match
95  bool version_match = cudnnCompiledVersion() == cudnnRuntimeVersion();
96  CAFFE_ENFORCE(version_match,
97  "cuDNN compiled (", cudnnCompiledVersion(), ") and"
98  "runtime (", cudnnRuntimeVersion(), ") versions mismatch");
99 }
100 
106 template <typename T>
108 
109 template <>
110 class cudnnTypeWrapper<float> {
111  public:
112  static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
113  typedef const float ScalingParamType;
114  typedef float BNParamType;
115  static ScalingParamType* kOne() {
116  static ScalingParamType v = 1.0;
117  return &v;
118  }
119  static const ScalingParamType* kZero() {
120  static ScalingParamType v = 0.0;
121  return &v;
122  }
123 };
124 
125 template <>
126 class cudnnTypeWrapper<double> {
127  public:
128  static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
129  typedef const double ScalingParamType;
130  typedef double BNParamType;
131  static ScalingParamType* kOne() {
132  static ScalingParamType v = 1.0;
133  return &v;
134  }
135  static ScalingParamType* kZero() {
136  static ScalingParamType v = 0.0;
137  return &v;
138  }
139 };
140 
141 template <>
142 class cudnnTypeWrapper<float16> {
143  public:
144  static const cudnnDataType_t type = CUDNN_DATA_HALF;
145  typedef const float ScalingParamType;
146  typedef float BNParamType;
147  static ScalingParamType* kOne() {
148  static ScalingParamType v = 1.0;
149  return &v;
150  }
151  static ScalingParamType* kZero() {
152  static ScalingParamType v = 0.0;
153  return &v;
154  }
155 };
156 
161 inline cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder& order) {
162  switch (order) {
163  case StorageOrder::NHWC:
164  return CUDNN_TENSOR_NHWC;
165  case StorageOrder::NCHW:
166  return CUDNN_TENSOR_NCHW;
167  default:
168  LOG(FATAL) << "Unknown cudnn equivalent for order: " << order;
169  }
170  // Just to suppress compiler warnings
171  return CUDNN_TENSOR_NCHW;
172 }
173 
180  public:
182  CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&desc_));
183  }
184  ~cudnnTensorDescWrapper() noexcept {
185  CUDNN_CHECK(cudnnDestroyTensorDescriptor(desc_));
186  }
187 
188  inline cudnnTensorDescriptor_t Descriptor(
189  const cudnnTensorFormat_t format,
190  const cudnnDataType_t type,
191  const vector<int>& dims,
192  bool* changed) {
193  if (type_ == type && format_ == format && dims_ == dims) {
194  // if not changed, simply return the current descriptor.
195  if (changed)
196  *changed = false;
197  return desc_;
198  }
199  CAFFE_ENFORCE_EQ(
200  dims.size(), 4, "Currently only 4-dimensional descriptor supported.");
201  format_ = format;
202  type_ = type;
203  dims_ = dims;
204  CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
205  desc_,
206  format,
207  type,
208  dims_[0],
209  (format == CUDNN_TENSOR_NCHW ? dims_[1] : dims_[3]),
210  (format == CUDNN_TENSOR_NCHW ? dims_[2] : dims_[1]),
211  (format == CUDNN_TENSOR_NCHW ? dims_[3] : dims_[2])));
212  if (changed)
213  *changed = true;
214  return desc_;
215  }
216 
217  template <typename T>
218  inline cudnnTensorDescriptor_t Descriptor(
219  const StorageOrder& order,
220  const vector<int>& dims) {
221  return Descriptor(
222  GetCudnnTensorFormat(order), cudnnTypeWrapper<T>::type, dims, nullptr);
223  }
224 
225  private:
226  cudnnTensorDescriptor_t desc_;
227  cudnnTensorFormat_t format_;
228  cudnnDataType_t type_;
229  vector<int> dims_;
230  DISABLE_COPY_AND_ASSIGN(cudnnTensorDescWrapper);
231 };
232 
234  public:
236  CUDNN_ENFORCE(cudnnCreateFilterDescriptor(&desc_));
237  }
238  ~cudnnFilterDescWrapper() noexcept {
239  CUDNN_CHECK(cudnnDestroyFilterDescriptor(desc_));
240  }
241 
242  inline cudnnFilterDescriptor_t Descriptor(
243  const StorageOrder& order,
244  const cudnnDataType_t type,
245  const vector<int>& dims,
246  bool* changed) {
247  if (type_ == type && order_ == order && dims_ == dims) {
248  // if not changed, simply return the current descriptor.
249  if (changed)
250  *changed = false;
251  return desc_;
252  }
253  CAFFE_ENFORCE_EQ(
254  dims.size(), 4, "Currently only 4-dimensional descriptor supported.");
255  order_ = order;
256  type_ = type;
257  dims_ = dims;
258  CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
259  desc_,
260  type,
261  GetCudnnTensorFormat(order),
262  dims_[0],
263  // TODO - confirm that this is correct for NHWC
264  (order == StorageOrder::NCHW ? dims_[1] : dims_[3]),
265  (order == StorageOrder::NCHW ? dims_[2] : dims_[1]),
266  (order == StorageOrder::NCHW ? dims_[3] : dims_[2])));
267  if (changed)
268  *changed = true;
269  return desc_;
270  }
271 
272  template <typename T>
273  inline cudnnFilterDescriptor_t Descriptor(
274  const StorageOrder& order,
275  const vector<int>& dims) {
276  return Descriptor(order, cudnnTypeWrapper<T>::type, dims, nullptr);
277  }
278 
279  private:
280  cudnnFilterDescriptor_t desc_;
281  StorageOrder order_;
282  cudnnDataType_t type_;
283  vector<int> dims_;
284  DISABLE_COPY_AND_ASSIGN(cudnnFilterDescWrapper);
285 };
286 
287 class CuDNNWrapper;
293  friend class CuDNNWrapper;
294 
295  private:
296  CuDNNHandles() {
297  for (int i = 0; i < CAFFE2_COMPILE_TIME_MAX_GPUS; ++i) {
298  cudnn_handle_[i] = nullptr;
299  }
300  }
301 
302  ~CuDNNHandles() noexcept {
303  for (int i = 0; i < CAFFE2_COMPILE_TIME_MAX_GPUS; ++i) {
304  if (cudnn_handle_[i]) {
305  CUDNN_CHECK(cudnnDestroy(cudnn_handle_[i]));
306  }
307  }
308  }
309 
310  cudnnHandle_t cudnn_handle_[CAFFE2_COMPILE_TIME_MAX_GPUS];
311 };
312 
322  ~CuDNNWorkspace() noexcept {
323  if (data_) {
324  CUDAContext::Delete(data_);
325  }
326  }
327 
328  void* get(size_t nbytes) {
329  if (nbytes_ < nbytes) {
330  reset();
331  data_ = CUDAContext::New(nbytes);
332  nbytes_ = nbytes;
333  }
334  CAFFE_ENFORCE_GE(nbytes_, nbytes);
335  return data_;
336  }
337 
338  void reset() {
339  if (data_) {
340  CUDAContext::Delete(data_);
341  }
342  data_ = nullptr;
343  nbytes_ = 0;
344  }
345 
346  void* data_{nullptr};
347  size_t nbytes_{0};
348 };
349 
350 // CuDNNState is the owner of the CuDNNWorkspace, and serializes all
351 // executions of operations that use the state onto it's own stream
352 // (so multiple Net workers can reuse the same workspace from
353 // different threads and CUDA streams).
354 class CuDNNState {
355  public:
356  explicit CuDNNState(size_t gpu_id) : gpu_id_(gpu_id) {
357  DeviceGuard g(gpu_id_);
358  CUDNN_ENFORCE(cudnnCreate(&cudnn_handle_));
359  CUDA_ENFORCE(cudaEventCreate(&before_));
360  CUDA_ENFORCE(cudaEventCreate(&after_));
361  CUDA_ENFORCE(cudaStreamCreate(&stream_));
362  CUDNN_ENFORCE(cudnnSetStream(cudnn_handle_, stream_));
363  }
364 
365  ~CuDNNState() noexcept {
366  DeviceGuard g(gpu_id_);
367  CUDNN_CHECK(cudnnDestroy(cudnn_handle_));
368  CUDA_CHECK(cudaStreamDestroy(stream_));
369  CUDA_CHECK(cudaEventDestroy(after_));
370  CUDA_CHECK(cudaEventDestroy(before_));
371  }
372 
373  cudnnHandle_t& cudnn_handle() {
374  return cudnn_handle_;
375  }
376 
377  CuDNNWorkspace& workspace() {
378  return workspace_;
379  }
380 
381  template <typename F>
382  void execute(cudaStream_t stream, F&& f) {
383  CUDA_ENFORCE(cudaEventRecord(before_, stream));
384  CUDA_ENFORCE(cudaStreamWaitEvent(stream_, before_, 0));
385  f(this);
386  CUDA_ENFORCE(cudaEventRecord(after_, stream_));
387  CUDA_ENFORCE(cudaStreamWaitEvent(stream, after_, 0));
388  }
389 
390  private:
391  cudnnHandle_t cudnn_handle_{nullptr};
392  cudaEvent_t before_{nullptr};
393  cudaEvent_t after_{nullptr};
394  cudaStream_t stream_{nullptr};
395  CuDNNWorkspace workspace_;
396  size_t gpu_id_{0};
397  DISABLE_COPY_AND_ASSIGN(CuDNNState);
398 };
399 
410  public:
415  explicit CuDNNWrapper(CUDAContext* context) : context_(context) {}
416 
421  cudnnHandle_t& inline_cudnn_handle() {
422  int gpu_id = context_->cuda_gpu_id();
423  auto& cudnn_handle_ = tls_cudnn_handles_.cudnn_handle_[gpu_id];
424  if (cudnn_handle_) {
425  return cudnn_handle_;
426  } else {
427  context_->SwitchToDevice();
428  CUDNN_ENFORCE(cudnnCreate(&cudnn_handle_));
429  CUDNN_ENFORCE(cudnnSetStream(cudnn_handle_, context_->cuda_stream()));
430  }
431  return cudnn_handle_;
432  }
433 
434  // Executes the closure F on the CuDNNState associated with state_idx
435  template <typename F>
436  void with_cudnn_state(size_t state_idx, F&& f) {
437  CAFFE_ENFORCE(
438  state_idx < CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES, "Invalid state_idx");
439  auto& sync_state = cudnn_states()[context_->cuda_gpu_id()][state_idx];
440 
441  DeviceGuard dg(context_->cuda_gpu_id());
442 
443  // We need to serialize execution on the CuDNNState as we can't
444  // allow multiple threads to race through the cudaEventRecord
445  // calls (so a worker thread might wait on another worker thread's
446  // execution)
447  std::lock_guard<std::mutex> g(sync_state.mutex);
448  if (!sync_state.state.get()) {
449  sync_state.state.reset(new CuDNNState(context_->cuda_gpu_id()));
450  }
451  CHECK_NOTNULL(sync_state.state.get())->execute(context_->cuda_stream(), f);
452  }
453 
454  protected:
455  // Pointer to an external cuda context that the cudnn wrapper will use.
456  CUDAContext* context_;
457  static thread_local CuDNNHandles tls_cudnn_handles_;
458 
459  static constexpr size_t CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES = 4;
460 
462  std::mutex mutex;
463  std::unique_ptr<CuDNNState> state;
464  };
465 
466  using PerGPUCuDNNStates = std::array<
467  std::array<SyncedCuDNNState, CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES>,
468  CAFFE2_COMPILE_TIME_MAX_GPUS>;
469  static PerGPUCuDNNStates& cudnn_states();
470 
471  DISABLE_COPY_AND_ASSIGN(CuDNNWrapper);
472 };
473 
474 } // namespace caffe2
475 
476 #endif // CAFFE2_CORE_COMMON_CUDNN_H_
CuDNNWrapper is a class that wraps the cudnn handles and cudnn workspaces.
Definition: common_cudnn.h:409
cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder &order)
A wrapper function to convert the Caffe storage order to cudnn storage order enum values...
Definition: common_cudnn.h:161
cudnnTypeWrapper is a wrapper class that allows us to refer to the cudnn type in a template function...
Definition: common_cudnn.h:107
cudnnHandle_t & inline_cudnn_handle()
Returns the inline cudnn handle that executes on the current thread&#39;s cuda_stream.
Definition: common_cudnn.h:421
Simple registry implementation in Caffe2 that uses static variables to register object creators durin...
cudnnTensorDescWrapper is the placeholder that wraps around a cudnnTensorDescriptor_t, allowing us to do descriptor change as-needed during runtime.
Definition: common_cudnn.h:179
CuDNNWrapper(CUDAContext *context)
Creates a cudnn wrapper associated with a CUDAContext object.
Definition: common_cudnn.h:415
CuDNNWorkspace is a wrapper around a raw cuda pointer that holds the cudnn scratch space...
Definition: common_cudnn.h:321
CuDNNHandles wraps around cudnnHandle_t so they can be properly destructed when threads exit...
Definition: common_cudnn.h:292