1 #ifndef CAFFE2_CORE_COMMON_CUDNN_H_ 2 #define CAFFE2_CORE_COMMON_CUDNN_H_ 9 #include "caffe2/core/common.h" 10 #include "caffe2/core/common_gpu.h" 11 #include "caffe2/core/context.h" 12 #include "caffe2/core/context_gpu.h" 13 #include "caffe2/core/logging.h" 14 #include "caffe2/core/types.h" 15 #include "caffe2/proto/caffe2.pb.h" 18 CUDNN_VERSION >= 5000,
19 "Caffe2 requires cudnn version 5.0 or above.");
21 #define CUDNN_VERSION_MIN(major, minor, patch) \ 22 (CUDNN_VERSION >= ((major) * 1000 + (minor) * 100 + (patch))) 30 inline const char* cudnnGetErrorString(cudnnStatus_t status) {
32 case CUDNN_STATUS_SUCCESS:
33 return "CUDNN_STATUS_SUCCESS";
34 case CUDNN_STATUS_NOT_INITIALIZED:
35 return "CUDNN_STATUS_NOT_INITIALIZED";
36 case CUDNN_STATUS_ALLOC_FAILED:
37 return "CUDNN_STATUS_ALLOC_FAILED";
38 case CUDNN_STATUS_BAD_PARAM:
39 return "CUDNN_STATUS_BAD_PARAM";
40 case CUDNN_STATUS_INTERNAL_ERROR:
41 return "CUDNN_STATUS_INTERNAL_ERROR";
42 case CUDNN_STATUS_INVALID_VALUE:
43 return "CUDNN_STATUS_INVALID_VALUE";
44 case CUDNN_STATUS_ARCH_MISMATCH:
45 return "CUDNN_STATUS_ARCH_MISMATCH";
46 case CUDNN_STATUS_MAPPING_ERROR:
47 return "CUDNN_STATUS_MAPPING_ERROR";
48 case CUDNN_STATUS_EXECUTION_FAILED:
49 return "CUDNN_STATUS_EXECUTION_FAILED";
50 case CUDNN_STATUS_NOT_SUPPORTED:
51 return "CUDNN_STATUS_NOT_SUPPORTED";
52 case CUDNN_STATUS_LICENSE_ERROR:
53 return "CUDNN_STATUS_LICENSE_ERROR";
55 return "Unknown cudnn error number";
62 #define CUDNN_ENFORCE(condition) \ 64 cudnnStatus_t status = condition; \ 67 CUDNN_STATUS_SUCCESS, \ 73 ::caffe2::internal::cudnnGetErrorString(status)); \ 75 #define CUDNN_CHECK(condition) \ 77 cudnnStatus_t status = condition; \ 78 CHECK(status == CUDNN_STATUS_SUCCESS) \ 79 << ::caffe2::internal::cudnnGetErrorString(status); \ 83 inline size_t cudnnCompiledVersion() {
87 inline size_t cudnnRuntimeVersion() {
88 return cudnnGetVersion();
92 inline void CheckCuDNNVersions() {
95 bool version_match = cudnnCompiledVersion() == cudnnRuntimeVersion();
96 CAFFE_ENFORCE(version_match,
97 "cuDNN compiled (", cudnnCompiledVersion(),
") and" 98 "runtime (", cudnnRuntimeVersion(),
") versions mismatch");
106 template <
typename T>
112 static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
113 typedef const float ScalingParamType;
114 typedef float BNParamType;
115 static ScalingParamType* kOne() {
116 static ScalingParamType v = 1.0;
119 static const ScalingParamType* kZero() {
120 static ScalingParamType v = 0.0;
128 static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
129 typedef const double ScalingParamType;
130 typedef double BNParamType;
131 static ScalingParamType* kOne() {
132 static ScalingParamType v = 1.0;
135 static ScalingParamType* kZero() {
136 static ScalingParamType v = 0.0;
144 static const cudnnDataType_t type = CUDNN_DATA_HALF;
145 typedef const float ScalingParamType;
146 typedef float BNParamType;
147 static ScalingParamType* kOne() {
148 static ScalingParamType v = 1.0;
151 static ScalingParamType* kZero() {
152 static ScalingParamType v = 0.0;
163 case StorageOrder::NHWC:
164 return CUDNN_TENSOR_NHWC;
165 case StorageOrder::NCHW:
166 return CUDNN_TENSOR_NCHW;
168 LOG(FATAL) <<
"Unknown cudnn equivalent for order: " << order;
171 return CUDNN_TENSOR_NCHW;
182 CUDNN_ENFORCE(cudnnCreateTensorDescriptor(&desc_));
185 CUDNN_CHECK(cudnnDestroyTensorDescriptor(desc_));
188 inline cudnnTensorDescriptor_t Descriptor(
189 const cudnnTensorFormat_t format,
190 const cudnnDataType_t type,
191 const vector<int>& dims,
193 if (type_ == type && format_ == format && dims_ == dims) {
200 dims.size(), 4,
"Currently only 4-dimensional descriptor supported.");
204 CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
209 (format == CUDNN_TENSOR_NCHW ? dims_[1] : dims_[3]),
210 (format == CUDNN_TENSOR_NCHW ? dims_[2] : dims_[1]),
211 (format == CUDNN_TENSOR_NCHW ? dims_[3] : dims_[2])));
217 template <
typename T>
218 inline cudnnTensorDescriptor_t Descriptor(
219 const StorageOrder& order,
220 const vector<int>& dims) {
226 cudnnTensorDescriptor_t desc_;
227 cudnnTensorFormat_t format_;
228 cudnnDataType_t type_;
236 CUDNN_ENFORCE(cudnnCreateFilterDescriptor(&desc_));
239 CUDNN_CHECK(cudnnDestroyFilterDescriptor(desc_));
242 inline cudnnFilterDescriptor_t Descriptor(
243 const StorageOrder& order,
244 const cudnnDataType_t type,
245 const vector<int>& dims,
247 if (type_ == type && order_ == order && dims_ == dims) {
254 dims.size(), 4,
"Currently only 4-dimensional descriptor supported.");
258 CUDNN_ENFORCE(cudnnSetFilter4dDescriptor(
264 (order == StorageOrder::NCHW ? dims_[1] : dims_[3]),
265 (order == StorageOrder::NCHW ? dims_[2] : dims_[1]),
266 (order == StorageOrder::NCHW ? dims_[3] : dims_[2])));
272 template <
typename T>
273 inline cudnnFilterDescriptor_t Descriptor(
274 const StorageOrder& order,
275 const vector<int>& dims) {
280 cudnnFilterDescriptor_t desc_;
282 cudnnDataType_t type_;
297 for (
int i = 0; i < CAFFE2_COMPILE_TIME_MAX_GPUS; ++i) {
298 cudnn_handle_[i] =
nullptr;
303 for (
int i = 0; i < CAFFE2_COMPILE_TIME_MAX_GPUS; ++i) {
304 if (cudnn_handle_[i]) {
305 CUDNN_CHECK(cudnnDestroy(cudnn_handle_[i]));
310 cudnnHandle_t cudnn_handle_[CAFFE2_COMPILE_TIME_MAX_GPUS];
324 CUDAContext::Delete(data_);
328 void*
get(
size_t nbytes) {
329 if (nbytes_ < nbytes) {
331 data_ = CUDAContext::New(nbytes);
334 CAFFE_ENFORCE_GE(nbytes_, nbytes);
340 CUDAContext::Delete(data_);
346 void* data_{
nullptr};
356 explicit CuDNNState(
size_t gpu_id) : gpu_id_(gpu_id) {
358 CUDNN_ENFORCE(cudnnCreate(&cudnn_handle_));
359 CUDA_ENFORCE(cudaEventCreate(&before_));
360 CUDA_ENFORCE(cudaEventCreate(&after_));
361 CUDA_ENFORCE(cudaStreamCreate(&stream_));
362 CUDNN_ENFORCE(cudnnSetStream(cudnn_handle_, stream_));
367 CUDNN_CHECK(cudnnDestroy(cudnn_handle_));
368 CUDA_CHECK(cudaStreamDestroy(stream_));
369 CUDA_CHECK(cudaEventDestroy(after_));
370 CUDA_CHECK(cudaEventDestroy(before_));
373 cudnnHandle_t& cudnn_handle() {
374 return cudnn_handle_;
381 template <
typename F>
382 void execute(cudaStream_t stream, F&& f) {
383 CUDA_ENFORCE(cudaEventRecord(before_, stream));
384 CUDA_ENFORCE(cudaStreamWaitEvent(stream_, before_, 0));
386 CUDA_ENFORCE(cudaEventRecord(after_, stream_));
387 CUDA_ENFORCE(cudaStreamWaitEvent(stream, after_, 0));
391 cudnnHandle_t cudnn_handle_{
nullptr};
392 cudaEvent_t before_{
nullptr};
393 cudaEvent_t after_{
nullptr};
394 cudaStream_t stream_{
nullptr};
422 int gpu_id = context_->cuda_gpu_id();
423 auto& cudnn_handle_ = tls_cudnn_handles_.cudnn_handle_[gpu_id];
425 return cudnn_handle_;
427 context_->SwitchToDevice();
428 CUDNN_ENFORCE(cudnnCreate(&cudnn_handle_));
429 CUDNN_ENFORCE(cudnnSetStream(cudnn_handle_, context_->cuda_stream()));
431 return cudnn_handle_;
435 template <
typename F>
436 void with_cudnn_state(
size_t state_idx, F&& f) {
438 state_idx < CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES,
"Invalid state_idx");
439 auto& sync_state = cudnn_states()[context_->cuda_gpu_id()][state_idx];
447 std::lock_guard<std::mutex> g(sync_state.mutex);
448 if (!sync_state.state.get()) {
449 sync_state.state.reset(
new CuDNNState(context_->cuda_gpu_id()));
451 CHECK_NOTNULL(sync_state.state.get())->execute(context_->cuda_stream(), f);
459 static constexpr
size_t CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES = 4;
463 std::unique_ptr<CuDNNState> state;
466 using PerGPUCuDNNStates = std::array<
467 std::array<SyncedCuDNNState, CAFFE2_COMPILE_TIME_MAX_CUDNN_STATES>,
468 CAFFE2_COMPILE_TIME_MAX_GPUS>;
469 static PerGPUCuDNNStates& cudnn_states();
476 #endif // CAFFE2_CORE_COMMON_CUDNN_H_
CuDNNWrapper is a class that wraps the cudnn handles and cudnn workspaces.
cudnnTensorFormat_t GetCudnnTensorFormat(const StorageOrder &order)
A wrapper function to convert the Caffe storage order to cudnn storage order enum values...
cudnnTypeWrapper is a wrapper class that allows us to refer to the cudnn type in a template function...
cudnnHandle_t & inline_cudnn_handle()
Returns the inline cudnn handle that executes on the current thread's cuda_stream.
Simple registry implementation in Caffe2 that uses static variables to register object creators durin...
cudnnTensorDescWrapper is the placeholder that wraps around a cudnnTensorDescriptor_t, allowing us to do descriptor change as-needed during runtime.
CuDNNWrapper(CUDAContext *context)
Creates a cudnn wrapper associated with a CUDAContext object.
CuDNNWorkspace is a wrapper around a raw cuda pointer that holds the cudnn scratch space...
CuDNNHandles wraps around cudnnHandle_t so they can be properly destructed when threads exit...