Caffe2 - C++ API
A deep learning, cross platform ML framework
context_gpu.h
1 #ifndef CAFFE2_CORE_CONTEXT_GPU_H_
2 #define CAFFE2_CORE_CONTEXT_GPU_H_
3 
4 #include <ctime>
5 #include <mutex>
6 
7 #include "caffe2/core/common_gpu.h"
8 #include "caffe2/core/context.h"
9 #include "caffe2/core/tensor.h"
10 #include "caffe2/core/types.h"
11 #include "caffe2/proto/caffe2.pb.h"
12 #include "caffe2/core/logging.h"
13 
14 namespace caffe2 {
15 
16 enum class CudaMemoryPoolType {
17  NONE = 0,
18  CNMEM = 1,
19  CUB = 2,
20 };
21 
27 CudaMemoryPoolType GetCudaMemoryPoolType();
28 
29 
40  friend class CUDAContext;
41  private:
43  for (int i = 0; i < CAFFE2_COMPILE_TIME_MAX_GPUS; ++i) {
44  cuda_streams_[i] = vector<cudaStream_t>();
45  cublas_handles_[i] = vector<cublasHandle_t>();
46  }
47  }
48 
49  cudaStream_t GetStream(int gpu, int stream_id) {
50  vector<cudaStream_t> &gpu_streams = cuda_streams_[gpu];
51  if (gpu_streams.size() <= stream_id) {
52  gpu_streams.resize(stream_id + 1, nullptr);
53  }
54  if (!gpu_streams[stream_id]) {
55  DeviceGuard guard(gpu);
56  CUDA_ENFORCE(cudaStreamCreateWithFlags(
57  &gpu_streams[stream_id], cudaStreamNonBlocking));
58  }
59  return gpu_streams[stream_id];
60  }
61 
62  cublasHandle_t GetHandle(int gpu, int stream_id) {
63  DeviceGuard guard(gpu);
64  vector<cublasHandle_t> &gpu_handles = cublas_handles_[gpu];
65  if (gpu_handles.size() <= stream_id) {
66  gpu_handles.resize(stream_id + 1, nullptr);
67  }
68  if (!gpu_handles[stream_id]) {
69  CUBLAS_ENFORCE(cublasCreate(&gpu_handles[stream_id]));
70  // The default is CUBLAS_POINTER_MODE_HOST. You can override
71  // it after obtaining the cublas handle, but do that with
72  // caution.
73  CUBLAS_ENFORCE(cublasSetPointerMode(
74  gpu_handles[stream_id], CUBLAS_POINTER_MODE_HOST));
75  CUBLAS_ENFORCE(
76  cublasSetStream(gpu_handles[stream_id], GetStream(gpu, stream_id)));
77  }
78  return gpu_handles[stream_id];
79  }
80 
81  ~ThreadLocalCUDAObjects() noexcept {
82  for (int i = 0; i < CAFFE2_COMPILE_TIME_MAX_GPUS; ++i) {
83  for (auto& handle : cublas_handles_[i]) {
84  if (handle) {
85  CUBLAS_CHECK(cublasDestroy(handle));
86  }
87  }
88  for (auto& stream : cuda_streams_[i]) {
89  if (stream) {
90  CUDA_CHECK(cudaStreamDestroy(stream));
91  }
92  }
93  }
94  }
95  vector<cudaStream_t> cuda_streams_[CAFFE2_COMPILE_TIME_MAX_GPUS];
96  vector<cublasHandle_t> cublas_handles_[CAFFE2_COMPILE_TIME_MAX_GPUS];
97 };
98 
99 class CUDAContext final {
100  public:
101  // The default cuda context constructor.
102  explicit CUDAContext(const int gpu_id = -1);
103  explicit CUDAContext(const DeviceOption& option);
104 
105  ~CUDAContext() {
106  if (curand_generator_) {
107  CURAND_ENFORCE(curandDestroyGenerator(curand_generator_));
108  }
109  CAFFE_ENFORCE(FinishDeviceComputation());
110  }
111 
112  inline void SwitchToDevice(int stream_id) {
113  set_stream_id(stream_id);
114  CUDA_ENFORCE(cudaSetDevice(gpu_id_));
115  }
116  inline void SwitchToDevice() {
117  SwitchToDevice(0);
118  }
119 
120  bool FinishDeviceComputation() {
121  cudaStreamSynchronize(cuda_objects_.GetStream(gpu_id_, stream_id_));
122  cudaError_t error = cudaGetLastError();
123  if (error == cudaSuccess) {
124  return true;
125  } else {
126  LOG(ERROR) << "Encountered CUDA error: "
127  << cudaGetErrorString(error);
128  return false;
129  }
130  }
131 
132  inline int cuda_gpu_id() const { return gpu_id_; }
133 
134  inline cudaStream_t cuda_stream() {
135  return cuda_stream(gpu_id_, stream_id_);
136  }
137 
138  inline cudaStream_t cuda_stream() const {
139  return cuda_stream(gpu_id_, stream_id_);
140  }
141 
142  static cudaStream_t cuda_stream(int gpu_id, int stream_id) {
143  return cuda_objects_.GetStream(gpu_id, stream_id);
144  }
145 
146  cublasHandle_t cublas_handle() {
147  return cuda_objects_.GetHandle(gpu_id_, stream_id_);
148  }
149 
150  curandGenerator_t& curand_generator() {
151  if (!curand_generator_) {
152  DeviceGuard guard(gpu_id_);
153  CURAND_ENFORCE(
154  curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
155  CURAND_ENFORCE(
156  curandSetPseudoRandomGeneratorSeed(curand_generator_, random_seed_));
157  CHECK_NOTNULL(curand_generator_);
158  }
159  CURAND_ENFORCE(curandSetStream(curand_generator_, cuda_stream()));
160  return curand_generator_;
161  }
162 
163  static void* New(size_t nbytes);
164 
165  static void Delete(void* data);
166 
167 
168  // Get a mutex to lock out cudaMalloc / cudaFree calls when
169  // NCCL kernels are being launched. Should remove threat of
170  // deadlocks
171  static std::mutex& mutex();
172 
173  template <class SrcContext, class DstContext>
174  inline void CopyBytes(size_t nbytes, const void* src, void* dst) {
175  CUDA_ENFORCE(cudaMemcpyAsync(
176  dst,
177  src,
178  nbytes,
179  cudaMemcpyDefault,
180  cuda_objects_.GetStream(gpu_id_, stream_id_)));
181  }
182 
183  template <typename T, class SrcContext, class DstContext>
184  inline void Copy(int n, const T* src, T* dst) {
185  CopyBytes<SrcContext, DstContext>(n * sizeof(T),
186  static_cast<const void*>(src),
187  static_cast<void*>(dst));
188  }
189 
190  template <class SrcContext, class DstContext>
191  inline void
192  CopyItems(const TypeMeta& meta, size_t n, const void* src, void* dst) {
193  CAFFE_ENFORCE(!meta.copy(), "CUDAContext requires fundamental types.");
194  CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
195  }
196 
197  void set_stream_id(int stream_id) {
198  stream_id_ = stream_id;
199  }
200 
201  protected:
202  int gpu_id_;
203  int stream_id_ = 0;
204  int random_seed_;
205  curandGenerator_t curand_generator_{nullptr};
206  static thread_local ThreadLocalCUDAObjects cuda_objects_;
207 };
208 
209 // For the CPU context, we also allow a (probably expensive) function
210 // to copy the data from a cuda context. Inside the function, we create
211 // a temporary CUDAContext object to carry out the copy. From the caller's
212 // side, these functions are synchronous with respect to the host, similar
213 // to a normal CPUContext::CopyBytes<CPUContext, CPUContext> call.
214 template<>
215 inline void CPUContext::CopyBytes<CUDAContext, CPUContext>(
216  size_t nbytes, const void* src, void* dst) {
217  CUDAContext context(GetGPUIDForPointer(src));
218  context.CopyBytes<CUDAContext, CPUContext>(nbytes, src, dst);
219 }
220 template<>
221 inline void CPUContext::CopyBytes<CPUContext, CUDAContext>(
222  size_t nbytes, const void* src, void* dst) {
223  CUDAContext context(GetGPUIDForPointer(dst));
224  context.CopyBytes<CPUContext, CUDAContext>(nbytes, src, dst);
225 }
226 
237  PinnedCPUAllocator() {}
238  ~PinnedCPUAllocator() {}
239  void* New(size_t nbytes) override {
240  void* data;
241  std::lock_guard<std::mutex> lock(CUDAContext::mutex());
242  CUDA_ENFORCE(cudaMallocHost(&data, nbytes));
243  memset(data, 0, nbytes);
244  return data;
245  }
246  void Delete(void* data) override {
247  // Caffe2 uses a lazy way to figure out if one is actually going to use GPUs
248  // or not. If a CUDAContext::New() call is made, inside the CUDAContext
249  // function we will switch the cpu side allocator to a PinnedCPUAllocator.
250  // But, if one calls CPUContext::New() before any cuda allocations,
251  // PinnedCPUAllocator can still delete the corresponding memory.
252  std::lock_guard<std::mutex> lock(CUDAContext::mutex());
253  cudaError_t err = cudaFreeHost(data);
254  if (err == cudaErrorInvalidValue) {
255  free(data);
256  // Calling cudaGetLastError will reset the cuda error.
257  cudaGetLastError();
258  } else {
259  // For all other errors, still do a cuda check.
260  CUDA_ENFORCE(err);
261  }
262  }
263 };
264 
265 // For simplicity, we will typedef Tensor<CPUContext> to TensorCPU.
267 
268 } // namespace caffe2
269 
270 #endif // CAFFE2_CORE_CONTEXT_GPU_H_
TypedCopy copy() const
Returns the typed copy function pointer for individual iterms.
Definition: typeid.h:133
A struct to host thread-local cuda objects.
Definition: context_gpu.h:39
TypeMeta is a thin class that allows us to store the type of a container such as a blob...
Definition: typeid.h:66
CudaMemoryPoolType GetCudaMemoryPoolType()
Gets the current memory pool type used by Caffe2.
Tensor is the basic class in Caffe2 that stores a contiguous memory with its shape information...
Definition: tensor.h:73
Simple registry implementation in Caffe2 that uses static variables to register object creators durin...
int GetGPUIDForPointer(const void *ptr)
Gets the GPU id that the current pointer is located at.
Definition: common_gpu.cc:99
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
Definition: context.h:105
const size_t & itemsize() const
Returns the size of the item.
Definition: typeid.h:121
An allocator that does the CPU memory allocation with pinned memory.
Definition: context_gpu.h:236