1 #ifndef CAFFE2_CORE_CONTEXT_GPU_H_ 2 #define CAFFE2_CORE_CONTEXT_GPU_H_ 7 #include "caffe2/core/common_gpu.h" 8 #include "caffe2/core/context.h" 9 #include "caffe2/core/tensor.h" 10 #include "caffe2/core/types.h" 11 #include "caffe2/proto/caffe2.pb.h" 12 #include "caffe2/core/logging.h" 16 enum class CudaMemoryPoolType {
43 for (
int i = 0; i < CAFFE2_COMPILE_TIME_MAX_GPUS; ++i) {
44 cuda_streams_[i] = vector<cudaStream_t>();
45 cublas_handles_[i] = vector<cublasHandle_t>();
49 cudaStream_t GetStream(
int gpu,
int stream_id) {
50 vector<cudaStream_t> &gpu_streams = cuda_streams_[gpu];
51 if (gpu_streams.size() <= stream_id) {
52 gpu_streams.resize(stream_id + 1,
nullptr);
54 if (!gpu_streams[stream_id]) {
56 CUDA_ENFORCE(cudaStreamCreateWithFlags(
57 &gpu_streams[stream_id], cudaStreamNonBlocking));
59 return gpu_streams[stream_id];
62 cublasHandle_t GetHandle(
int gpu,
int stream_id) {
64 vector<cublasHandle_t> &gpu_handles = cublas_handles_[gpu];
65 if (gpu_handles.size() <= stream_id) {
66 gpu_handles.resize(stream_id + 1,
nullptr);
68 if (!gpu_handles[stream_id]) {
69 CUBLAS_ENFORCE(cublasCreate(&gpu_handles[stream_id]));
73 CUBLAS_ENFORCE(cublasSetPointerMode(
74 gpu_handles[stream_id], CUBLAS_POINTER_MODE_HOST));
76 cublasSetStream(gpu_handles[stream_id], GetStream(gpu, stream_id)));
78 return gpu_handles[stream_id];
82 for (
int i = 0; i < CAFFE2_COMPILE_TIME_MAX_GPUS; ++i) {
83 for (
auto& handle : cublas_handles_[i]) {
85 CUBLAS_CHECK(cublasDestroy(handle));
88 for (
auto& stream : cuda_streams_[i]) {
90 CUDA_CHECK(cudaStreamDestroy(stream));
95 vector<cudaStream_t> cuda_streams_[CAFFE2_COMPILE_TIME_MAX_GPUS];
96 vector<cublasHandle_t> cublas_handles_[CAFFE2_COMPILE_TIME_MAX_GPUS];
106 if (curand_generator_) {
107 CURAND_ENFORCE(curandDestroyGenerator(curand_generator_));
109 CAFFE_ENFORCE(FinishDeviceComputation());
112 inline void SwitchToDevice(
int stream_id) {
113 set_stream_id(stream_id);
114 CUDA_ENFORCE(cudaSetDevice(gpu_id_));
116 inline void SwitchToDevice() {
120 bool FinishDeviceComputation() {
121 cudaStreamSynchronize(cuda_objects_.GetStream(gpu_id_, stream_id_));
122 cudaError_t error = cudaGetLastError();
123 if (error == cudaSuccess) {
126 LOG(ERROR) <<
"Encountered CUDA error: " 127 << cudaGetErrorString(error);
132 inline int cuda_gpu_id()
const {
return gpu_id_; }
134 inline cudaStream_t cuda_stream() {
135 return cuda_stream(gpu_id_, stream_id_);
138 inline cudaStream_t cuda_stream()
const {
139 return cuda_stream(gpu_id_, stream_id_);
142 static cudaStream_t cuda_stream(
int gpu_id,
int stream_id) {
143 return cuda_objects_.GetStream(gpu_id, stream_id);
146 cublasHandle_t cublas_handle() {
147 return cuda_objects_.GetHandle(gpu_id_, stream_id_);
150 curandGenerator_t& curand_generator() {
151 if (!curand_generator_) {
154 curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT));
156 curandSetPseudoRandomGeneratorSeed(curand_generator_, random_seed_));
157 CHECK_NOTNULL(curand_generator_);
159 CURAND_ENFORCE(curandSetStream(curand_generator_, cuda_stream()));
160 return curand_generator_;
163 static void* New(
size_t nbytes);
165 static void Delete(
void* data);
171 static std::mutex& mutex();
173 template <
class SrcContext,
class DstContext>
174 inline void CopyBytes(
size_t nbytes,
const void* src,
void* dst) {
175 CUDA_ENFORCE(cudaMemcpyAsync(
180 cuda_objects_.GetStream(gpu_id_, stream_id_)));
183 template <
typename T,
class SrcContext,
class DstContext>
184 inline void Copy(
int n,
const T* src, T* dst) {
185 CopyBytes<SrcContext, DstContext>(n *
sizeof(T),
186 static_cast<const void*>(src),
187 static_cast<void*
>(dst));
190 template <
class SrcContext,
class DstContext>
192 CopyItems(
const TypeMeta& meta,
size_t n,
const void* src,
void* dst) {
193 CAFFE_ENFORCE(!meta.
copy(),
"CUDAContext requires fundamental types.");
194 CopyBytes<SrcContext, DstContext>(n * meta.
itemsize(), src, dst);
197 void set_stream_id(
int stream_id) {
198 stream_id_ = stream_id;
205 curandGenerator_t curand_generator_{
nullptr};
215 inline void CPUContext::CopyBytes<CUDAContext, CPUContext>(
216 size_t nbytes,
const void* src,
void* dst) {
221 inline void CPUContext::CopyBytes<CPUContext, CUDAContext>(
222 size_t nbytes,
const void* src,
void* dst) {
239 void* New(
size_t nbytes)
override {
241 std::lock_guard<std::mutex> lock(CUDAContext::mutex());
242 CUDA_ENFORCE(cudaMallocHost(&data, nbytes));
243 memset(data, 0, nbytes);
246 void Delete(
void* data)
override {
252 std::lock_guard<std::mutex> lock(CUDAContext::mutex());
253 cudaError_t err = cudaFreeHost(data);
254 if (err == cudaErrorInvalidValue) {
270 #endif // CAFFE2_CORE_CONTEXT_GPU_H_
A struct to host thread-local cuda objects.
CudaMemoryPoolType GetCudaMemoryPoolType()
Gets the current memory pool type used by Caffe2.
Tensor is the basic class in Caffe2 that stores a contiguous memory with its shape information...
Simple registry implementation in Caffe2 that uses static variables to register object creators durin...
int GetGPUIDForPointer(const void *ptr)
Gets the GPU id that the current pointer is located at.
The CPU Context, representing the bare minimum of what a Context class in Caffe2 should implement...
An allocator that does the CPU memory allocation with pinned memory.