1 #ifndef CAFFE2_CUDA_RTC_COMMON_RTC_H_ 2 #define CAFFE2_CUDA_RTC_COMMON_RTC_H_ 7 #define NVRTC_CHECK(condition) \ 9 nvrtcResult result = condition; \ 10 if (result != NVRTC_SUCCESS) { \ 11 LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": " \ 12 << nvrtcGetErrorString(result); \ 18 template <
typename Derived>
24 CUDA_DRIVERAPI_ENFORCE(cuModuleUnload(module_));
31 template <
typename... Args>
32 void Compile(Args... args) {
33 string src =
static_cast<Derived*
>(
this)->GetSource(args...);
34 string name =
static_cast<Derived*
>(
this)->KernelName(args...);
35 VLOG(1) <<
"function name: " << name;
36 VLOG(1) <<
"function src:\n" << src;
39 NVRTC_CHECK(nvrtcCreateProgram(
40 &prog, src.c_str(),
nullptr, 0,
nullptr,
nullptr));
44 const char *nvrtc_opts[] = {
"--gpu-architecture=compute_35",
46 nvrtcResult compile_result = nvrtcCompileProgram(
48 if (compile_result != NVRTC_SUCCESS) {
50 NVRTC_CHECK(nvrtcGetProgramLogSize(prog, &log_size));
51 vector<char> nvrtc_log(log_size);
52 NVRTC_CHECK(nvrtcGetProgramLog(prog, nvrtc_log.data()));
53 LOG(FATAL) <<
"Compilation failure for nvrtc(" 54 << nvrtcGetErrorString(compile_result) <<
"): \n" 58 NVRTC_CHECK(nvrtcGetPTXSize(prog, &ptx_size));
59 vector<char> nvrtc_ptx(ptx_size);
60 NVRTC_CHECK(nvrtcGetPTX(prog, nvrtc_ptx.data()));
61 NVRTC_CHECK(nvrtcDestroyProgram(&prog));
64 CUDA_DRIVERAPI_ENFORCE(cuModuleUnload(module_));
66 CUDA_DRIVERAPI_ENFORCE(
67 cuModuleLoadDataEx(&module_, nvrtc_ptx.data(), 0, 0, 0));
68 module_loaded_ =
true;
69 CUDA_DRIVERAPI_ENFORCE(
70 cuModuleGetFunction(&kernel_, module_, name.c_str()));
73 template <
typename... Args>
74 void Launch(
unsigned int gx,
unsigned int gy,
unsigned int gz,
75 unsigned int bx,
unsigned int by,
unsigned int bz,
76 unsigned int shared_mem, cudaStream_t stream,
79 module_loaded_,
"Cannot call Launch before a module is loaded.");
80 void * args_voidp[] = {&args...};
81 CUDA_DRIVERAPI_ENFORCE(cuLaunchKernel(
82 kernel_, gx, gy, gz, bx, by, bz, shared_mem, stream, args_voidp, 0));
85 void LaunchEx(
unsigned int gx,
unsigned int gy,
unsigned int gz,
86 unsigned int bx,
unsigned int by,
unsigned int bz,
87 unsigned int shared_mem, cudaStream_t stream,
90 module_loaded_,
"Cannot call Launch before a module is loaded.");
91 CUDA_DRIVERAPI_ENFORCE(cuLaunchKernel(
92 kernel_, gx, gy, gz, bx, by, bz, shared_mem, stream,
nullptr, extra));
102 inline string GetUniqueName() {
103 static constexpr
int len = 20;
104 static const char alpha[] =
105 "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
107 std::stringstream ss;
108 ss <<
"_cuda_kernel_";
109 for (
int i = 0; i < len; ++i) {
110 ss << alpha[rand() % (
sizeof(alpha) - 1)];
117 #endif // CAFFE2_CUDA_RTC_COMMON_RTC_H_ Simple registry implementation in Caffe2 that uses static variables to register object creators durin...