1 #include "caffe2/core/common_gpu.h" 2 #include "caffe2/core/context_gpu.h" 3 #include "caffe2/core/operator.h" 4 #include "caffe2/cuda_rtc/common_rtc.h" 8 class ElementwiseRTCFunction
9 :
public CudaRTCFunction<ElementwiseRTCFunction> {
11 ElementwiseRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {}
13 template <
typename... Args>
14 string KernelName(Args... args) {
return name_; }
16 template <
typename... Args>
17 string GetSource(Args... args);
24 string ElementwiseRTCFunction::GetSource(
25 int input_size,
int output_size,
26 const string command_string) {
28 ss <<
"extern \"C\" __global__ void " << name_ <<
29 "(const size_t nthreads, \n";
31 int remain_params = input_size + output_size;
32 for (
int i = 0; i < input_size; ++i) {
33 ss <<
"const float* in" << i
34 << ((remain_params--) ?
", \n" :
"");
36 for (
int i = 0; i < output_size; ++i) {
37 ss <<
"float* out" << i
38 << ((remain_params--) ?
", \n" :
"");
41 "for (int index = blockIdx.x * blockDim.x + threadIdx.x;\n" 42 "index < nthreads; index += blockDim.x * gridDim.x) {\n" 43 << command_string <<
"\n" 76 const string src = OperatorBase::GetSingleArgument<string>(
78 CAFFE_ENFORCE(src.size(),
"Op should have a non-zero source code size.");
79 func_.Compile(InputSize(), OutputSize(), src);
83 bool RunOnDevice()
override {
84 static_assert(
sizeof(
void*) ==
sizeof(
size_t),
85 "The argbuffer relies on the assumption that void* and " 86 "size_t have the same size.");
87 vector<size_t> argBuffer_vec(InputSize() + OutputSize() + 1);
88 size_t* argBuffer = argBuffer_vec.data();
90 Input(0).size() < std::numeric_limits<int>::max(),
91 "The kernel function currently only supports int index.");
92 argBuffer[0] = Input(0).size();
93 void** ptr_buffer =
reinterpret_cast<void**
>(argBuffer + 1);
94 for (
int i = 0; i < InputSize(); ++i) {
95 ptr_buffer[i] =
const_cast<float*
>(Input(i).data<
float>());
97 for (
int i = 0; i < OutputSize(); ++i) {
98 Output(i)->ResizeLike(Input(0));
99 ptr_buffer[i + InputSize()] = Output(i)->mutable_data<
float>();
101 size_t argBufferSize =
sizeof(argBuffer);
103 CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
104 CU_LAUNCH_PARAM_BUFFER_SIZE, &argBufferSize,
108 CAFFE_CUDA_NUM_THREADS, 1, 1,
109 0, context_.cuda_stream(), config);
114 ElementwiseRTCFunction func_;
118 REGISTER_CUDA_OPERATOR_WITH_ENGINE(ElementwiseRTC, NVRTC,
ElementwiseRTCOp);
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Simple registry implementation in Caffe2 that uses static variables to register object creators durin...
A GPU operator that can generate limited elementwise operations.
int CAFFE_GET_BLOCKS(const int N)
Compute the number of blocks needed to run N threads.