Caffe2 - C++ API
A deep learning, cross platform ML framework
elemenntwise_rtc_gpu.cc
1 #include "caffe2/core/common_gpu.h"
2 #include "caffe2/core/context_gpu.h"
3 #include "caffe2/core/operator.h"
4 #include "caffe2/cuda_rtc/common_rtc.h"
5 
6 namespace caffe2 {
7 namespace {
8 class ElementwiseRTCFunction
9  : public CudaRTCFunction<ElementwiseRTCFunction> {
10  public:
11  ElementwiseRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {}
12 
13  template <typename... Args>
14  string KernelName(Args... args) { return name_; }
15 
16  template <typename... Args>
17  string GetSource(Args... args);
18 
19  private:
20  string name_;
21 };
22 
23 template<>
24 string ElementwiseRTCFunction::GetSource(
25  int input_size, int output_size,
26  const string command_string) {
27  std::stringstream ss;
28  ss << "extern \"C\" __global__ void " << name_ <<
29  "(const size_t nthreads, \n";
30  // Insert the parameter list.
31  int remain_params = input_size + output_size;
32  for (int i = 0; i < input_size; ++i) {
33  ss << "const float* in" << i
34  << ((remain_params--) ? ", \n" : "");
35  }
36  for (int i = 0; i < output_size; ++i) {
37  ss << "float* out" << i
38  << ((remain_params--) ? ", \n" : "");
39  }
40  ss << ") {\n"
41  "for (int index = blockIdx.x * blockDim.x + threadIdx.x;\n"
42  "index < nthreads; index += blockDim.x * gridDim.x) {\n"
43  << command_string << "\n"
44  << "}\n}";
45  return ss.str();
46 }
47 } // namespace
48 
72 class ElementwiseRTCOp final : public Operator<CUDAContext> {
73  public:
74  ElementwiseRTCOp(const OperatorDef& operator_def, Workspace* ws)
75  : Operator<CUDAContext>(operator_def, ws) {
76  const string src = OperatorBase::GetSingleArgument<string>(
77  "rtc_src", "");
78  CAFFE_ENFORCE(src.size(), "Op should have a non-zero source code size.");
79  func_.Compile(InputSize(), OutputSize(), src);
80  }
81  ~ElementwiseRTCOp() {}
82 
83  bool RunOnDevice() override {
84  static_assert(sizeof(void*) == sizeof(size_t),
85  "The argbuffer relies on the assumption that void* and "
86  "size_t have the same size.");
87  vector<size_t> argBuffer_vec(InputSize() + OutputSize() + 1);
88  size_t* argBuffer = argBuffer_vec.data();
89  CAFFE_ENFORCE(
90  Input(0).size() < std::numeric_limits<int>::max(),
91  "The kernel function currently only supports int index.");
92  argBuffer[0] = Input(0).size();
93  void** ptr_buffer = reinterpret_cast<void**>(argBuffer + 1);
94  for (int i = 0; i < InputSize(); ++i) {
95  ptr_buffer[i] = const_cast<float*>(Input(i).data<float>());
96  }
97  for (int i = 0; i < OutputSize(); ++i) {
98  Output(i)->ResizeLike(Input(0));
99  ptr_buffer[i + InputSize()] = Output(i)->mutable_data<float>();
100  }
101  size_t argBufferSize = sizeof(argBuffer);
102  void* config[] = {
103  CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
104  CU_LAUNCH_PARAM_BUFFER_SIZE, &argBufferSize,
105  CU_LAUNCH_PARAM_END
106  };
107  func_.LaunchEx(CAFFE_GET_BLOCKS(Input(0).size()), 1, 1,
108  CAFFE_CUDA_NUM_THREADS, 1, 1,
109  0, context_.cuda_stream(), config);
110  return true;
111  }
112 
113  private:
114  ElementwiseRTCFunction func_;
115 };
116 
117 namespace {
118 REGISTER_CUDA_OPERATOR_WITH_ENGINE(ElementwiseRTC, NVRTC, ElementwiseRTCOp);
119 }
120 
121 } // namespace caffe2
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:53
Simple registry implementation in Caffe2 that uses static variables to register object creators durin...
A GPU operator that can generate limited elementwise operations.
int CAFFE_GET_BLOCKS(const int N)
Compute the number of blocks needed to run N threads.
Definition: common_gpu.h:239