Caffe2 - C++ API
A deep learning, cross platform ML framework
pool_op_rtc_gpu.cc
1 #include <cstdio>
2 
3 #include "caffe2/core/common_gpu.h"
4 #include "caffe2/core/context_gpu.h"
5 #include "caffe2/operators/pool_op.h"
6 #include "caffe2/cuda_rtc/common_rtc.h"
7 
8 namespace caffe2 {
9 namespace {
10 class AveragePool {};
11 class MaxPool {};
12 } // namespace
13 
14 namespace {
15 
16 // The max pool forward function, with parameters written in const int.
17 const char kMaxPoolForwardNCHWSource[] = R"(
18 extern "C"
19 __global__ void %s(const float* bottom_data, float* top_data) {
20  const int nthreads = %d;
21  const int channels = %d;
22  const int height = %d;
23  const int width = %d;
24  const int pooled_height = %d;
25  const int pooled_width = %d;
26  const int kernel_h = %d;
27  const int kernel_w = %d;
28  const int stride_h = %d;
29  const int stride_w = %d;
30  const int pad_t = %d;
31  const int pad_l = %d;
32  for (int index = blockIdx.x * blockDim.x + threadIdx.x;
33  index < nthreads; index += blockDim.x * gridDim.x) {
34  int pw = index %% pooled_width;
35  int ph = (index / pooled_width) %% pooled_height;
36  int c = (index / (pooled_width * pooled_height)) %% channels;
37  int n = index / (pooled_width * pooled_height * channels);
38  int hstart = ph * stride_h - pad_t;
39  int wstart = pw * stride_w - pad_l;
40  int hend = min(hstart + kernel_h, height);
41  int wend = min(wstart + kernel_w, width);
42  hstart = max(hstart, 0);
43  wstart = max(wstart, 0);
44  float maxval = -1.0e37f;
45  const float* bdata_offset = bottom_data + n * channels * height * width;
46  for (int h = hstart; h < hend; ++h) {
47  for (int w = wstart; w < wend; ++w) {
48  maxval = fmaxf(
49  bdata_offset[c * height * width + h * width + w], maxval);
50  }
51  }
52  top_data[index] = maxval;
53  }
54 }
55 )";
56 
57 // The max pool forward function, with parameters written in const int.
58 const char kMaxPoolBackwardNCHWSource[] = R"(
59 extern "C"
60 __global__ void %s(
61  const float* const bottom_data, const float* const top_data,
62  const float* const top_diff, float* const bottom_diff) {
63  const int nthreads = %d;
64  const int num = %d;
65  const int channels = %d;
66  const int height = %d;
67  const int width = %d;
68  const int pooled_height = %d;
69  const int pooled_width = %d;
70  const int kernel_h = %d;
71  const int kernel_w = %d;
72  const int stride_h = %d;
73  const int stride_w = %d;
74  const int pad_t = %d;
75  const int pad_l = %d;
76  for (int index = blockIdx.x * blockDim.x + threadIdx.x;
77  index < nthreads; index += blockDim.x * gridDim.x) {
78  const int w = index %% width + pad_l;
79  const int h = (index / width) %% height + pad_t;
80  const int c = (index / width / height) %% channels;
81  const int n = index / width / height / channels;
82  const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
83  const int phend = min(h / stride_h + 1, pooled_height);
84  const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
85  const int pwend = min(w / stride_w + 1, pooled_width);
86  const int top_offset =
87  (n * channels + c) * pooled_height * pooled_width;
88  bottom_diff[index] = 0;
89  for (int ph = phstart; ph < phend; ++ph) {
90  for (int pw = pwstart; pw < pwend; ++pw) {
91  int top_local_offset = top_offset + ph * pooled_width + pw;
92  if (bottom_data[index] == top_data[top_local_offset]) {
93  bottom_diff[index] += top_diff[top_local_offset];
94  }
95  }
96  }
97  }
98 }
99 )";
100 
101 
102 class MaxPoolRTCFunction : public CudaRTCFunction<MaxPoolRTCFunction> {
103  public:
104  MaxPoolRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {}
105 
106  template <typename... Args>
107  string KernelName(Args... args) { return name_; }
108 
109  template <typename... Args>
110  string GetSource(Args... args);
111 
112  private:
113  string name_;
114 };
115 
116 class MaxPoolGradientRTCFunction
117  : public CudaRTCFunction<MaxPoolGradientRTCFunction> {
118  public:
119  MaxPoolGradientRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {}
120 
121  template <typename... Args>
122  string KernelName(Args... args) { return name_; }
123 
124  template <typename... Args>
125  string GetSource(Args... args);
126 
127  private:
128  string name_;
129 };
130 
131 
132 template <>
133 string MaxPoolRTCFunction::GetSource(
134  const int output_size,
135  const int channels,
136  const int height,
137  const int width,
138  const int pooled_height,
139  const int pooled_width,
140  const int kernel_h,
141  const int kernel_w,
142  const int stride_h,
143  const int stride_w,
144  const int pad_t,
145  const int pad_l) {
146  char buffer[65536];
147  int nbytes = snprintf(
148  buffer, 65536, kMaxPoolForwardNCHWSource, name_.c_str(), output_size,
149  channels, height, width, pooled_height, pooled_width, kernel_h, kernel_w,
150  stride_h, stride_w, pad_t, pad_l);
151  DCHECK_GE(nbytes, 0);
152  DCHECK_LT(nbytes, 65536);
153  return string(buffer);
154 }
155 
156 template <>
157 string MaxPoolGradientRTCFunction::GetSource(
158  const int output_size,
159  const int num,
160  const int channels,
161  const int height,
162  const int width,
163  const int pooled_height,
164  const int pooled_width,
165  const int kernel_h,
166  const int kernel_w,
167  const int stride_h,
168  const int stride_w,
169  const int pad_t,
170  const int pad_l) {
171  char buffer[65536];
172  int nbytes = snprintf(
173  buffer, 65536, kMaxPoolBackwardNCHWSource, name_.c_str(), output_size,
174  num, channels, height, width, pooled_height, pooled_width, kernel_h,
175  kernel_w, stride_h, stride_w, pad_t, pad_l);
176  DCHECK_GE(nbytes, 0);
177  DCHECK_LT(nbytes, 65536);
178  return string(buffer);
179 }
180 
181 } // namespace
182 
183 
184 class MaxPoolRTCOp final : public ConvPoolOpBase<CUDAContext> {
185  public:
186  MaxPoolRTCOp(const OperatorDef& operator_def, Workspace* ws)
187  : ConvPoolOpBase<CUDAContext>(operator_def, ws) {
188  CAFFE_ENFORCE_EQ(
189  order_, StorageOrder::NCHW, "Currently only NCHW is supported.");
190  }
191  ~MaxPoolRTCOp() {}
192 
193  bool RunOnDeviceWithOrderNCHW() override {
194  auto& X = Input(0);
195  auto* Y = Output(0);
196  ConvPoolOpBase::SetOutputSize(X, Y, X.dim32(1));
197 
198  if (input_dims_ != X.dims()) {
199  // recompile
200  VLOG(1) << "MaxPool RTC recompiling";
201  CAFFE_ENFORCE_LT(Y->size(), std::numeric_limits<int>::max());
202  func_.Compile(
203  static_cast<int>(Y->size()),
204  X.dim32(1),
205  X.dim32(2),
206  X.dim32(3),
207  Y->dim32(2),
208  Y->dim32(3),
209  kernel_h(),
210  kernel_w(),
211  stride_h(),
212  stride_w(),
213  pad_t(),
214  pad_l());
215  input_dims_ = X.dims();
216  }
217  // Carry out the pooling computation.
218  func_.Launch(CAFFE_GET_BLOCKS(Y->size()), 1, 1, CAFFE_CUDA_NUM_THREADS,
219  1, 1, 0, context_.cuda_stream(),
220  X.data<float>(), Y->mutable_data<float>());
221  return true;
222  }
223 
224  bool RunOnDeviceWithOrderNHWC() override {
225  LOG(FATAL) << "Not implemented.";
226  return false;
227  }
228 
229  private:
230  MaxPoolRTCFunction func_;
231  vector<TIndex> input_dims_;
232 };
233 
234 class MaxPoolGradientRTCOp final : public ConvPoolOpBase<CUDAContext> {
235  public:
236  MaxPoolGradientRTCOp(const OperatorDef& operator_def, Workspace* ws)
237  : ConvPoolOpBase<CUDAContext>(operator_def, ws) {
238  CAFFE_ENFORCE_EQ(
239  order_, StorageOrder::NCHW, "Currently only NCHW is supported.");
240  }
242 
243  bool RunOnDeviceWithOrderNCHW() override {
244  auto& X = Input(0);
245  auto& Y = Input(1);
246  auto& dY = Input(2);
247  CAFFE_ENFORCE_EQ(dY.ndim(), 4);
248  auto* dX = Output(0);
249  dX->ResizeLike(X);
250  ConvPoolOpBase<CUDAContext>::ComputePads({X.dim32(2), X.dim32(3)});
251  if (input_dims_ != X.dims()) {
252  VLOG(1) << "MaxPoolGradient RTC recompiling";
253  CAFFE_ENFORCE_LT(X.size(), std::numeric_limits<int>::max());
254  func_.Compile(
255  static_cast<int>(X.size()),
256  X.dim32(0),
257  X.dim32(1),
258  X.dim32(2),
259  X.dim32(3),
260  dY.dim32(2),
261  dY.dim32(3),
262  kernel_h(),
263  kernel_w(),
264  stride_h(),
265  stride_w(),
266  pad_t(),
267  pad_l());
268  input_dims_ = X.dims();
269  }
270  func_.Launch(CAFFE_GET_BLOCKS(X.size()), 1, 1, CAFFE_CUDA_NUM_THREADS, 1, 1,
271  0, context_.cuda_stream(),
272  X.data<float>(), Y.data<float>(), dY.data<float>(),
273  dX->mutable_data<float>());
274  return true;
275  }
276 
277  bool RunOnDeviceWithOrderNHWC() override {
278  LOG(FATAL) << "Not implemented.";
279  return false;
280  }
281 
282  private:
283  MaxPoolGradientRTCFunction func_;
284  vector<TIndex> input_dims_;
285 };
286 
287 namespace {
288 REGISTER_CUDA_OPERATOR_WITH_ENGINE(MaxPool, NVRTC, MaxPoolRTCOp);
289 REGISTER_CUDA_OPERATOR_WITH_ENGINE(MaxPoolGradient, NVRTC,
291 } // namespace
292 } // namespace caffe2
Workspace is a class that holds all the related objects created during runtime: (1) all blobs...
Definition: workspace.h:53
Simple registry implementation in Caffe2 that uses static variables to register object creators durin...
int CAFFE_GET_BLOCKS(const int N)
Compute the number of blocks needed to run N threads.
Definition: common_gpu.h:239