Caffe2 - Python API
A deep learning, cross platform ML framework
optimizer.py
1 
3 from __future__ import absolute_import
4 from __future__ import division
5 from __future__ import print_function
6 from __future__ import unicode_literals
7 
8 from collections import namedtuple
9 from caffe2.python import core
10 from caffe2.proto import caffe2_pb2
11 
12 _OPTIMIZER_ITERATION_NAME = "optimizer_iteration"
13 
14 class Optimizer(object):
15  def __init__(self):
16  AuxParams = namedtuple("AuxParams", ["local", "shared"])
17  self._aux_params = AuxParams(local=[], shared=[])
18 
19  def __call__(self, net, param_init_net, param, grad):
20  raise NotImplementedError()
21 
22  @staticmethod
23  def build_lr(net, param_init_net, base_learning_rate,
24  learning_rate_blob="lr", policy="fixed",
25  iter_val=0, **kwargs):
26  if not param_init_net.BlobIsDefined(_OPTIMIZER_ITERATION_NAME):
27  # Add training operators.
28  with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
29  iteration = param_init_net.ConstantFill(
30  [], _OPTIMIZER_ITERATION_NAME, shape=[1],
31  value=iter_val,
32  dtype=core.DataType.INT32)
33 
34  iter_mutex = param_init_net.CreateMutex([], ["iteration_mutex"])
35  net.AtomicIter([iter_mutex, iteration], [iteration])
36  else:
37  iteration = param_init_net.GetBlobRef(_OPTIMIZER_ITERATION_NAME)
38 
39  # There is one interesting thing here: since we are minimizing, we are
40  # doing "descent" so the learning rate is set to be negative.
41  lr = net.LearningRate(
42  [iteration],
43  learning_rate_blob,
44  base_lr=-base_learning_rate,
45  policy=policy,
46  **kwargs
47  )
48  return lr, iteration
49 
50  @staticmethod
51  def dedup(net, sparse_dedup_aggregator, grad):
52  assert (isinstance(grad, core.GradientSlice))
53  if sparse_dedup_aggregator:
54  return net.DeduplicateGradientSlices(
55  grad, aggregator=sparse_dedup_aggregator)
56  else:
57  return grad
58 
60  """Returns a list of auxiliary parameters.
61 
62  Returns:
63  aux_params: A namedtuple, AuxParams.
64 
65  aux_params.local stores a list of blobs. Each blob is a local
66  auxiliary parameter. A local auxiliary parameter is a parameter in
67  parallel to a learning rate parameter. Take adagrad as an example,
68  the local auxiliary parameter is the squared sum parameter, because
69  every learning rate has a squared sum associated with it.
70 
71  aux_params.shared also stores a list of blobs. Each blob is a shared
72  auxiliary parameter. A shared auxiliary parameter is a parameter
73  that is shared across all the learning rate parameters. Take adam as
74  an example, the iteration parameter is a shared parameter, because
75  all the learning rates share the same iteration parameter.
76  """
77  return self._aux_params
78 
79 
81  def __init__(self, base_learning_rate=0.01, policy='fixed',
82  momentum=0.0, **kwargs):
83  super(SgdOptimizer, self).__init__()
84  self.base_learning_rate = base_learning_rate
85  self.policy = policy
86  self.momentum = momentum
87  self.init_kwargs = kwargs
88 
89  def __call__(self, net, param_init_net, param, grad):
90  if self.base_learning_rate <= 0:
91  return
92 
93  lr, _ = self.build_lr(
94  net, param_init_net,
95  base_learning_rate=self.base_learning_rate,
96  learning_rate_blob=str(param) + "_lr",
97  policy=self.policy,
98  **(self.init_kwargs)
99  )
100 
101  ONE = param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
102  self._aux_params.shared.append(ONE)
103 
104  if self.momentum > 0:
105  momentum_data = param_init_net.ConstantFill(
106  param, str(param) + "_momentum", value=0.)
107  self._aux_params.local.append(momentum_data)
108 
109  if isinstance(grad, core.GradientSlice):
110  assert self.momentum == 0., "Doesn't support momentum for sparse"
111  net.ScatterWeightedSum(
112  [param, ONE, grad.indices, grad.values, lr],
113  param
114  )
115  else:
116  if self.momentum > 0.:
117  net.MomentumSGD(
118  [grad, momentum_data, lr], [grad, momentum_data],
119  momentum=self.momentum,
120  nesterov=1)
121  coeff = ONE
122  else:
123  coeff = lr
124 
125  net.WeightedSum(
126  [param, ONE, grad, coeff],
127  param
128  )
129 
130 
132  def __init__(self, alpha=0.01, epsilon=1e-4, policy="fixed",
133  sparse_dedup_aggregator=None, engine='', **kwargs):
134  super(AdagradOptimizer, self).__init__()
135  self.alpha = alpha
136  self.epsilon = epsilon
137  self.policy = policy
138  self.sparse_dedup_aggregator = sparse_dedup_aggregator
139  self.engine = engine
140  self.init_kwargs = kwargs
141 
142  def __call__(self, net, param_init_net, param, grad):
143  if self.alpha <= 0:
144  return
145 
146  lr, _ = self.build_lr(
147  net, param_init_net,
148  base_learning_rate=self.alpha,
149  learning_rate_blob=str(param) + "_lr",
150  policy=self.policy,
151  **(self.init_kwargs)
152  )
153 
154  param_squared_sum = param_init_net.ConstantFill(
155  [param],
156  str(param) + "_squared_sum",
157  value=0.0
158  )
159  self._aux_params.local.append(param_squared_sum)
160 
161  if isinstance(grad, core.GradientSlice):
162  grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
163  net.SparseAdagrad(
164  [param, param_squared_sum, grad.indices, grad.values, lr],
165  [param, param_squared_sum],
166  epsilon=self.epsilon,
167  engine=self.engine
168  )
169  else:
170  net.Adagrad(
171  [param, param_squared_sum, grad, lr],
172  [param, param_squared_sum],
173  epsilon=self.epsilon,
174  engine=self.engine
175  )
176 
177 
179  def __init__(self, alpha=0.01, beta=1e-4, lambda1=0, lambda2=0,
180  sparse_dedup_aggregator=None, engine=''):
181  super(FtrlOptimizer, self).__init__()
182  self.alpha = alpha
183  self.beta = beta
184  self.lambda1 = lambda1
185  self.lambda2 = lambda2
186  self.sparse_dedup_aggregator = sparse_dedup_aggregator
187  self.engine = engine
188 
189  def __call__(self, net, param_init_net, param, grad):
190  if self.alpha <= 0:
191  return
192 
193  nz = param_init_net.ConstantFill(
194  [param],
195  str(param) + "_ftrl_nz",
196  extra_shape=[2],
197  value=0.0
198  )
199  self._aux_params.local.append(nz)
200  if isinstance(grad, core.GradientSlice):
201  grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
202  net.SparseFtrl(
203  [param, nz, grad.indices, grad.values],
204  [param, nz],
205  engine=self.engine,
206  alpha=self.alpha,
207  beta=self.beta,
208  lambda1=self.lambda1,
209  lambda2=self.lambda2
210  )
211  else:
212  net.Ftrl(
213  [param, nz, grad],
214  [param, nz],
215  engine=self.engine,
216  alpha=self.alpha,
217  beta=self.beta,
218  lambda1=self.lambda1,
219  lambda2=self.lambda2
220  )
221 
223  def __init__(self, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
224  policy='fixed', sparse_dedup_aggregator=None,
225  engine='', **kwargs):
226  super(AdamOptimizer, self).__init__()
227  self.alpha = alpha
228  self.beta1 = beta1
229  self.beta2 = beta2
230  self.epsilon = epsilon
231  self.policy = policy
232  self.sparse_dedup_aggregator = sparse_dedup_aggregator
233  self.engine = engine
234  self.init_kwargs = kwargs
235 
236  def __call__(self, net, param_init_net, param, grad):
237  if self.alpha <= 0:
238  return
239 
240  lr, iteration = self.build_lr(
241  net, param_init_net,
242  base_learning_rate=self.alpha,
243  learning_rate_blob=str(param) + "_lr",
244  policy=self.policy,
245  **(self.init_kwargs)
246  )
247 
248  m1 = param_init_net.ConstantFill(
249  [param],
250  param + "_first_moment",
251  value=0.0
252  )
253  m2 = param_init_net.ConstantFill(
254  [param],
255  param + "_second_moment",
256  value=0.0
257  )
258  self._aux_params.shared.append(iteration)
259  self._aux_params.local.append(m1)
260  self._aux_params.local.append(m2)
261  if isinstance(grad, core.GradientSlice):
262  grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
263  net.SparseAdam(
264  [param, m1, m2, grad.indices, grad.values, lr, iteration],
265  [param, m1, m2],
266  beta1=self.beta1,
267  beta2=self.beta2,
268  epsilon=self.epsilon
269  )
270 
271  else:
272  net.Adam(
273  [param, m1, m2, grad, lr, iteration],
274  [param, m1, m2],
275  beta1=self.beta1,
276  beta2=self.beta2,
277  epsilon=self.epsilon)
278 
279 
280 def build_sgd(model, base_learning_rate, **kwargs):
281  sgd_optimizer = SgdOptimizer(base_learning_rate, **kwargs)
282  for param, grad in model.GetOptimizationPairs().items():
283  sgd_optimizer(model.net, model.param_init_net, param, grad)
284  return sgd_optimizer
285 
286 
287 def build_ftrl(model, engine="SIMD", **kwargs):
288  if engine == "SIMD":
289  assert core.IsOperator('Ftrl_ENGINE_SIMD')
290  assert core.IsOperator('SparseFtrl_ENGINE_SIMD')
291  ftrl_optimizer = FtrlOptimizer(engine=engine, **kwargs)
292  for param, grad in model.GetOptimizationPairs().items():
293  ftrl_optimizer(model.net, model.param_init_net, param, grad)
294  return ftrl_optimizer
295 
296 
297 def build_adagrad(model, base_learning_rate, parameters=None, **kwargs):
298  adagrad_optimizer = AdagradOptimizer(alpha=base_learning_rate, **kwargs)
299  param_to_grad = model.GetOptimizationPairs(parameters)
300 
301  for param, grad in param_to_grad.items():
302  adagrad_optimizer(model.net, model.param_init_net, param, grad)
303  return adagrad_optimizer
304 
305 
306 def build_adam(model, base_learning_rate, **kwargs):
307  adam_optimizer = AdamOptimizer(alpha=base_learning_rate, **kwargs)
308  for param, grad in model.GetOptimizationPairs().items():
309  adam_optimizer(model.net, model.param_init_net, param, grad)
310  return adam_optimizer
def dedup(net, sparse_dedup_aggregator, grad)
Definition: optimizer.py:51
def IsOperator(op_type)
Definition: core.py:95
def get_auxiliary_parameters(self)
Definition: optimizer.py:59
DeviceScope
Definition: core.py:27
def DeviceOption(device_type, cuda_gpu_id=0, random_seed=None)
Definition: core.py:103
def build_lr(net, param_init_net, base_learning_rate, learning_rate_blob="lr", policy="fixed", iter_val=0, kwargs)
Definition: optimizer.py:25