3 from __future__
import absolute_import
4 from __future__
import division
5 from __future__
import print_function
6 from __future__
import unicode_literals
8 from collections
import namedtuple
9 from caffe2.python
import core
10 from caffe2.proto
import caffe2_pb2
12 _OPTIMIZER_ITERATION_NAME =
"optimizer_iteration" 16 AuxParams = namedtuple(
"AuxParams", [
"local",
"shared"])
19 def __call__(self, net, param_init_net, param, grad):
20 raise NotImplementedError()
23 def build_lr(net, param_init_net, base_learning_rate,
24 learning_rate_blob="lr", policy="fixed",
25 iter_val=0, **kwargs):
26 if not param_init_net.BlobIsDefined(_OPTIMIZER_ITERATION_NAME):
29 iteration = param_init_net.ConstantFill(
30 [], _OPTIMIZER_ITERATION_NAME, shape=[1],
32 dtype=core.DataType.INT32)
34 iter_mutex = param_init_net.CreateMutex([], [
"iteration_mutex"])
35 net.AtomicIter([iter_mutex, iteration], [iteration])
37 iteration = param_init_net.GetBlobRef(_OPTIMIZER_ITERATION_NAME)
41 lr = net.LearningRate(
44 base_lr=-base_learning_rate,
51 def dedup(net, sparse_dedup_aggregator, grad):
52 assert (isinstance(grad, core.GradientSlice))
53 if sparse_dedup_aggregator:
54 return net.DeduplicateGradientSlices(
55 grad, aggregator=sparse_dedup_aggregator)
60 """Returns a list of auxiliary parameters. 63 aux_params: A namedtuple, AuxParams. 65 aux_params.local stores a list of blobs. Each blob is a local 66 auxiliary parameter. A local auxiliary parameter is a parameter in 67 parallel to a learning rate parameter. Take adagrad as an example, 68 the local auxiliary parameter is the squared sum parameter, because 69 every learning rate has a squared sum associated with it. 71 aux_params.shared also stores a list of blobs. Each blob is a shared 72 auxiliary parameter. A shared auxiliary parameter is a parameter 73 that is shared across all the learning rate parameters. Take adam as 74 an example, the iteration parameter is a shared parameter, because 75 all the learning rates share the same iteration parameter. 81 def __init__(self, base_learning_rate=0.01, policy='fixed',
82 momentum=0.0, **kwargs):
83 super(SgdOptimizer, self).__init__()
89 def __call__(self, net, param_init_net, param, grad):
96 learning_rate_blob=str(param) +
"_lr",
101 ONE = param_init_net.ConstantFill([],
"ONE", shape=[1], value=1.0)
105 momentum_data = param_init_net.ConstantFill(
106 param, str(param) +
"_momentum", value=0.)
109 if isinstance(grad, core.GradientSlice):
110 assert self.
momentum == 0.,
"Doesn't support momentum for sparse" 111 net.ScatterWeightedSum(
112 [param, ONE, grad.indices, grad.values, lr],
118 [grad, momentum_data, lr], [grad, momentum_data],
126 [param, ONE, grad, coeff],
132 def __init__(self, alpha=0.01, epsilon=1e-4, policy="fixed",
133 sparse_dedup_aggregator=None, engine='', **kwargs):
134 super(AdagradOptimizer, self).__init__()
142 def __call__(self, net, param_init_net, param, grad):
148 base_learning_rate=self.
alpha,
149 learning_rate_blob=str(param) +
"_lr",
154 param_squared_sum = param_init_net.ConstantFill(
156 str(param) +
"_squared_sum",
161 if isinstance(grad, core.GradientSlice):
164 [param, param_squared_sum, grad.indices, grad.values, lr],
165 [param, param_squared_sum],
171 [param, param_squared_sum, grad, lr],
172 [param, param_squared_sum],
179 def __init__(self, alpha=0.01, beta=1e-4, lambda1=0, lambda2=0,
180 sparse_dedup_aggregator=None, engine=''):
181 super(FtrlOptimizer, self).__init__()
189 def __call__(self, net, param_init_net, param, grad):
193 nz = param_init_net.ConstantFill(
195 str(param) +
"_ftrl_nz",
200 if isinstance(grad, core.GradientSlice):
203 [param, nz, grad.indices, grad.values],
223 def __init__(self, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
224 policy='fixed', sparse_dedup_aggregator=None,
225 engine='', **kwargs):
226 super(AdamOptimizer, self).__init__()
236 def __call__(self, net, param_init_net, param, grad):
242 base_learning_rate=self.
alpha,
243 learning_rate_blob=str(param) +
"_lr",
248 m1 = param_init_net.ConstantFill(
250 param +
"_first_moment",
253 m2 = param_init_net.ConstantFill(
255 param +
"_second_moment",
261 if isinstance(grad, core.GradientSlice):
264 [param, m1, m2, grad.indices, grad.values, lr, iteration],
273 [param, m1, m2, grad, lr, iteration],
280 def build_sgd(model, base_learning_rate, **kwargs):
281 sgd_optimizer =
SgdOptimizer(base_learning_rate, **kwargs)
282 for param, grad
in model.GetOptimizationPairs().items():
283 sgd_optimizer(model.net, model.param_init_net, param, grad)
287 def build_ftrl(model, engine="SIMD", **kwargs):
292 for param, grad
in model.GetOptimizationPairs().items():
293 ftrl_optimizer(model.net, model.param_init_net, param, grad)
294 return ftrl_optimizer
297 def build_adagrad(model, base_learning_rate, parameters=None, **kwargs):
299 param_to_grad = model.GetOptimizationPairs(parameters)
301 for param, grad
in param_to_grad.items():
302 adagrad_optimizer(model.net, model.param_init_net, param, grad)
303 return adagrad_optimizer
306 def build_adam(model, base_learning_rate, **kwargs):
307 adam_optimizer =
AdamOptimizer(alpha=base_learning_rate, **kwargs)
308 for param, grad
in model.GetOptimizationPairs().items():
309 adam_optimizer(model.net, model.param_init_net, param, grad)
310 return adam_optimizer
def dedup(net, sparse_dedup_aggregator, grad)
def get_auxiliary_parameters(self)
def DeviceOption(device_type, cuda_gpu_id=0, random_seed=None)
def build_lr(net, param_init_net, base_learning_rate, learning_rate_blob="lr", policy="fixed", iter_val=0, kwargs)