Caffe2 - Python API
A deep learning, cross platform ML framework
train.py
1 
3 """
4 Benchmark for ads based model.
5 
6 To run a benchmark with full forward-backward-update, do e.g.
7 
8 OMP_NUM_THREADS=8 _build/opt/caffe2/caffe2/fb/ads/train_cpu.lpar \
9  --batchSize 100 \
10  --hidden 128-64-32 \
11  --loaderConfig /mnt/vol/gfsdataswarm-global/namespaces/ads/fblearner/users/ \
12  dzhulgakov/caffe2/tests/test_direct_loader.config
13 
14 For more details, run with --help.
15 """
16 from __future__ import absolute_import
17 from __future__ import division
18 from __future__ import print_function
19 # TODO(jiayq): breaks Caffe2, need to investigate
20 # from __future__ import unicode_literals
21 
22 from caffe2.python import workspace, cnn, core
23 from caffe2.python.fb.models.mlp import (
24  mlp,
25  mlp_decomp,
26  mlp_prune,
27  sparse_mlp,
28  debug_sparse_mlp,
29  debug_sparse_mlp_decomposition,
30  debug_sparse_mlp_prune,
31 )
32 from caffe2.python.fb.models.loss import BatchLRLoss
33 from caffe2.python.fb.metrics.metrics import LogScoreReweightedMeasurements
34 from caffe2.python.fb.executor.executor import Trainer
35 from caffe2.python.sgd import build_sgd
36 from caffe2.python import net_drawer
37 from caffe2.python import SparseTransformer
38 
39 from collections import namedtuple
40 import os
41 import sys
42 import json
43 import subprocess
44 import logging
45 
46 import numpy as np
47 from libfb import pyinit
48 import hiveio.par_init
49 import fblearner.nn.gen_conf as conf_utils
50 
51 dyndep.InitOpsLibrary('@/caffe2/caffe2/fb/data:reading_ops')
52 
53 hiveio.par_init.install_class_path()
54 
55 for h in logging.root.handlers:
56  h.setFormatter(logging.Formatter(
57  '%(levelname)s %(asctime)s : %(message)s'))
58 logger = logging.getLogger(__name__)
59 logger.setLevel(logging.DEBUG)
60 
61 InputData = namedtuple(
62  'InputData', ['data', 'label', 'weight', 'prod_pred', 'sparse_segments'])
63 
64 
65 def FakeData(args, model):
66  logger.info('Input dimensions is %d', args.input_dim)
67 
68  workspace.FeedBlob('data', np.random.normal(
69  size=(args.batchSize, args.input_dim)).astype(np.float32))
70  workspace.FeedBlob('label', np.random.randint(
71  2, size=args.batchSize).astype(np.int32))
72  workspace.FeedBlob('weight', np.ones(args.batchSize).astype(np.float32))
73 
74  sparseBin = 100
75  workspace.FeedBlob('eid', np.arange(args.batchSize).astype(np.int32))
76  workspace.FeedBlob('key', np.random.randint(
77  0, sparseBin, args.batchSize).astype(np.int64))
78  workspace.FeedBlob('val', np.ones(args.batchSize).astype(np.float32))
79 
80  sparseSegments = [
81  {
82  'size': sparseBin,
83  'eid': 'eid',
84  'key': 'key',
85  'val': 'val',
86  },
87  ]
88 
89  return InputData(data='data', label='label', weight='weight',
90  prod_pred=None, sparse_segments=sparseSegments)
91 
92 
93 def NNLoaderData(args, model):
94  cfg = conf_utils.loadNNLoaderConfig(args.loaderConfig)
95  loaderConfig = conf_utils.getLoaderConfigFromNNLoaderConfig(cfg)
96  preperConfig = loaderConfig.preperConfig
97  metaFile = preperConfig.metaFile
98  assert metaFile, 'meta data not found'
99 
100  if type(loaderConfig).__name__ == 'LocalDirectLoader':
101  loaderConfig.batchConfig.batchSize = args.batchSize
102  logger.info('Batch size = %d', loaderConfig.batchConfig.batchSize)
103  else:
104  logger.info('Batch size unknown here. will be determined by the reader')
105 
106  logger.info('Parsing meta data %s', metaFile)
107  cmd = 'cat "{}" | {}'.format(metaFile, args.meta2json)
108  meta = json.loads(subprocess.check_output(cmd, shell=True))
109  args.input_dim = len(meta['denseFeatureNames'])
110  logger.info('Input dimensions is %d', args.input_dim)
111 
112  fields = ['data', 'label', 'weight', 'prod_pred']
113 
114  sparseSegments = []
115  if preperConfig.skipSparse or not preperConfig.sparseSegments.segments:
116  logger.info('No sparse features found')
117  else:
118  segments = loaderConfig.preperConfig.sparseSegments.segments
119  logger.info('Found %d sparse segments', len(segments))
120 
121  sparseFieldNames = ('eid', 'key', 'val', 'size')
122  for i, segment in enumerate(segments):
123  sparseData = ['{}_{}'.format(fn, i) for fn in sparseFieldNames[:3]]
124  fields.extend(sparseData)
125 
126  size = max(sf.mod + sf.offset for sf in segment.inputs)
127  sparseSegments.append(
128  dict(zip(sparseFieldNames, sparseData + [size])))
129  logger.info('Sparse segment %d: %s', i, sparseSegments[-1])
130 
131  loader = model.param_init_net.NNLoaderCreate(
132  [], json_config=conf_utils.structToString(cfg))
133 
134  model.net.NNLoaderRead([loader], fields, add_sparse_bias=True)
135 
136  return InputData(*(fields[:4] + [sparseSegments]))
137 
138 
139 def sparse_transform(model):
140  print("====================================================")
141  print(" Sparse Transformer ")
142  print("====================================================")
143  net_root, net_name2id, net_id2node = SparseTransformer.netbuilder(model)
145  net_root, net_id2node, net_name2id, model.net.Proto().op, model)
146  op_list = SparseTransformer.net2list(net_root)
147  del model.net.Proto().op[:]
148  model.net.Proto().op.extend(op_list)
149 
150 
151 def train(model_gen, data_gen, args):
152  model = cnn.CNNModelHelper("NCHW", name="mlp")
153  input_data = data_gen(args, model)
154  logger.info(input_data)
155  batch_loss = model_gen(args, model, input_data)
156 
157  try:
158  print(model.net.Proto())
159  graph = net_drawer.GetPydotGraph(model.net.Proto().op, 'net', 'TB')
160  netGraphFile = os.path.join(
161  os.path.expanduser('~'), 'public_html/net.png')
162  logger.info('Drawing network to %s', netGraphFile)
163  graph.write(netGraphFile, format='png')
164  except Exception as err:
165  logger.error('Failed to draw net: %s', err)
166 
167  # Add gradients
168  model.AddGradientOperators([batch_loss.loss])
169 
170  if model.net.Proto().op[-1].output[-1] == 'data_grad':
171  logger.info('Skipping grad for data')
172  del model.net.Proto().op[-1].output[-1]
173 
174  build_sgd(model,
175  base_learning_rate=args.rateOfLearning,
176  policy="inv",
177  gamma=args.learnRateDecay,
178  power=args.learnRatePower)
179 
180  if args.seed:
181  logger.info('Setting random seed to %d', args.seed)
182  model.param_init_net._net.device_option.CopyFrom(
183  core.DeviceOption(0, 0, random_seed=args.seed))
184 
185  if args.gpu:
186  model.param_init_net.RunAllOnGPU()
187  model.net.RunAllOnGPU()
188 
189  if args.net_type:
190  model.net.Proto().type = args.net_type
191  model.net.Proto().num_workers = args.num_workers
192 
193  trainer = Trainer(
194  model,
195  epoch_size=args.epochSize // args.batchSize,
196  num_threads=args.numThreads,
197  num_epochs=args.maxEpoch,
198  reporter=LogScoreReweightedMeasurements(
199  batch_loss, input_data.weight, args.negDownsampleRate,
200  args.batchSize, args.last_n_stats))
201  trainer.run(args.maxEpoch)
202 
203  print(model.net.Proto())
204  sparse_transform(model)
205  print(model.net.Proto())
206  workspace.RunNetOnce(model.net)
207 
208 
209 def mlp_model(args, model, input_data):
210  hiddens = [int(s) for s in args.hidden.split('-')] + [2]
211  sums = mlp(model, input_data.data, args.input_dim, hiddens)
212  return BatchLRLoss(model, sums, input_data.label)
213 
214 
215 def mlp_decomp_model(args, model, input_data):
216  hiddens = [int(s) for s in args.hidden.split('-')] + [2]
217  sums = mlp_decomp(model, input_data.data, args.input_dim, hiddens)
218  return BatchLRLoss(model, sums, input_data.label)
219 
220 
221 def mlp_prune_model(args, model, input_data):
222  hiddens = [int(s) for s in args.hidden.split('-')] + [2]
223  sums = mlp_prune(model, input_data.data, args.input_dim,
224  hiddens, prune_thres=args.prune_thres,
225  comp_lb=args.compress_lb)
226  return BatchLRLoss(model, sums, input_data.label)
227 
228 
229 def sparse_mlp_model(args, model, input_data):
230  hiddens = [int(s) for s in args.hidden.split('-')] + [2]
231  sums = sparse_mlp(model, input_data.data, args.input_dim, hiddens,
232  input_data.sparse_segments)
233  return BatchLRLoss(model, sums, input_data.label)
234 
235 
236 def debug_sparse_mlp_model(args, model, input_data):
237  hiddens = [int(s) for s in args.hidden.split('-')] + [2]
238  sums = debug_sparse_mlp(model, input_data.data, args.input_dim, hiddens,
239  input_data.sparse_segments)
240  return BatchLRLoss(model, sums, input_data.label)
241 
242 
243 def debug_sparse_mlp_decomposition_model(args, model, input_data):
244  hiddens = [int(s) for s in args.hidden.split('-')] + [2]
245  sums = debug_sparse_mlp_decomposition(model, input_data.data,
246  args.input_dim, hiddens,
247  input_data.sparse_segments)
248  return BatchLRLoss(model, sums, input_data.label)
249 
250 
251 def debug_sparse_mlp_prune_model(args, model, input_data):
252  hiddens = [int(s) for s in args.hidden.split('-')] + [2]
253  sums = debug_sparse_mlp_prune(model, input_data.data, args.input_dim,
254  hiddens,
255  input_data.sparse_segments)
256  return BatchLRLoss(model, sums, input_data.label)
257 
258 
259 MODEL_TYPE_FUNCTIONS = {
260  'mlp': mlp_model,
261  'mlp_decomp': mlp_decomp_model,
262  'mlp_prune': mlp_prune_model,
263  'sparse_mlp': sparse_mlp_model,
264  'debug_sparse_mlp': debug_sparse_mlp_model,
265  'debug_sparse_mlp_decomposition': debug_sparse_mlp_decomposition_model,
266  'debug_sparse_mlp_prune': debug_sparse_mlp_prune_model,
267  # Add more model_type functions here.
268 }
269 
270 
271 if __name__ == '__main__':
272  # it's hard to init flags correctly... so here it is
273  sys.argv.append('--caffe2_keep_on_shrink')
274 
275  # FbcodeArgumentParser calls initFacebook which is necessary for NNLoader
276  # initialization
277  parser = pyinit.FbcodeArgumentParser(description='Ads NN trainer')
278 
279  # arguments starting with single '-' are compatible with Lua
280  parser.add_argument("-batchSize", type=int, default=100,
281  help="The batch size of benchmark data.")
282  parser.add_argument("-loaderConfig", type=str,
283  help="Json file with NNLoader's config. If empty some "
284  "fake data is used")
285  parser.add_argument("-meta", type=str, help="Meta file (deprecated)")
286  parser.add_argument("-hidden", type=str,
287  help="A dash-separated string specifying the "
288  "model dimensions without the output layer.")
289  parser.add_argument("-epochSize", type=int, default=1000000,
290  help="Examples to process in one take")
291  parser.add_argument("-maxEpoch", type=int,
292  help="Limit number of epochs, if empty reads all data")
293  parser.add_argument("-negDownsampleRate", type=float, default=0.1,
294  help="Used to compute the bias term")
295  parser.add_argument("-rateOfLearning", type=float, default=0.02,
296  help="Learning rate, `lr/(1+t*d)^p`")
297  parser.add_argument("-learnRateDecay", type=float, default=1e-06,
298  help="d in `lr/(1+t*d)^p`")
299  parser.add_argument("-learnRatePower", type=float, default=0.5,
300  help="p in `lr/(1+t*d)^p`")
301  parser.add_argument("-numThreads", type=int, help="If set runs hogwild")
302  parser.add_argument("-model_type", type=str, default='mlp',
303  choices=MODEL_TYPE_FUNCTIONS.keys(),
304  help="The model to benchmark.")
305  parser.add_argument("-seed", type=int, help="random seed.")
306 
307  # arguments for Lua compatibility which are not implemented yet
308  parser.add_argument("-output", help="not implemented")
309 
310  # arguments starting with double '--' are additions of this script
311  parser.add_argument("--input_dim", type=int, default=1500,
312  help="The input dimension of benchmark data.")
313  parser.add_argument("--gpu", action="store_true",
314  help="If set, run testing on GPU.")
315  parser.add_argument("--net_type", type=str,
316  help="Set the type of the network to run with.")
317  parser.add_argument("--num_workers", type=int, default=4,
318  help="The number of workers, if the net type has "
319  "multiple workers.")
320  parser.add_argument("--last_n_stats", type=int, default=0,
321  help="LastN reporting, big values can slow things down")
322  parser.add_argument("--meta2json",
323  default='_bin/fblearner/nn/ads/meta2json.llar',
324  help='Path to meta2json.lar')
325  parser.add_argument("--prune_thres", type=float, default=0.00001,
326  help="The threshold to prune the weights")
327  parser.add_argument("--compress_lb", type=float, default=0.05,
328  help="The lower bound of layer compression")
329  args = parser.parse_args()
330 
331  workspace.GlobalInit(['caffe2', '--caffe2_log_level=-1'])
332  data_gen = NNLoaderData if args.loaderConfig else FakeData
333  train(MODEL_TYPE_FUNCTIONS[args.model_type], data_gen, args)
def RunNetOnce(net)
Definition: workspace.py:160
def DeviceOption(device_type, cuda_gpu_id=0, random_seed=None)
Definition: core.py:103
def Prune2Sparse(cur, id2node, name2id, ops, model)
def GetPydotGraph(operators_or_net, name=None, rankdir='LR', node_producer=None)
Definition: net_drawer.py:86
Module caffe2.experiments.python.train.
Definition: train.py:1
def InitOpsLibrary(name)
Definition: dyndep.py:14
def net2list(net_root)
def FeedBlob(name, arr, device_option=None)
Definition: workspace.py:229