Caffe2 - Python API
A deep learning, cross platform ML framework
seq2seq.py
1 
3 from __future__ import absolute_import
4 from __future__ import division
5 from __future__ import print_function
6 from __future__ import unicode_literals
7 
8 import argparse
9 import collections
10 import logging
11 import math
12 import numpy as np
13 import random
14 import time
15 import sys
16 
17 from itertools import izip
18 
19 import caffe2.proto.caffe2_pb2 as caffe2_pb2
20 from caffe2.python import core, workspace, rnn_cell, data_parallel_model
21 from caffe2.python.examples import seq2seq_util
22 
23 logger = logging.getLogger(__name__)
24 logger.setLevel(logging.INFO)
25 logger.addHandler(logging.StreamHandler(sys.stderr))
26 
27 Batch = collections.namedtuple('Batch', [
28  'encoder_inputs',
29  'encoder_lengths',
30  'decoder_inputs',
31  'decoder_lengths',
32  'targets',
33  'target_weights',
34 ])
35 
36 _PAD_ID = 0
37 _GO_ID = 1
38 _EOS_ID = 2
39 EOS = '<EOS>'
40 UNK = '<UNK>'
41 GO = '<GO>'
42 PAD = '<PAD>'
43 
44 
45 def prepare_batch(batch):
46  encoder_lengths = [len(entry[0]) for entry in batch]
47  max_encoder_length = max(encoder_lengths)
48  decoder_lengths = []
49  max_decoder_length = max([len(entry[1]) for entry in batch])
50 
51  batch_encoder_inputs = []
52  batch_decoder_inputs = []
53  batch_targets = []
54  batch_target_weights = []
55 
56  for source_seq, target_seq in batch:
57  encoder_pads = (
58  [_PAD_ID] * (max_encoder_length - len(source_seq))
59  )
60  batch_encoder_inputs.append(
61  list(reversed(source_seq)) + encoder_pads
62  )
63 
64  decoder_pads = (
65  [_PAD_ID] * (max_decoder_length - len(target_seq))
66  )
67  target_seq_with_go_token = [_GO_ID] + target_seq
68  decoder_lengths.append(len(target_seq_with_go_token))
69  batch_decoder_inputs.append(target_seq_with_go_token + decoder_pads)
70 
71  target_seq_with_eos = target_seq + [_EOS_ID]
72  targets = target_seq_with_eos + decoder_pads
73  batch_targets.append(targets)
74 
75  if len(source_seq) + len(target_seq) == 0:
76  target_weights = [0] * len(targets)
77  else:
78  target_weights = [
79  1 if target != _PAD_ID else 0
80  for target in targets
81  ]
82  batch_target_weights.append(target_weights)
83 
84  return Batch(
85  encoder_inputs=np.array(
86  batch_encoder_inputs,
87  dtype=np.int32,
88  ).transpose(),
89  encoder_lengths=np.array(encoder_lengths, dtype=np.int32),
90  decoder_inputs=np.array(
91  batch_decoder_inputs,
92  dtype=np.int32,
93  ).transpose(),
94  decoder_lengths=np.array(decoder_lengths, dtype=np.int32),
95  targets=np.array(
96  batch_targets,
97  dtype=np.int32,
98  ).transpose(),
99  target_weights=np.array(
100  batch_target_weights,
101  dtype=np.float32,
102  ).transpose(),
103  )
104 
105 
107 
108  def _build_model(
109  self,
110  init_params,
111  ):
112  model = seq2seq_util.ModelHelper(init_params=init_params)
113  self._build_shared(model)
114  self._build_embeddings(model)
115 
116  forward_model = seq2seq_util.ModelHelper(init_params=init_params)
117  self._build_shared(forward_model)
118  self._build_embeddings(forward_model)
119 
120  if self.num_gpus == 0:
121  loss_blobs = self.model_build_fun(model)
122  model.AddGradientOperators(loss_blobs)
124  model,
125  scope='norm_clipped_grad_update'
126  )
127  self.forward_model_build_fun(forward_model)
128 
129  else:
130  assert (self.batch_size % self.num_gpus) == 0
131 
133  forward_model,
134  input_builder_fun=lambda m: None,
135  forward_pass_builder_fun=self.forward_model_build_fun,
136  param_update_builder_fun=None,
137  devices=range(self.num_gpus),
138  )
139 
140  def clipped_grad_update_bound(model):
142  model,
143  scope='norm_clipped_grad_update',
144  )
145 
147  model,
148  input_builder_fun=lambda m: None,
149  forward_pass_builder_fun=self.model_build_fun,
150  param_update_builder_fun=clipped_grad_update_bound,
151  devices=range(self.num_gpus),
152  )
154  model,
155  scope='norm_clipped_sparse_grad_update',
156  )
157  self.model = model
158  self.forward_net = forward_model.net
159 
160  def _build_embedding_encoder(
161  self,
162  model,
163  inputs,
164  input_lengths,
165  vocab_size,
166  embeddings,
167  embedding_size,
168  use_attention,
169  num_gpus,
170  forward_only=False,
171  ):
172  if num_gpus == 0:
173  embedded_encoder_inputs = model.net.Gather(
174  [embeddings, inputs],
175  ['embedded_encoder_inputs'],
176  )
177  else:
178  with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
179  embedded_encoder_inputs_cpu = model.net.Gather(
180  [embeddings, inputs],
181  ['embedded_encoder_inputs_cpu'],
182  )
183  embedded_encoder_inputs = model.CopyCPUToGPU(
184  embedded_encoder_inputs_cpu,
185  'embedded_encoder_inputs',
186  )
187 
188  if self.encoder_type == 'rnn':
189  assert len(self.encoder_params['encoder_layer_configs']) == 1
190  encoder_num_units = (
191  self.encoder_params['encoder_layer_configs'][0]['num_units']
192  )
193  encoder_initial_cell_state = model.param_init_net.ConstantFill(
194  [],
195  ['encoder_initial_cell_state'],
196  shape=[encoder_num_units],
197  value=0.0,
198  )
199  encoder_initial_hidden_state = (
200  model.param_init_net.ConstantFill(
201  [],
202  'encoder_initial_hidden_state',
203  shape=[encoder_num_units],
204  value=0.0,
205  )
206  )
207  # Choose corresponding rnn encoder function
208  if self.encoder_params['use_bidirectional_encoder']:
209  rnn_encoder_func = seq2seq_util.rnn_bidirectional_encoder
210  encoder_output_dim = 2 * encoder_num_units
211  else:
212  rnn_encoder_func = seq2seq_util.rnn_unidirectional_encoder
213  encoder_output_dim = encoder_num_units
214 
215  (
216  encoder_outputs,
217  final_encoder_hidden_state,
218  final_encoder_cell_state,
219  ) = rnn_encoder_func(
220  model,
221  embedded_encoder_inputs,
222  input_lengths,
223  encoder_initial_hidden_state,
224  encoder_initial_cell_state,
225  embedding_size,
226  encoder_num_units,
227  use_attention,
228  )
229  weighted_encoder_outputs = None
230  else:
231  raise ValueError('Unsupported encoder type {}'.format(
232  self.encoder_type))
233 
234  return (
235  encoder_outputs,
236  weighted_encoder_outputs,
237  final_encoder_hidden_state,
238  final_encoder_cell_state,
239  encoder_output_dim,
240  )
241 
242  def output_projection(
243  self,
244  model,
245  decoder_outputs,
246  decoder_output_size,
247  target_vocab_size,
248  decoder_softmax_size,
249  ):
250  if decoder_softmax_size is not None:
251  decoder_outputs = model.FC(
252  decoder_outputs,
253  'decoder_outputs_scaled',
254  dim_in=decoder_output_size,
255  dim_out=decoder_softmax_size,
256  )
257  decoder_output_size = decoder_softmax_size
258 
259  output_projection_w = model.param_init_net.XavierFill(
260  [],
261  'output_projection_w',
262  shape=[self.target_vocab_size, decoder_output_size],
263  )
264 
265  output_projection_b = model.param_init_net.XavierFill(
266  [],
267  'output_projection_b',
268  shape=[self.target_vocab_size],
269  )
270  model.params.extend([
271  output_projection_w,
272  output_projection_b,
273  ])
274  output_logits = model.net.FC(
275  [
276  decoder_outputs,
277  output_projection_w,
278  output_projection_b,
279  ],
280  ['output_logits'],
281  )
282  return output_logits
283 
284  def _build_shared(self, model):
285  optimizer_params = self.model_params['optimizer_params']
286  with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
287  self.learning_rate = model.AddParam(
288  name='learning_rate',
289  init_value=float(optimizer_params['learning_rate']),
290  trainable=False,
291  )
292  self.global_step = model.AddParam(
293  name='global_step',
294  init_value=0,
295  trainable=False,
296  )
297  self.start_time = model.AddParam(
298  name='start_time',
299  init_value=time.time(),
300  trainable=False,
301  )
302 
303  def _build_embeddings(self, model):
304  with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
305  sqrt3 = math.sqrt(3)
306  self.encoder_embeddings = model.param_init_net.UniformFill(
307  [],
308  'encoder_embeddings',
309  shape=[
310  self.source_vocab_size,
311  self.model_params['encoder_embedding_size'],
312  ],
313  min=-sqrt3,
314  max=sqrt3,
315  )
316  model.params.append(self.encoder_embeddings)
317  self.decoder_embeddings = model.param_init_net.UniformFill(
318  [],
319  'decoder_embeddings',
320  shape=[
321  self.target_vocab_size,
322  self.model_params['decoder_embedding_size'],
323  ],
324  min=-sqrt3,
325  max=sqrt3,
326  )
327  model.params.append(self.decoder_embeddings)
328 
329  def model_build_fun(self, model, forward_only=False, loss_scale=None):
330  encoder_inputs = model.net.AddExternalInput(
331  workspace.GetNameScope() + 'encoder_inputs',
332  )
333  encoder_lengths = model.net.AddExternalInput(
334  workspace.GetNameScope() + 'encoder_lengths',
335  )
336  decoder_inputs = model.net.AddExternalInput(
337  workspace.GetNameScope() + 'decoder_inputs',
338  )
339  decoder_lengths = model.net.AddExternalInput(
340  workspace.GetNameScope() + 'decoder_lengths',
341  )
342  targets = model.net.AddExternalInput(
343  workspace.GetNameScope() + 'targets',
344  )
345  target_weights = model.net.AddExternalInput(
346  workspace.GetNameScope() + 'target_weights',
347  )
348  attention_type = self.model_params['attention']
349  assert attention_type in ['none', 'regular']
350 
351  (
352  encoder_outputs,
353  weighted_encoder_outputs,
354  final_encoder_hidden_state,
355  final_encoder_cell_state,
356  encoder_output_dim,
357  ) = self._build_embedding_encoder(
358  model=model,
359  inputs=encoder_inputs,
360  input_lengths=encoder_lengths,
361  vocab_size=self.source_vocab_size,
362  embeddings=self.encoder_embeddings,
363  embedding_size=self.model_params['encoder_embedding_size'],
364  use_attention=(attention_type != 'none'),
365  num_gpus=self.num_gpus,
366  forward_only=forward_only,
367  )
368 
369  assert len(self.model_params['decoder_layer_configs']) == 1
370  decoder_num_units = (
371  self.model_params['decoder_layer_configs'][0]['num_units']
372  )
373 
374  if attention_type == 'none':
375  decoder_initial_hidden_state = model.FC(
376  final_encoder_hidden_state,
377  'decoder_initial_hidden_state',
378  encoder_output_dim,
379  decoder_num_units,
380  axis=2,
381  )
382  decoder_initial_cell_state = model.FC(
383  final_encoder_cell_state,
384  'decoder_initial_cell_state',
385  encoder_output_dim,
386  decoder_num_units,
387  axis=2,
388  )
389  else:
390  decoder_initial_hidden_state = model.param_init_net.ConstantFill(
391  [],
392  'decoder_initial_hidden_state',
393  shape=[decoder_num_units],
394  value=0.0,
395  )
396  decoder_initial_cell_state = model.param_init_net.ConstantFill(
397  [],
398  'decoder_initial_cell_state',
399  shape=[decoder_num_units],
400  value=0.0,
401  )
402  initial_attention_weighted_encoder_context = (
403  model.param_init_net.ConstantFill(
404  [],
405  'initial_attention_weighted_encoder_context',
406  shape=[encoder_output_dim],
407  value=0.0,
408  )
409  )
410 
411  if self.num_gpus == 0:
412  embedded_decoder_inputs = model.net.Gather(
413  [self.decoder_embeddings, decoder_inputs],
414  ['embedded_decoder_inputs'],
415  )
416  else:
417  with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
418  embedded_decoder_inputs_cpu = model.net.Gather(
419  [self.decoder_embeddings, decoder_inputs],
420  ['embedded_decoder_inputs_cpu'],
421  )
422  embedded_decoder_inputs = model.CopyCPUToGPU(
423  embedded_decoder_inputs_cpu,
424  'embedded_decoder_inputs',
425  )
426 
427  # seq_len x batch_size x decoder_embedding_size
428  if attention_type == 'none':
429  decoder_outputs, _, _, _ = rnn_cell.LSTM(
430  model=model,
431  input_blob=embedded_decoder_inputs,
432  seq_lengths=decoder_lengths,
433  initial_states=(
434  decoder_initial_hidden_state,
435  decoder_initial_cell_state,
436  ),
437  dim_in=self.model_params['decoder_embedding_size'],
438  dim_out=decoder_num_units,
439  scope='decoder',
440  outputs_with_grads=[0],
441  )
442  decoder_output_size = decoder_num_units
443  else:
444  (
445  decoder_outputs, _, _, _,
446  attention_weighted_encoder_contexts, _
448  model=model,
449  decoder_inputs=embedded_decoder_inputs,
450  decoder_input_lengths=decoder_lengths,
451  initial_decoder_hidden_state=decoder_initial_hidden_state,
452  initial_decoder_cell_state=decoder_initial_cell_state,
453  initial_attention_weighted_encoder_context=(
454  initial_attention_weighted_encoder_context
455  ),
456  encoder_output_dim=encoder_output_dim,
457  encoder_outputs=encoder_outputs,
458  decoder_input_dim=self.model_params['decoder_embedding_size'],
459  decoder_state_dim=decoder_num_units,
460  scope='decoder',
461  outputs_with_grads=[0, 4],
462  )
463  decoder_outputs, _ = model.net.Concat(
464  [decoder_outputs, attention_weighted_encoder_contexts],
465  [
466  'states_and_context_combination',
467  '_states_and_context_combination_concat_dims',
468  ],
469  axis=2,
470  )
471  decoder_output_size = decoder_num_units + encoder_output_dim
472 
473  # we do softmax over the whole sequence
474  # (max_length in the batch * batch_size) x decoder embedding size
475  # -1 because we don't know max_length yet
476  decoder_outputs_flattened, _ = model.net.Reshape(
477  [decoder_outputs],
478  [
479  'decoder_outputs_flattened',
480  'decoder_outputs_and_contexts_combination_old_shape',
481  ],
482  shape=[-1, decoder_output_size],
483  )
484  output_logits = self.output_projection(
485  model=model,
486  decoder_outputs=decoder_outputs_flattened,
487  decoder_output_size=decoder_output_size,
488  target_vocab_size=self.target_vocab_size,
489  decoder_softmax_size=self.model_params['decoder_softmax_size'],
490  )
491  targets, _ = model.net.Reshape(
492  [targets],
493  ['targets', 'targets_old_shape'],
494  shape=[-1],
495  )
496  target_weights, _ = model.net.Reshape(
497  [target_weights],
498  ['target_weights', 'target_weights_old_shape'],
499  shape=[-1],
500  )
501  output_probs = model.net.Softmax(
502  [output_logits],
503  ['output_probs'],
504  engine=('CUDNN' if self.num_gpus > 0 else None),
505  )
506  label_cross_entropy = model.net.LabelCrossEntropy(
507  [output_probs, targets],
508  ['label_cross_entropy'],
509  )
510  weighted_label_cross_entropy = model.net.Mul(
511  [label_cross_entropy, target_weights],
512  'weighted_label_cross_entropy',
513  )
514  total_loss_scalar = model.net.SumElements(
515  [weighted_label_cross_entropy],
516  'total_loss_scalar',
517  )
518  total_loss_scalar_weighted = model.net.Scale(
519  [total_loss_scalar],
520  'total_loss_scalar_weighted',
521  scale=1.0 / self.batch_size,
522  )
523  return [total_loss_scalar_weighted]
524 
525  def forward_model_build_fun(self, model, loss_scale=None):
526  return self.model_build_fun(
527  model=model,
528  forward_only=True,
529  loss_scale=loss_scale
530  )
531 
532  def _calc_norm_ratio(self, model, params, scope, ONE):
533  with core.NameScope(scope):
534  grad_squared_sums = []
535  for i, param in enumerate(params):
536  logger.info(param)
537  grad = (
538  model.param_to_grad[param]
539  if not isinstance(
540  model.param_to_grad[param],
541  core.GradientSlice,
542  ) else model.param_to_grad[param].values
543  )
544  grad_squared = model.net.Sqr(
545  [grad],
546  'grad_{}_squared'.format(i),
547  )
548  grad_squared_sum = model.net.SumElements(
549  grad_squared,
550  'grad_{}_squared_sum'.format(i),
551  )
552  grad_squared_sums.append(grad_squared_sum)
553 
554  grad_squared_full_sum = model.net.Sum(
555  grad_squared_sums,
556  'grad_squared_full_sum',
557  )
558  global_norm = model.net.Pow(
559  grad_squared_full_sum,
560  'global_norm',
561  exponent=0.5,
562  )
563  clip_norm = model.param_init_net.ConstantFill(
564  [],
565  'clip_norm',
566  shape=[],
567  value=float(self.model_params['max_gradient_norm']),
568  )
569  max_norm = model.net.Max(
570  [global_norm, clip_norm],
571  'max_norm',
572  )
573  norm_ratio = model.net.Div(
574  [clip_norm, max_norm],
575  'norm_ratio',
576  )
577  return norm_ratio
578 
579  def _apply_norm_ratio(
580  self, norm_ratio, model, params, learning_rate, scope, ONE
581  ):
582  for param in params:
583  param_grad = model.param_to_grad[param]
584  nlr = model.net.Negative(
585  [learning_rate],
586  'negative_learning_rate',
587  )
588  with core.NameScope(scope):
589  update_coeff = model.net.Mul(
590  [nlr, norm_ratio],
591  'update_coeff',
592  broadcast=1,
593  )
594  if isinstance(param_grad, core.GradientSlice):
595  param_grad_values = param_grad.values
596 
597  model.net.ScatterWeightedSum(
598  [
599  param,
600  ONE,
601  param_grad.indices,
602  param_grad_values,
603  update_coeff,
604  ],
605  param,
606  )
607  else:
608  model.net.WeightedSum(
609  [
610  param,
611  ONE,
612  param_grad,
613  update_coeff,
614  ],
615  param,
616  )
617 
618  def norm_clipped_grad_update(self, model, scope):
619 
620  if self.num_gpus == 0:
621  learning_rate = self.learning_rate
622  else:
623  learning_rate = model.CopyCPUToGPU(self.learning_rate, 'LR')
624 
625  params = []
626  for param in model.GetParams(top_scope=True):
627  if param in model.param_to_grad:
628  if not isinstance(
629  model.param_to_grad[param],
630  core.GradientSlice,
631  ):
632  params.append(param)
633 
634  ONE = model.param_init_net.ConstantFill(
635  [],
636  'ONE',
637  shape=[1],
638  value=1.0,
639  )
640  logger.info('Dense trainable variables: ')
641  norm_ratio = self._calc_norm_ratio(model, params, scope, ONE)
642  self._apply_norm_ratio(
643  norm_ratio, model, params, learning_rate, scope, ONE
644  )
645 
646  def norm_clipped_sparse_grad_update(self, model, scope):
647  learning_rate = self.learning_rate
648 
649  params = []
650  for param in model.GetParams(top_scope=True):
651  if param in model.param_to_grad:
652  if isinstance(
653  model.param_to_grad[param],
654  core.GradientSlice,
655  ):
656  params.append(param)
657 
658  ONE = model.param_init_net.ConstantFill(
659  [],
660  'ONE',
661  shape=[1],
662  value=1.0,
663  )
664  logger.info('Sparse trainable variables: ')
665  norm_ratio = self._calc_norm_ratio(model, params, scope, ONE)
666  self._apply_norm_ratio(
667  norm_ratio, model, params, learning_rate, scope, ONE
668  )
669 
670  def total_loss_scalar(self):
671  if self.num_gpus == 0:
672  return workspace.FetchBlob('total_loss_scalar')
673  else:
674  total_loss = 0
675  for i in range(self.num_gpus):
676  name = 'gpu_{}/total_loss_scalar'.format(i)
677  gpu_loss = workspace.FetchBlob(name)
678  total_loss += gpu_loss
679  return total_loss
680 
681  def _init_model(self):
682  workspace.RunNetOnce(self.model.param_init_net)
683 
684  def create_net(net):
686  net,
687  input_blobs=map(str, net.external_inputs),
688  )
689 
690  create_net(self.model.net)
691  create_net(self.forward_net)
692 
693  def __init__(
694  self,
695  model_params,
696  source_vocab_size,
697  target_vocab_size,
698  num_gpus=1,
699  num_cpus=1,
700  ):
701  self.model_params = model_params
702  self.encoder_type = 'rnn'
703  self.encoder_params = model_params['encoder_type']
704  self.source_vocab_size = source_vocab_size
705  self.target_vocab_size = target_vocab_size
706  self.num_gpus = num_gpus
707  self.num_cpus = num_cpus
708  self.batch_size = model_params['batch_size']
709 
711  'caffe2',
712  # NOTE: modify log level for debugging purposes
713  '--caffe2_log_level=0',
714  # NOTE: modify log level for debugging purposes
715  '--v=0',
716  # Fail gracefully if one of the threads fails
717  '--caffe2_handle_executor_threads_exceptions=1',
718  '--caffe2_mkl_num_threads=' + str(self.num_cpus),
719  ])
720 
721  def __enter__(self):
722  return self
723 
724  def __exit__(self, exc_type, exc_value, traceback):
726 
727  def initialize_from_scratch(self):
728  logger.info('Initializing Seq2SeqModelCaffe2 from scratch: Start')
729  self._build_model(init_params=True)
730  self._init_model()
731  logger.info('Initializing Seq2SeqModelCaffe2 from scratch: Finish')
732 
733  def get_current_step(self):
734  return workspace.FetchBlob(self.global_step)[0]
735 
736  def inc_current_step(self):
738  self.global_step,
739  np.array([self.get_current_step() + 1]),
740  )
741 
742  def step(
743  self,
744  batch,
745  forward_only
746  ):
747  if self.num_gpus < 1:
748  batch_obj = prepare_batch(batch)
749  for batch_obj_name, batch_obj_value in izip(
750  Batch._fields,
751  batch_obj,
752  ):
753  workspace.FeedBlob(batch_obj_name, batch_obj_value)
754  else:
755  for i in range(self.num_gpus):
756  gpu_batch = batch[i::self.num_gpus]
757  batch_obj = prepare_batch(gpu_batch)
758  for batch_obj_name, batch_obj_value in izip(
759  Batch._fields,
760  batch_obj,
761  ):
762  name = 'gpu_{}/{}'.format(i, batch_obj_name)
763  if batch_obj_name in ['encoder_inputs', 'decoder_inputs']:
764  dev = core.DeviceOption(caffe2_pb2.CPU)
765  else:
766  dev = core.DeviceOption(caffe2_pb2.CUDA, i)
767  workspace.FeedBlob(name, batch_obj_value, device_option=dev)
768 
769  if forward_only:
771  else:
772  workspace.RunNet(self.model.net)
773  self.inc_current_step()
774 
775  return self.total_loss_scalar()
776 
777 
778 def gen_vocab(corpus, unk_threshold):
779  vocab = collections.defaultdict(lambda: len(vocab))
780  freqs = collections.defaultdict(lambda: 0)
781  # Adding padding tokens to the vocabulary to maintain consistency with IDs
782  vocab[PAD]
783  vocab[GO]
784  vocab[EOS]
785  vocab[UNK]
786 
787  with open(corpus) as f:
788  for sentence in f:
789  tokens = sentence.strip().split()
790  for token in tokens:
791  freqs[token] += 1
792  for token, freq in freqs.items():
793  if freq > unk_threshold:
794  # TODO: Add reverse lookup dict when it becomes necessary
795  vocab[token]
796 
797  return vocab
798 
799 
800 def get_numberized_sentence(sentence, vocab):
801  numerized_sentence = []
802  for token in sentence.strip().split():
803  if token in vocab:
804  numerized_sentence.append(vocab[token])
805  else:
806  numerized_sentence.append(vocab[UNK])
807  return numerized_sentence
808 
809 
810 def gen_batches(source_corpus, target_corpus, source_vocab, target_vocab,
811  batch_size, max_length):
812  with open(source_corpus) as source, open(target_corpus) as target:
813  parallel_sentences = []
814  for source_sentence, target_sentence in zip(source, target):
815  numerized_source_sentence = get_numberized_sentence(
816  source_sentence,
817  source_vocab,
818  )
819  numerized_target_sentence = get_numberized_sentence(
820  target_sentence,
821  target_vocab,
822  )
823  if (
824  len(numerized_source_sentence) > 0 and
825  len(numerized_target_sentence) > 0 and
826  (
827  max_length is None or (
828  len(numerized_source_sentence) <= max_length and
829  len(numerized_target_sentence) <= max_length
830  )
831  )
832  ):
833  parallel_sentences.append((
834  numerized_source_sentence,
835  numerized_target_sentence,
836  ))
837  parallel_sentences.sort(key=lambda s_t: (len(s_t[0]), len(s_t[1])))
838 
839  batches, batch = [], []
840  for sentence_pair in parallel_sentences:
841  batch.append(sentence_pair)
842  if len(batch) >= batch_size:
843  batches.append(batch)
844  batch = []
845  if len(batch) > 0:
846  while len(batch) < batch_size:
847  batch.append(batch[-1])
848  assert len(batch) == batch_size
849  batches.append(batch)
850  random.shuffle(batches)
851  return batches
852 
853 
854 def run_seq2seq_model(args, model_params=None):
855  source_vocab = gen_vocab(args.source_corpus, args.unk_threshold)
856  target_vocab = gen_vocab(args.target_corpus, args.unk_threshold)
857  logger.info('Source vocab size {}'.format(len(source_vocab)))
858  logger.info('Target vocab size {}'.format(len(target_vocab)))
859 
860  batches = gen_batches(args.source_corpus, args.target_corpus, source_vocab,
861  target_vocab, model_params['batch_size'],
862  args.max_length)
863  logger.info('Number of training batches {}'.format(len(batches)))
864 
865  batches_eval = gen_batches(args.source_corpus_eval, args.target_corpus_eval,
866  source_vocab, target_vocab,
867  model_params['batch_size'], args.max_length)
868  logger.info('Number of eval batches {}'.format(len(batches_eval)))
869 
870  with Seq2SeqModelCaffe2(
871  model_params=model_params,
872  source_vocab_size=len(source_vocab),
873  target_vocab_size=len(target_vocab),
874  num_gpus=args.num_gpus,
875  num_cpus=20,
876  ) as model_obj:
877  model_obj.initialize_from_scratch()
878  for i in range(args.epochs):
879  logger.info('Epoch {}'.format(i))
880  total_loss = 0
881  for batch in batches:
882  total_loss += model_obj.step(
883  batch=batch,
884  forward_only=False,
885  )
886  logger.info('\ttraining loss {}'.format(total_loss))
887  total_loss = 0
888  for batch in batches_eval:
889  total_loss += model_obj.step(
890  batch=batch,
891  forward_only=False,
892  )
893  logger.info('\teval loss {}'.format(total_loss))
894 
895 
896 def run_seq2seq_rnn_unidirection_with_no_attention(args):
897  run_seq2seq_model(args, model_params=dict(
898  attention=('regular' if args.use_attention else 'none'),
899  decoder_layer_configs=[
900  dict(
901  num_units=args.decoder_cell_num_units,
902  ),
903  ],
904  encoder_type=dict(
905  encoder_layer_configs=[
906  dict(
907  num_units=args.encoder_cell_num_units,
908  ),
909  ],
910  use_bidirectional_encoder=args.use_bidirectional_encoder,
911  ),
912  batch_size=args.batch_size,
913  optimizer_params=dict(
914  learning_rate=args.learning_rate,
915  ),
916  encoder_embedding_size=args.encoder_embedding_size,
917  decoder_embedding_size=args.decoder_embedding_size,
918  decoder_softmax_size=args.decoder_softmax_size,
919  max_gradient_norm=args.max_gradient_norm,
920  ))
921 
922 
923 def main():
924  random.seed(31415)
925  parser = argparse.ArgumentParser(
926  description='Caffe2: Seq2Seq Training'
927  )
928  parser.add_argument('--source-corpus', type=str, default=None,
929  help='Path to source corpus in a text file format. Each '
930  'line in the file should contain a single sentence',
931  required=True)
932  parser.add_argument('--target-corpus', type=str, default=None,
933  help='Path to target corpus in a text file format',
934  required=True)
935  parser.add_argument('--max-length', type=int, default=None,
936  help='Maximal lengths of train and eval sentences')
937  parser.add_argument('--batch-size', type=int, default=32,
938  help='Training batch size')
939  parser.add_argument('--epochs', type=int, default=10,
940  help='Number of iterations over training data')
941  parser.add_argument('--learning-rate', type=float, default=0.5,
942  help='Learning rate')
943  parser.add_argument('--unk-threshold', type=int, default=50,
944  help='Threshold frequency under which token becomes '
945  'labeled unknown token')
946  parser.add_argument('--max-gradient-norm', type=float, default=1.0,
947  help='Max global norm of gradients at the end of each '
948  'backward pass. We do clipping to match the number.')
949  parser.add_argument('--use-bidirectional-encoder', action='store_true',
950  help='Set flag to use bidirectional recurrent network '
951  'in encoder')
952  parser.add_argument('--use-attention', action='store_true',
953  help='Set flag to use seq2seq with attention model')
954  parser.add_argument('--source-corpus-eval', type=str, default=None,
955  help='Path to source corpus for evaluation in a text '
956  'file format', required=True)
957  parser.add_argument('--target-corpus-eval', type=str, default=None,
958  help='Path to target corpus for evaluation in a text '
959  'file format', required=True)
960  parser.add_argument('--encoder-cell-num-units', type=int, default=256,
961  help='Number of cell units in the encoder layer')
962  parser.add_argument('--decoder-cell-num-units', type=int, default=512,
963  help='Number of cell units in the decoder layer')
964  parser.add_argument('--encoder-embedding-size', type=int, default=256,
965  help='Size of embedding in the encoder layer')
966  parser.add_argument('--decoder-embedding-size', type=int, default=512,
967  help='Size of embedding in the decoder layer')
968  parser.add_argument('--decoder-softmax-size', type=int, default=128,
969  help='Size of softmax layer in the decoder')
970  parser.add_argument('--num-gpus', type=int, default=0,
971  help='Number of GPUs for data parallel model')
972 
973  args = parser.parse_args()
974 
975  run_seq2seq_rnn_unidirection_with_no_attention(args)
976 
977 
978 if __name__ == '__main__':
979  main()
def LSTM(model, input_blob, seq_lengths, initial_states, dim_in, dim_out, scope, outputs_with_grads=(0,), return_params=False, memory_optimization=False, forget_bias=0.0)
Definition: rnn_cell.py:202
def ResetWorkspace(root_folder=None)
Definition: workspace.py:130
Module caffe2.python.layers.split.
def _build_embeddings(self, model)
Definition: seq2seq.py:303
def LSTMWithAttention(model, decoder_inputs, decoder_input_lengths, initial_decoder_hidden_state, initial_decoder_cell_state, initial_attention_weighted_encoder_context, encoder_output_dim, encoder_outputs, decoder_input_dim, decoder_state_dim, scope, attention_type=AttentionType.Regular, outputs_with_grads=(0, 4), weighted_encoder_outputs=None, lstm_memory_optimization=False, attention_memory_optimization=False, forget_bias=0.0)
Definition: rnn_cell.py:590
def _build_embedding_encoder(self, model, inputs, input_lengths, vocab_size, embeddings, embedding_size, use_attention, num_gpus, forward_only=False)
Definition: seq2seq.py:171
DeviceScope
Definition: core.py:27
def _apply_norm_ratio(self, norm_ratio, model, params, learning_rate, scope, ONE)
Definition: seq2seq.py:581
def RunNet(name, num_iter=1)
Definition: workspace.py:164
def total_loss_scalar(self)
Definition: seq2seq.py:670
def RunNetOnce(net)
Definition: workspace.py:160
def DeviceOption(device_type, cuda_gpu_id=0, random_seed=None)
Definition: core.py:103
def GetNameScope()
Definition: workspace.py:287
def output_projection(self, model, decoder_outputs, decoder_output_size, target_vocab_size, decoder_softmax_size)
Definition: seq2seq.py:249
def norm_clipped_sparse_grad_update(self, model, scope)
Definition: seq2seq.py:646
NameScope
Definition: core.py:28
def get_current_step(self)
Definition: seq2seq.py:733
def forward_model_build_fun(self, model, loss_scale=None)
Definition: seq2seq.py:525
def norm_clipped_grad_update(self, model, scope)
Definition: seq2seq.py:618
def FeedBlob(name, arr, device_option=None)
Definition: workspace.py:229
def _calc_norm_ratio(self, model, params, scope, ONE)
Definition: seq2seq.py:532
def CreateNet(net, overwrite=False, input_blobs=None)
Definition: workspace.py:140
def model_build_fun(self, model, forward_only=False, loss_scale=None)
Definition: seq2seq.py:329
def FetchBlob(name)
Definition: workspace.py:276
def inc_current_step(self)
Definition: seq2seq.py:736
def Parallelize_GPU(model_helper_obj, input_builder_fun, forward_pass_builder_fun, param_update_builder_fun, devices=range(0, workspace.NumCudaDevices()), rendezvous=None, net_type='dag', broadcast_computed_params=True, optimize_gradient_memory=False)
def _build_model(self, init_params)
Definition: seq2seq.py:111
def _build_shared(self, model)
Definition: seq2seq.py:284