3 from __future__
import absolute_import
4 from __future__
import division
5 from __future__
import print_function
6 from __future__
import unicode_literals
17 from itertools
import izip
19 import caffe2.proto.caffe2_pb2
as caffe2_pb2
20 from caffe2.python
import core, workspace, rnn_cell, data_parallel_model
21 from caffe2.python.examples
import seq2seq_util
23 logger = logging.getLogger(__name__)
24 logger.setLevel(logging.INFO)
25 logger.addHandler(logging.StreamHandler(sys.stderr))
27 Batch = collections.namedtuple(
'Batch', [
45 def prepare_batch(batch):
46 encoder_lengths = [len(entry[0])
for entry
in batch]
47 max_encoder_length = max(encoder_lengths)
49 max_decoder_length = max([len(entry[1])
for entry
in batch])
51 batch_encoder_inputs = []
52 batch_decoder_inputs = []
54 batch_target_weights = []
56 for source_seq, target_seq
in batch:
58 [_PAD_ID] * (max_encoder_length - len(source_seq))
60 batch_encoder_inputs.append(
61 list(reversed(source_seq)) + encoder_pads
65 [_PAD_ID] * (max_decoder_length - len(target_seq))
67 target_seq_with_go_token = [_GO_ID] + target_seq
68 decoder_lengths.append(len(target_seq_with_go_token))
69 batch_decoder_inputs.append(target_seq_with_go_token + decoder_pads)
71 target_seq_with_eos = target_seq + [_EOS_ID]
72 targets = target_seq_with_eos + decoder_pads
73 batch_targets.append(targets)
75 if len(source_seq) + len(target_seq) == 0:
76 target_weights = [0] * len(targets)
79 1
if target != _PAD_ID
else 0
82 batch_target_weights.append(target_weights)
85 encoder_inputs=np.array(
89 encoder_lengths=np.array(encoder_lengths, dtype=np.int32),
90 decoder_inputs=np.array(
94 decoder_lengths=np.array(decoder_lengths, dtype=np.int32),
99 target_weights=np.array(
100 batch_target_weights,
122 model.AddGradientOperators(loss_blobs)
125 scope=
'norm_clipped_grad_update' 134 input_builder_fun=
lambda m:
None,
136 param_update_builder_fun=
None,
140 def clipped_grad_update_bound(model):
143 scope=
'norm_clipped_grad_update',
148 input_builder_fun=
lambda m:
None,
150 param_update_builder_fun=clipped_grad_update_bound,
155 scope=
'norm_clipped_sparse_grad_update',
160 def _build_embedding_encoder(
173 embedded_encoder_inputs = model.net.Gather(
174 [embeddings, inputs],
175 [
'embedded_encoder_inputs'],
179 embedded_encoder_inputs_cpu = model.net.Gather(
180 [embeddings, inputs],
181 [
'embedded_encoder_inputs_cpu'],
183 embedded_encoder_inputs = model.CopyCPUToGPU(
184 embedded_encoder_inputs_cpu,
185 'embedded_encoder_inputs',
190 encoder_num_units = (
193 encoder_initial_cell_state = model.param_init_net.ConstantFill(
195 [
'encoder_initial_cell_state'],
196 shape=[encoder_num_units],
199 encoder_initial_hidden_state = (
200 model.param_init_net.ConstantFill(
202 'encoder_initial_hidden_state',
203 shape=[encoder_num_units],
209 rnn_encoder_func = seq2seq_util.rnn_bidirectional_encoder
210 encoder_output_dim = 2 * encoder_num_units
212 rnn_encoder_func = seq2seq_util.rnn_unidirectional_encoder
213 encoder_output_dim = encoder_num_units
217 final_encoder_hidden_state,
218 final_encoder_cell_state,
219 ) = rnn_encoder_func(
221 embedded_encoder_inputs,
223 encoder_initial_hidden_state,
224 encoder_initial_cell_state,
229 weighted_encoder_outputs =
None 231 raise ValueError(
'Unsupported encoder type {}'.format(
236 weighted_encoder_outputs,
237 final_encoder_hidden_state,
238 final_encoder_cell_state,
242 def output_projection(
248 decoder_softmax_size,
250 if decoder_softmax_size
is not None:
251 decoder_outputs = model.FC(
253 'decoder_outputs_scaled',
254 dim_in=decoder_output_size,
255 dim_out=decoder_softmax_size,
257 decoder_output_size = decoder_softmax_size
259 output_projection_w = model.param_init_net.XavierFill(
261 'output_projection_w',
265 output_projection_b = model.param_init_net.XavierFill(
267 'output_projection_b',
270 model.params.extend([
274 output_logits = model.net.FC(
284 def _build_shared(self, model):
285 optimizer_params = self.
model_params[
'optimizer_params']
288 name=
'learning_rate',
289 init_value=float(optimizer_params[
'learning_rate']),
299 init_value=time.time(),
303 def _build_embeddings(self, model):
308 'encoder_embeddings',
319 'decoder_embeddings',
329 def model_build_fun(self, model, forward_only=False, loss_scale=None):
330 encoder_inputs = model.net.AddExternalInput(
333 encoder_lengths = model.net.AddExternalInput(
336 decoder_inputs = model.net.AddExternalInput(
339 decoder_lengths = model.net.AddExternalInput(
342 targets = model.net.AddExternalInput(
345 target_weights = model.net.AddExternalInput(
349 assert attention_type
in [
'none',
'regular']
353 weighted_encoder_outputs,
354 final_encoder_hidden_state,
355 final_encoder_cell_state,
359 inputs=encoder_inputs,
360 input_lengths=encoder_lengths,
363 embedding_size=self.
model_params[
'encoder_embedding_size'],
364 use_attention=(attention_type !=
'none'),
366 forward_only=forward_only,
369 assert len(self.
model_params[
'decoder_layer_configs']) == 1
370 decoder_num_units = (
371 self.
model_params[
'decoder_layer_configs'][0][
'num_units']
374 if attention_type ==
'none':
375 decoder_initial_hidden_state = model.FC(
376 final_encoder_hidden_state,
377 'decoder_initial_hidden_state',
382 decoder_initial_cell_state = model.FC(
383 final_encoder_cell_state,
384 'decoder_initial_cell_state',
390 decoder_initial_hidden_state = model.param_init_net.ConstantFill(
392 'decoder_initial_hidden_state',
393 shape=[decoder_num_units],
396 decoder_initial_cell_state = model.param_init_net.ConstantFill(
398 'decoder_initial_cell_state',
399 shape=[decoder_num_units],
402 initial_attention_weighted_encoder_context = (
403 model.param_init_net.ConstantFill(
405 'initial_attention_weighted_encoder_context',
406 shape=[encoder_output_dim],
412 embedded_decoder_inputs = model.net.Gather(
414 [
'embedded_decoder_inputs'],
418 embedded_decoder_inputs_cpu = model.net.Gather(
420 [
'embedded_decoder_inputs_cpu'],
422 embedded_decoder_inputs = model.CopyCPUToGPU(
423 embedded_decoder_inputs_cpu,
424 'embedded_decoder_inputs',
428 if attention_type ==
'none':
431 input_blob=embedded_decoder_inputs,
432 seq_lengths=decoder_lengths,
434 decoder_initial_hidden_state,
435 decoder_initial_cell_state,
438 dim_out=decoder_num_units,
440 outputs_with_grads=[0],
442 decoder_output_size = decoder_num_units
445 decoder_outputs, _, _, _,
446 attention_weighted_encoder_contexts, _
449 decoder_inputs=embedded_decoder_inputs,
450 decoder_input_lengths=decoder_lengths,
451 initial_decoder_hidden_state=decoder_initial_hidden_state,
452 initial_decoder_cell_state=decoder_initial_cell_state,
453 initial_attention_weighted_encoder_context=(
454 initial_attention_weighted_encoder_context
456 encoder_output_dim=encoder_output_dim,
457 encoder_outputs=encoder_outputs,
458 decoder_input_dim=self.
model_params[
'decoder_embedding_size'],
459 decoder_state_dim=decoder_num_units,
461 outputs_with_grads=[0, 4],
463 decoder_outputs, _ = model.net.Concat(
464 [decoder_outputs, attention_weighted_encoder_contexts],
466 'states_and_context_combination',
467 '_states_and_context_combination_concat_dims',
471 decoder_output_size = decoder_num_units + encoder_output_dim
476 decoder_outputs_flattened, _ = model.net.Reshape(
479 'decoder_outputs_flattened',
480 'decoder_outputs_and_contexts_combination_old_shape',
482 shape=[-1, decoder_output_size],
486 decoder_outputs=decoder_outputs_flattened,
487 decoder_output_size=decoder_output_size,
489 decoder_softmax_size=self.
model_params[
'decoder_softmax_size'],
491 targets, _ = model.net.Reshape(
493 [
'targets',
'targets_old_shape'],
496 target_weights, _ = model.net.Reshape(
498 [
'target_weights',
'target_weights_old_shape'],
501 output_probs = model.net.Softmax(
504 engine=(
'CUDNN' if self.
num_gpus > 0
else None),
506 label_cross_entropy = model.net.LabelCrossEntropy(
507 [output_probs, targets],
508 [
'label_cross_entropy'],
510 weighted_label_cross_entropy = model.net.Mul(
511 [label_cross_entropy, target_weights],
512 'weighted_label_cross_entropy',
514 total_loss_scalar = model.net.SumElements(
515 [weighted_label_cross_entropy],
518 total_loss_scalar_weighted = model.net.Scale(
520 'total_loss_scalar_weighted',
523 return [total_loss_scalar_weighted]
525 def forward_model_build_fun(self, model, loss_scale=None):
529 loss_scale=loss_scale
532 def _calc_norm_ratio(self, model, params, scope, ONE):
534 grad_squared_sums = []
535 for i, param
in enumerate(params):
538 model.param_to_grad[param]
540 model.param_to_grad[param],
542 )
else model.param_to_grad[param].values
544 grad_squared = model.net.Sqr(
546 'grad_{}_squared'.format(i),
548 grad_squared_sum = model.net.SumElements(
550 'grad_{}_squared_sum'.format(i),
552 grad_squared_sums.append(grad_squared_sum)
554 grad_squared_full_sum = model.net.Sum(
556 'grad_squared_full_sum',
558 global_norm = model.net.Pow(
559 grad_squared_full_sum,
563 clip_norm = model.param_init_net.ConstantFill(
569 max_norm = model.net.Max(
570 [global_norm, clip_norm],
573 norm_ratio = model.net.Div(
574 [clip_norm, max_norm],
579 def _apply_norm_ratio(
580 self, norm_ratio, model, params, learning_rate, scope, ONE
583 param_grad = model.param_to_grad[param]
584 nlr = model.net.Negative(
586 'negative_learning_rate',
589 update_coeff = model.net.Mul(
594 if isinstance(param_grad, core.GradientSlice):
595 param_grad_values = param_grad.values
597 model.net.ScatterWeightedSum(
608 model.net.WeightedSum(
618 def norm_clipped_grad_update(self, model, scope):
626 for param
in model.GetParams(top_scope=
True):
627 if param
in model.param_to_grad:
629 model.param_to_grad[param],
634 ONE = model.param_init_net.ConstantFill(
640 logger.info(
'Dense trainable variables: ')
643 norm_ratio, model, params, learning_rate, scope, ONE
646 def norm_clipped_sparse_grad_update(self, model, scope):
650 for param
in model.GetParams(top_scope=
True):
651 if param
in model.param_to_grad:
653 model.param_to_grad[param],
658 ONE = model.param_init_net.ConstantFill(
664 logger.info(
'Sparse trainable variables: ')
667 norm_ratio, model, params, learning_rate, scope, ONE
670 def total_loss_scalar(self):
676 name =
'gpu_{}/total_loss_scalar'.format(i)
678 total_loss += gpu_loss
681 def _init_model(self):
687 input_blobs=map(str, net.external_inputs),
690 create_net(self.
model.net)
713 '--caffe2_log_level=0',
717 '--caffe2_handle_executor_threads_exceptions=1',
718 '--caffe2_mkl_num_threads=' + str(self.
num_cpus),
724 def __exit__(self, exc_type, exc_value, traceback):
727 def initialize_from_scratch(self):
728 logger.info(
'Initializing Seq2SeqModelCaffe2 from scratch: Start')
731 logger.info(
'Initializing Seq2SeqModelCaffe2 from scratch: Finish')
733 def get_current_step(self):
736 def inc_current_step(self):
748 batch_obj = prepare_batch(batch)
749 for batch_obj_name, batch_obj_value
in izip(
757 batch_obj = prepare_batch(gpu_batch)
758 for batch_obj_name, batch_obj_value
in izip(
762 name =
'gpu_{}/{}'.format(i, batch_obj_name)
763 if batch_obj_name
in [
'encoder_inputs',
'decoder_inputs']:
778 def gen_vocab(corpus, unk_threshold):
779 vocab = collections.defaultdict(
lambda: len(vocab))
780 freqs = collections.defaultdict(
lambda: 0)
787 with open(corpus)
as f:
789 tokens = sentence.strip().
split()
792 for token, freq
in freqs.items():
793 if freq > unk_threshold:
800 def get_numberized_sentence(sentence, vocab):
801 numerized_sentence = []
802 for token
in sentence.strip().
split():
804 numerized_sentence.append(vocab[token])
806 numerized_sentence.append(vocab[UNK])
807 return numerized_sentence
810 def gen_batches(source_corpus, target_corpus, source_vocab, target_vocab,
811 batch_size, max_length):
812 with open(source_corpus)
as source, open(target_corpus)
as target:
813 parallel_sentences = []
814 for source_sentence, target_sentence
in zip(source, target):
815 numerized_source_sentence = get_numberized_sentence(
819 numerized_target_sentence = get_numberized_sentence(
824 len(numerized_source_sentence) > 0
and 825 len(numerized_target_sentence) > 0
and 827 max_length
is None or (
828 len(numerized_source_sentence) <= max_length
and 829 len(numerized_target_sentence) <= max_length
833 parallel_sentences.append((
834 numerized_source_sentence,
835 numerized_target_sentence,
837 parallel_sentences.sort(key=
lambda s_t: (len(s_t[0]), len(s_t[1])))
839 batches, batch = [], []
840 for sentence_pair
in parallel_sentences:
841 batch.append(sentence_pair)
842 if len(batch) >= batch_size:
843 batches.append(batch)
846 while len(batch) < batch_size:
847 batch.append(batch[-1])
848 assert len(batch) == batch_size
849 batches.append(batch)
850 random.shuffle(batches)
854 def run_seq2seq_model(args, model_params=None):
855 source_vocab = gen_vocab(args.source_corpus, args.unk_threshold)
856 target_vocab = gen_vocab(args.target_corpus, args.unk_threshold)
857 logger.info(
'Source vocab size {}'.format(len(source_vocab)))
858 logger.info(
'Target vocab size {}'.format(len(target_vocab)))
860 batches = gen_batches(args.source_corpus, args.target_corpus, source_vocab,
861 target_vocab, model_params[
'batch_size'],
863 logger.info(
'Number of training batches {}'.format(len(batches)))
865 batches_eval = gen_batches(args.source_corpus_eval, args.target_corpus_eval,
866 source_vocab, target_vocab,
867 model_params[
'batch_size'], args.max_length)
868 logger.info(
'Number of eval batches {}'.format(len(batches_eval)))
871 model_params=model_params,
872 source_vocab_size=len(source_vocab),
873 target_vocab_size=len(target_vocab),
874 num_gpus=args.num_gpus,
877 model_obj.initialize_from_scratch()
878 for i
in range(args.epochs):
879 logger.info(
'Epoch {}'.format(i))
881 for batch
in batches:
882 total_loss += model_obj.step(
886 logger.info(
'\ttraining loss {}'.format(total_loss))
888 for batch
in batches_eval:
889 total_loss += model_obj.step(
893 logger.info(
'\teval loss {}'.format(total_loss))
896 def run_seq2seq_rnn_unidirection_with_no_attention(args):
897 run_seq2seq_model(args, model_params=dict(
898 attention=(
'regular' if args.use_attention
else 'none'),
899 decoder_layer_configs=[
901 num_units=args.decoder_cell_num_units,
905 encoder_layer_configs=[
907 num_units=args.encoder_cell_num_units,
910 use_bidirectional_encoder=args.use_bidirectional_encoder,
912 batch_size=args.batch_size,
913 optimizer_params=dict(
914 learning_rate=args.learning_rate,
916 encoder_embedding_size=args.encoder_embedding_size,
917 decoder_embedding_size=args.decoder_embedding_size,
918 decoder_softmax_size=args.decoder_softmax_size,
919 max_gradient_norm=args.max_gradient_norm,
925 parser = argparse.ArgumentParser(
926 description=
'Caffe2: Seq2Seq Training' 928 parser.add_argument(
'--source-corpus', type=str, default=
None,
929 help=
'Path to source corpus in a text file format. Each ' 930 'line in the file should contain a single sentence',
932 parser.add_argument(
'--target-corpus', type=str, default=
None,
933 help=
'Path to target corpus in a text file format',
935 parser.add_argument(
'--max-length', type=int, default=
None,
936 help=
'Maximal lengths of train and eval sentences')
937 parser.add_argument(
'--batch-size', type=int, default=32,
938 help=
'Training batch size')
939 parser.add_argument(
'--epochs', type=int, default=10,
940 help=
'Number of iterations over training data')
941 parser.add_argument(
'--learning-rate', type=float, default=0.5,
942 help=
'Learning rate')
943 parser.add_argument(
'--unk-threshold', type=int, default=50,
944 help=
'Threshold frequency under which token becomes ' 945 'labeled unknown token')
946 parser.add_argument(
'--max-gradient-norm', type=float, default=1.0,
947 help=
'Max global norm of gradients at the end of each ' 948 'backward pass. We do clipping to match the number.')
949 parser.add_argument(
'--use-bidirectional-encoder', action=
'store_true',
950 help=
'Set flag to use bidirectional recurrent network ' 952 parser.add_argument(
'--use-attention', action=
'store_true',
953 help=
'Set flag to use seq2seq with attention model')
954 parser.add_argument(
'--source-corpus-eval', type=str, default=
None,
955 help=
'Path to source corpus for evaluation in a text ' 956 'file format', required=
True)
957 parser.add_argument(
'--target-corpus-eval', type=str, default=
None,
958 help=
'Path to target corpus for evaluation in a text ' 959 'file format', required=
True)
960 parser.add_argument(
'--encoder-cell-num-units', type=int, default=256,
961 help=
'Number of cell units in the encoder layer')
962 parser.add_argument(
'--decoder-cell-num-units', type=int, default=512,
963 help=
'Number of cell units in the decoder layer')
964 parser.add_argument(
'--encoder-embedding-size', type=int, default=256,
965 help=
'Size of embedding in the encoder layer')
966 parser.add_argument(
'--decoder-embedding-size', type=int, default=512,
967 help=
'Size of embedding in the decoder layer')
968 parser.add_argument(
'--decoder-softmax-size', type=int, default=128,
969 help=
'Size of softmax layer in the decoder')
970 parser.add_argument(
'--num-gpus', type=int, default=0,
971 help=
'Number of GPUs for data parallel model')
973 args = parser.parse_args()
975 run_seq2seq_rnn_unidirection_with_no_attention(args)
978 if __name__ ==
'__main__':
def LSTM(model, input_blob, seq_lengths, initial_states, dim_in, dim_out, scope, outputs_with_grads=(0,), return_params=False, memory_optimization=False, forget_bias=0.0)
def ResetWorkspace(root_folder=None)
Module caffe2.python.layers.split.
def _build_embeddings(self, model)
def LSTMWithAttention(model, decoder_inputs, decoder_input_lengths, initial_decoder_hidden_state, initial_decoder_cell_state, initial_attention_weighted_encoder_context, encoder_output_dim, encoder_outputs, decoder_input_dim, decoder_state_dim, scope, attention_type=AttentionType.Regular, outputs_with_grads=(0, 4), weighted_encoder_outputs=None, lstm_memory_optimization=False, attention_memory_optimization=False, forget_bias=0.0)
def _build_embedding_encoder(self, model, inputs, input_lengths, vocab_size, embeddings, embedding_size, use_attention, num_gpus, forward_only=False)
def _apply_norm_ratio(self, norm_ratio, model, params, learning_rate, scope, ONE)
def RunNet(name, num_iter=1)
def total_loss_scalar(self)
def DeviceOption(device_type, cuda_gpu_id=0, random_seed=None)
def output_projection(self, model, decoder_outputs, decoder_output_size, target_vocab_size, decoder_softmax_size)
def norm_clipped_sparse_grad_update(self, model, scope)
def get_current_step(self)
def forward_model_build_fun(self, model, loss_scale=None)
def norm_clipped_grad_update(self, model, scope)
def FeedBlob(name, arr, device_option=None)
def _calc_norm_ratio(self, model, params, scope, ONE)
def CreateNet(net, overwrite=False, input_blobs=None)
def model_build_fun(self, model, forward_only=False, loss_scale=None)
def inc_current_step(self)
def Parallelize_GPU(model_helper_obj, input_builder_fun, forward_pass_builder_fun, param_update_builder_fun, devices=range(0, workspace.NumCudaDevices()), rendezvous=None, net_type='dag', broadcast_computed_params=True, optimize_gradient_memory=False)
def _build_model(self, init_params)
def _build_shared(self, model)