3 from __future__
import absolute_import
4 from __future__
import division
5 from __future__
import print_function
6 from __future__
import unicode_literals
11 from caffe2.python.attention
import (
13 apply_regular_attention,
14 apply_recurrent_attention,
16 from caffe2.python
import core, recurrent, workspace
17 from caffe2.python.cnn
import CNNModelHelper
22 Base class for writing recurrent / stateful operations. 24 One needs to implement 3 methods: _apply, prepare_input and get_state_names. 25 As a result base class will provice apply_over_sequence method, which 26 allows you to apply recurrent operations over a sequence of any length. 28 def __init__(self, name):
32 def scope(self, name):
33 return self.
name +
'/' + name
if self.
name is not None else name
35 def apply_over_sequence(
41 outputs_with_grads=None,
44 step_model = CNNModelHelper(name=self.
name, param_model=model)
45 input_t, timestep = step_model.net.AddScopedExternalInputs(
49 states_prev = step_model.net.AddScopedExternalInputs(*[
55 seq_lengths=seq_lengths,
61 cell_net=step_model.net,
62 inputs=[(input_t, preprocessed_inputs)],
63 initial_cell_inputs=zip(states_prev, initial_states),
64 links=dict(zip(states_prev, states)),
69 if outputs_with_grads
is not None 70 else self.get_outputs_with_grads()
75 def apply(self, model, input_t, seq_lengths, states, timestep):
77 return self.
_apply(model, input_t, seq_lengths, states, timestep)
79 def _apply(self, model, input_t, seq_lengths, states, timestep):
81 A single step of a recurrent network. 83 model: CNNModelHelper object new operators would be added to 85 input_blob: single input with shape (1, batch_size, input_dim) 87 seq_lengths: blob containing sequence lengths which would be passed to 90 states: previous recurrent states 92 timestep: current recurrent iteration. Could be used together with 93 seq_lengths in order to determine, if some shorter sequences 94 in the batch have already ended. 96 raise NotImplementedError(
'Abstract method')
100 If some operations in _apply method depend only on the input, 101 not on recurrent states, they could be computed in advance. 103 model: CNNModelHelper object new operators would be added to 105 input_blob: either the whole input sequence with shape 106 (sequence_length, batch_size, input_dim) or a single input with shape 107 (1, batch_size, input_dim). 109 raise NotImplementedError(
'Abstract method')
113 Return the names of the recurrent states. 114 It's required by apply_over_sequence method in order to allocate 115 recurrent states for all steps with meaningful names. 117 raise NotImplementedError(
'Abstract method')
130 super(LSTMCell, self).__init__(name)
144 hidden_t_prev, cell_t_prev = states
147 self.
scope(
'gates_t'),
152 model.net.Sum([gates_t, input_t], gates_t)
153 hidden_t, cell_t = model.net.LSTMUnit(
164 model.net.AddExternalOutputs(hidden_t, cell_t)
167 return hidden_t, cell_t
169 def get_input_params(self):
171 'weights': self.
scope(
'i2h') +
'_w',
172 'biases': self.
scope(
'i2h') +
'_b',
175 def get_recurrent_params(self):
177 'weights': self.
scope(
'gates_t') +
'_w',
178 'biases': self.
scope(
'gates_t') +
'_b',
181 def prepare_input(self, model, input_blob):
190 def get_state_names(self):
191 return (self.
scope(
'hidden_t'), self.
scope(
'cell_t'))
193 def get_outputs_with_grads(self):
196 def get_output_size(self):
200 def LSTM(model, input_blob, seq_lengths, initial_states, dim_in, dim_out,
201 scope, outputs_with_grads=(0,), return_params=
False,
202 memory_optimization=
False, forget_bias=0.0):
204 Adds a standard LSTM recurrent network operator to a model. 206 model: CNNModelHelper object new operators would be added to 208 input_blob: the input sequence in a format T x N x D 209 where T is sequence size, N - batch size and D - input dimention 211 seq_lengths: blob containing sequence lengths which would be passed to 214 initial_states: a tupple of (hidden_input_blob, cell_input_blob) 215 which are going to be inputs to the cell net on the first iteration 217 dim_in: input dimention 219 dim_out: output dimention 221 outputs_with_grads : position indices of output blobs which will receive 222 external error gradient during backpropagation 224 return_params: if True, will return a dictionary of parameters of the LSTM 226 memory_optimization: if enabled, the LSTM step is recomputed on backward step 227 so that we don't need to store forward activations for each 228 timestep. Saves memory with cost of computation. 233 forget_bias=forget_bias,
234 memory_optimization=memory_optimization,
237 result = cell.apply_over_sequence(
240 seq_lengths=seq_lengths,
241 initial_states=initial_states,
242 outputs_with_grads=outputs_with_grads,
245 result = list(result) + [{
246 'input': cell.get_input_params(),
247 'recurrent': cell.get_recurrent_params(),
252 def GetLSTMParamNames():
253 weight_params = [
"input_gate_w",
"forget_gate_w",
"output_gate_w",
"cell_w"]
254 bias_params = [
"input_gate_b",
"forget_gate_b",
"output_gate_b",
"cell_b"]
255 return {
'weights': weight_params,
'biases': bias_params}
260 Set the parameters of LSTM based on predefined values 262 weight_params = GetLSTMParamNames()[
'weights']
263 bias_params = GetLSTMParamNames()[
'biases']
264 for input_type
in param_values.keys():
265 weight_values = [param_values[input_type][w].flatten()
for w
in weight_params]
267 for w
in weight_values:
268 wmat = np.append(wmat, w)
269 bias_values = [param_values[input_type][b].flatten()
for b
in bias_params]
271 for b
in bias_values:
272 bm = np.append(bm, b)
274 weights_blob = lstm_pblobs[input_type][
'weights']
275 bias_blob = lstm_pblobs[input_type][
'biases']
281 wmat.reshape(cur_weight.shape).astype(np.float32))
284 bm.reshape(cur_biases.shape).astype(np.float32))
287 def cudnn_LSTM(model, input_blob, initial_states, dim_in, dim_out,
288 scope, recurrent_params=None, input_params=None,
289 num_layers=1, return_params=False):
291 CuDNN version of LSTM for GPUs. 292 input_blob Blob containing the input. Will need to be available 293 when param_init_net is run, because the sequence lengths 294 and batch sizes will be inferred from the size of this 296 initial_states tuple of (hidden_init, cell_init) blobs 297 dim_in input dimensions 298 dim_out output/hidden dimension 299 scope namescope to apply 300 recurrent_params dict of blobs containing values for recurrent 301 gate weights, biases (if None, use random init values) 302 See GetLSTMParamNames() for format. 303 input_params dict of blobs containing values for input 304 gate weights, biases (if None, use random init values) 305 See GetLSTMParamNames() for format. 306 num_layers number of LSTM layers 307 return_params if True, returns (param_extract_net, param_mapping) 308 where param_extract_net is a net that when run, will 309 populate the blobs specified in param_mapping with the 310 current gate weights and biases (input/recurrent). 311 Useful for assigning the values back to non-cuDNN 315 weight_params = GetLSTMParamNames()[
'weights']
316 bias_params = GetLSTMParamNames()[
'biases']
318 input_weight_size = dim_out * dim_in
319 recurrent_weight_size = dim_out * dim_out
320 input_bias_size = dim_out
321 recurrent_bias_size = dim_out
323 def init(layer, pname, input_type):
324 if pname
in weight_params:
325 sz = input_weight_size
if input_type ==
'input' \
326 else recurrent_weight_size
327 elif pname
in bias_params:
328 sz = input_bias_size
if input_type ==
'input' \
329 else recurrent_bias_size
331 assert False,
"unknown parameter type {}".format(pname)
332 return model.param_init_net.UniformFill(
334 "lstm_init_{}_{}_{}".format(input_type, pname, layer),
338 total_sz = 4 * num_layers * (
339 input_weight_size + recurrent_weight_size + input_bias_size +
343 weights = model.param_init_net.UniformFill(
344 [],
"lstm_weight", shape=[total_sz])
346 model.params.append(weights)
347 model.weights.append(weights)
350 'hidden_size': dim_out,
354 'input_mode':
'linear',
355 'num_layers': num_layers,
359 param_extract_net =
core.Net(
"lstm_param_extractor")
360 param_extract_net.AddExternalInputs([input_blob, weights])
361 param_extract_mapping = {}
368 for input_type
in [
'input',
'recurrent']:
369 param_extract_mapping[input_type] = {}
370 p = recurrent_params
if input_type ==
'recurrent' else input_params
373 for pname
in weight_params + bias_params:
374 for j
in range(0, num_layers):
375 values = p[pname]
if pname
in p
else init(j, pname, input_type)
376 model.param_init_net.RecurrentParamSet(
377 [input_blob, weights, values],
380 input_type=input_type,
384 if pname
not in param_extract_mapping[input_type]:
385 param_extract_mapping[input_type][pname] = {}
386 b = param_extract_net.RecurrentParamGet(
387 [input_blob, weights],
388 [
"lstm_{}_{}_{}".format(input_type, pname, j)],
390 input_type=input_type,
394 param_extract_mapping[input_type][pname][j] = b
396 (hidden_input_blob, cell_input_blob) = initial_states
397 output, hidden_output, cell_output, rnn_scratch, dropout_states = \
399 [input_blob, cell_input_blob, cell_input_blob, weights],
400 [
"lstm_output",
"lstm_hidden_output",
"lstm_cell_output",
401 "lstm_rnn_scratch",
"lstm_dropout_states"],
402 seed=random.randint(0, 100000),
405 model.net.AddExternalOutputs(
406 hidden_output, cell_output, rnn_scratch, dropout_states)
409 param_extract = param_extract_net, param_extract_mapping
410 return output, hidden_output, cell_output, param_extract
412 return output, hidden_output, cell_output
425 weighted_encoder_outputs,
427 lstm_memory_optimization,
428 attention_memory_optimization,
430 super(LSTMWithAttentionCell, self).__init__(name)
437 assert attention_type
in [
438 AttentionType.Regular,
439 AttentionType.Recurrent,
456 attention_weighted_encoder_context_t_prev,
459 gates_concatenated_input_t, _ = model.net.Concat(
460 [hidden_t_prev, attention_weighted_encoder_context_t_prev],
462 self.
scope(
'gates_concatenated_input_t'),
463 self.
scope(
'_gates_concatenated_input_t_concat_dims'),
468 gates_concatenated_input_t,
469 self.
scope(
'gates_t'),
474 model.net.Sum([gates_t, input_t], gates_t)
476 hidden_t_intermediate, cell_t = model.net.LSTMUnit(
484 [
'hidden_t_intermediate', self.
scope(
'cell_t')],
488 attention_weighted_encoder_context_t,
489 self.attention_weights_3d,
491 ) = apply_recurrent_attention(
496 decoder_hidden_state_t=hidden_t_intermediate,
499 attention_weighted_encoder_context_t_prev=(
500 attention_weighted_encoder_context_t_prev
505 attention_weighted_encoder_context_t,
506 self.attention_weights_3d,
508 ) = apply_regular_attention(
513 decoder_hidden_state_t=hidden_t_intermediate,
517 hidden_t = model.Copy(hidden_t_intermediate, self.
scope(
'hidden_t'))
518 model.net.AddExternalOutputs(
521 attention_weighted_encoder_context_t,
528 return hidden_t, cell_t, attention_weighted_encoder_context_t
530 def get_attention_weights(self):
532 return self.attention_weights_3d
534 def prepare_input(self, model, input_blob):
538 self.
scope(
'encoder_outputs_transposed'),
544 self.
scope(
'weighted_encoder_outputs'),
558 def get_state_names(self):
560 self.
scope(
'hidden_t'),
561 self.
scope(
'cell_t'),
562 self.
scope(
'attention_weighted_encoder_context_t'),
565 def get_outputs_with_grads(self):
568 def get_output_size(self):
575 decoder_input_lengths,
576 initial_decoder_hidden_state,
577 initial_decoder_cell_state,
578 initial_attention_weighted_encoder_context,
584 attention_type=AttentionType.Regular,
585 outputs_with_grads=(0, 4),
586 weighted_encoder_outputs=
None,
587 lstm_memory_optimization=
False,
588 attention_memory_optimization=
False,
592 Adds a LSTM with attention mechanism to a model. 594 The implementation is based on https://arxiv.org/abs/1409.0473, with 595 a small difference in the order 596 how we compute new attention context and new hidden state, similarly to 597 https://arxiv.org/abs/1508.04025. 599 The model uses encoder-decoder naming conventions, 600 where the decoder is the sequence the op is iterating over, 601 while computing the attention context over the encoder. 603 model: CNNModelHelper object new operators would be added to 605 decoder_inputs: the input sequence in a format T x N x D 606 where T is sequence size, N - batch size and D - input dimention 608 decoder_input_lengths: blob containing sequence lengths 609 which would be passed to LSTMUnit operator 611 initial_decoder_hidden_state: initial hidden state of LSTM 613 initial_decoder_cell_state: initial cell state of LSTM 615 initial_attention_weighted_encoder_context: initial attention context 617 encoder_output_dim: dimension of encoder outputs 619 encoder_outputs: the sequence, on which we compute the attention context 622 decoder_input_dim: input dimention (last dimension on decoder_inputs) 624 decoder_state_dim: size of hidden states of LSTM 626 attention_type: One of: AttentionType.Regular, AttentionType.Recurrent. 627 Determines which type of attention mechanism to use. 629 outputs_with_grads : position indices of output blobs which will receive 630 external error gradient during backpropagation 632 weighted_encoder_outputs: encoder outputs to be used to compute attention 633 weights. In the basic case it's just linear transformation of 634 encoder outputs (that the default, when weighted_encoder_outputs is None). 635 However, it can be something more complicated - like a separate 636 encoder network (for example, in case of convolutional encoder) 638 lstm_memory_optimization: recompute LSTM activations on backward pass, so 639 we don't need to store their values in forward passes 641 attention_memory_optimization: recompute attention for backward pass 644 encoder_output_dim=encoder_output_dim,
645 encoder_outputs=encoder_outputs,
646 decoder_input_dim=decoder_input_dim,
647 decoder_state_dim=decoder_state_dim,
649 attention_type=attention_type,
650 weighted_encoder_outputs=weighted_encoder_outputs,
651 forget_bias=forget_bias,
652 lstm_memory_optimization=lstm_memory_optimization,
653 attention_memory_optimization=attention_memory_optimization,
655 return cell.apply_over_sequence(
657 inputs=decoder_inputs,
658 seq_lengths=decoder_input_lengths,
660 initial_decoder_hidden_state,
661 initial_decoder_cell_state,
662 initial_attention_weighted_encoder_context,
664 outputs_with_grads=
None,
689 alpha = model.param_init_net.ConstantFill(
691 [self.
scope(
'alpha')],
695 beta1 = model.param_init_net.ConstantFill(
697 [self.
scope(
'beta1')],
701 beta2 = model.param_init_net.ConstantFill(
703 [self.
scope(
'beta2')],
707 b = model.param_init_net.ConstantFill(
713 model.params.extend([alpha, beta1, beta2, b])
716 alpha_tdash = model.net.Mul(
718 self.
scope(
'alpha_tdash')
721 alpha_tdash_rs, _ = model.net.Reshape(
723 [self.
scope(
'alpha_tdash_rs'), self.
scope(
'alpha_tdash_old_shape')],
726 alpha_t = model.net.Mul(
727 [alpha_tdash_rs, alpha],
728 self.
scope(
'alpha_t'),
734 prev_t_rs, _ = model.net.Reshape(
736 [self.
scope(
'prev_t_rs'), self.
scope(
'prev_t_old_shape')],
739 beta1_t = model.net.Mul(
741 self.
scope(
'beta1_t'),
747 input_t_rs, _ = model.net.Reshape(
749 [self.
scope(
'input_t_rs'), self.
scope(
'input_t_old_shape')],
752 beta2_t = model.net.Mul(
754 self.
scope(
'beta2_t'),
759 gates_tdash = model.net.Sum(
760 [alpha_t, beta1_t, beta2_t],
761 self.
scope(
'gates_tdash')
763 gates_t = model.net.Add(
765 self.
scope(
'gates_t'),
770 gates_t_rs, _ = model.net.Reshape(
772 [self.
scope(
'gates_t_rs'), self.
scope(
'gates_t_old_shape')],
776 hidden_t_intermediate, cell_t = model.net.LSTMUnit(
777 [hidden_t_prev, cell_t_prev, gates_t_rs, seq_lengths, timestep],
778 [self.
scope(
'hidden_t_intermediate'), self.
scope(
'cell_t')],
781 hidden_t = model.Copy(hidden_t_intermediate, self.
scope(
'hidden_t'))
782 model.net.AddExternalOutputs(
788 return hidden_t, cell_t
791 def MILSTM(model, input_blob, seq_lengths, initial_states, dim_in, dim_out,
792 scope, outputs_with_grads=(0,), memory_optimization=
False,
795 Adds MI flavor of standard LSTM recurrent network operator to a model. 796 See https://arxiv.org/pdf/1606.06630.pdf 798 model: CNNModelHelper object new operators would be added to 800 input_blob: the input sequence in a format T x N x D 801 where T is sequence size, N - batch size and D - input dimention 803 seq_lengths: blob containing sequence lengths which would be passed to 806 initial_states: a tupple of (hidden_input_blob, cell_input_blob) 807 which are going to be inputs to the cell net on the first iteration 809 dim_in: input dimention 811 dim_out: output dimention 813 outputs_with_grads : position indices of output blobs which will receive 814 external error gradient during backpropagation 816 memory_optimization: if enabled, the LSTM step is recomputed on backward step 817 so that we don't need to store forward activations for each 818 timestep. Saves memory with cost of computation. 823 forget_bias=forget_bias,
824 memory_optimization=memory_optimization,
827 result = cell.apply_over_sequence(
830 seq_lengths=seq_lengths,
831 initial_states=initial_states,
832 outputs_with_grads=outputs_with_grads,
850 attention_weighted_encoder_context_t_prev,
853 gates_concatenated_input_t, _ = model.net.Concat(
854 [hidden_t_prev, attention_weighted_encoder_context_t_prev],
856 self.
scope(
'gates_concatenated_input_t'),
857 self.
scope(
'_gates_concatenated_input_t_concat_dims'),
864 gates_concatenated_input_t,
865 self.
scope(
'prev_t'),
871 alpha = model.param_init_net.ConstantFill(
873 [self.
scope(
'alpha')],
877 beta1 = model.param_init_net.ConstantFill(
879 [self.
scope(
'beta1')],
883 beta2 = model.param_init_net.ConstantFill(
885 [self.
scope(
'beta2')],
889 b = model.param_init_net.ConstantFill(
895 model.params.extend([alpha, beta1, beta2, b])
898 alpha_tdash = model.net.Mul(
900 self.
scope(
'alpha_tdash')
903 alpha_tdash_rs, _ = model.net.Reshape(
905 [self.
scope(
'alpha_tdash_rs'), self.
scope(
'alpha_tdash_old_shape')],
908 alpha_t = model.net.Mul(
909 [alpha_tdash_rs, alpha],
910 self.
scope(
'alpha_t'),
916 prev_t_rs, _ = model.net.Reshape(
918 [self.
scope(
'prev_t_rs'), self.
scope(
'prev_t_old_shape')],
921 beta1_t = model.net.Mul(
923 self.
scope(
'beta1_t'),
929 input_t_rs, _ = model.net.Reshape(
931 [self.
scope(
'input_t_rs'), self.
scope(
'input_t_old_shape')],
934 beta2_t = model.net.Mul(
936 self.
scope(
'beta2_t'),
941 gates_tdash = model.net.Sum(
942 [alpha_t, beta1_t, beta2_t],
943 self.
scope(
'gates_tdash')
945 gates_t = model.net.Add(
947 self.
scope(
'gates_t'),
952 gates_t_rs, _ = model.net.Reshape(
954 [self.
scope(
'gates_t_rs'), self.
scope(
'gates_t_old_shape')],
958 hidden_t_intermediate, cell_t = model.net.LSTMUnit(
959 [hidden_t_prev, cell_t_prev, gates_t_rs, seq_lengths, timestep],
960 [self.
scope(
'hidden_t_intermediate'), self.
scope(
'cell_t')],
965 attention_weighted_encoder_context_t,
966 self.attention_weights_3d,
969 apply_recurrent_attention(
974 decoder_hidden_state_t=hidden_t_intermediate,
977 attention_weighted_encoder_context_t_prev=(
978 attention_weighted_encoder_context_t_prev
984 attention_weighted_encoder_context_t,
985 self.attention_weights_3d,
988 apply_regular_attention(
993 decoder_hidden_state_t=hidden_t_intermediate,
998 hidden_t = model.Copy(hidden_t_intermediate, self.
scope(
'hidden_t'))
999 model.net.AddExternalOutputs(
1002 attention_weighted_encoder_context_t,
1004 return hidden_t, cell_t, attention_weighted_encoder_context_t
def LSTM(model, input_blob, seq_lengths, initial_states, dim_in, dim_out, scope, outputs_with_grads=(0,), return_params=False, memory_optimization=False, forget_bias=0.0)
def InitFromLSTMParams(lstm_pblobs, param_values)
attention_memory_optimization
Module caffe2.python.scope.
def MILSTM(model, input_blob, seq_lengths, initial_states, dim_in, dim_out, scope, outputs_with_grads=(0,), memory_optimization=False, forget_bias=0.0)
def LSTMWithAttention(model, decoder_inputs, decoder_input_lengths, initial_decoder_hidden_state, initial_decoder_cell_state, initial_attention_weighted_encoder_context, encoder_output_dim, encoder_outputs, decoder_input_dim, decoder_state_dim, scope, attention_type=AttentionType.Regular, outputs_with_grads=(0, 4), weighted_encoder_outputs=None, lstm_memory_optimization=False, attention_memory_optimization=False, forget_bias=0.0)
def cudnn_LSTM(model, input_blob, initial_states, dim_in, dim_out, scope, recurrent_params=None, input_params=None, num_layers=1, return_params=False)
encoder_outputs_transposed
def FeedBlob(name, arr, device_option=None)
def prepare_input(self, model, input_blob)
def _apply(self, model, input_t, seq_lengths, states, timestep)
def recurrent_net(net, cell_net, inputs, initial_cell_inputs, links, timestep=None, scope=None, outputs_with_grads=(0,), recompute_blobs_on_backward=None)
def get_state_names(self)