extra_hyperparameters: &extra_hyperparameters
    lr_account_for_nbr_actor: False 
    weights_decay_lambda: 0.0
    weights_entropy_lambda: 0.0 #01
    use_target_to_gather_data:    False

    ####################################
    # New hyperparameters:
    PER_compute_initial_priority: False
    #####################################
    
    sequence_replay_use_online_states: True
    sequence_replay_use_zero_initial_states: False
    sequence_replay_store_on_terminal: True
    
    r2d2_loss_masking: True
    r2d2_loss_masking_n_step_regularisation: True
    r2d2_bellman_target_SAD: False

    burn_in: True
    sequence_replay_unroll_length: 80
    sequence_replay_overlap_length: 40
    sequence_replay_burn_in_length: 20

    sequence_replay_PER_eta: 0.9

    vdn: False 
    vdn_nbr_players: 2

LargeMLP: &LargeMLP
        phi_arch: 'MLP'
        actor_arch: 'None'
        critic_arch: 'LSTM-RNN'
        
        # Phi Body:
        #phi_arch_channels: ['BN32', 'BN64', 'BN64']
        #phi_arch_kernels: [8, 4, 3]
        #phi_arch_strides: [4, 2, 1]
        #phi_arch_paddings: [1, 1, 1]
        phi_arch_feature_dim: 128
        phi_arch_hidden_units: [512,256]

        extra_inputs_infos: {
            'previous_reward':{
                shape: [1,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'previous_action':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },
            
            'communication_channel':{
                shape: [55,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            #'other_agent_id':{
            #    shape: [10,], 
            #    target_location: ['critic_body', 'extra_inputs']
            #},
            'role_id':{
                shape: [2,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'mode_id':{
                shape: [2,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'previous_game_result':{
                shape: [2,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'previous_game_reward':{
                shape: [1,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            
            'action_mask':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'legal_actions':{
                shape: ['task.action_dim',], 
                target_location: ['head', 'extra_inputs']
            },
               
        }

        # Dictionnaries of keys living inside the 'infos' OpenAI Gym's output.
        # Value is a tuple where the first element is the expected shape of the extra input,
        # and the second item is the location where the input should be stored in the framestate.
        # Parsing of the shape will infer where to fetch the value when encountering a string.

        # Actor architecture:
        actor_arch_hidden_units: []
        # Critic architecture:
        #critic_arch_feature_dim: 32
        critic_arch_hidden_units: [128, 128]

LargeMLP_SAD: &LargeMLP_SAD
        sad: True 

        phi_arch: 'MLP' #-LSTM-RNN'
        actor_arch: 'None'
        critic_arch: 'LSTM-RNN'
        
        # Phi Body:
        #phi_arch_channels: ['BN32', 'BN64', 'BN64']
        #phi_arch_channels: [32, 64, 64]
        #phi_arch_kernels: [8, 4, 3]
        #phi_arch_kernels: [3, 3, 3]
        #phi_arch_strides: [4, 2, 1]
        #phi_arch_strides: [2, 2, 1]
        #phi_arch_paddings: [1, 1, 1]
        phi_arch_feature_dim: 128
        phi_arch_hidden_units: [512, 256]

        extra_inputs_infos: {
            'previous_reward':{
                shape: [1,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'previous_action':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },

            'communication_channel':{
                shape: [55,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'round_id':{
                shape: [4,], 
                target_location: ['critic_body', 'extra_inputs']
            },

            #'other_agent_id':{
            #    shape: [10,], 
            #    target_location: ['critic_body', 'extra_inputs']
            #},
            'role_id':{
                shape: [2,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'mode_id':{
                shape: [2,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'previous_game_result':{
                shape: [2,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'previous_game_reward':{
                shape: [1,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            
            'action_mask':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'legal_actions':{
                shape: ['task.action_dim',], 
                target_location: ['head', 'extra_inputs']
            },

            ########################
            # WITH SAD:
            ########################
            'greedy_action':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },
            ########################
            ########################
               
        }

        # Dictionnaries of keys living inside the 'infos' OpenAI Gym's output.
        # Value is a tuple where the first element is the expected shape of the extra input,
        # and the second item is the location where the input should be stored in the framestate.
        # Parsing of the shape will infer where to fetch the value when encountering a string.

        # Actor architecture:
        actor_arch_hidden_units: []
        # Critic architecture:
        #critic_arch_feature_dim: 32
        critic_arch_hidden_units: [128, 128]


r2d2_LargeMLPLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0: &r2d2_LargeMLPLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0
        #observation_resize_dim: 21 #56
        
        dueling: True
        noisy: False 
        n_step: 3

        use_PER: True
        PER_alpha: 0.9
        PER_beta: 0.6

        replay_capacity: 5242880 # in terms of experience #1e6
        min_capacity: 4e5 #in terms of experiences... #1e4
        replay_period: 1
        
        actor_models_update_steps_interval: 10 #considering only 1 actor's steps.

        discount: 0.999
        use_cuda: True
        gradient_clip: 0.5
        batch_size: 128
        tau: 4.0e-4
        learning_rate: 6.25e-5
        adam_eps: 1.5e-5

        epsstart: 1.0
        epsend: 0.1
        epsdecay: 10000
        eps_greedy_alpha: 7.0

        sequence_replay_use_online_states: True
        sequence_replay_use_zero_initial_states: False
        sequence_replay_store_on_terminal: False
        
        r2d2_loss_masking: True
        r2d2_loss_masking_n_step_regularisation: True
        
        burn_in: False
        sequence_replay_unroll_length: 40
        sequence_replay_overlap_length: 10
        sequence_replay_burn_in_length: 0

        sequence_replay_PER_eta: 0.9

        <<: *LargeMLP_SAD
        <<: *extra_hyperparameters

max_sentence_length: &max_sentence_length 3
vocab_size: &vocab_size 6
vocab_eos_size: &vocab_eos_size 7
nbr_latents: &nbr_latents 3 

experiment:
    tasks: [{
        #'env-id': 'SymbolicBehaviourBenchmark-ReceptiveConstructiveTestEnv-2Shots-v0',
        'env-id': 'SymbolicBehaviourBenchmark-ReceptiveConstructiveTestEnv-v0',
        'env-config': {
            "nbr_communication_rounds": 1,
            "vocab_size": 6, #20,
            #"max_sentence_length": 4, #3
            "max_sentence_length": 3,
            "descriptive": True,
            #"nbr_latents": 4, #3
            "nbr_latents": *nbr_latents,
            "min_nbr_values_per_latent": 2,
            "max_nbr_values_per_latent": 3,
            "nbr_object_centric_samples": 1,
            "nbr_distractors": 0, # used to be 3, but the component-focused approach raises an exception.
            "use_communication_channel_permutations": True,
            "allow_listener_query": False,
            "provide_listener_feedback": True,
            "sampling_strategy": "component-focused-2shots",
        },
        
        #'run-id': 'Train-Reward1/venv128/V6-MSL3-NCR1-L3Min2Max5-Descr-Distr0-OC1-CommPerm-CompFocSampling2shots-ListFeedback',
        'run-id': 'Train-Reward1/venv128/V6-MSL3-NCR1-L3Min2Max5-Descr-Distr0-OC1-CommPerm-CompFocSampling1shots-ListFeedback',
        
        #'agent-id': '3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeMLPLSTM2Res_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L150_O0_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        'agent-id': '3step_SAD_VDN_aID_0Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeMLPLSTM_GradClip5m0_r5p5Min1e3_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        
        'nbr_actor': 128,
        #'nbr_actor': 100,
        #'nbr_actor': 64,
        #'nbr_actor': 32,
        #'nbr_actor': 16,
        #'nbr_actor': 8,
        #'nbr_frame_skipping': 4,
        #'nbr_frame_stacking': 4,
        #'grayscale': True,
        #'single_life_episode': True, #False,
        #'nbr_max_random_steps': 30,
        #'sad': False, 
        'sad': True,
        #'vdn': False, 
        'vdn': True,
        #"otherplay": True,
        "otherplay": False,
        'clip_reward': False,
        'previous_reward_action': True,
        #'observation_resize_dim': (21,21), #(56,56),
        'observation_resize_dim': 56, #(56,56),
        #
        'reload': 'None',
        },
    ]
    experiment_id: 'r2d2_s2b_debug'
    benchmarking_episodes: 10
    benchmarking_interval: 1.0e4
    benchmarking_record_episode_interval: 'None' #1.0e1 #1.0e20
    #benchmarking_record_episode_interval: 1.0e20
    train_observation_budget: 1.0e7
    seed: 1
    
agents:
    SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeMLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone: &SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeMLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone
        <<: *r2d2_LargeMLPLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0
        actor_models_update_steps_interval: 1 #considering only 1 actor's steps.

        vdn: True 
        vdn_nbr_players: 2

        batch_size: 32
        learning_rate: 6.25e-5
        adam_eps: 1.5e-5
        discount: 0.997
        gradient_clip: 5.0 
        # ...not specified in r2d2 paper but in Ape-X,
        # and r2d2 paper says that missing hyper-param
        # are the same as ape-X
        
        replay_capacity: 5e4 #163840 #2e13*20 #5242880 # in terms of experience #1e6
        min_capacity: 2e4 #in terms of experiences... #1e4
        
        PER_compute_initial_priority: False
        PER_beta_increase_interval: None #2e5
        
        double: True
        dueling: True 
        noisy: False
        n_step: 3
        tau: 4.0e-4
        
        sequence_replay_use_online_states: True
        sequence_replay_use_zero_initial_states: False
        sequence_replay_store_on_terminal: True
        
        r2d2_loss_masking: True
        r2d2_loss_masking_n_step_regularisation: True
        r2d2_bellman_target_SAD: False 

        burn_in: False
        sequence_replay_unroll_length: 100
        sequence_replay_overlap_length: 0
        sequence_replay_burn_in_length: 0
        

        epsstart: 1.0
        epsend: 0.05
        epsdecay: 30000 #1000000
        
        # ape-X and r2d2 keep it constant over each actor 
        # with a different value eps_i = base_eps**(1+\alpha*i/nbr_actors)
        # with base_eps=0.4 and \alpha = 7...
        eps_greedy_alpha: 7.0
    
    #3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeMLPLSTM2Res_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L150_O0_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    3step_SAD_VDN_aID_0Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeMLPLSTM_GradClip5m0_r5p5Min1e3_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
        <<: *SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeMLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone
        min_capacity: 4e4

        weights_entropy_lambda: 0.0 
        #weights_entropy_lambda: 0.1
        #weights_entropy_lambda: 0.001 #01
    
        #vdn: False 
        vdn: True 
        vdn_nbr_players: 2
        #sad: False 
        sad: True

        
        learning_rate: 6.25e-5
        #adam_eps: 1.5e-5
        #learning_rate: 1.0e-3
        #adam_eps: 1.0e-8
        adam_eps: 1.0e-12
        #adam_eps: 1.0e-15

        replay_capacity: 5e4 #5e5
        #replay_capacity: 1e5 
        #min_capacity: 3e4 #in terms of experiences... #1e4
        
        n_step: 3
        #n_step: 7

        #tau: 4.0e-4
        #tau: 1.0e-5
        
        #sequence_replay_overlap_length: 0
        #sequence_replay_overlap_length: 50
        
        batch_size: 128
        
        burn_in: False
        #burn_in: True

        sequence_replay_unroll_length: 20
        sequence_replay_overlap_length: 10
        sequence_replay_burn_in_length: 0
        # #sequence_replay_burn_in_length: 10
        
        # sequence_replay_unroll_length: 100
        # sequence_replay_overlap_length: 50
        # sequence_replay_burn_in_length: 0
        
        epsend: 0.4
        eps_greedy_alpha: 2.0
        
        # Architecture:
        #critic_arch: 'LSTM-RNN'
        #critic_arch_hidden_units: [512, 512]
        #critic_arch_hidden_units: [512]
        #use_relu_after_rnn: False 

        # normal arch:
        # critic_arch: 'MLP-LSTM-RNN'
        # use_relu_after_rnn: True 
        # #use_relu_after_rnn: False 
        # critic_arch_feature_dim: 512
        # critic_arch_hidden_units: [512]

        # Arch2:
        critic_arch: 'MLP-LSTM-RNN2'
        use_relu_after_rnn: False #True 
        use_residual_connection: True 
        critic_arch_linear_hidden_units: [1024,]
        critic_arch_feature_dim: 512
        critic_arch_hidden_units: [512, 512]

        # Arch DNC:
        # critic_arch: 'DNC'
        # critic_arch_feature_dim: 128
        # critic_arch_hidden_units: [128]
        #critic_arch_hidden_units: [128, 128]
        # DNC_sparse_K: 0

        # Arch 3:
        # critic_arch: 'MLP-LSTM-RNN2'
        # use_relu_after_rnn: True 
        # critic_arch_linear_hidden_units: [128]
        # critic_arch_feature_dim: 64
        # critic_arch_hidden_units: [64]

        #Arch 4:
        # critic_arch: 'MLP-LSTM-RNN2'
        # use_relu_after_rnn: True 
        # critic_arch_linear_hidden_units: [512, 256]
        # critic_arch_hidden_units: [256]
        # critic_arch_linear_post_hidden_units: [256]
        # critic_arch_feature_dim: 128

        ArchiModel:
            model_id: 'RL_LSTM_Model'
            hyperparameters:
                value_dim: 64
                key_dim: 64
                hidden_dim: 512
                output_dim: 256
                action_dim: 8
                lstm_input_dim: &lstm_input_dim 245
            input_stream_ids:
                "inputs:obs" : "observations:obs"
                "inputs:legal_actions" : "frame_states:legal_actions"
            modules:
                'ObsEncoderFCN':
                    type: FullyConnectedNetworkModule
                    state_dim: *nbr_latents #"{{hidden_dim}}"
                    hidden_units: [64, 32,] #64,] # 32] #["{{output_dim}}"]
                    non_linearities: ['ReLU']
                    dropout: 0.0
                    config: None
                    input_stream_ids:
                        input: "inputs:obs"
                    output_stream_ids:
                        input: "inputs:ObsEncoderFCN:processed_input"
                    use_cuda: True
                #'CommEncoder':
                 #   type: EmbeddingRNNModule 
                 #   vocab_size: *vocab_size
                 #   feature_dim: None #32 
                 #   embedding_size: 32 
                 #   hidden_units: 32
                 #   num_layers: 1
                 #   gate: None #F.relu, 
                 #   dropout: 0.0 
                 #   rnn_fn: "GRU"
                 #   
                 #   padding_idx: 0 
                 #   #WARNING: it is very important that the grounded EoS symbol be the padding_idx...
                 #   
                 #   config: None
                 #   input_stream_ids:
                 #       input: "inputs:phi_body:extra_inputs:communication_channel"
                 #   output_stream_ids:
                 #       input: "inputs:CommEncoder:processed_input"
                 #   use_cuda: True
                'CommEncoder':
                    type: FullyConnectedNetworkModule
                    #state_dim: 30 
                    state_dim: 15  #"{{hidden_dim}}"
                    hidden_units: [64, 32,] #64,] # 32] #["{{output_dim}}"]
                    non_linearities: ['ReLU']
                    dropout: 0.0
                    config: None
                    input_stream_ids:
                        input: "inputs:phi_body:extra_inputs:multi_binary_communication_channel"
                    output_stream_ids:
                        input: "inputs:CommEncoder:processed_input"
                    use_cuda: True
                'ConcatenationOperation':
                    type: ConcatenationOperationModule
                    config:
                        'dim': -1
                        'use_cuda': True
                        'output_dim': *lstm_input_dim #245 
                    input_stream_ids:
                        input0: "inputs:ObsEncoderFCN:processed_input"
                        input1: "inputs:CommEncoder:processed_input"
                        input2: "inputs:critic_body:extra_inputs:previous_reward"
                        input3: "inputs:critic_body:extra_inputs:previous_action"
                        input4: "inputs:critic_body:extra_inputs:round_id"
                        input5: "inputs:critic_body:extra_inputs:role_id"
                        input6: "inputs:critic_body:extra_inputs:mode_id"
                        input7: "inputs:critic_body:extra_inputs:previous_game_result"
                        input8: "inputs:critic_body:extra_inputs:previous_game_reward"
                'CoreLSTM':
                    type: LSTMModule
                    state_dim: *lstm_input_dim #130 #65 #"{{key_dim}+1}"
                    hidden_units: [512,] #["{{hidden_dim}}"]
                    non_linearities: [None]
                    config: None
                    input_stream_ids:
                        lstm_input: "modules:ConcatenationOperation:output"
                        lstm_hidden: "inputs:CoreLSTM:hidden"
                        lstm_cell: "inputs:CoreLSTM:cell"
                        iteration: "inputs:CoreLSTM:iteration"
                    use_cuda: True
                'RLHead':
                    type: RLCategoricalHeadModule
                    state_dim: 683 
                    action_dim: 171 #519 #"{{action_dim}}"
                    noisy: False
                    dueling: True
                    config: None
                    input_stream_ids: 
                        input0: "modules:CoreLSTM:output"
                        input8: "inputs:critic_body:extra_inputs:action_mask"
                        action: "inputs:action"
                        legal_actions: "inputs:head:extra_inputs:legal_actions"
                    use_cuda: True
            output_mappings:
                head:
                    "a": "modules:RLHead:a"
                    #"greedy_action" : "modules:RLHead:greedy_action"
                    "ent" : "modules:RLHead:ent"
                    "legal_ent" : "modules:RLHead:legal_ent"
                    #"v" : "modules:RLHead:v"
                    "qa" : "modules:RLHead:qa"
                    #"log_pi_a" : "modules:RLHead:log_pi_a"
                    "log_a" : "modules:RLHead:log_a"
                    "unlegal_log_a" : "modules:RLHead:unlegal_log_a"
            
            input_mappings:
                    # Remaps the features to there original stream when using head pipeline only:
                    head:
                        "inputs:obs": "modules:ConcatenationOperation:output"
            pipelines:
                torso: 
                    - 'ObsEncoderFCN'
                    - 'CommEncoder' 
                    - 'ConcatenationOperation'
                head:
                    - 'CoreLSTM'
                    - 'RLHead'
            features_id: 
                torso: "modules:ConcatenationOperation:output"
            
        extra_inputs_infos: {
            'previous_reward':{
                shape: [1,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'previous_action':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },

            'multi_binary_communication_channel':{
                shape: [21], 
                target_location: ['phi_body', 'extra_inputs']
            },
            #'communication_channel':{
            #    shape: [*max_sentence_length], 
            #    target_location: ['phi_body', 'extra_inputs']
            #},
            'round_id':{
                shape: [2,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            #'other_agent_id':{
            #    shape: [10,], 
            #    target_location: ['critic_body', 'extra_inputs']
            #},
            'role_id':{
                shape: [2,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'mode_id':{
                shape: [2,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'previous_game_result':{
                shape: [2,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'previous_game_reward':{
                shape: [1,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            
            'action_mask':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'legal_actions':{
                shape: ['task.action_dim',], 
                target_location: ['head', 'extra_inputs']
            },

            ########################
            # WITH SAD:
            ########################
            #'greedy_action':{
            #    shape: [31,], #[6223,], 
            #    target_location: ['critic_body', 'extra_inputs']
            #},
            ########################
            ########################
               
        }
    
    
    
        
