extra_hyperparameters: &extra_hyperparameters
    lr_account_for_nbr_actor: False 
    weights_decay_lambda: 0.0
    weights_entropy_lambda: 0.0 #01
    use_target_to_gather_data:    False

    ####################################
    # New hyperparameters:
    PER_compute_initial_priority: False
    #####################################
    
    sequence_replay_use_online_states: True
    sequence_replay_use_zero_initial_states: False
    sequence_replay_store_on_terminal: True
    
    r2d2_loss_masking: True
    r2d2_loss_masking_n_step_regularisation: True
    r2d2_bellman_target_SAD: False

    burn_in: True
    sequence_replay_unroll_length: 80
    sequence_replay_overlap_length: 40
    sequence_replay_burn_in_length: 20

    sequence_replay_PER_eta: 0.9

    vdn: False 
    vdn_nbr_players: 2

LargeMLP: &LargeMLP
        phi_arch: 'MLP'
        actor_arch: 'None'
        critic_arch: 'LSTM-RNN'
        
        # Phi Body:
        #phi_arch_channels: ['BN32', 'BN64', 'BN64']
        #phi_arch_kernels: [8, 4, 3]
        #phi_arch_strides: [4, 2, 1]
        #phi_arch_paddings: [1, 1, 1]
        phi_arch_feature_dim: 128
        phi_arch_hidden_units: [512,256]

        extra_inputs_infos: {
            'previous_reward':{
                shape: [1,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'previous_action':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },
            
            'communication_channel':{
                shape: [55,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'other_agent_id':{
                shape: [10,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'role_id':{
                shape: [2,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'mode_id':{
                shape: [2,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'previous_game_result':{
                shape: [2,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'previous_game_reward':{
                shape: [1,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            
            'action_mask':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'legal_actions':{
                shape: ['task.action_dim',], 
                target_location: ['head', 'extra_inputs']
            },
               
        }

        # Dictionnaries of keys living inside the 'infos' OpenAI Gym's output.
        # Value is a tuple where the first element is the expected shape of the extra input,
        # and the second item is the location where the input should be stored in the framestate.
        # Parsing of the shape will infer where to fetch the value when encountering a string.

        # Actor architecture:
        actor_arch_hidden_units: []
        # Critic architecture:
        #critic_arch_feature_dim: 32
        critic_arch_hidden_units: [128, 128]

LargeMLP_SAD: &LargeMLP_SAD
        sad: True 

        phi_arch: 'MLP' #-LSTM-RNN'
        actor_arch: 'None'
        critic_arch: 'LSTM-RNN'
        
        # Phi Body:
        #phi_arch_channels: ['BN32', 'BN64', 'BN64']
        #phi_arch_channels: [32, 64, 64]
        #phi_arch_kernels: [8, 4, 3]
        #phi_arch_kernels: [3, 3, 3]
        #phi_arch_strides: [4, 2, 1]
        #phi_arch_strides: [2, 2, 1]
        #phi_arch_paddings: [1, 1, 1]
        phi_arch_feature_dim: 128
        phi_arch_hidden_units: [512, 256]

        extra_inputs_infos: {
            'previous_reward':{
                shape: [1,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'previous_action':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },

            'communication_channel':{
                shape: [55,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'round_id':{
                shape: [4,], 
                target_location: ['critic_body', 'extra_inputs']
            },

            'other_agent_id':{
                shape: [10,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'role_id':{
                shape: [2,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'mode_id':{
                shape: [2,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'previous_game_result':{
                shape: [2,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'previous_game_reward':{
                shape: [1,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            
            'action_mask':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'legal_actions':{
                shape: ['task.action_dim',], 
                target_location: ['head', 'extra_inputs']
            },

            ########################
            # WITH SAD:
            ########################
            'greedy_action':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },
            ########################
            ########################
               
        }

        # Dictionnaries of keys living inside the 'infos' OpenAI Gym's output.
        # Value is a tuple where the first element is the expected shape of the extra input,
        # and the second item is the location where the input should be stored in the framestate.
        # Parsing of the shape will infer where to fetch the value when encountering a string.

        # Actor architecture:
        actor_arch_hidden_units: []
        # Critic architecture:
        #critic_arch_feature_dim: 32
        critic_arch_hidden_units: [128, 128]


r2d2_LargeMLPLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0: &r2d2_LargeMLPLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0
        #observation_resize_dim: 21 #56
        
        dueling: True
        noisy: False 
        n_step: 3

        use_PER: True
        PER_alpha: 0.9
        PER_beta: 0.6

        replay_capacity: 5242880 # in terms of experience #1e6
        min_capacity: 4e5 #in terms of experiences... #1e4
        replay_period: 1
        
        actor_models_update_steps_interval: 10 #considering only 1 actor's steps.

        discount: 0.999
        use_cuda: True
        gradient_clip: 0.5
        batch_size: 128
        tau: 4.0e-4
        learning_rate: 6.25e-5
        adam_eps: 1.5e-5

        epsstart: 1.0
        epsend: 0.1
        epsdecay: 10000
        eps_greedy_alpha: 7.0

        sequence_replay_use_online_states: True
        sequence_replay_use_zero_initial_states: False
        sequence_replay_store_on_terminal: False
        
        r2d2_loss_masking: True
        r2d2_loss_masking_n_step_regularisation: True
        
        burn_in: False
        sequence_replay_unroll_length: 40
        sequence_replay_overlap_length: 10
        sequence_replay_burn_in_length: 0

        sequence_replay_PER_eta: 0.9

        <<: *LargeMLP_SAD
        <<: *extra_hyperparameters


experiment:
    tasks: [{
        'env-id': 'SymbolicBehaviourBenchmark-ReceptiveConstructiveTestEnv-v0',
        'env-config': {
            "nbr_communication_rounds": 3,
            "vocab_size": 6,
            "max_sentence_length": 1,
            "nbr_latents": 3,
            "min_nbr_values_per_latent": 2,
            "max_nbr_values_per_latent": 5,
            "nbr_object_centric_samples": 1,
            "nbr_distractors": 3,
            "use_communication_channel_permutations": True,
            "allow_listener_query": False,
        },
        
        'run-id': 'Train-Reward1/venv128/V6-MSL1-NCR3-L3Min2Max5-Distr3-OC1-CommPerm',
        
        #'agent-id': '3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeMLPLSTM2Res_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L150_O0_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        'agent-id': '3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeMLPNTM_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        
        'nbr_actor': 128,
        #'nbr_actor': 100,
        #'nbr_actor': 64,
        #'nbr_actor': 32,
        #'nbr_actor': 16,
        #'nbr_actor': 8,
        #'nbr_frame_skipping': 4,
        #'nbr_frame_stacking': 4,
        #'grayscale': True,
        #'single_life_episode': True, #False,
        #'nbr_max_random_steps': 30,
        #'sad': False, 
        'sad': True,
        #'vdn': False, 
        'vdn': True,
        #"otherplay": True,
        "otherplay": False,
        'clip_reward': False,
        'previous_reward_action': True,
        #'observation_resize_dim': (21,21), #(56,56),
        'observation_resize_dim': 56, #(56,56),
        #
        'reload': 'None',
        },
    ]
    experiment_id: 'r2d2_s2b_debug'
    benchmarking_episodes: 10
    benchmarking_interval: 1.0e4
    benchmarking_record_episode_interval: 'None' #1.0e1 #1.0e20
    #benchmarking_record_episode_interval: 1.0e20
    train_observation_budget: 1.0e7
    seed: 1
    
agents:
    SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeMLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone: &SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeMLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone
        <<: *r2d2_LargeMLPLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0
        actor_models_update_steps_interval: 1 #considering only 1 actor's steps.

        vdn: True 
        vdn_nbr_players: 2

        batch_size: 32
        learning_rate: 6.25e-5
        adam_eps: 1.5e-5
        discount: 0.997
        gradient_clip: 5.0 
        # ...not specified in r2d2 paper but in Ape-X,
        # and r2d2 paper says that missing hyper-param
        # are the same as ape-X
        
        replay_capacity: 5e4 #163840 #2e13*20 #5242880 # in terms of experience #1e6
        min_capacity: 2e4 #in terms of experiences... #1e4
        
        PER_compute_initial_priority: False
        PER_beta_increase_interval: None #2e5
        
        double: True
        dueling: True 
        noisy: False
        n_step: 3
        tau: 4.0e-4
        
        sequence_replay_use_online_states: True
        sequence_replay_use_zero_initial_states: False
        sequence_replay_store_on_terminal: True
        
        r2d2_loss_masking: True
        r2d2_loss_masking_n_step_regularisation: True
        r2d2_bellman_target_SAD: False 

        burn_in: False
        sequence_replay_unroll_length: 100
        sequence_replay_overlap_length: 0
        sequence_replay_burn_in_length: 0
        

        epsstart: 1.0
        epsend: 0.05
        epsdecay: 30000 #1000000
        
        # ape-X and r2d2 keep it constant over each actor 
        # with a different value eps_i = base_eps**(1+\alpha*i/nbr_actors)
        # with base_eps=0.4 and \alpha = 7...
        eps_greedy_alpha: 7.0
    
    #3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeMLPLSTM2Res_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L150_O0_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeMLPNTM_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
        <<: *SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeMLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone
        # DEBUG:
        min_capacity: 1e3    
        
        #weights_entropy_lambda: 0.0 
        #weights_entropy_lambda: 0.1
        weights_entropy_lambda: 0.001 #01
    
        #vdn: False 
        vdn: True 
        vdn_nbr_players: 2
        #sad: False 
        sad: True

        
        learning_rate: 6.25e-5
        #adam_eps: 1.5e-5
        #learning_rate: 1.0e-3
        #adam_eps: 1.0e-8
        adam_eps: 1.0e-12
        #adam_eps: 1.0e-15

        replay_capacity: 5e4 #163840 #2e13*20 #5242880 # in terms of experience #1e6
        #replay_capacity: 1e5 
        #min_capacity: 3e4 #in terms of experiences... #1e4
        
        n_step: 3
        #n_step: 7

        #tau: 4.0e-4
        #tau: 1.0e-5
        
        #sequence_replay_overlap_length: 0
        #sequence_replay_overlap_length: 50
        
        batch_size: 128
        
        burn_in: False
        #burn_in: True

        sequence_replay_unroll_length: 20
        sequence_replay_overlap_length: 10
        sequence_replay_burn_in_length: 0
        # #sequence_replay_burn_in_length: 10
        
        # sequence_replay_unroll_length: 100
        # sequence_replay_overlap_length: 50
        # sequence_replay_burn_in_length: 0
        
        epsend: 0.4
        eps_greedy_alpha: 2.0
        
        # Architecture:
        #critic_arch: 'LSTM-RNN'
        #critic_arch_hidden_units: [512, 512]
        #critic_arch_hidden_units: [512]
        #use_relu_after_rnn: False 

        # normal arch:
        # critic_arch: 'MLP-LSTM-RNN'
        # use_relu_after_rnn: True 
        # #use_relu_after_rnn: False 
        # critic_arch_feature_dim: 512
        # critic_arch_hidden_units: [512]

        # Arch2:
        # critic_arch: 'MLP-LSTM-RNN2'
        # use_relu_after_rnn: False #True 
        # use_residual_connection: True 
        # critic_arch_linear_hidden_units: [512, 256]
        # critic_arch_feature_dim: 128
        # critic_arch_hidden_units: [128, 128]

        # Arch NTM:
        critic_arch: 'NTM'
        critic_arch_feature_dim: 128
        critic_arch_hidden_units: [128, 128]

        # Arch 3:
        # critic_arch: 'MLP-LSTM-RNN2'
        # use_relu_after_rnn: True 
        # critic_arch_linear_hidden_units: [128]
        # critic_arch_feature_dim: 64
        # critic_arch_hidden_units: [64]

        #Arch 4:
        # critic_arch: 'MLP-LSTM-RNN2'
        # use_relu_after_rnn: True 
        # critic_arch_linear_hidden_units: [512, 256]
        # critic_arch_hidden_units: [256]
        # critic_arch_linear_post_hidden_units: [256]
        # critic_arch_feature_dim: 128

        # extra_inputs_infos: {
        #     'previous_reward':{
        #         shape: [1,], 
        #         target_location: ['critic_body', 'extra_inputs']
        #     },
        #     'previous_action':{
        #         shape: ['task.action_dim',], 
        #         target_location: ['critic_body', 'extra_inputs']
        #     },

        #     'communication_channel':{
        #         shape: [7,], 
        #         target_location: ['critic_body', 'extra_inputs']
        #     },
        #     'round_id':{
        #         shape: [4,], 
        #         target_location: ['critic_body', 'extra_inputs']
        #     },
        #     'other_agent_id':{
        #         shape: [10,], 
        #         target_location: ['critic_body', 'extra_inputs']
        #     },
        #     'role_id':{
        #         shape: [2,], 
        #         target_location: ['critic_body', 'extra_inputs']
        #     },
        #     'mode_id':{
        #         shape: [2,], 
        #         target_location: ['critic_body', 'extra_inputs']
        #     },
        #     'previous_game_result':{
        #         shape: [2,], 
        #         target_location: ['critic_body', 'extra_inputs']
        #     },
        #     'previous_game_reward':{
        #         shape: [1,], 
        #         target_location: ['critic_body', 'extra_inputs']
        #     },
            
        #     'action_mask':{
        #         shape: ['task.action_dim',], 
        #         target_location: ['critic_body', 'extra_inputs']
        #     },
        #     'legal_actions':{
        #         shape: ['task.action_dim',], 
        #         target_location: ['head', 'extra_inputs']
        #     },

        #     ########################
        #     # WITH SAD:
        #     ########################
        #     'greedy_action':{
        #         shape: [31,], #[6223,], 
        #         target_location: ['critic_body', 'extra_inputs']
        #     },
        #     ########################
        #     ########################
               
        # }

        extra_inputs_infos: {
            'previous_reward':{
                shape: [1,], 
                target_location: ['critic_body', 'ntm', 'ntm_controller', 'extra_inputs']
            },
            'previous_action':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'ntm', 'ntm_controller', 'extra_inputs']
            },

            'communication_channel':{
                shape: [7,], 
                target_location: ['critic_body', 'ntm', 'ntm_controller', 'extra_inputs']
            },
            'round_id':{
                shape: [4,], 
                target_location: ['critic_body', 'ntm', 'ntm_controller', 'extra_inputs']
            },
            'other_agent_id':{
                shape: [10,], 
                target_location: ['critic_body', 'ntm', 'ntm_controller', 'extra_inputs']
            },
            'role_id':{
                shape: [2,], 
                target_location: ['critic_body', 'ntm', 'ntm_controller', 'extra_inputs']
            },
            'mode_id':{
                shape: [2,], 
                target_location: ['critic_body', 'ntm', 'ntm_controller', 'extra_inputs']
            },
            'previous_game_result':{
                shape: [2,], 
                target_location: ['critic_body', 'ntm', 'ntm_controller', 'extra_inputs']
            },
            'previous_game_reward':{
                shape: [1,], 
                target_location: ['critic_body', 'ntm', 'ntm_controller', 'extra_inputs']
            },
            
            'action_mask':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'ntm', 'ntm_controller', 'extra_inputs']
            },
            'legal_actions':{
                shape: ['task.action_dim',], 
                target_location: ['head', 'extra_inputs']
            },

            ########################
            # WITH SAD:
            ########################
            'greedy_action':{
                shape: [31,], #[6223,], 
                target_location: ['critic_body', 'ntm', 'ntm_controller', 'extra_inputs']
            },
            ########################
            ########################
               
        }
    
    
    
    
