extra_hyperparameters: &extra_hyperparameters
    lr_account_for_nbr_actor: False 
    weights_decay_lambda: 0.0
    weights_entropy_lambda: 0.0 #01
    use_target_to_gather_data:    False

    ####################################
    # New hyperparameters:
    PER_compute_initial_priority: False
    #####################################
    
    sequence_replay_use_online_states: True
    sequence_replay_use_zero_initial_states: False
    sequence_replay_store_on_terminal: True 

    r2d2_loss_masking: True
    r2d2_loss_masking_n_step_regularisation: True
    r2d2_bellman_target_SAD: False

    burn_in: True 
    sequence_replay_unroll_length: 80
    sequence_replay_overlap_length: 40
    sequence_replay_burn_in_length: 20

    sequence_replay_PER_eta: 0.9

    vdn: False 
    vdn_nbr_players: 2

MLPLSTM: &MLPLSTM
        phi_arch: 'MLP'
        actor_arch: 'None'
        critic_arch: 'MLP-LSTM-RNN'
        
        # Phi Body:
        #phi_arch_channels: [32, 64, 64]
        #phi_arch_kernels: [8, 4, 3]
        #phi_arch_strides: [4, 2, 1]
        #phi_arch_paddings: [1, 1, 1]
        phi_arch_feature_dim: 64
        phi_arch_hidden_units: ['BN256','BN128']

        extra_inputs_infos: {
            'previous_reward':{
                shape: [1,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'previous_action':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'action_mask':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'legal_actions':{
                shape: ['task.action_dim',], 
                target_location: ['head', 'extra_inputs']
            },
               
        }

        # Dictionnaries of keys living inside the 'infos' OpenAI Gym's output.
        # Value is a tuple where the first element is the expected shape of the extra input,
        # and the second item is the location where the input should be stored in the framestate.
        # Parsing of the shape will infer where to fetch the value when encountering a string.

        # Actor architecture:
        actor_arch_hidden_units: []
        # Critic architecture:
        critic_arch_feature_dim: 32
        critic_arch_hidden_units: [64]

SAD_MLPLSTM: &SAD_MLPLSTM
        sad: True
        
        phi_arch: 'MLP'
        actor_arch: 'None'
        critic_arch: 'LSTM-RNN'
        
        # Phi Body:
        phi_arch_feature_dim: 512
        phi_arch_hidden_units: []

        # extra_inputs_infos: {
        #     'previous_reward':{
        #         shape: [1,], 
        #         target_location: ['critic_body', 'extra_inputs']
        #     },
        #     'previous_action':{
        #         shape: ['task.action_dim',], 
        #         target_location: ['critic_body', 'extra_inputs']
        #     },
        #     'action_mask':{
        #         shape: ['task.action_dim',], 
        #         target_location: ['critic_body', 'extra_inputs']
        #     },
        #     ########################
        #     # WITH SAD:
        #     ########################
        #     'greedy_action':{
        #         shape: ['task.action_dim',], 
        #         target_location: ['critic_body', 'extra_inputs']
        #     },
        #     ########################
        #     ########################
        #     'legal_actions':{
        #         shape: ['task.action_dim',], 
        #         target_location: ['head', 'extra_inputs']
        #     },       
        # }
        

        extra_inputs_infos: {
            'previous_reward':{
                shape: [1,], 
                target_location: ['phi_body', 'extra_inputs']
            },
            'previous_action':{
                shape: ['task.action_dim',], 
                target_location: ['phi_body', 'extra_inputs']
            },
            'action_mask':{
                shape: ['task.action_dim',], 
                target_location: ['phi_body', 'extra_inputs']
            },
            ########################
            # WITH SAD:
            ########################
            'greedy_action':{
                shape: ['task.action_dim',], 
                target_location: ['phi_body', 'extra_inputs']
            },
            ########################
            ########################
            'legal_actions':{
                shape: ['task.action_dim',], 
                target_location: ['head', 'extra_inputs']
            },
               
        }

        # Dictionnaries of keys living inside the 'infos' OpenAI Gym's output.
        # Value is a tuple where the first element is the expected shape of the extra input,
        # and the second item is the location where the input should be stored in the framestate.
        # Parsing of the shape will infer where to fetch the value when encountering a string.

        # Actor architecture:
        actor_arch_hidden_units: []
        # Critic architecture:
        #critic_arch_feature_dim: 32
        critic_arch_hidden_units: [512, 512]

SAD_MLP: &SAD_MLP
        sad: True 

        phi_arch: 'None'
        actor_arch: 'None'
        critic_arch: 'MLP-MLP-RNN'
        
        # Phi Body:
        phi_arch_feature_dim: 512
        phi_arch_hidden_units: []

        extra_inputs_infos: {
            'previous_reward':{
                shape: [1,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'previous_action':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'action_mask':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'legal_actions':{
                shape: ['task.action_dim',], 
                target_location: ['head', 'extra_inputs']
            },
               
        }

        # Dictionnaries of keys living inside the 'infos' OpenAI Gym's output.
        # Value is a tuple where the first element is the expected shape of the extra input,
        # and the second item is the location where the input should be stored in the framestate.
        # Parsing of the shape will infer where to fetch the value when encountering a string.

        # Actor architecture:
        actor_arch_hidden_units: []
        # Critic architecture:
        critic_arch_feature_dim: 'None'
        critic_arch_hidden_units: [32]

SAD_LINEAR: &SAD_LINEAR
        sad: True 

        phi_arch: 'None'
        actor_arch: 'None'
        critic_arch: 'None'
        
        # Phi Body:
        phi_arch_feature_dim: 512
        phi_arch_hidden_units: []

        extra_inputs_infos: {
            'previous_reward':{
                shape: [1,], 
                target_location: ['final_critic_layer', 'extra_inputs']
            },
            'previous_action':{
                shape: ['task.action_dim',], 
                target_location: ['final_critic_layer', 'extra_inputs']
            },
            'action_mask':{
                shape: ['task.action_dim',], 
                target_location: ['final_critic_layer', 'extra_inputs']
            },
            ########################
            # WITH SAD:
            ########################
            'greedy_action':{
                shape: ['task.action_dim',], 
                target_location: ['final_critic_layer', 'extra_inputs']
            },
            ########################
            ########################
            'legal_actions':{
                shape: ['task.action_dim',], 
                target_location: ['head', 'extra_inputs']
            },
               
        }

        # Dictionnaries of keys living inside the 'infos' OpenAI Gym's output.
        # Value is a tuple where the first element is the expected shape of the extra input,
        # and the second item is the location where the input should be stored in the framestate.
        # Parsing of the shape will infer where to fetch the value when encountering a string.

        # Actor architecture:
        actor_arch_hidden_units: []
        # Critic architecture:
        critic_arch_feature_dim: 'None'
        critic_arch_hidden_units: [32]

r2d2_MLPLSTM_obs84_graclip5m1_b32_tau1m2_lr25m5_L80_O40_B20: &r2d2_MLPLSTM_obs84_graclip5m1_b32_tau1m2_lr25m5_L80_O40_B20
        dueling: True
        noisy: False 
        n_step: 1

        use_PER: False
        PER_alpha: 0.6
        PER_beta: 1.0

        replay_capacity: 1e6
        min_capacity: 1e3
        replay_period: 1
        # deprecated: actor_models_update_optimization_interval: 4
        actor_models_update_steps_interval: 400 #considering only 1 actor's steps.

        observation_resize_dim: 84
        discount: 0.99 #0.997
        use_cuda: True
        gradient_clip: 0.5
        batch_size: 32
        tau: 1.0e-2
        learning_rate: 2.5e-4
        adam_eps: 1.0e-8

        epsstart: 1.0
        epsend: 0.01    #0.1
        epsdecay: 30000 #1000000
        eps_greedy_alpha: 0.0

        <<: *MLPLSTM
        <<: *extra_hyperparameters

r2d2_SAD_MLPLSTM_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0: &r2d2_SAD_MLPLSTM_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0
        dueling: True
        noisy: False 
        n_step: 3

        use_PER: True
        PER_alpha: 0.9
        PER_beta: 0.6

        replay_capacity: 5242880 # in terms of experience #1e6
        min_capacity: 4e5 #in terms of experiences... #1e4
        replay_period: 1
        
        actor_models_update_steps_interval: 1 #considering only 1 actor's steps.

        discount: 0.999
        use_cuda: True
        gradient_clip: 0.5
        batch_size: 128
        tau: 4.0e-4
        learning_rate: 6.25e-5
        adam_eps: 1.5e-5

        epsstart: 1.0
        epsend: 0.1
        epsdecay: 10000
        eps_greedy_alpha: 7.0

        sequence_replay_use_online_states: True
        sequence_replay_use_zero_initial_states: False
        sequence_replay_store_on_terminal: False
        r2d2_loss_masking: True
        r2d2_loss_masking_n_step_regularisation: True
        
        burn_in: False
        sequence_replay_unroll_length: 40
        sequence_replay_overlap_length: 10
        sequence_replay_burn_in_length: 0

        sequence_replay_PER_eta: 0.9

        <<: *SAD_MLPLSTM
        <<: *extra_hyperparameters

r2d2_SAD_MLP_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0: &r2d2_SAD_MLP_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0
        dueling: True
        noisy: False 
        n_step: 3

        use_PER: True
        PER_alpha: 0.9
        PER_beta: 0.6

        replay_capacity: 5242880 # in terms of experience #1e6
        min_capacity: 4e5 #in terms of experiences... #1e4
        replay_period: 1
        
        actor_models_update_steps_interval: 10 #considering only 1 actor's steps.

        discount: 0.999
        use_cuda: True
        gradient_clip: 0.5
        batch_size: 128
        tau: 4.0e-4
        learning_rate: 6.25e-5
        adam_eps: 1.5e-5

        epsstart: 1.0
        epsend: 0.1
        epsdecay: 10000
        eps_greedy_alpha: 7.0

        sequence_replay_use_online_states: True
        sequence_replay_use_zero_initial_states: False
        sequence_replay_store_on_terminal: False
        r2d2_loss_masking: True
        r2d2_loss_masking_n_step_regularisation: True
        
        burn_in: False
        sequence_replay_unroll_length: 40
        sequence_replay_overlap_length: 10
        sequence_replay_burn_in_length: 0

        sequence_replay_PER_eta: 0.9

        <<: *SAD_MLP
        <<: *extra_hyperparameters


r2d2_SAD_LINEAR_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0: &r2d2_SAD_LINEAR_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0
        dueling: True
        noisy: False 
        n_step: 3

        use_PER: True
        PER_alpha: 0.9
        PER_beta: 0.6

        replay_capacity: 5242880 # in terms of experience #1e6
        min_capacity: 4e5 #in terms of experiences... #1e4
        replay_period: 1
        
        actor_models_update_steps_interval: 10 #considering only 1 actor's steps.

        discount: 0.999
        use_cuda: True
        gradient_clip: 0.5
        batch_size: 128
        tau: 4.0e-4
        learning_rate: 6.25e-5
        adam_eps: 1.5e-5

        epsstart: 1.0
        epsend: 0.1
        epsdecay: 10000
        eps_greedy_alpha: 7.0

        sequence_replay_use_online_states: True
        sequence_replay_use_zero_initial_states: True
        sequence_replay_store_on_terminal: True 
        r2d2_loss_masking: True
        r2d2_loss_masking_n_step_regularisation: True
        
        burn_in: False
        sequence_replay_unroll_length: 40
        sequence_replay_overlap_length: 10
        sequence_replay_burn_in_length: 0

        sequence_replay_PER_eta: 0.9

        <<: *SAD_LINEAR
        <<: *extra_hyperparameters


experiment:
    tasks: [{
        #'env-id': 'TinyAbstractHanabi2P2C3A-v0',
        #'env-id': 'TinyAbstractHanabi2P2C3A-OHEObs-v0',
        #'env-id': 'EasyTinyAbstractHanabi2P2C3A-OHEObs-v0',
        #'env-id': 'Hanabi-VerySmall-v0',
        'env-id': 'Hanabi-Full-v0',
        #'env-id': 'Hanabi-Small-v0',
        
        
        
        #'run-id': 'test/serial/debugLSTMGRU/debugTinyArch-WithOutBN/debugNStepBellmanTargetFunction/Env-NoPenaltyTimeLimit-WithTimePenalty/ScalingFN_EPS1m3/Seed1_venv32_r2d2_Obs84_EntropyReg0_WeightDecayReg0/',
        # SAD bellman target:
        #'run-id': 'test/serial/SELFPLAY/L2AUpdateInterval1p0/debugLSTMGRU/debugTinyArch-WithOutBN/debugNStepBellmanTargetFunction_SAD/AddingNonLinFCN+Dueling+ExtraInputOnInitialLayer/LayerInitFn/NOScalingFN_EPS1m3/Seed11_venv3160_r2d2_Obs84_EntropyReg0_WeightDecayReg0/',
        
        #'run-id': 'test/serial/SELFPLAY/L2AUpdateInterval1p0/debugLSTMGRU/debugTinyArch-WithOutBN/debugNStepBellmanTargetFunction_SAD/AddingNonLinFCN+Dueling+ExtraInputOnCriticLayer/LayerInitFn/NOScalingFN_EPS1m3/Seed11_venv3160_r2d2_Obs84_EntropyReg0_WeightDecayReg0/',
        #'run-id': 'test/serial/SELFPLAY/L2AUpdateInterval1p0/debugLSTMGRU/debugTinyArch-WithOutBN/debugNStepBellmanTargetFunction_SAD/AddingNonLinFCN+Dueling+ExtraInputOnInitLayer/LayerInitFn/NOScalingFN_EPS1m3/Seed11_venv160_r2d2_Obs84_EntropyReg0_WeightDecayReg0/',
        'run-id': 'test/serial/SELFPLAY/NOSAD/debugVDN23-debugExploEPS-debugProperVDNLoss+SumBeforeLoss-VDNExpHandling-RandomizedStartPlayerActionSet/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv32_r2d2_Obs56_EntropyReg0_WeightDecayReg0/',
        
        #'agent-id': 'VDN_3step_r2d2_AdamLR6d25m5_EPS1m12_EPSAPEX1m0_4m1OVER3p5_Alpha2_gamma999_SAD_MLP_LSTM_GradClip5m0_r1p5Min1e3_a9m1_b6m1_over2e4_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_TermSt_NOZeroInitSt_OnlineSt_RegMaskLoss',
        # explo epsdecay: p5 -> p6
        'agent-id': 'VDN_3step_r2d2_AdamLR6d25m5_EPS1m12_EPSAPEX1m0_4m1OVER3p6_Alpha2_gamma999_SAD_MLP_LSTM_GradClip5m0_r1p5Min1e3_a9m1_b6m1_over2e4_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_TermSt_NOZeroInitSt_OnlineSt_RegMaskLoss',
        
        #'nbr_actor': 160, #1600 
        'nbr_actor': 32,
        #'sad': True,
        'sad': False,
        'vdn': True,
        #'vdn': False,
        'vdn_nbr_players': 2,
        'clip_reward': False,
        'previous_reward_action': True,
        #'observation_resize_dim': (84,84),
        },
    ]
    experiment_id: 'r2d2_comaze_debug/r2d2_hanabi_debug'
    benchmarking_episodes: 10
    benchmarking_interval: 1.0e3
    benchmarking_record_episode_interval: 1.0e20
    train_observation_budget: 1.0e7 #640000 #1.0e10 #3.0e5 #1.0e7
    seed: 2

agents:
    # Hanabi SAD implementation starts from stored RNN states:
    SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_EPSAPEX1m0_1m1OVER1m3_Alpha2_gamma999_SAD_MLP_LSTM_GradClip5m0_r1p5Min2e4_a9m1_b6m1_overNone_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_TermSt_NOZeroInitSt_OnlineSt_RegMaskLoss: &SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_EPSAPEX1m0_1m1OVER1m3_Alpha2_gamma999_SAD_MLP_LSTM_GradClip5m0_r1p5Min2e4_a9m1_b6m1_overNone_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_TermSt_NOZeroInitSt_OnlineSt_RegMaskLoss
        <<: *r2d2_SAD_MLPLSTM_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0
        actor_models_update_steps_interval: 1 #considering only 1 actor's steps.
        
        sad: True
        vdn: True 
        vdn_nbr_players: 2

        learning_rate: 6.25e-5
        adam_eps: 1.5e-5
        discount: 0.999
        gradient_clip: 5.0 

        batch_size: 128
        replay_capacity: 1e5 #2e4 #163840 #2e13*20 #5242880 # in terms of experience #1e6
        min_capacity: 2e4 #2e3 #in terms of experiences... #1e4
        
        double: True
        dueling: True 
        noisy: False
        n_step: 3
        tau: 4.0e-4
        
        use_PER: True
        PER_alpha: 0.9
        PER_beta: 0.6
        PER_compute_initial_priority: False
        PER_beta_increase_interval: None #2e5
        
        sequence_replay_use_online_states: True
        sequence_replay_use_zero_initial_states: False
        sequence_replay_store_on_terminal: True
        r2d2_loss_masking: True
        r2d2_loss_masking_n_step_regularisation: True
        r2d2_bellman_target_SAD: False

        burn_in: False
        sequence_replay_unroll_length: 20
        sequence_replay_overlap_length: 10
        sequence_replay_burn_in_length: 0
        
        epsstart: 1.0
        epsend: 0.1
        epsdecay: 1000 #30000 #1000000
        #eps_greedy_alpha: 7.0
        eps_greedy_alpha: 2.0

    #3step_r2d2_AdamLR6d25m5_EPS1d5m5_EPSAPEX1m0_5m2OVER3p5_Alpha7_gamma999_SAD_MLP_LSTM_GradClip5m0_r1p5Min2e4_a9m1_b6m1_over2e4_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_TermSt_ZeroInitSt_OnlineSt_RegMaskLoss:
    # epsilong decrease:
    #3step_r2d2_AdamLR6d25m5_EPS1m12_EPSAPEX1m0_4m1OVER3p5_Alpha2_gamma999_SAD_MLP_LSTM_GradClip5m0_r1p5Min1e3_a9m1_b6m1_over2e4_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_TermSt_NOZeroInitSt_OnlineSt_RegMaskLoss:
    #VDN_3step_r2d2_AdamLR6d25m5_EPS1m12_EPSAPEX1m0_4m1OVER3p5_Alpha2_gamma999_SAD_MLP_LSTM_GradClip5m0_r1p5Min1e3_a9m1_b6m1_over2e4_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_TermSt_NOZeroInitSt_OnlineSt_RegMaskLoss:
    VDN_3step_r2d2_AdamLR6d25m5_EPS1m12_EPSAPEX1m0_4m1OVER3p6_Alpha2_gamma999_SAD_MLP_LSTM_GradClip5m0_r1p5Min1e3_a9m1_b6m1_over2e4_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_TermSt_NOZeroInitSt_OnlineSt_RegMaskLoss:
        <<: *SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_EPSAPEX1m0_1m1OVER1m3_Alpha2_gamma999_SAD_MLP_LSTM_GradClip5m0_r1p5Min2e4_a9m1_b6m1_overNone_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_TermSt_NOZeroInitSt_OnlineSt_RegMaskLoss
        # DEBUG:
        min_capacity: 1e3    
        
        #vdn: False 
        vdn: True 
        vdn_nbr_players: 2
        sad: False #True

        learning_rate: 6.25e-5
        #adam_eps: 1.5e-5
        #learning_rate: 1.0e-3
        #adam_eps: 1.0e-8
        adam_eps: 1.0e-12
        #adam_eps: 1.0e-15

        n_step: 3
        #n_step: 7

        #tau: 4.0e-4
        #tau: 1.0e-5
        
        batch_size: 128
        
        epsend: 0.4
        epsdecay: 300000
        eps_greedy_alpha: 2.0
        
        burn_in: False
        #burn_in: True
        sequence_replay_unroll_length: 20
        sequence_replay_overlap_length: 10
        sequence_replay_burn_in_length: 0
        #sequence_replay_burn_in_length: 10

        extra_inputs_infos: {
            'previous_reward':{
                shape: [1,], 
                target_location: ['phi_body', 'extra_inputs']
            },
            'previous_action':{
                shape: ['task.action_dim',], 
                target_location: ['phi_body', 'extra_inputs']
            },
            'action_mask':{
                shape: ['task.action_dim',], 
                target_location: ['phi_body', 'extra_inputs']
            },
            ########################
            # WITH SAD:
            ########################
            # 'greedy_action':{
            #     shape: ['task.action_dim',], 
            #     target_location: ['phi_body', 'extra_inputs']
            # },
            ########################
            ########################
            'legal_actions':{
                shape: ['task.action_dim',], 
                target_location: ['head', 'extra_inputs']
            },
               
        }
        
