extra_hyperparameters: &extra_hyperparameters
    lr_account_for_nbr_actor: False 
    weights_decay_lambda: 0.0
    weights_entropy_lambda: 0.0 #01
    use_target_to_gather_data:    False

    ####################################
    # New hyperparameters:
    PER_compute_initial_priority: False
    #####################################
    
    sequence_replay_use_online_states: True
    sequence_replay_use_zero_initial_states: False
    sequence_replay_store_on_terminal: True
    
    r2d2_loss_masking: True
    r2d2_loss_masking_n_step_regularisation: True
    r2d2_bellman_target_SAD: False

    burn_in: True
    sequence_replay_unroll_length: 80
    sequence_replay_overlap_length: 40
    sequence_replay_burn_in_length: 20

    sequence_replay_PER_eta: 0.9

    vdn: False 
    vdn_nbr_players: 2

LargeCNNMLP: &LargeCNNMLP
        phi_arch: 'CNN' #-LSTM-RNN'
        actor_arch: 'None'
        critic_arch: 'LSTM-RNN'
        
        # Phi Body:
        phi_arch_channels: ['BN32', 'BN64', 'BN64']
        phi_arch_kernels: [8, 4, 3]
        phi_arch_strides: [4, 2, 1]
        phi_arch_paddings: [1, 1, 1]
        phi_arch_feature_dim: 512
        phi_arch_hidden_units: []

        extra_inputs_infos: {
            'previous_reward':{
                shape: [1,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'previous_action':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'action_mask':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'communication_channel':{
                shape: [11,], 
                target_location: ['critic_body', 'extra_inputs']
            },'secret_goal_rule':{
                shape: [8,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'legal_actions':{
                shape: ['task.action_dim',], 
                target_location: ['head', 'extra_inputs']
            },
               
        }

        # Dictionnaries of keys living inside the 'infos' OpenAI Gym's output.
        # Value is a tuple where the first element is the expected shape of the extra input,
        # and the second item is the location where the input should be stored in the framestate.
        # Parsing of the shape will infer where to fetch the value when encountering a string.

        # Actor architecture:
        actor_arch_hidden_units: []
        # Critic architecture:
        #critic_arch_feature_dim: 32
        critic_arch_hidden_units: [512, 512]

LargeCNNMLP_SAD: &LargeCNNMLP_SAD
        sad: True 

        phi_arch: 'CNN' #-LSTM-RNN'
        actor_arch: 'None'
        critic_arch: 'LSTM-RNN'
        
        # Phi Body:
        #phi_arch_channels: ['BN32', 'BN64', 'BN64']
        phi_arch_channels: [32, 64, 64]
        #phi_arch_kernels: [8, 4, 3]
        phi_arch_kernels: [3, 3, 3]
        #phi_arch_strides: [4, 2, 1]
        phi_arch_strides: [2, 2, 1]
        phi_arch_paddings: [1, 1, 1]
        phi_arch_feature_dim: 512
        phi_arch_hidden_units: []

        extra_inputs_infos: {
            'previous_reward':{
                shape: [1,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'previous_action':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'action_mask':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'communication_channel':{
                shape: [11,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'secret_goal_rule':{
                shape: [8,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            ########################
            # WITH SAD:
            ########################
            'greedy_action':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },
            ########################
            ########################
            'legal_actions':{
                shape: ['task.action_dim',], 
                target_location: ['head', 'extra_inputs']
            },
               
        }

        # Dictionnaries of keys living inside the 'infos' OpenAI Gym's output.
        # Value is a tuple where the first element is the expected shape of the extra input,
        # and the second item is the location where the input should be stored in the framestate.
        # Parsing of the shape will infer where to fetch the value when encountering a string.

        # Actor architecture:
        actor_arch_hidden_units: []
        # Critic architecture:
        #critic_arch_feature_dim: 32
        critic_arch_hidden_units: [512, 512]


r2d2_LargeCNNLSTM_IQL_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0: &r2d2_LargeCNNLSTM_IQL_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0
        observation_resize_dim: 56
        
        dueling: True
        noisy: False 
        n_step: 3

        use_PER: True
        PER_alpha: 0.9
        PER_beta: 0.6

        replay_capacity: 5242880 # in terms of experience #1e6
        min_capacity: 4e5 #in terms of experiences... #1e4
        replay_period: 1
        
        actor_models_update_steps_interval: 10 #considering only 1 actor's steps.

        discount: 0.999
        use_cuda: True
        gradient_clip: 0.5
        batch_size: 128
        tau: 4.0e-4
        learning_rate: 6.25e-5
        adam_eps: 1.5e-5

        epsstart: 1.0
        epsend: 0.1
        epsdecay: 10000
        eps_greedy_alpha: 7.0

        sequence_replay_use_online_states: True
        sequence_replay_use_zero_initial_states: False
        sequence_replay_store_on_terminal: False
        
        r2d2_loss_masking: True
        r2d2_loss_masking_n_step_regularisation: True
        
        
        burn_in: False
        sequence_replay_unroll_length: 40
        sequence_replay_overlap_length: 10
        sequence_replay_burn_in_length: 0

        sequence_replay_PER_eta: 0.9

        <<: *LargeCNNMLP
        <<: *extra_hyperparameters

r2d2_LargeCNNLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0: &r2d2_LargeCNNLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0
        observation_resize_dim: 21 #56
        
        dueling: True
        noisy: False 
        n_step: 3

        use_PER: True
        PER_alpha: 0.9
        PER_beta: 0.6

        replay_capacity: 5242880 # in terms of experience #1e6
        min_capacity: 4e5 #in terms of experiences... #1e4
        replay_period: 1
        
        actor_models_update_steps_interval: 10 #considering only 1 actor's steps.

        discount: 0.999
        use_cuda: True
        gradient_clip: 0.5
        batch_size: 128
        tau: 4.0e-4
        learning_rate: 6.25e-5
        adam_eps: 1.5e-5

        epsstart: 1.0
        epsend: 0.1
        epsdecay: 10000
        eps_greedy_alpha: 7.0

        sequence_replay_use_online_states: True
        sequence_replay_use_zero_initial_states: False
        sequence_replay_store_on_terminal: False
        
        r2d2_loss_masking: True
        r2d2_loss_masking_n_step_regularisation: True
        
        burn_in: False
        sequence_replay_unroll_length: 40
        sequence_replay_overlap_length: 10
        sequence_replay_burn_in_length: 0

        sequence_replay_PER_eta: 0.9

        <<: *LargeCNNMLP_SAD
        <<: *extra_hyperparameters


experiment:
    tasks: [{
        #'env-id': 'CoMaze-7x7-Dense-v0',
        #'env-id': 'CoMaze-7x7-Dense-Level4-v0',
        #'env-id': 'CoMaze-7x7-Dense-Level5-v0',
        #'env-id': 'CoMaze-9x9-Dense-Level5-v0',
        #'env-id': 'CoMaze-9x9-Dense-Level5-EasySecrets-v0',
        #'env-id': 'CoMaze-9x9-Dense-Level5-HardSecrets-v0',
        'env-id': 'CoMaze-9x9-Dense-Level5-UniformSecrets-v0',
        #'env-id': 'CoMaze-7x7-Dense-SinglePlayerReward-v0',
        #'env-id': 'CoMaze-7x7-Dense-SinglePlayerReward-Level4-v0',
        #'env-id': 'CoMaze-7x7-Dense-SinglePlayerReward-Level4-FixedSecretGoalRules-v0',
        #'env-id': 'CoMaze-7x7-Dense-FixedActions-v0',
        #'env-id': 'CoMaze-7x7-Dense-SinglePlayer-v0',
        #'env-id': 'CoMaze-7x7-Dense-Easy-SinglePlayer-v0',
        
        #'run-id': 'serial/selfplay/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed1_venv64_r2d2_Obs21_EntropyReg0_WeightDecayReg0/',
        #'run-id': 'serial/selfplay/EnvWithBlackReachedGoals/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/',
        # More actors? fails to learn:
        # maybe the trajectories are being replaced too fast in the replay-buffer:
        # the replay occurrence for each trajectory is reduced...
        # Reducing nbr actors? /2 should give 2*replay_ratio: does it have any effect? greater data-efficiency
        #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv32_r2d2_Obs56_EntropyReg0_WeightDecayReg0/',
        # What about 16 actors then? /4 ==> 4*replay_ratio : great data-efficiency!!!
        #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv16_r2d2_Obs56_EntropyReg0_WeightDecayReg0/',
        # What about 64 with fixed actions (multiplayer)?
        #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/',
        

        #VDN: FixedAction
        # What about 64 with fixed actions (multiplayer)?
        #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/debugVDN23/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/',
        
        #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/debugVDN23-debug1VDNLoss-VDNExpHandling-RandomizedStartPlayerActionSet/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/',
        #'run-id': 'serial/selfplay/400MaxSteps/NOSAD/debugVDNLoss+SumBeforeLoss-VDNExpHandling-RandomizedStartPlayerActionSet/NoBN/WithReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/',
        
        # NOOP:
        #'run-id': 'serial/selfplay/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/NOOP/NoBN/WithOUTReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv100_r2d2_Obs56_EntropyReg0_WeightDecayReg0',
        #'run-id': 'serial/selfplay/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/NOOP/NoBN/WithOUTReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0',
        #'run-id': 'serial/selfplay/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/NOOP/NoBN/WithOUTReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0',
        # OP:
        #'run-id': 'serial/selfplay/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/OP/NoBN/WithOUTReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0',
        #'run-id': 'serial/selfplay/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/OP-VocabOnly/NoBN/WithOUTReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0',
        #
        'run-id': '100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-Vocab20/venv64',
               

        #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/400MaxSteps/NOSAD/debugNOVDNLoss+NOSumBeforeLoss-VDNExpHandling-RandomizedStartPlayerActionSet/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/',
        
        # Reducing nbr actors? /2 should give 2*replay_ratio: does it have any effect? greater data-efficiency
        #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/VDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv32_r2d2_Obs56_EntropyReg0_WeightDecayReg0/',
        # What about 16 actors then? ?
        #'run-id': 'serial/selfplay/EnvWithSymmetricalTimeBonuses/EnvWithBlackReachedGoals/NOSAD/VDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed2_venv16_r2d2_Obs56_EntropyReg0_WeightDecayReg0/',
        
        # Observation space? Visibly better!
        #'run-id': 'serial/selfplay/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/SeedRep1_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/',
        
        #'run-id': 'serial/selfplay/testRecording/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed1_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/',
        # Reducing nbr actor to increase experience replay time: /8 ==> replay-occurences*8?
        #'run-id': 'serial/selfplay/NOSAD/NOVDN/NoBN/WithReLUonFC/NOScalingFN_EPS1m3/Seed1_venv8_r2d2_Obs56_EntropyReg0_WeightDecayReg0/',
        
        #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        # Is the overlap useful? no
        #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O50_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        # Is a different sequence length better?
        #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        # More exploration?
        #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        # Bigger batch-size?
        # Different explo: not sufficient, but at least there is maxtrainingreward remains high...
        #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        
        # Different archi with relu after rnns? not clear yet whether archi or relu...
        #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        # The above arch is definitively learning as there are picks of in mean episode length occuring.
        # Let us try to make it learn faster then: increasing lr and decreasing eps:
        #'agent-id': '3step_r2d2_AdamLR1m3_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        # What about only decreasing eps: the most important element so far!!!!!
        #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        # normal archi but even lower eps: huge gains!!!!!
        #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        
        # Increasing nstep return? does not show specific improvement...
        #'agent-id': '7step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        # burn-in? very sharp progress, but requires more update (since some values are not used in the loss...)
        # early catastrophic forgetting phenomenon...!
        #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_BURNIN_b128_L20_O10_B10_NOZeroInitSt_OnlineSt_StoreOnDone',

        # VDN:
        # normal arch:
        #'agent-id': '3step_VDN_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        # with entropy weight 0.1:
        # Stable learning ! but cannot learn to communicate... 
        #'agent-id': '3step_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        # + agent-id observation:
        #'agent-id': '3step_VDN_1m1Ent_aID_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        #'agent-id': '3step_VDN_1m3Ent_aID_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        # Trying to learn to communicate by enlarging the seq len:
        #'agent-id': '3step_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L100_O50_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        
        # SAD only:
        #'agent-id': '3step_SAD_aID_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        # seq 100:
        #'agent-id': '3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L100_O50_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        # seq 20:
        #'agent-id': '3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        # arch 2:
        #'agent-id': '3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        # + res con :
        #'agent-id': '3step_SAD_VDN_aID_NoEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        # Biasing messes up with action policy entropy:
        'agent-id': '3step_SAD_VDN_aID_WD1m6_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSAPEX1m0_4m1OVER3p4_A2m0_g997_LargeCNN_MLPLSTM2Res_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        # Thus, the following increase in the lambda value aims to address this:
        #'agent-id': '3step_SAD_VDN_aID_1m0Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        #SAD+VDN: plateauing at 8 in singleplayerreward
        #'agent-id': '3step_SAD_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        
        # + agent id: does not provide any help compared to SAD:
        #'agent-id': '3step_SAD_VDN_aID_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_ovrN_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        
        #'agent-id': '3step_VDN_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        # arch 4:
        #'agent-id': '3step_VDN_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM4_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        #'agent-id': '3step_VDN_r2d2_AdamLR6d25m5_EPS1m15_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        #'agent-id': '3step_VDN_r2d2_AdamLR6d25m5_EPS1m15_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau1m5_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',


        # New archi with extra input via fcn and more fc layers: higher pick than normal arch, but lr is too high visibly, the loss diverges...
        #'agent-id': '3step_r2d2_AdamLR1m3_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        # with low eps and low lr, new arch:
        #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        
        #archi 3: very small eps, minimal fcn + extra input on fcn:
        #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM3_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        

        # Normal archi but with a relu:
        #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNNLSTMReLU_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        # normal archi but with a relu + only 1 rnn layer:
        #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN1xLSTMReLU_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        # normal archi but with only 1 rnn layer:
        #'agent-id': '3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN1xLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',

        #'nbr_actor': 128,
        #'nbr_actor': 100,
        'nbr_actor': 64,
        #'nbr_actor': 32,
        #'nbr_actor': 16,
        #'nbr_actor': 8,
        #'nbr_frame_skipping': 4,
        #'nbr_frame_stacking': 4,
        #'grayscale': True,
        #'single_life_episode': True, #False,
        #'nbr_max_random_steps': 30,
        #'sad': False, 
        'sad': True,
        #'vdn': False, 
        'vdn': True,
        #"otherplay": True,
        "otherplay": False,
        'clip_reward': False,
        'previous_reward_action': True,
        #'observation_resize_dim': (21,21), #(56,56),
        'observation_resize_dim': 56, #(56,56),
        #
        'reload': 'None',
        # NOOP:
        #"reload": "/home/kevin/debug_ray/r2d2_comaze_debug/CoMaze-9x9-Dense-Level5-UniformSecrets-v0/serial/selfplay/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/NOOP/NoBN/WithOUTReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone/TRAINING/PUBSUB/./3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone.agent",
        # OP: do be deprecated...
        #'reload': '/home/kevin/debug_ray/r2d2_comaze_debug/CoMaze-9x9-Dense-Level5-UniformSecrets-v0/serial/PUBSUBDEBUG/selfplay/REPDebugVecEnvInitRZeroed/100MaxSteps/PenalizeSecretGoalRuleBreaching-1-Reward1-NoGameOver-PRISM/Penalty+Vocab20/OP/debugPosList/debugSAD/debugVDNLoss+SumBeforeLoss-LossTermCombReg/NoBN/WithOUTReLUonRNN/NOScalingFN_EPS1m3/Seed2_venv64_r2d2_Obs56_EntropyReg0_WeightDecayReg0/TEST+DEBUGExpSumRLPolicy/NoMessEntropy+CorrectPosTargetEntropy+Masking+MeanEntReg/CorrectRuleBasedAgentInnerStateManag/3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r1p5Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone/GoalOrderingPred-NoDropout+RulesPrediction+BigArch/./3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r1p5Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone.agent',
        },
    ]
    #experiment_id: 'r2d2_comaze_data'
    experiment_id: 'CoMaze_data/r2d2_comaze_data'
    benchmarking_episodes: 10
    benchmarking_interval: 1.0e3
    benchmarking_record_episode_interval: 'None' #1.0e1 #1.0e20
    #benchmarking_record_episode_interval: 1.0e20
    #train_observation_budget: 1.0e7
    train_observation_budget: 3.0e6
    seed: 1
    
agents:
    SAD_IQL_paper_1step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate100Steps_EPSgreedyAPEX1m0_4m1OVER3p4_gamma997_LargeCNNLSTM_GradClip5m1_r2p4Min1e4_alpha9m1_beta6m1_over2e4_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L2_O1_B0:
        <<: *r2d2_LargeCNNLSTM_IQL_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0
        actor_models_update_steps_interval: 100 #considering only 1 actor's steps.

        batch_size: 128
        learning_rate: 6.25e-5
        adam_eps: 1.5e-5
        discount: 0.997
        gradient_clip: 0.5 
        # ...not specified in r2d2 paper but in Ape-X,
        # and r2d2 paper says that missing hyper-param
        # are the same as ape-X
        
        replay_capacity: 2e4 #163840 #2e13*20 #5242880 # in terms of experience #1e6
        min_capacity: 1e4 #in terms of experiences... #1e4
        
        PER_beta_increase_interval: 2e4
        
        double: True
        dueling: True 
        noisy: False
        
        n_step: 1
        tau: 4.0e-4
        
        sequence_replay_use_online_states: True
        sequence_replay_use_zero_initial_states: False
        sequence_replay_store_on_terminal: False
        
        r2d2_loss_masking: True
        r2d2_loss_masking_n_step_regularisation: True
        
        burn_in: False
        sequence_replay_unroll_length: 2
        sequence_replay_overlap_length: 1
        sequence_replay_burn_in_length: 0
        

        epsstart: 1.0
        epsend: 0.4
        epsdecay: 30000 #1000000
        
        # ape-X and r2d2 keep it constant over each actor 
        # with a different value eps_i = base_eps**(1+\alpha*i/nbr_actors)
        # with base_eps=0.4 and \alpha = 7...
        eps_greedy_alpha: 7.0
    
    SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone: &SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone
        <<: *r2d2_LargeCNNLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0
        actor_models_update_steps_interval: 1 #considering only 1 actor's steps.

        vdn: True 
        vdn_nbr_players: 2

        batch_size: 32
        learning_rate: 6.25e-5
        adam_eps: 1.5e-5
        discount: 0.997
        gradient_clip: 5.0 
        # ...not specified in r2d2 paper but in Ape-X,
        # and r2d2 paper says that missing hyper-param
        # are the same as ape-X
        
        replay_capacity: 5e4 #163840 #2e13*20 #5242880 # in terms of experience #1e6
        min_capacity: 2e4 #in terms of experiences... #1e4
        
        PER_compute_initial_priority: False
        PER_beta_increase_interval: None #2e5
        
        double: True
        dueling: True 
        noisy: False
        n_step: 3
        tau: 4.0e-4
        
        sequence_replay_use_online_states: True
        sequence_replay_use_zero_initial_states: False
        sequence_replay_store_on_terminal: True
        
        r2d2_loss_masking: True
        r2d2_loss_masking_n_step_regularisation: True
        r2d2_bellman_target_SAD: False 

        burn_in: False
        sequence_replay_unroll_length: 100
        sequence_replay_overlap_length: 0
        sequence_replay_burn_in_length: 0
        

        epsstart: 1.0
        epsend: 0.05
        epsdecay: 30000 #1000000
        
        # ape-X and r2d2 keep it constant over each actor 
        # with a different value eps_i = base_eps**(1+\alpha*i/nbr_actors)
        # with base_eps=0.4 and \alpha = 7...
        eps_greedy_alpha: 7.0
    
    # New archi with relu on rnn:
    #3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    # with higher lr and lower eps for adam opt:
    #3step_r2d2_AdamLR1m3_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    # only lower eps:
    #3step_r2d2_AdamLR6d25m5_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    # even lower eps: best version so far! BEST:
    #3step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    #burn-in?
    #3step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_BURNIN_b128_L20_O10_B10_NOZeroInitSt_OnlineSt_StoreOnDone:
    # increase nstep return?
    #7step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    
    # VDN:
    #3step_VDN_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    #3step_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    # With agent id:
    #3step_VDN_1m3Ent_aID_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    
    #3step_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L100_O50_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    #3step_SAD_VDN_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    # With agent id:
    #3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    # arch 2:
    #3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    # arch 2 + res. con.:
    #3step_SAD_VDN_aID_NoEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    #3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2Res_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    # replay buffer 5p4 -> 1p5
    3step_SAD_VDN_aID_WD1m6_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSAPEX1m0_4m1OVER3p4_A2m0_g997_LargeCNN_MLPLSTM2Res_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    # seq 100
    #3step_SAD_VDN_aID_NOEnt_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L100_O50_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    #3step_SAD_VDN_aID_1m1Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_ovrN_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:

    # arch 3 with very low eps: minimal arch
    #3step_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM3_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    
    # New architecture with more FC layers and extra inputs are feed to those (rather than rnn):
    #3step_r2d2_AdamLR1m3_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    # with low eps and low lr:
    #3step_r2d2_AdamLR6d25m5_EPS1m8_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN_MLPLSTM2_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    # old archi with relu on rnn
    #3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNNLSTMReLU_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    # old archi with relu on rnn + only one rnn layer:
    #3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN1xLSTMReLU_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    # old archi with only on rnn layer:
    #3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeCNN1xLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
        <<: *SAD_VDN_3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeCNNLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone
        # DEBUG:
        min_capacity: 1e3    
        
        #weights_decay_lambda: 0.0001 #0.0
        weights_decay_lambda: 1.0e-6 #0.0
        
        #weights_entropy_lambda: 0.0 
        #weights_entropy_lambda: 1.0
        weights_entropy_lambda: 0.001 #01
    
        #vdn: False 
        vdn: True 
        vdn_nbr_players: 2
        #sad: False 
        sad: True

        
        learning_rate: 6.25e-5
        #adam_eps: 1.5e-5
        #learning_rate: 1.0e-3
        #adam_eps: 1.0e-8
        adam_eps: 1.0e-12
        #adam_eps: 1.0e-15

        #replay_capacity: 5e4 #163840 #2e13*20 #5242880 # in terms of experience #1e6
        replay_capacity: 1e5 
        min_capacity: 3e4 #in terms of experiences... #1e4
        
        n_step: 3
        #n_step: 7

        #tau: 4.0e-4
        #tau: 1.0e-5
        
        #sequence_replay_overlap_length: 0
        #sequence_replay_overlap_length: 50
        
        batch_size: 128
        
        burn_in: False
        #burn_in: True

        sequence_replay_unroll_length: 20
        sequence_replay_overlap_length: 10
        sequence_replay_burn_in_length: 0
        # #sequence_replay_burn_in_length: 10
        
        # sequence_replay_unroll_length: 100
        # sequence_replay_overlap_length: 50
        # sequence_replay_burn_in_length: 0
        
        epsend: 0.4
        eps_greedy_alpha: 2.0
        
        # Architecture:
        #critic_arch: 'LSTM-RNN'
        #critic_arch_hidden_units: [512, 512]
        #critic_arch_hidden_units: [512]
        #use_relu_after_rnn: False 

        # normal arch:
        # critic_arch: 'MLP-LSTM-RNN'
        # use_relu_after_rnn: True 
        # #use_relu_after_rnn: False 
        # critic_arch_feature_dim: 512
        # critic_arch_hidden_units: [512]

        # Arch2:
        critic_arch: 'MLP-LSTM-RNN2'
        use_relu_after_rnn: False #True 
        use_residual_connection: True 
        critic_arch_linear_hidden_units: [512, 256]
        critic_arch_feature_dim: 128
        critic_arch_hidden_units: [128, 128]

        # Arch 3:
        # critic_arch: 'MLP-LSTM-RNN2'
        # use_relu_after_rnn: True 
        # critic_arch_linear_hidden_units: [128]
        # critic_arch_feature_dim: 64
        # critic_arch_hidden_units: [64]

        #Arch 4:
        # critic_arch: 'MLP-LSTM-RNN2'
        # use_relu_after_rnn: True 
        # critic_arch_linear_hidden_units: [512, 256]
        # critic_arch_hidden_units: [256]
        # critic_arch_linear_post_hidden_units: [256]
        # critic_arch_feature_dim: 128

        extra_inputs_infos: {
            'previous_reward':{
                shape: [1,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'previous_action':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'action_mask':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'communication_channel':{
                #shape: [4], 
                shape: [21], 
                #shape: [8], 
                #shape: [11,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'secret_goal_rule':{
                shape: [8,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            ########################
            # WITH AGENT_ID:
            ########################
            'agent_id':{
               shape: [2,], 
               target_location: ['critic_body', 'extra_inputs']
            },
            ########################
            ########################
            ########################
            # WITH SAD:
            ########################
            'greedy_action':{
                #shape: [23], 
                shape: [108], 
                #shape: [43], 
                #shape: [58,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            ########################
            ########################
            'legal_actions':{
                shape: ['task.action_dim',], 
                target_location: ['head', 'extra_inputs']
            },
               
        }

    
    
    
    
