HER_hyperparameters: &HER_hyperparameters
    use_HER: True
    HER_strategy: "final-1"
    HER_soft_update: True #False
    HER_target_clamping: True
    # set to None if left unspecified:
    # HER_extract_goal_from_info_fn: "None"
    HER_achieved_goal_key_from_info: "grid"
    HER_target_goal_key_from_info: "target_grid"
    HER_filtering_fn: "None"
    
    nbr_training_iteration_per_cycle: 40 # HER: 40
    nbr_episode_per_cycle:  16  # HER: 16 DQN needs removal.

THER_extra_hyperparameters: &THER_extra_hyperparameters
    THER_use_THER:  True
    THER_use_predictor:  True
    THER_predictor_policy_shared_phi: False

    THER_max_sentence_length: 128
    THER_vocabulary: ['key', 'ball', 'red', 'green', 'blue', 'purple', 
            'yellow', 'grey', 'verydark', 'dark', 'neutral', 'light', 'verylight',
            'tiny', 'small', 'medium', 'large', 'giant', 'get', 'go', 'fetch', 'go', 'get',
            'a', 'fetch', 'a', 'you', 'must', 'fetch', 'a', 'to', 'the', 'box']
    
    THER_use_PER: False
    THER_PER_alpha: 0.6
    THER_PER_beta: 1.0

    THER_replay_capacity: 50000
    THER_test_replay_capacity: 500
    THER_min_capacity: 32 #1e4
    THER_replay_period: 40 # Training every episode: 40...  instead of every successfull episode... 
    #OVERRIDEN by next parameter...
    THER_train_on_success:  True
    THER_nbr_training_iteration_per_update: 40
    THER_predictor_accuracy_threshold: 0.7
    THER_predictor_test_train_split_interval: 10

    THER_predictor_learning_rate: 1e-4
    THER_predictor_batch_size: 128
    THER_gradient_clip: 10.0
    THER_weights_decay_lambda: 1.0 # 1e-6
  
extra_hyperparameters: &extra_hyperparameters
    lr_account_for_nbr_actor: False 
    weights_decay_lambda: 0.0
    weights_entropy_lambda: 0.0 #01
    use_target_to_gather_data:    False

    ####################################
    # New hyperparameters:
    PER_compute_initial_priority: False
    #####################################
    
    sequence_replay_use_online_states: True
    sequence_replay_use_zero_initial_states: False
    sequence_replay_store_on_terminal: True
    
    r2d2_loss_masking: True
    r2d2_loss_masking_n_step_regularisation: True
    r2d2_bellman_target_SAD: False

    burn_in: True
    sequence_replay_unroll_length: 80
    sequence_replay_overlap_length: 40
    sequence_replay_burn_in_length: 20

    sequence_replay_PER_eta: 0.9

    vdn: False 
    vdn_nbr_players: 2
    
    <<: *HER_hyperparameters
    <<: *THER_extra_hyperparameters

LargeMLP: &LargeMLP
        phi_arch: 'MLP'
        actor_arch: 'None'
        critic_arch: 'LSTM-RNN'
        
        # Phi Body:
        #phi_arch_channels: ['BN32', 'BN64', 'BN64']
        #phi_arch_kernels: [8, 4, 3]
        #phi_arch_strides: [4, 2, 1]
        #phi_arch_paddings: [1, 1, 1]
        phi_arch_feature_dim: 128
        phi_arch_hidden_units: [512,256]

        extra_inputs_infos: {
            'previous_reward':{
                shape: [1,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'previous_action':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },
            
            'mode_id':{
                shape: [2,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'previous_game_result':{
                shape: [2,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            
            'action_mask':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'legal_actions':{
                shape: ['task.action_dim',], 
                target_location: ['head', 'extra_inputs']
            },
               
        }

        # Dictionnaries of keys living inside the 'infos' OpenAI Gym's output.
        # Value is a tuple where the first element is the expected shape of the extra input,
        # and the second item is the location where the input should be stored in the framestate.
        # Parsing of the shape will infer where to fetch the value when encountering a string.

        # Actor architecture:
        actor_arch_hidden_units: []
        # Critic architecture:
        #critic_arch_feature_dim: 32
        critic_arch_hidden_units: [128, 128]

LargeCNN_SAD: &LargeCNN_SAD
        sad: False

        phi_arch: 'CNN' #-LSTM-RNN'
        actor_arch: 'None'
        critic_arch: 'LSTM-RNN'
        
        # Phi Body:
        phi_arch_channels: ['BN32', 'BN64', 'BN64']
        phi_arch_channels: [32, 64, 64]
        phi_arch_kernels: [8, 4, 3]
        #phi_arch_kernels: [3, 3, 3]
        phi_arch_strides: [4, 2, 1]
        #phi_arch_strides: [2, 2, 1]
        phi_arch_paddings: [1, 1, 1]
        phi_arch_feature_dim: 512

        #phi_arch_feature_dim: 128
        #phi_arch_hidden_units: [512, 256]

        extra_inputs_infos: {
            'previous_reward':{
                shape: [1,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'previous_action':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },

            'round_id':{
                shape: [4,], 
                target_location: ['critic_body', 'extra_inputs']
            },

            'mode_id':{
                shape: [2,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'previous_game_result':{
                shape: [2,], 
                target_location: ['critic_body', 'extra_inputs']
            },
            
            'action_mask':{
                shape: ['task.action_dim',], 
                target_location: ['critic_body', 'extra_inputs']
            },
            'legal_actions':{
                shape: ['task.action_dim',], 
                target_location: ['head', 'extra_inputs']
            },

        }

        # Dictionnaries of keys living inside the 'infos' OpenAI Gym's output.
        # Value is a tuple where the first element is the expected shape of the extra input,
        # and the second item is the location where the input should be stored in the framestate.
        # Parsing of the shape will infer where to fetch the value when encountering a string.

        # Actor architecture:
        actor_arch_hidden_units: []
        # Critic architecture:
        #critic_arch_feature_dim: 32
        critic_arch_hidden_units: [128, 128]


r2d2_LargeMLPLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0: &r2d2_LargeMLPLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0
        #'observation_resize_dim': 56, #(56,56),
        observation_resize_dim: 32 #(56,56),
        
        dueling: True
        noisy: False 
        n_step: 3

        use_PER: True
        PER_alpha: 0.9
        PER_beta: 0.6

        replay_capacity: 5242880 # in terms of experience #1e6
        min_capacity: 4e5 #in terms of experiences... #1e4
        replay_period: 1
        
        actor_models_update_steps_interval: 10 #considering only 1 actor's steps.

        discount: 0.999
        use_cuda: True
        gradient_clip: 0.5
        batch_size: 128
        tau: 4.0e-4
        learning_rate: 6.25e-5
        adam_eps: 1.5e-5

        epsstart: 1.0
        epsend: 0.1
        epsdecay: 10000
        eps_greedy_alpha: 7.0

        sequence_replay_use_online_states: True
        sequence_replay_use_zero_initial_states: False
        sequence_replay_store_on_terminal: False
        
        r2d2_loss_masking: True
        r2d2_loss_masking_n_step_regularisation: True
        
        burn_in: False
        sequence_replay_unroll_length: 40
        sequence_replay_overlap_length: 10
        sequence_replay_burn_in_length: 0

        sequence_replay_PER_eta: 0.9

        <<: *LargeCNN_SAD
        <<: *extra_hyperparameters


experiment:
    tasks: [{
        #'env-id': 'IGLUSilentBuilder-v0',
        'env-id': 'IGLUGridworld-v0',
        'env-config': {
            #"action_space": "discrete",
            "action_space": "walking",
            "vector_state": True,
            "target_in_obs": True,
        },
        
        'run-id': 'TestTHER-NOTaskCurriculumWrapper+SoftUpdate+NbrItPerCycle40+NbrEpPerCycle16/Discrete+1Actors+NOCurrFakeReset+MFEL200+NOFake+Obs32+NOSparsePosReward+NOInvertedGoalPredReward+BlockDenseAct+LowerCam10/',
        
        #'agent-id': '1step_0Ent_r2d2_AdamLR3m4_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeMLPDNC_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        #'agent-id': '3step_0Ent_r2d2_AdamLR3m4_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeMLPDNC_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        'agent-id': '3step_0Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeMLPDNC_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone',
        
        #'nbr_actor': 128,
        #'nbr_actor': 100,
        #'nbr_actor': 64,
        #'nbr_actor': 32,
        #'nbr_actor': 16,
        'nbr_actor': 1,
        'nbr_frame_skipping': 1,
        'nbr_frame_stacking': 4,
        #'grayscale': True,
        #'single_life_episode': True, #False,
        #'nbr_max_random_steps': 30,
        'sad': False, 
        'vdn': False, 
        "otherplay": False,
        'clip_reward': False,
        'previous_reward_action': True,
        'curriculum_fake_reset': False,
        'task_curriculum': False,
        # The following must be an int for
        # task_curriculum to work...:
        'max_fake_episode_length': 20, #'None', #100,
        'sparse_positive_reward': False,
        'block_dense_actions': True,
        'inverted_goal_predicated_reward': False,
        'use_THER': True,
        'use_HER_reward': True,
        'use_OHE': True,
        #'observation_resize_dim': (21,21), #(56,56),
        #'observation_resize_dim': 56, #(56,56),
        #'observation_resize_dim': 56, 
        #This value is DEPRECATED: use the one in agent config.
        # Also, halfing it does increase 2 fold the processing speed...
        #
        'reload': 'None', #'/home/kevin/debug_IGLU/newest_agent.agent',
        'reload_model': 'None', #'/home/kevin/debug_IGLU/newest_agent.agent',
        },
    ]
    experiment_id: 'r2d2_iglu_debug'
    benchmarking_episodes: 1
    benchmarking_interval: 1.0e10
    benchmarking_record_episode_interval: 2.0e0 #'None' #1.0e1 #1.0e20
    #benchmarking_record_episode_interval: 1.0e20
    train_observation_budget: 1.0e6
    seed: 1
    
agents:
    3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeMLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone: &3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeMLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone
        <<: *r2d2_LargeMLPLSTM_SAD_graclip5m1_b128_tau4m4_lr6p25m5_L40_O10_B0
        actor_models_update_steps_interval: 1 #considering only 1 actor's steps.

        vdn: False
        vdn_nbr_players: 2

        batch_size: 32
        learning_rate: 6.25e-5
        adam_eps: 1.5e-5
        discount: 0.997
        gradient_clip: 5.0 
        # ...not specified in r2d2 paper but in Ape-X,
        # and r2d2 paper says that missing hyper-param
        # are the same as ape-X
        
        replay_capacity: 5e4 #163840 #2e13*20 #5242880 # in terms of experience #1e6
        min_capacity: 2e4 #in terms of experiences... #1e4
        
        PER_compute_initial_priority: False
        PER_beta_increase_interval: None #2e5
        
        double: True
        dueling: True 
        noisy: False
        n_step: 3
        tau: 4.0e-4
        
        sequence_replay_use_online_states: True
        sequence_replay_use_zero_initial_states: False
        sequence_replay_store_on_terminal: True
        
        r2d2_loss_masking: True
        r2d2_loss_masking_n_step_regularisation: True
        r2d2_bellman_target_SAD: False 

        burn_in: False
        sequence_replay_unroll_length: 100
        sequence_replay_overlap_length: 0
        sequence_replay_burn_in_length: 0
        

        epsstart: 1.0
        epsend: 0.05
        epsdecay: 30000 #1000000
        
        # ape-X and r2d2 keep it constant over each actor 
        # with a different value eps_i = base_eps**(1+\alpha*i/nbr_actors)
        # with base_eps=0.4 and \alpha = 7...
        eps_greedy_alpha: 7.0
    
    #3step_SAD_VDN_aID_1m3Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeMLPLSTM2Res_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L150_O0_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    3step_0Ent_r2d2_AdamLR6d25m5_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeMLPDNC_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
    #3step_0Ent_r2d2_AdamLR3m4_EPS1m12_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m1OVER3p4_A2m0_gamma997_LargeMLPDNC_GradClip5m0_r1p5Min3e4_a9m1_b6m1_ovrN_e9m1_tau4m4_RepP1_NOBURNIN_b128_L20_O10_B0_NOZeroInitSt_OnlineSt_StoreOnDone:
        <<: *3step_r2d2_AdamLR6d25m5_EPS1d5m5_L2AModelUpdate1Steps_EPSgreedyAPEX1m0_4m2OVER3p4_gamma997_LargeMLPLSTM_GradClip5m0_r5p4Min2e4_alpha9m1_beta6m1_overNone_eta9m1_tau4m4_RepP1_NOBURNIN_b32_L100_O0_B0_NOZeroInitSt_StoreOnDone
        # DEBUG:
        use_HER: True
        
        min_capacity: 1e3    
        
        weights_entropy_lambda: 0.0 
        #weights_entropy_lambda: 0.1
        #weights_entropy_lambda: 0.001 #01
    
        vdn: False 
        vdn_nbr_players: 2
        sad: False 
        
        discount: 0.997
        #discount: 0.95
        learning_rate: 6.25e-5
        #adam_eps: 1.5e-5
        #learning_rate: 3.0e-4
        #adam_eps: 1.0e-8
        adam_eps: 1.0e-12
        #adam_eps: 1.0e-15

        replay_capacity: 5e4 #163840 #2e13*20 #5242880 # in terms of experience #1e6
        #replay_capacity: 1e5 
        #min_capacity: 3e4 #in terms of experiences... #1e4
        
        #n_step: 1
        n_step: 3
        #n_step: 7

        #tau: 4.0e-4
        #tau: 1.0e-5
        
        #sequence_replay_overlap_length: 0
        #sequence_replay_overlap_length: 50
        
        batch_size: 128
        
        burn_in: False
        #burn_in: True

        sequence_replay_unroll_length: 20
        sequence_replay_overlap_length: 10
        sequence_replay_burn_in_length: 0
        # #sequence_replay_burn_in_length: 10
        
        # sequence_replay_unroll_length: 100
        # sequence_replay_overlap_length: 50
        # sequence_replay_burn_in_length: 0
        
        epsend: 0.4
        eps_greedy_alpha: 2.0
        
        # Architecture:
        #critic_arch: 'LSTM-RNN'
        #critic_arch_hidden_units: [512, 512]
        #critic_arch_hidden_units: [512]
        #use_relu_after_rnn: False 

        # normal arch:
        # critic_arch: 'MLP-LSTM-RNN'
        # use_relu_after_rnn: True 
        # #use_relu_after_rnn: False 
        # critic_arch_feature_dim: 512
        # critic_arch_hidden_units: [512]

        # Arch2:
        # critic_arch: 'MLP-LSTM-RNN2'
        # use_relu_after_rnn: False #True 
        # use_residual_connection: True 
        # critic_arch_linear_hidden_units: [512, 256]
        # critic_arch_feature_dim: 128
        # critic_arch_hidden_units: [128, 128]

        # Arch DNC:
        # critic_arch: 'DNC'
        # critic_arch_feature_dim: 128
        #critic_arch_hidden_units: [128]
        #critic_arch_hidden_units: [128, 128]
        # DNC_sparse_K: 0

        # Arch 3:
        critic_arch: 'MLP-LSTM-RNN2'
        use_relu_after_rnn: False #True 
        critic_arch_linear_hidden_units: [256, 128]
        critic_arch_linear_post_hidden_units: [128]
        critic_arch_feature_dim: 128
        critic_arch_hidden_units: [128]

        #Arch 4:
        # critic_arch: 'MLP-LSTM-RNN2'
        # use_relu_after_rnn: True 
        # critic_arch_linear_hidden_units: [512, 256]
        # critic_arch_hidden_units: [256]
        # critic_arch_linear_post_hidden_units: [256]
        # critic_arch_feature_dim: 128
        
        extra_bodies:
           chat_body:
               arch: 'EmbeddingRNN'
               vocab_size: 4096
               hidden_units: [256,]
               embedding_size: 64

        extra_inputs_infos: {
           'previous_reward':{
               shape: [1,], 
               target_location: ['critic_body', 'extra_inputs']
           },
           'previous_action':{
               shape: ['task.action_dim',], 
               target_location: ['critic_body', 'extra_inputs']
           },

           #'chat':{
           'dialog':{
               shape: [256,], # 256 tokens .... 
               target_location: ['chat_body', 'extra_inputs']
           },
           
           #'grid_ohe':{
           #    shape: [6534, ], #1089,],
           #    target_location: ['critic_body', 'extra_inputs']
           #},

           #'target_grid_ohe':{
           #    shape: [6534, ], #1089,],
           #    target_location: ['critic_body', 'extra_inputs']
           #},

           #'chat':{
           #    shape: [256,], 
           #    target_location: ['critic_body', 'extra_inputs']
           #},
           
           #'target_grid':{
           #    shape: [1089,],
           #    target_location: ['critic_body', 'extra_inputs']
           #},

           #'action_mask':{
           #    shape: ['task.action_dim',], 
           #    target_location: ['critic_body', 'extra_inputs']
           #},
           #'legal_actions':{
           #    shape: ['task.action_dim',], 
           #    target_location: ['head', 'extra_inputs']
           #},
        }     
        

        # extra_inputs_infos: {
        #      'previous_reward':{
        #          shape: [1,], 
        #          target_location: ['critic_body', 'dnc', 'dnc_controller', 'extra_inputs']
        #      },
        #      'previous_action':{
        #          shape: ['task.action_dim',], 
        #          target_location: ['critic_body', 'dnc', 'dnc_controller', 'extra_inputs']
        #      },
           
        #      'round_id':{
        #          shape: [2,], 
        #          target_location: ['critic_body', 'dnc', 'dnc_controller', 'extra_inputs']
        #      },
        #      'mode_id':{
        #          shape: [2,], 
        #          target_location: ['critic_body', 'dnc', 'dnc_controller', 'extra_inputs']
        #      },
        #      'previous_game_result':{
        #          shape: [2,], 
        #          target_location: ['critic_body', 'dnc', 'dnc_controller', 'extra_inputs']
        #      },
        #      
        #      #'action_mask':{
        #      #    shape: ['task.action_dim',], 
        #      #    target_location: ['critic_body', 'dnc', 'dnc_controller', 'extra_inputs']
        #      #},
        #      #'legal_actions':{
        #      #    shape: ['task.action_dim',], 
        #      #    target_location: ['head', 'extra_inputs']
        #      #},
        #  }
        
