extra_hyperparameters: &extra_hyperparameters
    lr_account_for_nbr_actor: False 
    weights_decay_lambda: 1.0 # 1e-6
    use_target_to_gather_data:    False
    goal_oriented: True 
    goal_state_shared_arch:  False
    goal_state_flattening: False    #True
    nbr_training_iteration_per_cycle: 40 # HER: 40
    nbr_episode_per_cycle:  16  # HER: 16 DQN needs removal.
    HER_use_latent: False   #True
    HER_target_clamping: False 

THER_extra_hyperparameters: &THER_extra_hyperparameters
    THER_use_THER:  True
    THER_use_predictor:  True
    THER_predictor_policy_shared_phi: False

    THER_max_sentence_length: 7
    THER_vocabulary: ['key', 'ball', 'red', 'green', 'blue', 'purple', 
            'yellow', 'grey', 'verydark', 'dark', 'neutral', 'light', 'verylight',
            'tiny', 'small', 'medium', 'large', 'giant', 'get', 'go', 'fetch', 'go', 'get',
            'a', 'fetch', 'a', 'you', 'must', 'fetch', 'a', 'to', 'the', 'box',
            'then', 'after', 'you', 'and', 'pick', 'up', 'open', 'put', 'next', 'door',
            'on', 'your', 'left', 'right', 'in', 'front', 'of', 'behind']
    
    THER_use_PER: False
    THER_PER_alpha: 0.6
    THER_PER_beta: 1.0

    THER_replay_capacity: 50000
    THER_test_replay_capacity: 500
    
    THER_min_capacity: 32 #1e4
    THER_replay_period: 40 # Training (predictor) every X episodes...  instead of every successfull episode... 
    #OVERRIDEN by next parameter...
    THER_train_on_success:  True # Reduices the training speed from 10 it/s to 4 it/s...
    # If not training on success, once the accuracy threshold below is reached, then back at 70 it/s, until the next training period...
    # So, training on success might ease the load/change of distribution, and so the speed might come back faster to 70 it/s ?
    # --> 
    THER_nbr_training_iteration_per_update: 40
    THER_predictor_accuracy_threshold: 0.7
    # What data is used for training, or for testing: every X experience, we add 1 test data.
    THER_predictor_test_train_split_interval: 10

    THER_predictor_learning_rate: 1e-4
    THER_predictor_batch_size: 128
    THER_gradient_clip: 10.0
    THER_weights_decay_lambda: 1.0 # 1e-6
    
LargeCNN: &LargeCNN
        phi_arch: 'CNN-LSTM-RNN'
        goal_phi_arch: 'EmbedGRU'
        critic_arch: 'None'
        
        
        # Phi Body:
        phi_arch_channels: [16, M, 32, 64]
        phi_arch_kernels: [2, 2, 2, 2]
        phi_arch_strides: [1, 1, 1, 1]
        phi_arch_paddings: [1, 1, 1, 1]
        
        phi_arch_feature_dim: 64        # LSTM inputs / CNN output dim: 64
        phi_arch_hidden_units: [64,]    # LSTM hidden units: 64

        #phi_arch_feature_dim: 256 #Matching Predictor Decoder hidden size in shared arch        # LSTM inputs / CNN output dim: 64
        #phi_arch_hidden_units: [256,] #[64,]    # LSTM hidden units: 64

        # Actor architecture:
        actor_arch_hidden_units: []
        # Critic architecture:
        critic_arch_hidden_units: []

        # Goal Phi Body:
        goal_phi_arch_channels: None
        goal_phi_arch_kernels: None
        goal_phi_arch_strides: None
        goal_phi_arch_paddings: None
        goal_phi_arch_feature_dim: None
        goal_phi_arch_hidden_units: [128,]

        goal_phi_arch_embedding_size: 32

        # Critic architecture:
        goal_critic_arch_hidden_units: []


Predictor: &Predictor 
        predictor_encoder_arch: 'CNN'
        predictor_decoder_arch: 'CaptionGRU'
        
        # Encoder:
        predictor_encoder_arch_channels: [16, M, 32, 256]
        predictor_encoder_arch_kernels: [2, 2, 2, 2]
        predictor_encoder_arch_strides: [1, 1, 1, 1]
        predictor_encoder_arch_paddings: [1, 1, 1, 1]
        predictor_encoder_arch_feature_dim: 256        # GRU inputs / CNN output dim
        # predictor_policy_shared_phi = True:
        predictor_encoder_arch_hidden_units: [256,]    # GRU hidden units

        # Decoder:
        predictor_decoder_arch_channels: None
        predictor_decoder_arch_kernels: None
        predictor_decoder_arch_strides: None
        predictor_decoder_arch_paddings: None
        predictor_decoder_arch_feature_dim: None
        # predictor_policy_shared_phi = True:
        predictor_decoder_arch_hidden_units: [64,]
        # else:
        #predictor_decoder_arch_hidden_units: [256,]

        predictor_decoder_embedding_size: 128


THER_LargeCNN_Predictor: &THER_LargeCNN_Predictor
        double: False
        dueling: False
        noisy: False 
        n_step: 1

        use_PER: False
        PER_alpha: 0.6
        PER_beta: 1.0

        replay_capacity: 50000
        min_capacity: 128 #1e4
        replay_period: 240 #240

        use_HER:    True 
        HER_strategy:   'final-1' #'future-4' #

        observation_resize_dim: None
        goal_resize_dim: None
        
        discount: 0.98 #0.99
        use_cuda: True
        gradient_clip: 1.0
        batch_size: 128 #32
        tau: 1.0e-2 #THER paper:1.0e-3
        learning_rate: 1.0e-4 #1.0e-5   # 1e-4 predictor while 1e-5 network...
        adam_eps: 1.0e-8
        # NEED RMSProp optimizer...

        epsstart: 1.0
        epsend: 0.05
        epsdecay: 500000 
        epsdecay_strategy:   'None'

        <<: *LargeCNN
        <<: *Predictor
        <<: *extra_hyperparameters
        <<: *THER_extra_hyperparameters

experiment:
    tasks: [
            {'env-id': 'BabyAI-GoToLocal-v0',

             #'run-id': 'B1M_EpPerCycle16_GSNotShared_final-1_lrPr1m4Net1m4/Seed4_venv_Max+Sk0_St4_ObsNone_ClipReward_Eps5p5End5m2_tau1000_GradClip1/Reward0p1_RelabellingTrajWhenFailure_WithEndGoalPrediction_AbsEps1e0_CELossOverWholeSentence_PredTrainEvery40Step_NoTargetClamping',
             #'run-id': 'B1M_EpPerCycle16_GSNotShared_future-4_lrPr1m4Net1m4/Seed4_venv_Max+Sk0_St4_ObsNone_ClipReward_Eps5p5End5m2_tau1000_GradClip1_THER1p1/Reward0p1_RelabellingTrajWhenFailure_WithProperTerminal_WithEndGoalPrediction_AbsEps1e0_CELossUntilEoS_PredTrainEvery40Step_TrainOnSuccess_NoTargetClamping/SentL7_40MaxTrainPerUpdate_AccGoal70',
             #'run-id': 'B1M_EpPerCycle16_GSNotShared_final-1_lrPr1m4Net1m4/Seed5_venv_Max+Sk0_St4_ObsNone_ClipReward_Eps5p5End5m2_tau1000_GradClip1_THER1p1/Reward0p1_RelabellingTrajWhenFailure_WithProperTerminal_WithEndGoalPrediction_AbsEps1e0_CELossUntilEoS_PredTrainEvery40Step_TrainOnSuccess_NoTargetClamping/SentL7_40MaxTrainPerUpdate_AccGoal70',
             #'run-id': 'B1M/final-1/EpPerCycle16/lrPr1m4Net1m4/Seed1_venv_Max+Sk0_St4_ObsNone_ClipReward_Eps5p5End5m2_tau100_GradClip1_THER1p1/EnvReward0p1_PredicateEps1e0_NoTargetClamping/SentL7_40MaxTrainPerUpdate_AccGoal70_ProperTHERPredTrainPeriod',
             #'run-id': 'B1M/final-1/EpPerCycle16/lrPr1m4Net1m4/Seed1_venv_Max+Sk0_St4_ObsNone_ClipReward_Eps5p5End5m2_tau100_GradClip1_THER1p1/EnvReward0p1_PredicateEps1e0_NoTargetClamping/SentL7_40MaxTrainPerUpdate_AccGoal70_ImProperTHERPredTrainPeriod',
             
             #'run-id': 'B1M/final-1/EpPerCycle16/lrPr1m4Net1m4/Seed10_venv_Max+Sk0_St4_ObsNone_ClipReward_Eps5p5End5m2_tau100_GradClip1_THER1p1/EnvReward0p1_PredicateEps1e0_NoTargetClamping/SentL7_40MaxTrainPerUpdate_AccGoal70_THERPredTrainPeriodOnBufferedPeriodAndOnSuccess_MaxEntr1p0',
             'run-id': 'B1M/final-1/EpPerCycle16/lrPr1m4Net1m4/Seed100_venv_Max+Sk0_St4_ObsNone_ClipReward_Eps5p5End5m2_tau100_GradClip1_THER1p1/EnvReward0p1_PredicateEps1e0_NoTargetClamping/SentTrueL7_40MaxTrainPerUpdate_AccGoal70_THERPredTrainPeriodOnBufferedPeriodAndOnSuccess_MaxEntr1m1',
             
             #'agent-id': '1step_double_THER_LargeCNN',
             #'agent-id': '1step_double_THER_LargeCNN_HER20TrainPerCycle_8EpPerCycle',
             #'agent-id': '1step_double_THER_LargeCNN_HER40TrainPerCycle_8EpPerCycle',
             #'agent-id': '1step_double_THER_LargeCNN_HER80TrainPerCycle_8EpPerCycle',
             
             #'agent-id': '1step_double_THER_LargeCNN_HER40TrainPerCycle_16EpPerCycle',
             #'agent-id': '1step_noisy_double_THER_LargeCNN_HER40TrainPerCycle_16EpPerCycle',
             #'agent-id': '1step_prioritized_double_THER_LargeCNN_beta4m1_alpha_7m1_HER40TrainPerCycle_16EpPerCycle',
             #'agent-id': '1step_prioritized_double_THER_LargeCNN_beta4m1_alpha_7m1_HER40TrainPerCycle_16EpPerCycle_NoPrioritizedPredTraining',
             'agent-id': '1step_prioritized_double_THER_LargeCNN_beta4m1_alpha_7m1_HER40TrainPerCycle_16EpPerCycle_NoPrioritizedPredTraining_PredPiSharedPhi',
             #'agent-id': '1step_double_THER_LargeCNN_HER40TrainPerCycle_16EpPerCycle_PredPiSharedPhi',
             
             #'agent-id': '1step_double_THER_LargeCNN_HER40TrainPerCycle_32EpPerCycle',
             #'agent-id': '1step_double_THER_LargeCNN_deepqnhyper',
             #'agent-id': '1step_double_THER_LargeCNN_r5e4_THER_prioritized_beta4m1_alpha_7m1',
             #'agent-id': '1step_prioritized_double_THER_LargeCNN_r1e5_beta4m1_alpha_7m1',
             #'agent-id': '1step_noisy_double_THER_LargeCNN',
             
             'nbr_actor': 1,
             'nbr_frame_skipping': 0,
             'nbr_frame_stacking': 4,
             'single_life_episode': False,
             'nbr_max_random_steps': 0,
             'clip_reward': False,
             'observation_resize_dim': None,
             'goal_resize_dim': None,
             },
            ]
    experiment_id: 'THER_Benchmark'
    benchmarking_episodes: 1
    benchmarking_interval: 1.0e10
    benchmarking_record_episode_interval: 1.0e8
    video_recording_episode_period: 1e2
    train_observation_budget: 1.0e6
    seed: 100

agents:    
    1step_double_THER_LargeCNN:
        <<: *THER_LargeCNN_Predictor
        double: True
        #dueling: True 
        #noisy: True 
        n_step: 1

    # COMPLETE FAILURE:
    1step_double_THER_LargeCNN_deepqnhyper:
        <<: *THER_LargeCNN_Predictor
        double: True
        #dueling: True 
        #noisy: True 
        n_step: 1
        replay_period: 2

    1step_double_THER_LargeCNN_HER20TrainPerCycle_8EpPerCycle:
        <<: *THER_LargeCNN_Predictor
        double: True
        #dueling: True 
        #noisy: True 
        n_step: 1
        nbr_training_iteration_per_cycle: 20 # HER: 40
        nbr_episode_per_cycle:  8  # HER: 16
    
    1step_double_THER_LargeCNN_HER40TrainPerCycle_8EpPerCycle:
        <<: *THER_LargeCNN_Predictor
        double: True
        #dueling: True 
        #noisy: True 
        n_step: 1
        nbr_training_iteration_per_cycle: 40 # HER: 40
        nbr_episode_per_cycle:  8  # HER: 16
    
    1step_double_THER_LargeCNN_HER80TrainPerCycle_8EpPerCycle:
        <<: *THER_LargeCNN_Predictor
        double: True
        #dueling: True 
        #noisy: True 
        n_step: 1
        nbr_training_iteration_per_cycle: 80 # HER: 40
        nbr_episode_per_cycle:  8  # HER: 16
    
    


    1step_double_THER_LargeCNN_HER40TrainPerCycle_16EpPerCycle:
        <<: *THER_LargeCNN_Predictor
        double: True
        #dueling: True 
        #noisy: True 
        n_step: 1
        nbr_training_iteration_per_cycle: 40 # HER: 40
        nbr_episode_per_cycle:  16  # HER: 16
    
    1step_noisy_double_THER_LargeCNN_HER40TrainPerCycle_16EpPerCycle:
        <<: *THER_LargeCNN_Predictor
        double: True
        #dueling: True 
        noisy: True 
        n_step: 1
        nbr_training_iteration_per_cycle: 40 # HER: 40
        nbr_episode_per_cycle:  16  # HER: 16
    
    1step_double_THER_LargeCNN_HER40TrainPerCycle_16EpPerCycle_PredPiSharedPhi:
        <<: *THER_LargeCNN_Predictor
        double: True
        #dueling: True 
        #noisy: True 
        n_step: 1
        nbr_training_iteration_per_cycle: 40 # HER: 40
        nbr_episode_per_cycle:  16  # HER: 16
        THER_predictor_policy_shared_phi: True   
    
    1step_prioritized_double_THER_LargeCNN_beta4m1_alpha_7m1_HER40TrainPerCycle_16EpPerCycle:
        <<: *THER_LargeCNN_Predictor
        double: True
        #dueling: True 
        #noisy: True 
        n_step: 1
        use_PER: True
        PER_alpha: 0.7
        PER_beta: 0.4
        #replay_period: 2    
        #batch_size: 16
        # Paper: ratio = batch_size(=32) / replay_period(=4) = 8 ,
        # but bottleneck on GPU batchsize gives a better trade-off 
        # batch-regularization-effect / speed with a batch_size=16 
        # using NVIDIA 1080 Ti... Expect ~90 it/sec, without update
        # and ~84 it/sec with updates...
        # Whereas 32 / 4 yielded ~25 it/sec....
        THER_use_PER: True
        THER_PER_alpha: 0.7
        THER_PER_beta: 0.4
        nbr_training_iteration_per_cycle: 40 # HER: 40
        nbr_episode_per_cycle:  16  # HER: 16
    
    1step_prioritized_double_THER_LargeCNN_beta4m1_alpha_7m1_HER40TrainPerCycle_16EpPerCycle_NoPrioritizedPredTraining:
        <<: *THER_LargeCNN_Predictor
        double: True
        #dueling: True 
        #noisy: True 
        n_step: 1
        use_PER: True
        PER_alpha: 0.7
        PER_beta: 0.4
        #replay_period: 2    
        #batch_size: 16
        # Paper: ratio = batch_size(=32) / replay_period(=4) = 8 ,
        # but bottleneck on GPU batchsize gives a better trade-off 
        # batch-regularization-effect / speed with a batch_size=16 
        # using NVIDIA 1080 Ti... Expect ~90 it/sec, without update
        # and ~84 it/sec with updates...
        # Whereas 32 / 4 yielded ~25 it/sec....
        THER_use_PER: False
        THER_PER_alpha: 0.7
        THER_PER_beta: 0.4
        nbr_training_iteration_per_cycle: 40 # HER: 40
        nbr_episode_per_cycle:  16  # HER: 16

    1step_prioritized_double_THER_LargeCNN_beta4m1_alpha_7m1_HER40TrainPerCycle_16EpPerCycle_NoPrioritizedPredTraining_PredPiSharedPhi:
        <<: *THER_LargeCNN_Predictor
        double: True
        #dueling: True 
        #noisy: True 
        n_step: 1
        use_PER: True
        PER_alpha: 0.7
        PER_beta: 0.4
        #replay_period: 2    
        #batch_size: 16
        # Paper: ratio = batch_size(=32) / replay_period(=4) = 8 ,
        # but bottleneck on GPU batchsize gives a better trade-off 
        # batch-regularization-effect / speed with a batch_size=16 
        # using NVIDIA 1080 Ti... Expect ~90 it/sec, without update
        # and ~84 it/sec with updates...
        # Whereas 32 / 4 yielded ~25 it/sec....
        THER_use_PER: False
        THER_PER_alpha: 0.7
        THER_PER_beta: 0.4
        nbr_training_iteration_per_cycle: 40 # HER: 40
        nbr_episode_per_cycle:  16  # HER: 16
        THER_predictor_policy_shared_phi: True


    1step_double_THER_LargeCNN_HER40TrainPerCycle_32EpPerCycle:
        <<: *THER_LargeCNN_Predictor
        double: True
        #dueling: True 
        #noisy: True 
        n_step: 1
        nbr_training_iteration_per_cycle: 40 # HER: 40
        nbr_episode_per_cycle:  32  # HER: 16
    
    1step_double_THER_LargeCNN_r5e4_THER_prioritized_beta4m1_alpha_7m1:
        <<: *THER_LargeCNN_Predictor
        double: True
        #dueling: True 
        #noisy: True 
        n_step: 1
        use_PER: False
        PER_alpha: 0.7
        PER_beta: 0.4
        #replay_period: 2    
        #batch_size: 16
        # Paper: ratio = batch_size(=32) / replay_period(=4) = 8 ,
        # but bottleneck on GPU batchsize gives a better trade-off 
        # batch-regularization-effect / speed with a batch_size=16 
        # using NVIDIA 1080 Ti... Expect ~90 it/sec, without update
        # and ~84 it/sec with updates...
        # Whereas 32 / 4 yielded ~25 it/sec....
        THER_use_PER: True
        THER_PER_alpha: 0.7
        THER_PER_beta: 0.4

    1step_prioritized_double_THER_LargeCNN_r1e5_beta4m1_alpha_7m1:
        <<: *THER_LargeCNN_Predictor
        double: True
        #dueling: True 
        #noisy: True 
        n_step: 1
        use_PER: True
        PER_alpha: 0.7
        PER_beta: 0.4
        #replay_period: 2    
        #batch_size: 16
        # Paper: ratio = batch_size(=32) / replay_period(=4) = 8 ,
        # but bottleneck on GPU batchsize gives a better trade-off 
        # batch-regularization-effect / speed with a batch_size=16 
        # using NVIDIA 1080 Ti... Expect ~90 it/sec, without update
        # and ~84 it/sec with updates...
        # Whereas 32 / 4 yielded ~25 it/sec....
        THER_use_PER: True
        THER_PER_alpha: 0.6
        THER_PER_beta: 1.0

    1step_noisy_double_THER_LargeCNN:
        <<: *THER_LargeCNN_Predictor
        double: True
        #dueling: True 
        noisy: True 
        n_step: 1
