extra_hyperparameters: &extra_hyperparameters
    lr_account_for_nbr_actor: False 
    weights_decay_lambda: 0.0
    weights_entropy_lambda: 0.0 #01
    use_target_to_gather_data:    False

    sequence_replay_use_zero_initial_states: False
    burn_in: True 
    sequence_replay_unroll_length: 80
    sequence_replay_overlap_length: 40
    sequence_replay_burn_in_length: 20

    sequence_replay_PER_eta: 0.9
    

LargeLSTMCNN: &LargeLSTMCNN
        phi_arch: 'CNN-LSTM-RNN' #-LSTM-RNN'
        actor_arch: 'None'
        critic_arch: 'None'
        
        # Phi Body:
        phi_arch_channels: [32, 64, 64]
        phi_arch_kernels: [8, 4, 3]
        phi_arch_strides: [4, 2, 1]
        phi_arch_paddings: [1, 1, 1]
        phi_arch_feature_dim: 512
        phi_arch_hidden_units: [512,]

        extra_inputs_infos: {
            'previous_reward':{
                shape: [1,], 
                target_location: ['phi_body', 'extra_inputs']
            },
            'previous_action':{
                shape: ['task.action_dim',], 
                target_location: ['phi_body', 'extra_inputs']
            }
        }
        # Dictionnaries of keys living inside the 'infos' OpenAI Gym's output.
        # Value is a tuple where the first element is the expected shape of the extra input,
        # and the second item is the location where the input should be stored in the framestate.
        # Parsing of the shape will infer where to fetch the value when encountering a string.

        # Actor architecture:
        actor_arch_hidden_units: []
        # Critic architecture:
        critic_arch_hidden_units: []

        
r2d2_LargeLSTMCNN_obs84_graclip5m1_b32_tau1m2_lr25m5_L80_O40_B20: &r2d2_LargeLSTMCNN_obs84_graclip5m1_b32_tau1m2_lr25m5_L80_O40_B20
        dueling: False
        noisy: False 
        n_step: 1

        use_PER: False
        PER_alpha: 0.6
        PER_beta: 1.0

        replay_capacity: 1e6
        min_capacity: 1e3
        replay_period: 1
        actor_models_update_optimization_interval: 4

        observation_resize_dim: 84
        discount: 0.99 #0.997
        use_cuda: True
        gradient_clip: 0.5
        batch_size: 32
        tau: 1.0e-2
        learning_rate: 2.5e-4
        adam_eps: 1.0e-8

        epsstart: 1.0
        epsend: 0.01    #0.1
        epsdecay: 30000 #1000000

        <<: *LargeLSTMCNN
        <<: *extra_hyperparameters


experiment:
    tasks: [
            {
             #'env-id': 'PongNoFrameskip-v4',
             #'env-id': 'BreakoutNoFrameskip-v4',
             'env-id': 'BattleZoneNoFrameskip-v4',

             #'run-id': 'Seed1_venv1_r2d2_Max+Sk4_St4_Obs84_Grayscale_RandNoOpStart30_SingleLife_ClipReward_Eps3p4End1m2_EntropyReg0_b16_RepP2_DEBUGNSTEP+ProperGammaLoss+CurrentActionLoss+DetachedRNNStates',
             
             #'run-id': 'DEBUG_NOEXTRA_RNN_EXTRAINPUTS_InLossFn_Argmax_AlignedTargetActions_INM1Done_LossMasked/QReplayAction/CNN/ScalingFN/Seed1_venv1_r2d2_Max+Sk4_St4_Obs84_Grayscale_RandNoOpStart30_SingleLife_ClipReward_Eps3p4End1m3_EntropyReg0_WeightDecayReg0+ProperGammaLoss+CurrentActionLoss+Discount099+MinCap5e3/',
             #'run-id': 'DEBUGGING/ActorUpdateInt4/QReplayAction/ScalingFN/Seed1_venv1_r2d2_Max+Sk4_St4_Obs84_Grayscale_RandNoOpStart30_SingleLife_ClipReward_Eps3p4End1m3_EntropyReg0_WeightDecayReg0+Discount0997/',
             
             #'run-id': 'DEBUGGING/CrossingEpisodeBarriers/ProperTDs4Priori/PrioritizationAlphaFixed/ActorUpdateInt4/NonZeroInitialStateRNNOptim/QReplayAction/ScalingFN/Seed100_venv1_r2d2_Max+Sk4_St4_Obs84_Grayscale_RandNoOpStart30_SingleLife_ClipReward_Eps3p4End1m3_EntropyReg0_WeightDecayReg0+Discount0997/',
             'run-id': 'DEBUGGING/ActorUpdateInt4/NonZeroInitialStateRNNOptim/ScalingFN/Seed100_venv1_r2d2_Max+Sk4_St4_Obs84_Grayscale_RandNoOpStart30_SingleLife_ClipReward_Eps3p4End1m3_EntropyReg0_WeightDecayReg0+Discount0997/',
             
             # Not learning because of QOnlineActions:
             # 'run-id': 'DEBUG6_Argmax_AlignedTargetActions_INM1Done_LossMasked/QOnlineAction/CNN/ScalingFN/Seed1_venv1_r2d2_Max+Sk4_St4_Obs84_Grayscale_RandNoOpStart30_SingleLife_ClipReward_Eps3p4End1m3_EntropyReg0_WeightDecayReg0+ProperGammaLoss+CurrentActionLoss+Discount099',
             #'agent-id': '5step_r2d2_LargeLSTMCNN_r1e5_beta4m1_tau1m3_RepP2_b16_L20_O10_B5',
             #'agent-id': '5step_r2d2_LargeLSTMCNN_r1e5_beta4m1_tau1m3_RepP2_b2_L20_O10_B5',
             
             #'agent-id': '1step_r2d2_LargeLSTMCNN_r1e5_beta4m1_tau1m3_RepP2_b16_L2_O1_B1',
             #'agent-id': '1step_r2d2_LargeLSTMCNN_r1e5_beta4m1_tau1m3_RepP2_b16_L10_O5_B5',
             #'agent-id': '1step_r2d2_LargeLSTMCNN_r1e5_beta4m1_tau1m3_RepP2_NOBURNIN_b16_L2_O1_B0',
             #'agent-id': '1step_r2d2_LargeLSTMCNN_r1e5_beta4m1_tau1m3_RepP32_BURNIN_b1_L80_O40_B40',
             
             #'agent-id': '1step_dueling_PER_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e6_alpha9m1_beta4m1_tau25m4_RepP64_BURNIN_b1_L80_O40_B40',
             
             #'agent-id': '1step_dueling_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha9m1_beta4m1_tau25m4_RepP1_NOBURNIN_b16_L2_O1_B0',
             #'agent-id': '1step_dueling_PER_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha9m1_beta4m1_tau25m4_RepP1_NOBURNIN_b16_L2_O1_B0',
             #Gamma of 0.99 fail to learn, compared to 0.997 that does: 'agent-id': '1step_dueling_PER_r2d2_gamma999_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha9m1_beta4m1_tau25m4_RepP2_NOBURNIN_b16_L2_O1_B0',
             # Trying 0.999 to see if it leads to better data efficiency: does not work with cuda benchmark seed100
             #'agent-id': '1step_dueling_PER_r2d2_gamma999_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha9m1_beta4m1_tau25m4_RepP2_NOBURNIN_b16_L2_O1_B0',
             #cuda benchmark 0.997 seems stable, but not data-efficient: 
             #'agent-id': '1step_dueling_PER_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha9m1_beta4m1_tau25m4_RepP2_NOBURNIN_b16_L2_O1_B0',
             # The following is slightly better without PER...
             #'agent-id': '1step_dueling_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha9m1_beta4m1_tau25m4_RepP2_NOBURNIN_b16_L2_O1_B0',
             # better data-efficiency with smaller tau :
             #'agent-id': '1step_noisy_dueling_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha9m1_beta4m1_tau1m3_RepP2_NOBURNIN_b16_L2_O1_B0',
             # same data-efficiency with better stability, visibly:
             #'agent-id': '1step_noisy_dueling_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha9m1_beta4m1_tau5m4_RepP2_NOBURNIN_b16_L2_O1_B0',
             # Is the noisy aspect necessary? Nope, same results:
             #'agent-id': '1step_dueling_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha9m1_beta4m1_tau5m4_RepP2_NOBURNIN_b16_L2_O1_B0',
             # longer sequences: nstable:
             #'agent-id': '1step_dueling_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha9m1_beta4m1_tau5m4_RepP2_NOBURNIN_b16_L8_O1_B0',
             # with longer rep period: adjusting batch size to yield an effective minibatch size of 32: not learning...
             #'agent-id': '1step_dueling_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha9m1_beta4m1_tau5m4_RepP8_NOBURNIN_b4_L8_O1_B0',
             # reducing values:
             #'agent-id': '1step_dueling_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha9m1_beta4m1_tau5m4_RepP4_NOBURNIN_b8_L4_O1_B0',
             # Comparing against using burnin:
             #'agent-id': '1step_dueling_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha9m1_beta4m1_tau5m4_RepP4_BURNIN_b8_L4_O1_B1',
             #'agent-id': '1step_dueling_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha9m1_beta4m1_tau5m4_RepP4_BURNIN_b8_L4_O1_B2',
             # Doubling the burnin and length resulted in a doubled data-efficiency:
             #'agent-id': '1step_dueling_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha9m1_beta4m1_tau5m4_RepP4_BURNIN_b8_L8_O4_B4',
             
             # decreasing the repetition is not helping:
             #'agent-id': '1step_dueling_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha9m1_beta4m1_tau5m4_RepP8_BURNIN_b8_L8_O4_B4',
             # what about using PER: apparently not useful...
             #'agent-id': '1step_PER_dueling_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha9m1_beta4m1_tau5m4_RepP4_BURNIN_b8_L8_O4_B4',
             # finetuning the beta hyperparam:
             #'agent-id': '1step_PER_dueling_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha9m1_beta4m1_over1e5_tau5m4_RepP4_BURNIN_b8_L8_O4_B4',
             # finetuning the beta hyperparam: reduicing alpha, beta, eta:
             #'agent-id': '1step_PER_dueling_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha7m1_beta4m1_over1e4_eta9m1_tau5m4_RepP4_BURNIN_b8_L8_O4_B4',
             #'agent-id': '1step_PER_dueling_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha7m1_beta4m1_over1e4_eta9m1_tau5m4_RepP4_BURNIN_b8_L16_O8_B8',
             #'agent-id': '1step_PER_dueling_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha7m1_beta4m1_over1e4_eta9m1_tau5m4_RepP4_BURNIN_b8_L32_O16_B16',
             # Adding n-step=5 like in original R2D2:
             #'agent-id': '5step_PER_dueling_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha7m1_beta4m1_over1e4_eta9m1_tau5m4_RepP4_BURNIN_b8_L8_O4_B4',

             #'agent-id': '1step_PER_dueling_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha6m1_beta4m1_over1e6_eta9m1_tau1m3_RepP4_BURNIN_b8_L8_O4_B4',
             'agent-id': '1step_dueling_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_tau1m3_RepP4_NOBURNIN_b8_L8_O4_B4',
             

             #'agent-id': '20step_prioritized_double_dqn_LargeCNN_r1e5_beta4m1_tau1m3_RepP2_b16_L80_O40_B20',
             
             'nbr_actor': 1, #4,
             'nbr_frame_skipping': 4,
             'nbr_frame_stacking': 4,
             'grayscale': True,
             'single_life_episode': True,
             'nbr_max_random_steps': 30,
             'clip_reward': True,
             'previous_reward_action': True,
             'observation_resize_dim': (84,84),
             },
            ]
    experiment_id: 'r2d2_debug'
    benchmarking_episodes: 1
    benchmarking_interval: 1.0e4
    benchmarking_record_episode_interval: 1.0e8
    train_observation_budget: 5.0e6 #3.0e5 #1.0e7
    seed: 100

agents:
    5step_r2d2_LargeLSTMCNN_r1e5_beta4m1_tau1m3_RepP2_b16_L80_O40_B20:
        <<: *r2d2_LargeLSTMCNN_obs84_graclip5m1_b32_tau1m2_lr25m5_L80_O40_B20
        replay_capacity: 1e5
        use_PER: False
        PER_beta: 0.4
        replay_period: 2
        batch_size: 16
        # Paper: ratio = batch_size(=32) / replay_period(=4) = 8 ,
        # but bottleneck on GPU batchsize gives a better trade-off 
        # batch-regularization-effect / speed with a batch_size=16 
        # using NVIDIA 1080 Ti... Expect ~90 it/sec, without update
        # and ~84 it/sec with updates...
        # Whereas 32 / 4 yielded ~25 it/sec....
        double: True
        #dueling: True 
        #noisy: True 
        n_step: 5
        tau: 1.0e-3


    1step_dueling_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha9m1_beta4m1_tau5m4_RepP4_BURNIN_b8_L8_O4_B4:
        <<: *r2d2_LargeLSTMCNN_obs84_graclip5m1_b32_tau1m2_lr25m5_L80_O40_B20
        discount: 0.997
        gradient_clip: 0.0
        replay_capacity: 1e5
        min_capacity: 1e3
        use_PER: False 
        PER_alpha: 0.9
        PER_beta: 0.4
        replay_period: 4
        batch_size: 8
        # Paper: ratio = batch_size(=32) / replay_period(=4) = 8 ,
        # but bottleneck on GPU batchsize gives a better trade-off 
        # batch-regularization-effect / speed with a batch_size=16 
        # using NVIDIA 1080 Ti... Expect ~90 it/sec, without update
        # and ~84 it/sec with updates...
        # Whereas 32 / 4 yielded ~25 it/sec....
        double: True
        dueling: True 
        noisy: False 
        n_step: 1
        tau: 5.0e-4 #1.0e-3 #2.5e-3
        burn_in: True
        sequence_replay_unroll_length: 8
        sequence_replay_overlap_length: 4
        sequence_replay_burn_in_length: 4

     
    1step_dueling_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_tau1m3_RepP4_NOBURNIN_b8_L8_O4_B4:
        <<: *r2d2_LargeLSTMCNN_obs84_graclip5m1_b32_tau1m2_lr25m5_L80_O40_B20
        discount: 0.997
        gradient_clip: 0.0
        replay_capacity: 1e5
        min_capacity: 1e3
        use_PER: False
        PER_alpha: 0.6 #0.7
        PER_beta: 0.4
        PER_beta_increase_interval: 1e6 #1e4
        sequence_replay_PER_eta: 0.9
        replay_period: 4
        batch_size: 8
        # Paper: ratio = batch_size(=32) / replay_period(=4) = 8 ,
        # but bottleneck on GPU batchsize gives a better trade-off 
        # batch-regularization-effect / speed with a batch_size=16 
        # using NVIDIA 1080 Ti... Expect ~90 it/sec, without update
        # and ~84 it/sec with updates...
        # Whereas 32 / 4 yielded ~25 it/sec....
        double: True
        dueling: True 
        noisy: False 
        n_step: 1
        tau: 1.0e-3 #5.0e-4 #2.5e-3
        burn_in: False
        sequence_replay_unroll_length: 8
        sequence_replay_overlap_length: 4
        sequence_replay_burn_in_length: 4



    1step_PER_dueling_r2d2_gamma997_LargeLSTMCNN_NoGradClip_r1e5Min1e3_alpha6m1_beta4m1_over1e6_eta9m1_tau1m3_RepP4_BURNIN_b8_L8_O4_B4:
        <<: *r2d2_LargeLSTMCNN_obs84_graclip5m1_b32_tau1m2_lr25m5_L80_O40_B20
        discount: 0.997
        gradient_clip: 0.0
        replay_capacity: 1e5
        min_capacity: 1e3
        use_PER: True 
        PER_alpha: 0.6 #0.7
        PER_beta: 0.4
        PER_beta_increase_interval: 1e6 #1e4
        sequence_replay_PER_eta: 0.9
        replay_period: 4
        batch_size: 8
        # Paper: ratio = batch_size(=32) / replay_period(=4) = 8 ,
        # but bottleneck on GPU batchsize gives a better trade-off 
        # batch-regularization-effect / speed with a batch_size=16 
        # using NVIDIA 1080 Ti... Expect ~90 it/sec, without update
        # and ~84 it/sec with updates...
        # Whereas 32 / 4 yielded ~25 it/sec....
        double: True
        dueling: True 
        noisy: False 
        n_step: 1
        tau: 1.0e-3 #5.0e-4 #2.5e-3
        burn_in: True
        sequence_replay_unroll_length: 8
        sequence_replay_overlap_length: 4
        sequence_replay_burn_in_length: 4


    5step_r2d2_LargeLSTMCNN_r1e5_beta4m1_tau1m3_RepP2_b2_L20_O10_B5:
        <<: *r2d2_LargeLSTMCNN_obs84_graclip5m1_b32_tau1m2_lr25m5_L80_O40_B20
        replay_capacity: 1e5
        use_PER: False
        PER_beta: 0.4
        replay_period: 2
        batch_size: 2
        # Paper: ratio = batch_size(=32) / replay_period(=4) = 8 ,
        # but bottleneck on GPU batchsize gives a better trade-off 
        # batch-regularization-effect / speed with a batch_size=16 
        # using NVIDIA 1080 Ti... Expect ~90 it/sec, without update
        # and ~84 it/sec with updates...
        # Whereas 32 / 4 yielded ~25 it/sec....
        double: True
        #dueling: True 
        #noisy: True 
        n_step: 5
        tau: 1.0e-3
        sequence_replay_unroll_length: 20
        sequence_replay_overlap_length: 10
        sequence_replay_burn_in_length: 5

    20step_prioritized_double_r2d2_LargeLSTMCNN_r1e5_beta4m1_tau1m3_RepP2_b16_L80_O40_B20:
        <<: *r2d2_LargeLSTMCNN_obs84_graclip5m1_b32_tau1m2_lr25m5_L80_O40_B20
        replay_capacity: 1e5
        use_PER: True
        PER_beta: 0.4
        replay_period: 2
        batch_size: 16
        # Paper: ratio = batch_size(=32) / replay_period(=4) = 8 ,
        # but bottleneck on GPU batchsize gives a better trade-off 
        # batch-regularization-effect / speed with a batch_size=16 
        # using NVIDIA 1080 Ti... Expect ~90 it/sec, without update
        # and ~84 it/sec with updates...
        # Whereas 32 / 4 yielded ~25 it/sec....
        double: True
        #dueling: True 
        #noisy: True 
        n_step: 20
        tau: 1.0e-3
