BaselineMLP: &BaselineMLP 
        #-------------------------------------------------------------------
        # Actor Model:
        #-------------------------------------------------------------------
        actor_phi_arch: 'None'
        actor_goal_phi_arch: 'None'
        actor_head_arch: 'MLP'
        
        #MLP:
        actor_phi_arch_hidden_units: []

        # Actor head:
        actor_head_arch_hidden_units: [256, 256,]
        
        #-------------------------------------------------------------------
        # Critic Model :
        #-------------------------------------------------------------------
        
        critic_phi_arch: 'None'
        critic_action_phi_arch: 'None'
        critic_goal_phi_arch: 'None'
        critic_head_arch: 'MLP'
        
        #MLP:
        critic_phi_arch_hidden_units: []

        # Action Phi Body:
        #MLP:
        critic_action_phi_arch_hidden_units: []

        # Critic head:
        critic_head_arch_hidden_units: [256, 256,]
        

extra_hyperparameters: &extra_hyperparameters
    lr_account_for_nbr_actor: False 
    weights_decay_lambda_actor: 0.0 # 1e-6
    weights_decay_lambda_critic: 0.0 # 1e-6
    use_target_to_gather_data:    False
    goal_oriented: False 
    goal_state_shared_arch:  False
    goal_state_flattening: False    #True
    nbr_training_iteration_per_cycle: 1 # HER: 40 # SAC: == replay_period
    #nbr_episode_per_cycle:  16  # HER: 16 DQN needs removal.
    HER_use_latent: False   #True
    HER_target_clamping: False 

SAC_BaselineMLP: &SAC_BaselineMLP
        noisy: False 
        n_step: 1

        use_PER: False
        PER_alpha: 0.6
        PER_beta: 1.0

        replay_capacity: 1e6
        replay_period: 1 # must be similar to nbr_training_iteration_per_cycle

        use_HER:    False
        HER_strategy:   'final-1' #'future-4' #

        min_capacity: 10e3
        actor_start_delay: 10e3

        # Unlike TD3, no update delay...
        actor_update_delay: 1
        ensemble_qnet_nbr_models: 2
        entropy_regularization_coefficient_alpha: 0.2
        alpha_tuning: True

        observation_resize_dim: None
        goal_resize_dim: None
        
        discount: 0.99
        use_cuda: True
        gradient_clip: 0.0 #1.0
        batch_size: 256 #64 #32
        tau: 5e-3
        critic_learning_rate: 3.0e-4 #1.0e-5   # 1e-4 predictor while 1e-5 network...
        actor_learning_rate: 3.0e-4 #1.0e-5   # 1e-4 predictor while 1e-5 network...
        adam_eps: 1.0e-8
        
        <<: *BaselineMLP
        <<: *extra_hyperparameters
        
experiment:
    tasks: [
            { #'env-id': 'HalfCheetahBulletEnv-v0', #'Ant-v2',
              'env-id': 'HalfCheetah-v2',

             'run-id': 'B1M/TestNoDynamicRequiredGradSetting+CriticMSE/EntropyCoefficientTuningLog02+FailureBasedDone/BaselineStd+LessStableFormula+SeparateCompute&UpdateZeroGrad+ActionSamplingFromEnv/XavierUniformWHOLE+Bias0/EntropyRegAlpha2m1/ActorStart10p3TorchRand/MinCapacity10e3/ReplayPeriod1XNbrTrainIt1_UpdatePerReplay1_Cap1p6_b256/CriticLr3m4_ActorLr3m4_Seed1_tau5m3_GradClip0/WeightDecayCritic0Actor0/MSELOSS/',
             #'run-id': 'B1M/BaselineStd+PerDimLogProb+Log/NOINITHiddenLayerFanInLayerInit+BiasUniform1m1+LastLayerUniform3m3+ActorSame/EntropyRegAlpha2m1/ActorStart10p3TorchRand/MinCapacity10e3/ReplayPeriod1XNbrTrainIt1_UpdatePerReplay1_Cap1p6_b256/CriticLr3m4_ActorLr3m4_Seed100_tau5m3_GradClip0/WeightDecayCritic0Actor0/MSELOSS/',
             
             'agent-id': '1step_SAC_BaselineMLP',
             #'agent-id': '1step_PER_SAC_BaselineMLP',
             
             'nbr_actor': 1,
             #'nbr_frame_skipping': 0,
             #'nbr_frame_stacking': 4,
             #'single_life_episode': False,
             #'nbr_max_random_steps': 0,
             #'clip_reward': False,
             'observation_resize_dim': None,
             'goal_resize_dim': None,
             },
            ]
    experiment_id: '/home/kevin/debug_TD3/debug_SAC/'
    benchmarking_episodes: 10
    benchmarking_interval: 10.0e3
    
    # Deprecated:
    benchmarking_record_episode_interval: 1.0e8
    
    video_recording_episode_period_training: 1e2
    video_recording_episode_period_benchmarking: 5e0
    train_observation_budget: 1.0e6
    seed: 1

agents:        
    1step_SAC_BaselineMLP:
        <<: *SAC_BaselineMLP
        #noisy: True 
        n_step: 1
        use_PER: False
        PER_alpha: 0.7
        PER_beta: 0.4
        #replay_period: 2    
        #batch_size: 16
        # Paper: ratio = batch_size(=32) / replay_period(=4) = 8 ,
        # but bottleneck on GPU batchsize gives a better trade-off 
        # batch-regularization-effect / speed with a batch_size=16 
        # using NVIDIA 1080 Ti... Expect ~90 it/sec, without update
        # and ~84 it/sec with updates...
        # Whereas 32 / 4 yielded ~25 it/sec....
        #nbr_training_iteration_per_cycle: 40 # HER: 40
        #nbr_episode_per_cycle:  16  # HER: 16
    
    1step_PER_SAC_BaselineMLP:
        <<: *SAC_BaselineMLP
        #noisy: True 
        n_step: 1
        use_PER: True
        PER_alpha: 0.7
        PER_beta: 0.4
        #replay_period: 2    
        #batch_size: 16
        # Paper: ratio = batch_size(=32) / replay_period(=4) = 8 ,
        # but bottleneck on GPU batchsize gives a better trade-off 
        # batch-regularization-effect / speed with a batch_size=16 
        # using NVIDIA 1080 Ti... Expect ~90 it/sec, without update
        # and ~84 it/sec with updates...
        # Whereas 32 / 4 yielded ~25 it/sec....
        #nbr_training_iteration_per_cycle: 40 # HER: 40
        #nbr_episode_per_cycle:  16  # HER: 16
