# base
name: pd_morl_const

# for RL
norm_obs: true
norm_reward: false
discount_factor: 0.99
n_steps: 100
batch_size: 256
len_replay_buffer: 1000000
max_grad_norm: 1.0
soft_update_ratio: 0.995
actor_lr: 3e-4
critic_lr: 3e-4
n_actor_update_iters: 10
n_critic_update_iters: 10
n_HER_samples: 3
explore_action_noise_scale: 0.1
target_action_noise_scale: 0.2
target_action_noise_clip: 0.5
dir_angle_loss_coeff: 10.0

# for constraint
con_thresholds: [0.025]
con_lambdas_lr: 1e-5

# for logging
logging:
    task_indep: [fps, eplen, entropy, actor_loss, reward_critic_loss, cost_critic_loss]
    reward_dep: [reward_sum]
    cost_dep: [cost_sum, con_lambdas, cost_values]

# for model
model:
    actor:
        mlp:
            shape: [512, 512]
            activation: LeakyReLU
    reward_critic:
        mlp:
            shape: [512, 512]
            activation: LeakyReLU
            normalization: layer_norm
        clip_range: [-np.inf, np.inf]
        n_critics: 2
        n_quantiles: 25
    cost_critic:
        mlp:
            shape: [512, 512]
            activation: LeakyReLU
            normalization: layer_norm
        clip_range: [-np.inf, np.inf]
        n_critics: 2
        n_quantiles: 25
    multipliers:
        mlp:
            shape: [512,]
            activation: LeakyReLU
        init_value: 1.0
        clip_range: [0.0, 1000.0]
