# base
name: lp3_const

# for RL
norm_obs: true
norm_reward: false
discount_factor: 0.99
len_replay_buffer: 100000
n_update_steps: 10000
n_steps: 1000
gae_coeff: 0.97
max_grad_norm: 1.0
critic_lr: 3e-4
n_critic_iters: 20
n_action_samples: 20
n_target_quantiles: 50

# for preference
preference_lr: 3e-4
max_reward_preference: 0.01
n_prefer_samples: 10

# for constraint
con_thresholds: [0.025, 0.025, 0.4]

# for trust region
max_kl: 0.001
damping_coeff: 0.01
num_conjugate: 10
line_decay: 0.8

# for logging
logging:
    task_indep: [fps, eplen, objective, kl, beta, entropy, cost_critic_loss, reward_critic_loss]
    reward_dep: [reward_sum]
    cost_dep: [constraint, cost_preferences, cost_sum]

# for model
model:
    actor:
        mlp:
            shape: [512, 512]
            activation: LeakyReLU
        last_activation: Tanh
        use_action_bound: true
        log_std_init: -2.0
        log_std_fix: true
    reward_critic:
        mlp:
            shape: [512, 512]
            activation: LeakyReLU
            normalization: layer_norm
        clip_range: [-np.inf, np.inf]
        n_critics: 2
        n_quantiles: 25
    cost_critic:
        mlp:
            shape: [512, 512]
            activation: LeakyReLU
            normalization: layer_norm
        clip_range: [-np.inf, np.inf]
        n_critics: 2
        n_quantiles: 25
    cost_preferences:
        mlp:
            shape: [512,]
            activation: LeakyReLU
        init_value: 0.01
        max_value: 0.1
