[COMMON]
; Common parameters shared across all algorithms
hidden_dims = 64,64
train_episodes = 10000
train_steps = 50
gamma = 0.99

[PPO_BASE]
; Base PPO parameters that are same across all configurations
gae_lambda = 0.95
value_coef = 0.5
entropy_coef = 0.0
max_grad_norm = 0.5
target_kl = 0.01
mini_batch_size = 64
n_epochs = 10
normalize_advantages = true
ppo_n_steps = 256
clip_ratio = 0.1

[PPO.performance.scalarized]
learning_rate = 1e-4
; ppo_n_steps = 512
; clip_ratio = 0.1

[PPO.gap.scalarized]
learning_rate = 1e-4
; ppo_n_steps = 256
; clip_ratio = 0.1

[PPO.aptitude.scalarized]
learning_rate = 1e-4
; ppo_n_steps = 256
; clip_ratio = 0.1

[PPO.aptitude_gap_performance.scalarized]
learning_rate = 1e-4
; ppo_n_steps = 256
; clip_ratio = 0.1

[PPO.aptitude_gap_performance.pareto_buffer_rm]
learning_rate = 1e-4
; ppo_n_steps = 256
; clip_ratio = 0.2

[A2C_BASE]
; Base A2C parameters - TO BE TUNED
gae_lambda = 1.0
value_coef = 0.5
entropy_coef = 0.0
max_grad_norm = 0.5
normalize_advantages = false
use_rms_prop = true
rms_prop_eps = 1e-5
a2c_n_steps = 128

[A2C.performance.scalarized]
learning_rate = 3e-4
; a2c_n_steps = 256

[A2C.gap.scalarized]
learning_rate = 3e-4
; a2c_n_steps = 64

[A2C.aptitude.scalarized]
learning_rate = 3e-4
; a2c_n_steps = 64

[A2C.aptitude_gap_performance.scalarized]
learning_rate = 3e-4
; a2c_n_steps = 128

[A2C.aptitude_gap_performance.pareto_buffer_rm]
learning_rate = 3e-4
; a2c_n_steps = 256

[SARSA_BASE]
; Base SARSA parameters
epsilon = 0.1
epsilon_decay = 0.995
epsilon_min = 0.01

[SARSA.performance.scalarized]
learning_rate = 1e-4

[SARSA.gap.scalarized]
learning_rate = 1e-4

[SARSA.aptitude.scalarized]
learning_rate = 1e-4

[SARSA.aptitude_gap_performance.scalarized]
learning_rate = 1e-4

[SARSA.aptitude_gap_performance.reward_machine]
learning_rate = 1e-4