# === Devices ===
sampling_device: "cpu"        # Environments run on CPU
train_device: "cpu"           # Set to "cuda" if using a GPU

# ================= Training termination =================
max_n_frames: 500_000      # ≈ 2M steps total  
gamma: 0.99                  # high discount for long horizons  
use_gae: True                # generalized advantage estimation  
gae_lambda: 0.95             # bias-variance trade-off  

# ================= Rollout collection ==================
on_policy_collected_frames_per_batch: 2048   # rollout length per update  
on_policy_n_envs_per_worker: 16             # parallel envs per worker  

# ================ PPO-style update ==================
on_policy_n_minibatch_iters: 10    # epochs per batch  
on_policy_minibatch_size: 256     # samples per minibatch  
clip_param: 0.2                    # PPO clipping ε  
value_clip_param: 0.2              # optional: clip value-fn updates  
entropy_coef: 0.01                 # entropy bonus  
value_loss_coef: 0.5               # VF loss weight  
learning_rate: 3e-4                # Adam step size  
max_grad_norm: 0.5                 # gradient clipping 


# ============= LR / clip schedules ==============
use_linear_lr_decay: True          # anneal to 0 over training  
use_linear_clip_decay: True        # anneal clip_param  

# ================= Logging / eval =================
evaluation: False
evaluation_interval: 128       # doesn’t matter if you use max_n_frames=0 in eval-only
evaluation_episodes: 20
render: False
loggers: [csv]
tensorboard_logdir: tb

# --- info aggregation -------------------------------------------------
info_stats: [mean, min, max]     # any of: mean • min • max • sum • std

# === Policy sharing ===
share_policy_params: False

# === Checkpointing ===
checkpoint_interval: 0        # Disable intermediate checkpoints
checkpoint_at_end: True       # Only save final model
keep_checkpoints_num: 1       # Keep only the final checkpoint

