ppo:
  n_iters: 4000
  n_epochs: 5
  minibatch_size: 500
  n_mini_batches: 10
  clip_epsilon: 0.2
  gamma: 0.99
  lmbda: 0.9
  actor_lr: 3e-4
  min_actor_lr: 0
  critic_lr: 3e-4
  min_critic_lr: 2e-4
  lr_scheduler_enabled: true
  max_grad_norm: 1.0
  entropy_eps: 1e-3
  normalise_advantage: false
  start_from_checkpoint: null
logging:
  type: wandb
  mode: online
  # mode: disabled
  evaluation_interval: 50
  evaluation_episodes: 5
  checkpoint_interval: 200
policy:
  version: "v2"
  critic_kwargs:
    model_channels: 64