ppo:
  n_iters: 100
  n_epochs: 1
  minibatch_size: 500
  n_mini_batches: 2
  clip_epsilon: 0.2
  gamma: 0.99
  lmbda: 0.9
  actor_lr: 3e-4
  min_actor_lr: 0
  critic_lr: 3e-4
  min_critic_lr: 2e-4
  lr_scheduler_enabled: true
  max_grad_norm: 1.0
  entropy_eps: 1e-3
  normalise_advantage: false
  start_from_checkpoint: null
logging:
  type: wandb
  mode: disabled
  evaluation_interval: 50
  evaluation_episodes: 2
  checkpoint_interval: 999
policy:
  version: "v1"