ppo:
  n_iters: 201
  n_epochs: 10
  minibatch_size: 400
  n_mini_batches: 10
  clip_epsilon: 0.2
  gamma: 0.99
  lmbda: 0.9
  actor_lr: 3e-4
  min_actor_lr: 0
  critic_lr: 3e-4
  min_critic_lr: 2e-4
  lr_scheduler_enabled: true
  max_grad_norm: 1.0
  entropy_eps: 1e-3
  normalise_advantage: false
  start_from_checkpoint: null
logging:
  type: wandb
  mode: online
  evaluation_interval: 25
  evaluation_episodes: 5
  checkpoint_interval: 200
policy:
  actor:
    depth: 2
    hidden_size: 64
  critic:
    depth: 3
    hidden_size: 128