# PGA-ME uses one PGA emitter and one iso-line emitter
# PGA emitter internally maintains a critic and updates it periodically
# When asked for solutions, it performs several gradient ascent steps on sampled actors to make them better
# It also maintains a "best" actor that gets trained alongside the critic
# This best actor is always returned as the first element in the batch

method: "pga_me"
archive: "grid" # or cvt

# For Grid archives, this is the granularity of discretization.
# For CVT, this is the size of the archive
archive_dim: 100 
init_from_pretrained: False # Doesn't do anything for pga-me

qd_score_offset: ${env.min_score}

# isoline
iso_sigma: 0.005
line_sigma: 0.05
isoline_batch_size: 32

# pga
# When the archive is empty, we sample solutions from a Gaussian centered at x0 with std sigma0.
pga_sigma0: 0.05
# The first few asks output random solutions
init_iters: 5
pga_batch_size: 64

td3:
  buffer_size: 5000000
  # Number of iterations to train the critics on each call to train_critics().
  train_critics_itrs: 512
  # Batch size for sampling from replay buffer.
  batch_size: 256
  # Batch size for calculating policy gradient.
  pg_batch_size: 256
  # Discount factor.
  discount: 0.99
  # Known as tau in TD3 paper.
  target_update_rate: 0.005
  # How often (in terms of training itrs) to update target networks / actor.
  target_update_freq: 5
  # Noise for smoothing the critic training - see section 5.3 of TD3 paper.
  smoothing_noise_variance: 0.2
  smoothing_noise_clip: 0.5
  # Learning rate for Adam optimizers for training critics and greedy actors.
  adam_learning_rate: 3e-4
  # Number of steps to take when improving a policy with gradient_ascent.
  gradient_steps: 10
  # Learning rate for Adam optimizer in gradient_ascent.
  gradient_learning_rate: 1e-3
  state_dim: ${env.state_dim}
  action_dim: ${env.action_dim}


