SeekerCircleEnv:
  steps: 200000  # Number of training steps
  train_iters: 10 # Number of models per configuration
  n_eval_ep: 10 # Number of deployments/evaluations per trained model
  max_episode_steps: 100 # Max number of steps per episode
  randomize: True

  randomize_env: True
  punishment: -0.1
  find_seeds: True

  # This is needed for the base benchmark class
  u_space: [[-1, -1],
            [1,  1]]

  # These are not used in the env but needed in the base benchmark class
  x_goal: [0, 0, 0, 0, 0, 0]
  x_halfspace: [0, 0, 0, 0, 0, 0]
  x_lim_low: [0, 0, 0, 0, 0, 0]
  x_lim_high: [0, 0, 0, 0, 0, 0]

  noise_vector: [1, 1] # [x, z] essentially, indicating the number of dimensions of the noise vector -> will get multiplied by noise_bound
  noise_set: [[1, 1], 
              [-1, 1],
              [1, -1],
              [-1, -1]] # essentially, indicating dimensions of the noise set -> will get multiplied by noise_bound

  # G: [[0.0125, 0.0015], 
  #     [0.0125, -0.0015]] # G Input Zonotope -> has to be of shape (n, m) where n is the number of inputs and m the number of generators
  # G: [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
  #      [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]
  # G: [[1, 0], 
  #     [0, 1]] # eye
  # G: [[1, 0], 
  #   [1, 1]] # parallelogram
#  G: [[1, 1, 1],
#      [1, -1, 0]] # hexagon
  G: [[1, 1, 1, 0],
      [1, -1, 0, 1]] # hexagon
  # G: [[0.125, 0.015],
  #     [0.125, -0.015]] # times 10

  algorithms:
    DQN:
      learning_rate: 0.0001
      learning_starts: 100
      batch_size: 64
      gamma: 0.99999
      train_freq: 2
      gradient_steps: 4
      max_grad_norm: 100
      activation_fn: "tanh"
      network_size: 64
      target_update_interval: 1000
      exploration_final_eps: 0.004
      exploration_fraction: 0.00003
      exploration_initial_eps: 0.137

    TD3:
      learning_rate: 0.002
      buffer_size: 100000
      batch_size: 512
      gamma: 0.98
      train_freq: 5
      gradient_steps: 10
      noise_type: 'normal'
      noise_std: 0.12
      network_size: 64
      activation_fn: "relu"

    # Ray
    PPO:
      batch_size: 128  # hp tune
      n_steps: 256  # hp tune
      gamma: 0.98
      learning_rate: 0.0008249279223383465 # hp tune
      ent_coef: 1.6573870807142476e-07 # hp tune
      clip_range: 0.1
      n_epochs: 8 # hp tune
      gae_lambda: 0.9
      network_size: 32
      activation_fn: "relu"
      normalize_advantage: true
      # For some reson, reducing the log_std_init drastically increases the computation time
      log_std_init: -0.010177571643527838

    SAC:
      learning_rate: 0.0003
      buffer_size: 500000
      batch_size: 512
      ent_coef: 0.1
      train_freq: 32
      gradient_steps: 32
      gamma: 0.98
      tau: 0.01
      learning_starts: 1000
      use_sde: true
      network_size: 64
      log_std_init: -3.67

    A2C:
      ent_coef: 0.0
      max_grad_norm: 0.5
      n_steps: 64
      gae_lambda: 0.9
      vf_coef: 0.4
      gamma: 0.9
      use_rms_prop: true
      normalize_advantage: false
      learning_rate: 0.00004
      network_size: 64
      log_std_init: -3.67
