SeekerCircleEnv:
  steps: 200000  # Number of training steps
  train_iters: 10 # Number of models per configuration
  n_eval_ep: 10 # Number of deployments/evaluations per trained model
  max_episode_steps: 100 # Max number of steps per episode
  randomize: True

  randomize_env: True
  punishment: -0.1
  find_seeds: True

  # This is needed for the base benchmark class
  u_space: [[-1, -1],
            [1,  1]]

  # These are not used in the env but needed in the base benchmark class
  x_goal: [0, 0, 0, 0, 0, 0]
  x_halfspace: [0, 0, 0, 0, 0, 0]
  x_lim_low: [0, 0, 0, 0, 0, 0]
  x_lim_high: [0, 0, 0, 0, 0, 0]

  noise_vector: [1, 1] # [x, z] essentially, indicating the number of dimensions of the noise vector -> will get multiplied by noise_bound
  noise_set: [[1, 1], 
              [-1, 1],
              [1, -1],
              [-1, -1]] # essentially, indicating dimensions of the noise set -> will get multiplied by noise_bound

  # G: [[0.0125, 0.0015], 
  #     [0.0125, -0.0015]] # G Input Zonotope -> has to be of shape (n, m) where n is the number of inputs and m the number of generators
  # G: [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
  #      [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]
  # G: [[1, 0], 
  #     [0, 1]] # eye
  # G: [[1, 0], 
  #   [1, 1]] # parallelogram
#  G: [[1, 1, 1],
#      [1, -1, 0]] # hexagon
  G: [[1, 1, 1, 0],
      [1, -1, 0, 1]] # hexagon
  # G: [[0.125, 0.015],
  #     [0.125, -0.015]] # times 10

  algorithms:
    DQN:
      learning_rate: 0.0001
      learning_starts: 100
      batch_size: 64
      gamma: 0.99999
      train_freq: 2
      gradient_steps: 4
      max_grad_norm: 100
      activation_fn: "tanh"
      network_size: 64
      target_update_interval: 1000
      exploration_final_eps: 0.004
      exploration_fraction: 0.00003
      exploration_initial_eps: 0.137

    TD3:
      learning_rate: 0.002
      buffer_size: 100000
      batch_size: 512
      gamma: 0.98
      train_freq: 5
      gradient_steps: 10
      noise_type: 'normal'
      noise_std: 0.12
      network_size: 64
      activation_fn: "relu"

    PPO:
      batch_size: 8  # hp tune
      n_steps: 32  # hp tune
      gamma: 0.98
      learning_rate: 5.431623921337594e-05  # hp tune
      ent_coef: 4.708649348532251e-05  # hp tune
      clip_range: 0.1
      n_epochs: 4
      gae_lambda: 0.9
      network_size: 32
      activation_fn: "relu"
      normalize_advantage: true
      log_std_init: -1.1833325098189942  # hp tune

    SAC:
      learning_rate: 0.0003
      buffer_size: 500000
      batch_size: 512
      ent_coef: 0.1
      train_freq: 32
      gradient_steps: 32
      gamma: 0.98
      tau: 0.01
      learning_starts: 1000
      use_sde: true
      network_size: 64
      log_std_init: -3.67

    A2C:
      ent_coef: 0.0
      max_grad_norm: 0.5
      n_steps: 64
      gae_lambda: 0.9
      vf_coef: 0.4
      gamma: 0.9
      use_rms_prop: true
      normalize_advantage: false
      learning_rate: 0.00004
      network_size: 64
      log_std_init: -3.67
