checkpoint: False
evaluation:
  device: cuda:0
  n_evaluation_envs: ${scenario.n_evaluation_envs}
  seed: 1
  n_rollouts: 1
  evaluate_success: False

lr_scaling:  0. #scaling of the lr * nb_anchors

algorithm:
  classname: crl.algorithms.SAC
  params:
    learning_device: cuda:0
    acquisition_device: cuda:0
    buffer_device: cuda:0

    optimizer_policy:
      classname: torch.optim.Adam
      lr: 1e-3

    optimizer_q:
      classname: torch.optim.Adam
      lr: 3e-4

    optimizer_entropy:
      classname: torch.optim.Adam
      lr: 3e-4

    control_every_n_epochs: 1000
    n_control_rollouts: 1

    n_processes: 0

    reward_scaling: 1.
    policy_update_delay: 2
    target_update_delay: 2
    time_limit: 0
    
    n_timesteps: 2
    batch_size: 256

    init_temperature: 2.
    target_multiplier: 2.
    
    clip_grad: 0.
    inner_epochs: ${scenario.n_train_envs}
    grad_updates_per_step: 0.5
    discount_factor: 0.99
    update_target_tau: 0.005
    buffer_time_size: 2
    buffer_size: 1_280_000
    initial_buffer_size: 12_800

alpha_search:
  classname: crl.algorithms.AlphaSearch
  params:
    device: cuda:0
    n_estimations: 1024
    steps: 4096
    n_processes: 0
    improvement_threshold: 0.1
    time_size: ${framework.params.algorithm.params.buffer_time_size}

    n_rollouts: 32
    n_validation_steps: 200

policy_agent:
  classname: crl.agents.SubspaceActionAgent
  hidden_size: 256
  input_dimension: nil
  output_dimension: nil
  start_steps: 0

  n_initial_anchors: 1
  dist_type: flat
  refresh_rate: 1.
  resampling_policy: False
  repeat_alpha: 100

critic_agent:
  classname: crl.agents.AlphaTwinCritics
  hidden_size: 256
  obs_dimension: nil
  action_dimension: nil
  n_anchors: