classname: crl.frameworks.Subspace
seed: ${seed}
params:
  checkpoint: True
  evaluation:
    device: cuda:0
    n_evaluation_envs: ${scenario.n_evaluation_envs}
    seed: 1
    n_rollouts: 1
    evaluate_success: False
    oracle_rollouts: 0
    
  lr_scaling:  0. #scaling of the lr * nb_anchors

  algorithm:
    classname: crl.algorithms.SAC
    params:
      learning_device: cuda:0
      acquisition_device: cuda:0
      buffer_device: cuda:0

      optimizer_policy:
        classname: torch.optim.Adam
        lr: 3e-4

      optimizer_q:
        classname: torch.optim.Adam
        lr: 3e-4

      optimizer_entropy:
        classname: torch.optim.Adam
        lr: 3e-4

      control_every_n_epochs: 100
      n_control_rollouts: 1

      n_processes: 0

      reward_scaling: 1.
      policy_update_delay: 2
      target_update_delay: 2
      time_limit: 0
      
      n_timesteps: 2
      batch_size: 256

      init_temperature: 2.
      target_multiplier: 1.
      
      clip_grad: 0.
      inner_epochs: ${scenario.n_train_envs}
      grad_updates_per_step: 0.5
      discount_factor: 0.99
      update_target_tau: 0.005
      buffer_time_size: 2
      buffer_size: 1_280_000
      initial_buffer_size: 12_800

  alpha_search:
    classname: crl.algorithms.AlphaSearch
    params:
      device: cuda:0
      n_estimations: 1024
      n_samples: 4096
      improvement_threshold: 0.
      time_size: ${framework.params.algorithm.params.buffer_time_size}
      n_rollouts: 8
      n_validation_steps: 1000

  policy_agent:
    classname: crl.agents.SubspaceActionAgent
    hidden_size: 256
    input_dimension: nil
    output_dimension: nil

    n_initial_anchors: 1
    dist_type: peaked
    refresh_rate: 1.
    resampling_policy: True
    repeat_alpha: 250

  critic_agent:
    classname: crl.agents.AlphaTwinCritics
    hidden_size: 256
    obs_dimension: nil
    action_dimension: nil
    n_anchors: