defaults:
  - experiment_config
  - _self_

# The device for collection (e.g. cuda)
sampling_device: "cpu"
# The device for training (e.g. cuda)
train_device: "cpu"
# The device for the replay buffer of off-policy algorithms (e.g. cuda).
# Use "disk" to store it on disk (in the experiment save_folder)
buffer_device: "cpu"

# Whether to share the parameters of the policy within agent groups
share_policy_params: True
# If an algorithm and an env support both continuous and discrete actions, what should be preferred
prefer_continuous_actions: True
# If False collection is done using a collector (under no grad). If True, collection is done with gradients.
collect_with_grad: False
# In case of non-vectorized environments, whether to run collection of multiple processes
# If this is used, there will be n_envs_per_worker processes, collecting frames_per_batch/n_envs_per_worker frames each
parallel_collection: False

# Discount factor
gamma: 0.99
# Learning rate
lr: 0.00005
# The epsilon parameter of the adam optimizer
adam_eps: 0.000001
# Clips grad norm if true and clips grad value if false
clip_grad_norm: True
# The value for the clipping, if null no clipping
clip_grad_val: 5

# Whether to use soft or hard target updates
soft_target_update: True
# If soft_target_update is True, this is its polyak_tau
polyak_tau: 0.005
# If soft_target_update is False, this is the frequency of the hard target updates in terms of n_optimizer_steps
hard_target_update_frequency: 5

# When an exploration wrapper is used. This is its initial epsilon for annealing
exploration_eps_init: 0.8
# When an exploration wrapper is used. This is its final epsilon after annealing
exploration_eps_end: 0.01
# Number of frames for annealing of exploration strategy in deterministic policy algorithms
# If null it will default to max_n_frames / 3
exploration_anneal_frames: null

# The maximum number of experiment iterations before the experiment terminates, exclusive with max_n_frames
max_n_iters: null
# Number of collected frames before ending, exclusive with max_n_iters
max_n_frames: 3_000_000

# Number of frames collected and each experiment iteration
on_policy_collected_frames_per_batch: 6000
# Number of environments used for collection
# If the environment is vectorized, this will be the number of batched environments.
# Otherwise batching will be simulated and each env will be run sequentially or parallelly depending on parallel_collection.
on_policy_n_envs_per_worker: 10
# This is the number of times collected_frames_per_batch will be split into minibatches and trained
on_policy_n_minibatch_iters: 45
# In on-policy algorithms the train_batch_size will be equal to the on_policy_collected_frames_per_batch
# and it will be split into minibatches with this number of frames for training
on_policy_minibatch_size: 400

# Number of frames collected and each experiment iteration
off_policy_collected_frames_per_batch: 6000
# Number of environments used for collection
# If the environment is vectorized, this will be the number of batched environments.
# Otherwise batching will be simulated and each env will be run sequentially or parallelly depending on parallel_collection.
off_policy_n_envs_per_worker: 10
# This is the number of times off_policy_train_batch_size will be sampled from the buffer and trained over.
off_policy_n_optimizer_steps: 1000
# Number of frames used for each off_policy_n_optimizer_steps when training off-policy algorithms
off_policy_train_batch_size: 128
# Maximum number of frames to keep in replay buffer memory for off-policy algorithms
off_policy_memory_size: 1_000_000
# Number of random action frames to prefill the replay buffer with
off_policy_init_random_frames: 0
# whether to use priorities while sampling from the replay buffer
off_policy_use_prioritized_replay_buffer: False
# exponent that determines how much prioritization is used when off_policy_use_prioritized_replay_buffer = True
# PRB reduces to random sampling when alpha=0
off_policy_prb_alpha: 0.6
# importance sampling negative exponent when off_policy_use_prioritized_replay_buffer = True
off_policy_prb_beta: 0.4


evaluation: True
# Whether to render the evaluation (if rendering is available)
render: True
# Frequency of evaluation in terms of collected frames (this should be a multiple of on/off_policy_collected_frames_per_batch)
evaluation_interval: 120_000
# Number of episodes that evaluation is run on
evaluation_episodes: 10
# If True, when stochastic policies are evaluated, their deterministic value is taken, otherwise, if False, they are sampled
evaluation_deterministic_actions: True
# If True, seed the environment before evaluation leading to always the same evaluation env being used
# If False, evaluation environments will be more random throughout training
evaluation_static: False

# List of loggers to use, options are: wandb, csv, tensorboard, mflow
loggers: [csv]
# Wandb project name
project_name: "benchmarl"
# Create a json folder as part of the output in the format of marl-eval
create_json: True

# Absolute path to the folder where the experiment will log.
# If null, this will default to the hydra output dir (if using hydra) or to the current folder when the script is run (if not).
# If you are reloading an experiment with "restore_file", this will default to the reloaded experiment folder.
save_folder: null
# Absolute path to a checkpoint file where the experiment was saved. If null the experiment is started fresh.
restore_file: null
# Map location given to `torch.load()` when reloading.
# If you are reloading in a cpu-only machine a gpu experiment, you can use `restore_map_location: {"cuda:0":"cpu"}`
# to map gpu tensors to the cpu
restore_map_location: null
# Interval for experiment saving in terms of collected frames (this should be a multiple of on/off_policy_collected_frames_per_batch).
# Set it to 0 to disable checkpointing
checkpoint_interval: 0
# Whether to checkpoint when the experiment is done
checkpoint_at_end: False
# How many checkpoints to keep. As new checkpoints are taken, temporally older checkpoints are deleted to keep this number of
# checkpoints. The checkpoint at the end is included in this number. Set to `null` to keep all checkpoints.
keep_checkpoints_num: 3
