defaults:
  - experiment_config
  - _self_

# The device for collection (e.g. cuda)
sampling_device: "cuda"
# The device for training (e.g. cuda)
train_device: "cuda"
# The device for the replay buffer of off-policy algorithms (e.g. cuda)
buffer_device: "cuda"

# Whether to share the parameters of the policy within agent groups
share_policy_params: True
# If an algorithm and an env support both continuous and discrete actions, what should be preferred
prefer_continuous_actions: True
# If False collection is done using a collector (under no grad). If True, collection is done with gradients.
collect_with_grad: False

# Discount factor
gamma: 0.995
# Learning rate
lr: 0.0000125
# The epsilon parameter of the adam optimizer
adam_eps: 0.00000001
# Clips grad norm if true and clips grad value if false
clip_grad_norm: True
# The value for the clipping, if null no clipping
clip_grad_val: 5

# Whether to use soft or hard target updates
soft_target_update: True
# If soft_target_update is True, this is its polyak_tau
polyak_tau: 0.005
# If soft_target_update is False, this is the frequency of the hard trarget updates in terms of n_optimizer_steps
hard_target_update_frequency: 5

# When an exploration wrapper is used. This is its initial epsilon for annealing
exploration_eps_init: 0.8
# When an exploration wrapper is used. This is its final epsilon after annealing
exploration_eps_end: 0.01
# Number of frames for annealing of exploration strategy in deterministic policy algorithms
# If null it will default to max_n_frames / 3
exploration_anneal_frames: null

# The maximum number of experiment iterations before the experiment terminates, exclusive with max_n_frames
max_n_iters: null
# Number of collected frames before ending, exclusive with max_n_iters
max_n_frames: 960_000_000

# Number of frames collected and each experiment iteration
on_policy_collected_frames_per_batch: 120000 # Use multiple of 60000
# Number of environments used for collection
# If the environment is vectorized, this will be the number of batched environments.
# Otherwise batching will be simulated and each env will be run sequentially.
on_policy_n_envs_per_worker: 75 #75 # Should be frame num divided by number of steps
# This is the number of times collected_frames_per_batch will be split into minibatches and trained
on_policy_n_minibatch_iters: 45
# In on-policy algorithms the train_batch_size will be equal to the on_policy_collected_frames_per_batch
# and it will be split into minibatches with this number of frames for training
on_policy_minibatch_size: 4096

# Number of frames collected and each experiment iteration
off_policy_collected_frames_per_batch: 120000
# Number of environments used for collection
# If the environment is vectorized, this will be the number of batched environments.
# Otherwise batching will be simulated and each env will be run sequentially.
off_policy_n_envs_per_worker: 75
# This is the number of times off_policy_train_batch_size will be sampled from the buffer and trained over.
off_policy_n_optimizer_steps: 1000
# Number of frames used for each off_policy_n_optimizer_steps when training off-policy algorithms
off_policy_train_batch_size: 128
# Maximum number of frames to keep in replay buffer memory for off-policy algorithms
off_policy_memory_size: 1_000_000
# Number of random action frames to prefill the replay buffer with
off_policy_init_random_frames: 10000


evaluation: True
# Whether to render the evaluation (if rendering is available)
render: False
# Frequency of evaluation in terms of collected frames (this should be a multiple of on/off_policy_collected_frames_per_batch)
evaluation_interval: 240_000
# Number of episodes that evaluation is run on
evaluation_episodes: 1
# If True, when stochastic policies are evaluated, their mode is taken, otherwise, if False, they are sampled
evaluation_deterministic_actions: False

# List of loggers to use, options are: wandb, csv, tensorboard, mflow
loggers: [wandb]
# Wandb project name
project_name: "multi_agent_experiment"
# Create a json folder as part of the output in the format of marl-eval
create_json: False

# Absolute path to the folder where the experiment will log.
# If null, this will default to the hydra output dir (if using hydra) or to the current folder when the script is run (if not).
# If you are reloading an experiment with "restore_file", this will default to the reloaded experiment folder.
save_folder: "./logs"
# Absolute path to a checkpoint file where the experiment was saved. If null the experiment is started fresh.
restore_file: null
# Map location given to `torch.load()` when reloading.
# If you are reloading in a cpu-only machine a gpu experiment, you can use `restore_map_location: {"cuda:0":"cpu"}`
# to map gpu tensors to the cpu
restore_map_location: null
# Interval for experiment saving in terms of collected frames (this should be a multiple of on/off_policy_collected_frames_per_batch).
# Set it to 0 to disable checkpointing
checkpoint_interval: 120000
# Wether to checkpoint when the experiment is done
checkpoint_at_end: True
# How many checkpoints to keep. As new checkpoints are taken, temporally older checkpoints are deleted to keep this number of
# checkpoints. The checkpoint at the end is included in this number. Set to `null` to keep all checkpoints.
keep_checkpoints_num: null
