defaults:
  - env: navigation
  - _self_

ENV_NAME: ${env.ENV_NAME}

# PPO Params
# to match https://github.com/proroklab/ControllingBehavioralDiversity/blob/b6fc469e3ab14f8fd79b1bc2ad3dbf3948587a39/het_control/conf/experiment/het_control_experiment.yaml#L16
ALG: "IPPO-FuPS"
LR: 0.00005
NUM_ENVS: 600 # on_policy_n_envs_per_worker
NUM_STEPS: 100 # to match benchmarls num updates - 166 (max_n_frames/on_policy_collected_frames_per_batch). We calculate num_updates as config["TOTAL_TIMESTEPS"] // config["NUM_STEPS"] // config["NUM_ENVS"] 
TOTAL_TIMESTEPS: 10e6
UPDATE_EPOCHS: 45  # Matches on_policy_n_minibatch_iters
NUM_MINIBATCHES: 30 # to match on_policy_minibatch_size: 4096
GAMMA: 0.9
GAE_LAMBDA: 0.9
CLIP_EPS: 0.2
ENT_COEF: 0.0
VF_COEF: 1.0
MAX_GRAD_NORM: 5
ACTIVATION: "tanh"
ANNEAL_LR: False

# Seeds
SEED: 30
NUM_SEEDS: 1

# Network Architectures
ACTOR_LAYERS: [256, 256]
CRITIC_LAYERS: [256, 256]

# WandB Params
ENTITY: null
PROJECT: "enable-specialisation"
WANDB_MODE: "disabled"
EXP_NAME: ${hydra:runtime.config_name}
EXP_TAGS: ["${ALG}", "FF", "Shared_Weights", "${ENV_NAME}","WIP"]

# Unique run name
RUN_NAME: "${ENV_NAME}__${EXP_NAME}__${SEED}__${now:%Y-%m-%d_%H-%M-%S}"
GROUP: "${ENV_NAME}_${EXP_NAME}"

# Evaluation
EVAL_INTERVAL: 120_000 
EVAL_EPISODES: 200
EVAL_DETERMINISTIC: False

# Environment Configuration
# Args from env are passed to both train and eval envs
TEST_ENV_KWARGS: {}
TRAIN_ENV_KWARGS: {auto_reset: True}
ACTION_SPACE_TYPE: "continuous"

# Logging and Checkpointing
CAPTURE_VIDEO_INTERVAL: 5000000  # TOTAL_TIMESTEPS // 4
CHECKPOINT_INTERVAL: 5000000  # TOTAL_TIMESTEPS // 4
CHECKPOINT: True
HANDLE_TERMINAL_OBS: True

# config from dico
# experiment:
#   sampling_device: cuda
#   train_device: cuda
#   share_policy_params: true
#   prefer_continuous_actions: true
#   gamma: 0.9
#   lr: 5.0e-05
#   adam_eps: 1.0e-05
#   clip_grad_norm: true
#   clip_grad_val: 5.0
#   soft_target_update: true
#   polyak_tau: 0.005
#   hard_target_update_frequency: 5
#   exploration_eps_init: 0.8
#   exploration_eps_end: 0.01
#   exploration_anneal_frames: 1000000
#   max_n_iters: null
#   max_n_frames: 10000000
#   on_policy_collected_frames_per_batch: 60000
#   on_policy_n_envs_per_worker: 600
#   on_policy_n_minibatch_iters: 45
#   on_policy_minibatch_size: 4096
#   off_policy_collected_frames_per_batch: 6000
#   off_policy_n_envs_per_worker: 60
#   off_policy_n_optimizer_steps: 1000
#   off_policy_train_batch_size: 128
#   off_policy_memory_size: 1000000
#   off_policy_init_random_frames: 0
#   evaluation: true
#   render: true
#   evaluation_interval: 120000
#   evaluation_episodes: 200
#   evaluation_deterministic_actions: false
#   loggers:
#   - wandb
#   create_json: true
#   save_folder: null
#   restore_file: null
#   checkpoint_interval: 0.0
# algorithm:
#   share_param_critic: false
#   clip_epsilon: 0.2
#   entropy_coef: 0.0
#   critic_coef: 1.0
#   loss_critic_type: l2
#   lmbda: 0.9
#   scale_mapping: biased_softplus_1.0
#   use_tanh_normal: true
# task:
#   n_agents: 2
#   agents_with_same_goal: 1
#   max_steps: 100
#   collisions: false
#   split_goals: false
#   observe_all_goals: true
#   shared_rew: false
#   lidar_range: 0.35
#   agent_radius: 0.1
# model:
#   name: hetcontrolmlpempirical
#   num_cells:
#   - 256
#   - 256
#   activation_class: torch.nn.Tanh
#   desired_snd: 0.3
#   tau: 0.01
#   bootstrap_from_desired_snd: false
#   process_shared: true
#   probabilistic: null
#   scale_mapping: null
# critic_model:
#   name: mlp
#   num_cells:
#   - 256
#   - 256
#   layer_class: torch.nn.Linear
#   activation_class: torch.nn.Tanh
#   activation_kwargs: null
#   norm_class: null
#   norm_kwargs: null
# seed: 0
# use_action_loss: true
# action_loss_lr: 3.0e-05
