main_args:
  algo: mappo  # Choose from: mappo, happo, hatrpo, haa2c, haddpg(off), hatd3(off), hasac(off), had3qn(off), maddpg(off), matd3(off).",
  env: hmp
algo_args:
  # This is the configuration file for the HAPPO algorithm.
  seed:
    # whether to use the specified seed
    seed_specify: True
    # seed
    seed: 1
  device:
    # whether to use CUDA
    cuda: True
    # whether to set CUDA deterministic
    cuda_deterministic: True
    # arg to torch.set_num_threads
    torch_threads: 4
  train:
    # number of parallel environments for training data collection 
    n_rollout_threads: 32 #   NOTE shoule be compatible with num_threads of the main jsonc!!!
    # number of total training steps
    num_env_steps: 1000000000
    # number of warmup steps (only for off-line algs)
    warmup_steps: 100000
    # number of steps per train (only for off-line algs)
    train_interval: 50
    # ratio of training iterations to train_interval (only for off-line algs)
    update_per_train: 1
    # number of steps per environment per training data collection
    episode_length: 150
    # logging interval
    log_interval: 5
    # evaluation interval
    eval_interval: 25
    # whether to use ValueNorm
    use_valuenorm: True
    # whether to use linear learning rate decay
    use_linear_lr_decay: False
    # whether to consider the case of truncation when an episode is done
    use_proper_time_limits: True
    # if set, load models from this directory; otherwise, randomly initialise the models
    model_dir: ~
  eval:
    # whether to use evaluation
    use_eval: False
    # number of parallel environments for evaluation
    n_eval_rollout_threads: 10
    # number of episodes per evaluation
    eval_episodes: 20
  render:
    # whether to use render
    use_render: False
    # number of episodes to render
    render_episodes: 10
  model:
    # network parameters
    # hidden sizes for mlp module in the network
    hidden_sizes: [128, 128]
    # activation function, choose from sigmoid, tanh, relu, leaky_relu, selu
    activation_func: relu
    # whether to use feature normalization
    use_feature_normalization: True
    # initialization method for network parameters, choose from xavier_uniform_, orthogonal_, ...
    initialization_method: orthogonal_
    # gain of the output layer of the network.
    gain: 0.01
    # recurrent parameters
    # whether to use rnn policy (data is not chunked for training)
    use_naive_recurrent_policy: False
    # whether to use rnn policy (data is chunked for training)
    use_recurrent_policy: False
    # number of recurrent layers
    recurrent_n: 1
    # length of data chunk; only useful when use_recurrent_policy is True; episode_length has to be a multiple of data_chunk_length
    data_chunk_length: 10
    # optimizer parameters
    # actor learning rate
    lr: 0.0005
    # critic learning rate
    critic_lr: 0.0005
    # eps in Adam
    opti_eps: 0.00001
    # weight_decay in Adam
    weight_decay: 0
    # parameters of diagonal Gaussian distribution
    std_x_coef: 1
    # parameters of diagonal Gaussian distribution
    std_y_coef: 0.5

    # NOTE extra part from hasac 
    # final activation function, choose from sigmoid, tanh, relu, leaky_relu, selu
    final_activation_func: tanh
    # initialization method for network parameters, choose from xavier_uniform_, orthogonal_, ...
    initialization_method: orthogonal_



  algo:
    # ppo parameters
    # number of epochs for actor update
    ppo_epoch: 5
    # number of epochs for critic update
    critic_epoch: 5
    # whether to use clipped value loss
    use_clipped_value_loss: True
    # clip parameter
    clip_param: 0.2
    # number of mini-batches per epoch for actor update
    actor_num_mini_batch: 1
    # number of mini-batches per epoch for critic update
    critic_num_mini_batch: 1
    # coefficient for entropy term in actor loss
    entropy_coef: 0.01
    # coefficient for value loss
    value_loss_coef: 1
    # whether to clip gradient norm
    use_max_grad_norm: True
    # max gradient norm
    max_grad_norm: 10.0
    # whether to use Generalized Advantage Estimation (GAE)
    use_gae: True
    # discount factor
    gamma: 0.99
    # GAE lambda
    gae_lambda: 0.95
    # whether to use huber loss
    use_huber_loss: True
    # whether to use policy active masks
    use_policy_active_masks: True
    # huber delta
    huber_delta: 10.0
    # method of aggregating the probability of multi-dimensional actions, choose from prod, mean
    action_aggregation: prod
    # whether to share parameter among actors
    share_param: False
    # whether to use a fixed optimisation order
    fixed_order: False

    # NOTE extra part from hasac 
    # whether to auto-tune temperature
    auto_alpha: False
    # temperature parameter
    alpha: 0.001
    # alpha learning rate
    alpha_lr: 0.0003
    # discount factor
    gamma: 0.99
    # off-policy buffer size
    buffer_size: 1000000
    # training batch size
    batch_size: 1000
    # coefficient for target model soft update
    polyak: 0.005
    # the number of steps to look ahead
    n_step: 20

    # NOTE extra part from hatrpo
    # kl threshold in HATRPO update
    kl_threshold: 0.01
    # line search steps
    ls_step: 10
    # accept ratio in HATRPO update
    accept_ratio: 0.5
    # backtracking coefficient in line search
    backtrack_coeff: 0.8


  logger:
    # logging directory
    log_dir: "./results"
env_args:
  map_name: whatever
  state_type: FP # choose from FP (Feature Pruned) and EP (Environment Provided)
  env_uuid: uuuuuuuuuu
  use_global_state: False
