# seed:
# whether to use the specified seed
seed_specify: True
# seed
seed: 1
# device:
# whether to use CUDA
cuda: True
# whether to set CUDA deterministic
cuda_deterministic: True
# arg to torch.set_num_threads
torch_threads: 4
# train:
# number of parallel environments for training data collection
n_rollout_threads: 20
# number of total steps
num_env_steps: 5000000
# max length of an episode
episode_length: 210
# number of warmup steps
warmup_steps: 50000
# number of steps per train
train_interval: 1000
# ratio of training iterations to train_interval
update_per_train: 0.05
# logging interval
log_interval: 20000
# evaluation interval
eval_interval: 100000
# whether to use linear learning rate decay
use_linear_lr_decay: False
# if set, load models from this directory; otherwise, randomly initialise the models
model_dir: ~
# eval:
# whether to use evaluation
use_eval: True
# number of parallel environments for evaluation
n_eval_rollout_threads: 10
# number of episodes per evaluation
eval_episodes: 20
# render:
# whether to use render
use_render: False
# number of episodes to render
render_episodes: 10
# model:
# network parameters
# hidden sizes for mlp module in the network
hidden_sizes: [256, 256]
# activation function, choose from sigmoid, tanh, relu, leaky_relu, selu
activation_func: relu
# final activation function, choose from sigmoid, tanh, relu, leaky_relu, selu
final_activation_func: tanh
# whether to use feature normalization
use_feature_normalization: True
# initialization method for network parameters, choose from xavier_uniform_, orthogonal_, ...
initialization_method: orthogonal_
# optimizer parameters
# actor learning rate
lr: 0.0005
# critic learning rate
critic_lr: 0.0005
# recurrent parameters
# whether to use rnn policy (data is chunked for training)
use_recurrent_policy: False
# number of recurrent layers
recurrent_n: 1
# algo:
# discount factor
gamma: 0.99
# off-policy buffer size
buffer_size: 5000
# training batch size
batch_size: 1000
# coefficient for target model soft update
polyak: 0.005
# exploration noise
expl_noise: 0.1
# the number of steps to look ahead
n_step: 1
# whether to clip gradient norm
use_max_grad_norm: False
# max gradient norm
max_grad_norm: 10.0
# whether to share parameter among actors
share_param: True
# whether to use policy active masks
use_policy_active_masks: True
# adversarial policy parameters
# the id of adversarial agents
adv_agent_ids: [0]
# the range of timestep that can be perturbed, e.g.: "1-10,15,20"
perturb_timesteps: ~
# perturbation parameters
# the budget of perturbation (in L-inf norm)
perturb_epsilon: 0.2
# the iterations of gradient backwards for perturbations
perturb_iters: 10
# adaptively calculate the proper alpha
adaptive_alpha: True
# if adaptive_alpha=False, the budget of perturbation in every iteration
perturb_alpha: 0.05
# the criterion function when calculating the distance of actions
criterion: default
# if targeted, load the adversarial policies and perform targeted attack
targeted_attack: False
# logger:
# logging directory
log_dir: "./results"
