# This is the default config file for the train script.
# It can be used for train, eval, and sweep.

[base]
env_name = puffer_phc_fpo
wandb_name = fpo_full_tracking
vec = native
policy_name = FlowMatchingPolicy
flow_matching = True

[policy]
hidden_size = 512
parameterization = data
solver_step_size = 0.1
perturb_action_std = 0.05
condition_drop_ratio = 0
num_envs = 4096
sample_t_strategy = lognormal
p_mean = -1.2
p_std = 1.2

[rnn]
input_size = 512
hidden_size = 512

[env]
motion_file = "amass/amass_train_11313_upright.pkl"
has_self_collision = True
num_envs = 4096
; headless = True
use_amp_obs = False
auto_pmcp_soft = True
termination_distance = 0.25
kp_scale = 1.0
kd_scale = 1.0

[train]
seed = 1
torch_deterministic = True
device = cuda
flow_matching = True

cpu_offload = False
compile = False
norm_adv = True
target_kl = None

total_timesteps = 1_310_000_000
eval_timesteps = 1_310_000

data_dir = experiments
# NOTE: evaluation on all motions takes place when saving the model
checkpoint_interval = 250
# The resampling weight is updated after each eval. 
# Failed motions are more likely to be resampled.
motion_resample_interval = 250

num_workers = 1
num_envs = 1
batch_size = 131072
minibatch_size = 32768

learning_rate = 0.0001
# Keep this for other envs
anneal_lr = False
# Using exp decay: exp(-decay_rate * steps)
lr_decay_rate = 1.5e-4
# With the decay floor, the LR does not decay below LR * decay_floor
lr_decay_floor = 0.2

update_epochs = 4
bptt_horizon = 1
gae_lambda = 0.2
gamma = 0.98
clip_coef = 0.01
vf_coef = 1.2
clip_vloss = True
vf_clip_coef = 0.2
max_grad_norm = 1.0
ent_coef = 0.0
disc_coef = 5.0
bound_coef = 10.0
l2_reg_coef = 0.0

[sweep]
method = bayes
name = sweep

[sweep.metric]
goal = maximize
# name = environment/episode_return
# name = eval/success_rate
name = environment/episode_length

[sweep.parameters.train.parameters.total_timesteps]
distribution = log_uniform_values
min = 20_000_000
max = 80_000_000

[sweep.parameters.train.parameters.learning_rate]
distribution = log_uniform_values
min = 1e-5
max = 1e-3

; [sweep.parameters.train.parameters.gamma]
; distribution = uniform
; min = 0.0
; max = 1.0

[sweep.parameters.train.parameters.gae_lambda]
distribution = uniform
min = 0.0
max = 1.0

[sweep.parameters.train.parameters.clip_coef]
distribution = uniform
min = 0.0
max = 1.0

[sweep.parameters.train.parameters.vf_coef]
distribution = uniform
min = 0.0
max = 5.0

; [sweep.parameters.train.parameters.vf_clip_coef]
; distribution = uniform
; min = 0.0
; max = 1.0

[sweep.parameters.train.parameters.update_epochs]
distribution = int_uniform
min = 1
max = 6

; [sweep.parameters.train.parameters.minibatch_size]
; distribution = log_uniform_values
; min = 8192
; max = 65536

; [sweep.parameters.env.parameters.rew_power_coef]
; distribution = log_uniform_values
; min = 1e-4
; max = 1e-2