# defaults
DEFAULT_GPU_ID=0

# GPU ID
gpu_id=${1:-$DEFAULT_GPU_ID}    
export CUDA_VISIBLE_DEVICES=${gpu_id}

DEFAULT_NFE=30
DEFAULT_EXPLORATION_COEF=3.0

default_nfe=${2:-$DEFAULT_NFE}
exploration_coef=${3:-$DEFAULT_EXPLORATION_COEF}

GLOBAL_SEED=42
global_seed=${4:-$GLOBAL_SEED}










script/sample/t2i/run_sample_scheduled/mscoco_2014_5k_test/sd_v1_4/sde.sh ${gpu_id} 30

# 

# 15-step SD v1.4
# hps_v2
# exploration_coef = 3.0
# latent_reward, immediate_posterior_mean
# max_reward
# average
# beta, value_gradient, 10
# best_trajectory_updated
# 0.05, 0.5
# NFE 150
# best_merged_reward_list: [28.0312, 25.5625, 27.3750, 27.2969, 27.6406, 29.2656, 28.6094, 29.5000, 27.9844, 27.8594]
# best_final_reward_list: [28.0000, 25.4219, 27.3438, 27.2031, 27.5938, 29.1562, 28.5625, 29.4375, 27.9375, 27.8281]
# python main.py \
#     pipeline=sd_v1_4 \
#     task=search/run_optimal_control_mcts/sd_v1_4/template \
#     task.prompt_list.num_prompt=5 \
#     task.sample.num_inference_step=15 \
#     task.task.num_sample_per_prompt=2 \
#     task.reward_model.reward_model_type="hps_v2" \
#     task.reward_model.cal_dynamics_batch_size=20 \
#     task.reward_model.cal_intermediate_reward_batch_size=20 \
#     task.reward_model.cal_final_reward_batch_size=20 \
#     task.reward_model.reward_shaping_policy="latent_reward" \
#     task.reward_model.cal_intermediate_reward_policy="immediate_posterior_mean" \
#     task.mcts.mode.mdp_modeling="max_reward" \
#     task.mcts.mode.value_policy="average" \
#     task.mcts.mode.pseudo_latent_as_final=True \
#     task.mcts.ucb.exploration_coef=${exploration_coef} \
#     task.mcts.selection.selection_depth_lim=14 \
#     task.mcts.expansion.expansion_action_sampling_policy="beta" \
#     task.mcts.nfe_limit.nfe_cal_dynamics_lim=${default_nfe} \
#     task.mcts.nfe_limit.nfe_cal_intermediate_reward_lim=${default_nfe} \
#     task.mcts.nfe_limit.nfe_cal_final_reward_lim=${default_nfe} \
#     task.mcts.beta.update_policy="value_gradient" \
#     task.mcts.beta.value_gradient_update_time="best_trajectory_updated" \
#     task.mcts.beta.update_step_size=0.05 \
#     task.mcts.beta.max_update_bias=0.5 \
#     task.mcts.beta.zeta_list=10 \


# 15-step SD v1.4
# hps_v2
# exploration_coef = 3.0
# latent_reward, immediate_posterior_mean
# cumulative_reward
# average
# beta, value_gradient, 10
# best_trajectory_updated
# 0.05, 0.5
# NFE 150
# best_merged_reward_list: [394.2344, 360.6250, 384.4219, 376.7969, 403.4375, 411.0469, 413.6719, 421.8281, 399.3906, 379.5781]
# best_final_reward_list: [28.0000, 25.4219, 27.2969, 27.2031, 27.5938, 29.1562, 28.5625, 29.4375, 27.8750, 27.4688]
# python main.py \
#     pipeline=sd_v1_4 \
#     task=search/run_optimal_control_mcts/sd_v1_4/template \
#     task.prompt_list.num_prompt=5 \
#     task.sample.num_inference_step=15 \
#     task.task.num_sample_per_prompt=2 \
#     task.reward_model.reward_model_type="hps_v2" \
#     task.reward_model.cal_dynamics_batch_size=20 \
#     task.reward_model.cal_intermediate_reward_batch_size=20 \
#     task.reward_model.cal_final_reward_batch_size=20 \
#     task.reward_model.reward_shaping_policy="latent_reward" \
#     task.reward_model.cal_intermediate_reward_policy="immediate_posterior_mean" \
#     task.mcts.mode.mdp_modeling="cumulative_reward" \
#     task.mcts.mode.value_policy="average" \
#     task.mcts.mode.pseudo_latent_as_final=False \
#     task.mcts.ucb.exploration_coef=${exploration_coef} \
#     task.mcts.selection.selection_depth_lim=14 \
#     task.mcts.expansion.expansion_action_sampling_policy="beta" \
#     task.mcts.nfe_limit.nfe_cal_dynamics_lim=${default_nfe} \
#     task.mcts.nfe_limit.nfe_cal_intermediate_reward_lim=${default_nfe} \
#     task.mcts.nfe_limit.nfe_cal_final_reward_lim=${default_nfe} \
#     task.mcts.beta.update_policy="value_gradient" \
#     task.mcts.beta.value_gradient_update_time="best_trajectory_updated" \
#     task.mcts.beta.update_step_size=0.05 \
#     task.mcts.beta.max_update_bias=0.5 \
#     task.mcts.beta.zeta_list=10 \


# 15-step SD v1.4
# hps_v2
# exploration_coef = 3.0
# latent_reward, immediate_posterior_mean
# cumulative_reward
# average
# beta, value_gradient, 10
# back_propagation
# 0.05, 0.5
# NFE 150
# best_merged_reward_list: [397.2188, 360.6250, 384.4219, 376.7969, 406.4844, 413.9375, 418.8281, 421.8281, 403.0000, 379.5781]
# best_final_reward_list: [28.2656, 25.4219, 27.2969, 27.2031, 28.0938, 29.2500, 29.2500, 29.4375, 28.1562, 27.4688]
# python main.py \
#     pipeline=sd_v1_4 \
#     task=search/run_optimal_control_mcts/sd_v1_4/template \
#     task.prompt_list.num_prompt=5 \
#     task.sample.num_inference_step=15 \
#     task.task.num_sample_per_prompt=2 \
#     task.reward_model.reward_model_type="hps_v2" \
#     task.reward_model.cal_dynamics_batch_size=20 \
#     task.reward_model.cal_intermediate_reward_batch_size=20 \
#     task.reward_model.cal_final_reward_batch_size=20 \
#     task.reward_model.reward_shaping_policy="latent_reward" \
#     task.reward_model.cal_intermediate_reward_policy="immediate_posterior_mean" \
#     task.mcts.mode.mdp_modeling="cumulative_reward" \
#     task.mcts.mode.value_policy="average" \
#     task.mcts.mode.pseudo_latent_as_final=False \
#     task.mcts.ucb.exploration_coef=${exploration_coef} \
#     task.mcts.selection.selection_depth_lim=14 \
#     task.mcts.expansion.expansion_action_sampling_policy="beta" \
#     task.mcts.nfe_limit.nfe_cal_dynamics_lim=${default_nfe} \
#     task.mcts.nfe_limit.nfe_cal_intermediate_reward_lim=${default_nfe} \
#     task.mcts.nfe_limit.nfe_cal_final_reward_lim=${default_nfe} \
#     task.mcts.beta.update_policy="value_gradient" \
#     task.mcts.beta.value_gradient_update_time="back_propagation" \
#     task.mcts.beta.update_step_size=0.05 \
#     task.mcts.beta.max_update_bias=0.5 \
#     task.mcts.beta.zeta_list=10 \


# 20-step SDXL
# DrawBench
# CLIP score
# NFE 400
# 0.3857
# best_merged_reward_list: [0.3725, 0.3506, 0.3651, 0.3927, 0.3466, 0.3722, 0.3370, 0.2960, 0.4129, 0.4372, 0.4693, 0.4391, 0.2915, 0.3155, 0.4518, 0.4649, 0.3541, 0.3478, 0.4310, 0.4549, 0.4174, 0.4318, 0.4335, 0.4191, 0.3338, 0.3533, 0.3589, 0.3658, 0.3680, 0.3853]
# best_final_reward_list: [0.3725, 0.3481, 0.3598, 0.3836, 0.2966, 0.3629, 0.3319, 0.2867, 0.3981, 0.4322, 0.4693, 0.4391, 0.2795, 0.3134, 0.4357, 0.4477, 0.3378, 0.3478, 0.4180, 0.4516, 0.4036, 0.4275, 0.3776, 0.4191, 0.3221, 0.3533, 0.3519, 0.3566, 0.3500, 0.3708]
# python main.py \
#     vae_decode_batch_size=10 \
#     pipeline=sdxl \
#     task=search/run_optimal_control_mcts/sdxl/template \
#     task.prompt_list.prompt_manager_dict.prompt_manager_type="DrawBench" \
#     task.prompt_list.num_prompt=15 \
#     task.task.num_sample_per_prompt=2 \
#     task.sample.num_inference_step=20 \
#     task.reward_model.reward_model_type="clip_score" \
#     task.reward_model.cal_dynamics_batch_size=30 \
#     task.reward_model.cal_final_reward_batch_size=30 \
#     task.reward_model.reward_shaping_policy="latent_reward" \
#     task.reward_model.cal_intermediate_reward_policy="immediate_posterior_mean" \
#     task.mcts.mode.mdp_modeling="max_reward" \
#     task.mcts.mode.value_policy="average" \
#     task.mcts.mode.pseudo_latent_as_final=True \
#     task.mcts.ucb.exploration_coef=${exploration_coef} \
#     task.mcts.selection.selection_depth_lim=14 \
#     task.mcts.expansion.expansion_action_sampling_policy="beta" \
#     task.mcts.nfe_limit.nfe_cal_dynamics_lim=${default_nfe} \
#     task.mcts.nfe_limit.nfe_cal_intermediate_reward_lim=${default_nfe} \
#     task.mcts.nfe_limit.nfe_cal_final_reward_lim=${default_nfe} \
#     task.mcts.beta.update_policy="value_gradient" \
#     task.mcts.beta.value_gradient_update_time="back_propagation" \
#     task.mcts.beta.update_step_size=0.15 \
#     task.mcts.beta.max_update_bias=0.5 \
#     task.mcts.beta.zeta_list=10 \


# ./script/sample/t2i/run_sample_scheduled/hpd_v2/sd_v1_4/baseline/ddpm.sh ${gpu_id} 15
# ./script/sample/t2i/run_sample_scheduled/hpd_v2/sd_v1_4/baseline/deterministic_ddim.sh ${gpu_id} 15

# ./script/sample/t2i/run_sample_scheduled/hpd_v2/sd_v1_4/baseline/hpd_v2/ddpm.sh ${gpu_id} 20
# ./script/sample/t2i/run_sample_scheduled/hpd_v2/sd_v1_4/baseline/hpd_v2/deterministic_ddim.sh ${gpu_id} 20

# ./script/sample/t2i/run_sample_scheduled/hpd_v2/sd_v1_4/baseline/hpd_v2/ddpm.sh ${gpu_id} 25
# ./script/sample/t2i/run_sample_scheduled/hpd_v2/sd_v1_4/baseline/hpd_v2/deterministic_ddim.sh ${gpu_id} 25

# ./script/sample/t2i/run_sample_scheduled/hpd_v2/sd_v1_4/baseline/hpd_v2/ddpm.sh ${gpu_id} 30
# ./script/sample/t2i/run_sample_scheduled/hpd_v2/sd_v1_4/baseline/hpd_v2/deterministic_ddim.sh ${gpu_id} 30

# ./script/sample/t2i/run_sample_scheduled/hpd_v2/sd_v1_4/baseline/hpd_v2/ddpm.sh ${gpu_id} 50
# ./script/sample/t2i/run_sample_scheduled/hpd_v2/sd_v1_4/baseline/hpd_v2/deterministic_ddim.sh ${gpu_id} 50

# ./script/sample/t2i/run_sample_scheduled/hpd_v2/sdxl/baseline/run_baseline.sh ${gpu_id}
# ./script/sample/t2i/run_sample_scheduled/draw_bench/sdxl/baseline/run_baseline.sh ${gpu_id}


# (test only) 测 dbg 后的 MCTS, 
# max, max
# sparse reward
# Beta vg
# back_propagation
# 
# python main.py \
#     exp_name.default_exp_name="exp" \
#     vae_decode_batch_size=30 \
#     seed=${global_seed} \
#     pipeline=sd_v1_4 \
#     task=search/run_optimal_control_mcts/sd_v1_4/template \
#     task.init_latent.seed_list=0 \
#     task.eps.seed_list=3072 \
#     task.prompt_list.num_prompt=5 \
#     task.prompt_list.prompt_manager_dict.prompt_manager_type="HumanPreferenceDataset_v2" \
#     task.prompt_list.prompt_manager_dict.cfg_yaml_path="./config/dataset/hpd_v2_20.yaml" \
#     task.task.num_sample_per_prompt=2 \
#     task.sample.height=512 \
#     task.sample.width=512 \
#     task.sample.num_inference_step=15 \
#     task.reward_model.reward_model_type="hps_v2" \
#     task.reward_model.cal_dynamics_batch_size=60 \
#     task.reward_model.cal_final_reward_batch_size=60 \
#     task.reward_model.reward_shaping_policy="disabled" \
#     task.mcts.mode.mdp_modeling="max_reward" \
#     task.mcts.mode.value_policy="max" \
#     task.mcts.mode.pseudo_latent_as_final=False \
#     task.mcts.ucb.exploration_coef=${exploration_coef} \
#     task.mcts.selection.selection_depth_lim=None \
#     task.mcts.expansion.expansion_action_sampling_policy="beta" \
#     task.mcts.beta.update_policy="value_gradient" \
#     task.mcts.beta.value_gradient_update_time="back_propagation" \
#     task.mcts.beta.update_step_size=0.1 \
#     task.mcts.beta.max_update_bias=1.0 \
#     task.mcts.beta.zeta_list=10 \
#     task.mcts.nfe_limit.nfe_cal_dynamics_lim=${default_nfe} \
#     task.mcts.nfe_limit.nfe_cal_intermediate_reward_lim=${default_nfe} \
#     task.mcts.nfe_limit.nfe_cal_final_reward_lim=${default_nfe} \


# (test only) 测 dbg 后的 MCTS
# python main.py \
#     exp_name.default_exp_name="exp" \
#     vae_decode_batch_size=30 \
#     seed=${global_seed} \
#     pipeline=sd_v1_4 \
#     task=search/run_optimal_control_mcts/sd_v1_4/template \
#     task.init_latent.seed_list=0 \
#     task.eps.seed_list=3072 \
#     task.prompt_list.num_prompt=5 \
#     task.prompt_list.prompt_manager_dict.prompt_manager_type="HumanPreferenceDataset_v2" \
#     task.prompt_list.prompt_manager_dict.cfg_yaml_path="./config/dataset/hpd_v2_20.yaml" \
#     task.task.num_sample_per_prompt=2 \
#     task.sample.height=512 \
#     task.sample.width=512 \
#     task.sample.num_inference_step=15 \
#     task.reward_model.reward_model_type="hps_v2" \
#     task.reward_model.cal_dynamics_batch_size=60 \
#     task.reward_model.cal_final_reward_batch_size=60 \
#     task.reward_model.reward_shaping_policy="disabled" \
#     task.mcts.mode.mdp_modeling="sparse_reward" \
#     task.mcts.mode.value_policy="max" \
#     task.mcts.mode.pseudo_latent_as_final=False \
#     task.mcts.ucb.exploration_coef=${exploration_coef} \
#     task.mcts.selection.selection_depth_lim=None \
#     task.mcts.expansion.expansion_action_sampling_policy="beta" \
#     task.mcts.beta.update_policy="value_gradient" \
#     task.mcts.beta.value_gradient_update_time="back_propagation" \
#     task.mcts.beta.update_step_size=0.1 \
#     task.mcts.beta.max_update_bias=1.0 \
#     task.mcts.beta.zeta_list=10 \
#     task.mcts.nfe_limit.nfe_cal_dynamics_lim=${default_nfe} \
#     task.mcts.nfe_limit.nfe_cal_intermediate_reward_lim=${default_nfe} \
#     task.mcts.nfe_limit.nfe_cal_final_reward_lim=${default_nfe} \


# python main.py \
#     exp_name.default_exp_name="exp" \
#     vae_decode_batch_size=30 \
#     seed=${global_seed} \
#     pipeline=sd_v1_4 \
#     task=search/run_optimal_control_mcts/sd_v1_4/template \
#     task.init_latent.seed_list=0 \
#     task.eps.seed_list=3072 \
#     task.prompt_list.num_prompt=5 \
#     task.prompt_list.prompt_manager_dict.prompt_manager_type="HumanPreferenceDataset_v2" \
#     task.prompt_list.prompt_manager_dict.cfg_yaml_path="./config/dataset/hpd_v2_20.yaml" \
#     task.task.num_sample_per_prompt=2 \
#     task.sample.height=512 \
#     task.sample.width=512 \
#     task.sample.num_inference_step=15 \
#     task.reward_model.reward_model_type="hps_v2" \
#     task.reward_model.cal_dynamics_batch_size=60 \
#     task.reward_model.cal_final_reward_batch_size=60 \
#     task.reward_model.reward_shaping_policy="latent_reward" \
#     task.mcts.mode.mdp_modeling="max_reward" \
#     task.mcts.mode.value_policy="max" \
#     task.mcts.mode.pseudo_latent_as_final=True \
#     task.mcts.ucb.exploration_coef=${exploration_coef} \
#     task.mcts.selection.selection_depth_lim=None \
#     task.mcts.expansion.expansion_action_sampling_policy="beta" \
#     task.mcts.beta.update_policy="value_gradient" \
#     task.mcts.beta.value_gradient_update_time="back_propagation" \
#     task.mcts.beta.update_step_size=0.1 \
#     task.mcts.beta.max_update_bias=1.0 \
#     task.mcts.beta.zeta_list=10 \
#     task.mcts.nfe_limit.nfe_cal_dynamics_lim=${default_nfe} \
#     task.mcts.nfe_limit.nfe_cal_intermediate_reward_lim=1e9 \
#     task.mcts.nfe_limit.nfe_cal_final_reward_lim=1e9 \


# python main.py \
#     exp_name.default_exp_name="exp" \
#     vae_decode_batch_size=30 \
#     seed=${global_seed} \
#     pipeline=sd_v1_4 \
#     task=search/run_optimal_control_mcts/sd_v1_4/template \
#     task.init_latent.seed_list=0 \
#     task.eps.seed_list=3072 \
#     task.prompt_list.num_prompt=5 \
#     task.prompt_list.prompt_manager_dict.prompt_manager_type="HumanPreferenceDataset_v2" \
#     task.prompt_list.prompt_manager_dict.cfg_yaml_path="./config/dataset/hpd_v2_20.yaml" \
#     task.task.num_sample_per_prompt=2 \
#     task.sample.height=512 \
#     task.sample.width=512 \
#     task.sample.num_inference_step=15 \
#     task.reward_model.reward_model_type="hps_v2" \
#     task.reward_model.cal_dynamics_batch_size=60 \
#     task.reward_model.cal_final_reward_batch_size=60 \
#     task.reward_model.reward_shaping_policy="latent_reward" \
#     task.mcts.mode.mdp_modeling="max_reward" \
#     task.mcts.mode.value_policy="max" \
#     task.mcts.mode.pseudo_latent_as_final=True \
#     task.mcts.ucb.exploration_coef=${exploration_coef} \
#     task.mcts.selection.selection_depth_lim=None \
#     task.mcts.expansion.expansion_action_sampling_policy="beta" \
#     task.mcts.beta.update_policy="value_gradient" \
#     task.mcts.beta.value_gradient_update_time="back_propagation" \
#     task.mcts.beta.update_step_size=0.1 \
#     task.mcts.beta.max_update_bias=1.0 \
#     task.mcts.beta.zeta_list=10 \
#     task.mcts.nfe_limit.nfe_cal_dynamics_lim=${default_nfe} \
#     task.mcts.nfe_limit.nfe_cal_intermediate_reward_lim=1e9 \
#     task.mcts.nfe_limit.nfe_cal_final_reward_lim=1e9 \


# python main.py \
#     exp_name.default_exp_name="exp" \
#     vae_decode_batch_size=30 \
#     seed=${global_seed} \
#     pipeline=sd_v1_4 \
#     task=search/run_optimal_control_mcts/sd_v1_4/template \
#     task.init_latent.seed_list=0 \
#     task.eps.seed_list=3072 \
#     task.prompt_list.num_prompt=5 \
#     task.prompt_list.prompt_manager_dict.prompt_manager_type="HumanPreferenceDataset_v2" \
#     task.prompt_list.prompt_manager_dict.cfg_yaml_path="./config/dataset/hpd_v2_20.yaml" \
#     task.task.num_sample_per_prompt=2 \
#     task.sample.height=512 \
#     task.sample.width=512 \
#     task.sample.num_inference_step=15 \
#     task.reward_model.reward_model_type="hps_v2" \
#     task.reward_model.cal_dynamics_batch_size=60 \
#     task.reward_model.cal_final_reward_batch_size=60 \
#     task.reward_model.reward_shaping_policy="latent_reward" \
#     task.mcts.mode.mdp_modeling="max_reward" \
#     task.mcts.mode.value_policy="max" \
#     task.mcts.mode.pseudo_latent_as_final=True \
#     task.mcts.ucb.exploration_coef=${exploration_coef} \
#     task.mcts.selection.selection_depth_lim=8 \
#     task.mcts.expansion.expansion_action_sampling_policy="beta" \
#     task.mcts.beta.update_policy="value_gradient" \
#     task.mcts.beta.value_gradient_update_time="back_propagation" \
#     task.mcts.beta.update_step_size=0.1 \
#     task.mcts.beta.max_update_bias=1.0 \
#     task.mcts.beta.zeta_list=10 \
#     task.mcts.nfe_limit.nfe_cal_dynamics_lim=${default_nfe} \
#     task.mcts.nfe_limit.nfe_cal_intermediate_reward_lim=1e9 \
#     task.mcts.nfe_limit.nfe_cal_final_reward_lim=1e9 \


# (test only) 测 SD v1.4, compressibility_reward
# python main.py \
#     exp_name.default_exp_name="exp" \
#     vae_decode_batch_size=30 \
#     seed=${global_seed} \
#     pipeline=sd_v1_4 \
#     task=search/run_optimal_control_mcts/sd_v1_4/template \
#     task.init_latent.seed_list=0 \
#     task.eps.seed_list=3072 \
#     task.prompt_list.num_prompt=2 \
#     task.prompt_list.prompt_manager_dict.prompt_manager_type="HumanPreferenceDataset_v2" \
#     task.prompt_list.prompt_manager_dict.cfg_yaml_path="./config/dataset/hpd_v2_20.yaml" \
#     task.task.num_sample_per_prompt=2 \
#     task.sample.height=512 \
#     task.sample.width=512 \
#     task.sample.num_inference_step=20 \
#     task.reward_model.reward_model_type="compressibility_reward" \
#     task.reward_model.cal_dynamics_batch_size=60 \
#     task.reward_model.cal_final_reward_batch_size=60 \
#     task.reward_model.reward_shaping_policy="latent_reward" \
#     task.mcts.mode.mdp_modeling="max_reward" \
#     task.mcts.mode.value_policy="max" \
#     task.mcts.mode.pseudo_latent_as_final=False \
#     task.mcts.ucb.exploration_coef=${exploration_coef} \
#     task.mcts.selection.selection_depth_lim=None \
#     task.mcts.expansion.expansion_action_sampling_policy="beta" \
#     task.mcts.beta.update_policy="soft" \
#     task.mcts.beta.update_step_size=0.1 \
#     task.mcts.beta.max_update_bias=1.0 \
#     task.mcts.beta.zeta_list=10 \
#     task.mcts.nfe_limit.nfe_cal_dynamics_lim=${default_nfe} \
#     task.mcts.nfe_limit.nfe_cal_intermediate_reward_lim=1e9 \
#     task.mcts.nfe_limit.nfe_cal_final_reward_lim=1e9 \


# (test only) 测 SDXL, compressibility_reward
# python main.py \
#     exp_name.default_exp_name="exp" \
#     vae_decode_batch_size=10 \
#     seed=${global_seed} \
#     pipeline=sdxl \
#     task=search/run_optimal_control_mcts/sdxl/template \
#     task.init_latent.seed_list=0 \
#     task.eps.seed_list=3072 \
#     task.prompt_list.num_prompt=5 \
#     task.prompt_list.prompt_manager_dict.prompt_manager_type="HumanPreferenceDataset_v2" \
#     task.prompt_list.prompt_manager_dict.cfg_yaml_path="./config/dataset/hpd_v2_20.yaml" \
#     task.task.num_sample_per_prompt=2 \
#     task.sample.height=1024 \
#     task.sample.width=1024 \
#     task.sample.num_inference_step=30 \
#     task.reward_model.reward_model_type="compressibility_reward" \
#     task.reward_model.cal_dynamics_batch_size=40 \
#     task.reward_model.cal_final_reward_batch_size=60 \
#     task.reward_model.reward_shaping_policy="latent_reward" \
#     task.mcts.mode.mdp_modeling="max_reward" \
#     task.mcts.mode.value_policy="max" \
#     task.mcts.mode.pseudo_latent_as_final=False \
#     task.mcts.ucb.exploration_coef=${exploration_coef} \
#     task.mcts.selection.selection_depth_lim=None \
#     task.mcts.expansion.expansion_action_sampling_policy="beta" \
#     task.mcts.beta.update_policy="soft" \
#     task.mcts.beta.update_step_size=0.1 \
#     task.mcts.beta.max_update_bias=1.0 \
#     task.mcts.beta.zeta_list=10 \
#     task.mcts.nfe_limit.nfe_cal_dynamics_lim=${default_nfe} \
#     task.mcts.nfe_limit.nfe_cal_intermediate_reward_lim=1e9 \
#     task.mcts.nfe_limit.nfe_cal_final_reward_lim=1e9 \


# (test only) 测 SD v1.4, compressibility_reward
# python main.py \
#     exp_name.default_exp_name="exp" \
#     vae_decode_batch_size=20 \
#     seed=${global_seed} \
#     pipeline=sd_v1_4 \
#     task=search/run_optimal_control_mcts/sd_v1_4/template \
#     task.init_latent.seed_list=0 \
#     task.eps.seed_list=3072 \
#     task.prompt_list.num_prompt=5 \
#     task.prompt_list.prompt_manager_dict.prompt_manager_type="HumanPreferenceDataset_v2" \
#     task.prompt_list.prompt_manager_dict.cfg_yaml_path="./config/dataset/hpd_v2_20.yaml" \
#     task.task.num_sample_per_prompt=2 \
#     task.sample.height=1024 \
#     task.sample.width=1024 \
#     task.sample.num_inference_step=50 \
#     task.reward_model.reward_model_type="compressibility_reward" \
#     task.reward_model.cal_dynamics_batch_size=40 \
#     task.reward_model.cal_final_reward_batch_size=60 \
#     task.reward_model.reward_shaping_policy="latent_reward" \
#     task.mcts.mode.mdp_modeling="max_reward" \
#     task.mcts.mode.value_policy="max" \
#     task.mcts.mode.pseudo_latent_as_final=False \
#     task.mcts.ucb.exploration_coef=${exploration_coef} \
#     task.mcts.selection.selection_depth_lim=None \
#     task.mcts.expansion.expansion_action_sampling_policy="beta" \
#     task.mcts.beta.update_policy="soft" \
#     task.mcts.beta.update_step_size=0.1 \
#     task.mcts.beta.max_update_bias=1.0 \
#     task.mcts.beta.zeta_list=10 \
#     task.mcts.nfe_limit.nfe_cal_dynamics_lim=${default_nfe} \
#     task.mcts.nfe_limit.nfe_cal_intermediate_reward_lim=1e9 \
#     task.mcts.nfe_limit.nfe_cal_final_reward_lim=1e9 \


# latent, max, average, 4096
# python main.py \
#     exp_name.default_exp_name="max_average" \
#     vae_decode_batch_size=10 \
#     seed=${global_seed} \
#     pipeline=sd_v1_4 \
#     task=search/run_optimal_control_mcts/sd_v1_4/template \
#     task.init_latent.seed_list=0 \
#     task.eps.seed_list=4096 \
#     task.prompt_list.num_prompt=20 \
#     task.prompt_list.prompt_manager_dict.prompt_manager_type="HumanPreferenceDataset_v2" \
#     task.prompt_list.prompt_manager_dict.cfg_yaml_path="./config/dataset/hpd_v2_30.yaml" \
#     task.task.num_sample_per_prompt=2 \
#     task.sample.height=512 \
#     task.sample.width=512 \
#     task.sample.num_inference_step=15 \
#     task.reward_model.reward_model_type="hps_v2" \
#     task.reward_model.cal_dynamics_batch_size=40 \
#     task.reward_model.cal_final_reward_batch_size=40 \
#     task.reward_model.reward_shaping_policy="latent_reward" \
#     task.mcts.mode.mdp_modeling="max_reward" \
#     task.mcts.mode.value_policy="average" \
#     task.mcts.mode.pseudo_latent_as_final=True \
#     task.mcts.ucb.exploration_coef=${exploration_coef} \
#     task.mcts.selection.selection_depth_lim=None \
#     task.mcts.expansion.expansion_action_sampling_policy="beta" \
#     task.mcts.beta.update_policy="soft" \
#     task.mcts.beta.update_step_size=0.1 \
#     task.mcts.beta.max_update_bias=1.0 \
#     task.mcts.beta.zeta_list=10 \
#     task.mcts.nfe_limit.nfe_cal_dynamics_lim=${default_nfe} \
#     task.mcts.nfe_limit.nfe_cal_intermediate_reward_lim=1e9 \
#     task.mcts.nfe_limit.nfe_cal_final_reward_lim=1e9 \


# 30-step SD v1.4
# color_channel_reward
# disabled, sparse, max
# 4096
python main.py \
    exp_name.default_exp_name="sparse_max" \
    vae_decode_batch_size=10 \
    seed=${global_seed} \
    pipeline=sd_v1_4 \
    task=search/run_optimal_control_mcts/sd_v1_4/template \
    task.init_latent.seed_list=0 \
    task.eps.seed_list=5120 \
    task.prompt_list.num_prompt=20 \
    task.prompt_list.prompt_manager_dict.prompt_manager_type="HumanPreferenceDataset_v2" \
    task.prompt_list.prompt_manager_dict.cfg_yaml_path="./config/dataset/hpd_v2_20.yaml" \
    task.task.num_sample_per_prompt=2 \
    task.sample.height=512 \
    task.sample.width=512 \
    task.sample.num_inference_step=30 \
    task.reward_model.reward_model_type="color_channel_reward" \
    task.reward_model.cal_dynamics_batch_size=40 \
    task.reward_model.cal_final_reward_batch_size=40 \
    task.reward_model.reward_shaping_policy="disabled" \
    task.mcts.mode.mdp_modeling="sparse_reward" \
    task.mcts.mode.value_policy="max" \
    task.mcts.mode.pseudo_latent_as_final=False \
    task.mcts.ucb.exploration_coef=${exploration_coef} \
    task.mcts.selection.selection_depth_lim=None \
    task.mcts.expansion.expansion_action_sampling_policy="beta" \
    task.mcts.beta.update_policy="soft" \
    task.mcts.beta.update_step_size=0.1 \
    task.mcts.beta.max_update_bias=1.0 \
    task.mcts.beta.zeta_list=10 \
    task.mcts.nfe_limit.nfe_cal_dynamics_lim=${default_nfe} \
    task.mcts.nfe_limit.nfe_cal_intermediate_reward_lim=1e9 \
    task.mcts.nfe_limit.nfe_cal_final_reward_lim=1e9 \

