name: search-run_optimal_control_mcts

init_latent:
  random: False
  # random: True

  # ---------= [Random] =---------
  seed_st: 0
  seed_ed: 1023

  # ---------= [Deterministic] =---------
  seed_list: 0
  seed_auto_increment: True

eps:
  random: False
  # random: True

  # ---------= [Random] =---------
  seed_st: 1024
  seed_ed: 2047

  # ---------= [Deterministic] =---------
  seed_list: 1024
  seed_auto_increment: True

eta:
  random: False

  # ---------= [Random] =---------
  eta_low: 0.0
  eta_high: 1.0

  # ---------= [Deterministic] =---------
  default_eta: None

prompt_list:
  num_prompt: 4

  prompt_manager_dict:
    prompt_manager_type: HumanPreferenceDataset_v2
    cfg_yaml_path: null

    # ---------= [HumanPreferenceDataset v2] =---------
    category_name_list: 
      - anime
      - concept-art
      - paintings
      - photo

task:
  num_sample_per_prompt: 2

  # batch_size: 3

sample:
  # prompt: null
  prompt_2: null
  negative_prompt: "low quality, blurry, ugly, oversaturated"
  negative_prompt_2: null

  height: 1024
  width: 1024

  down_sampling_ratio: 8

  # num_inference_step: 5
  num_inference_step: 8
  # num_inference_step: 10
  # num_inference_step: 15
  # num_inference_step: 20
  # num_inference_step: 100
  # guidance_scale: 4.5
  guidance_scale: 5.5

promptist:
  enable: False
  # enable: True
  
  cfg_yaml_path: ./config/model/promptist.yaml

golden_noise:
  enable: False
  # enable: True

  cfg_yaml_path: ./config/model/golden_noise.yaml

save:
  save_root_path: ./tmp/run_optimal_control_mcts

reward_model:
  reward_model_type: hps_v2
  # reward_model_type: color_channel_reward
  # reward_model_type: laplacian_var_reward
  # reward_model_type: clip_score
  # reward_model_type: compressibility_reward
  # reward_model_type: incompressibility_reward

  cal_dynamics_batch_size: 20
  cal_intermediate_reward_batch_size: 20
  cal_final_reward_batch_size: 20

  # ---------= [reward shaping] =---------
  # reward_shaping_policy: "disabled"
  reward_shaping_policy: "latent_reward"
  # reward_shaping_policy: "potential_based"
  # reward_shaping_policy: "skipping"

  # potential_exp_growing: True
  # potential_exp_growing: False
  # potential_exp_base: 8.0

  cal_intermediate_reward_policy: "immediate_posterior_mean"
  # cal_intermediate_reward_policy: "immediate_score_function"
  # cal_intermediate_reward_policy: "look_ahead"
  # cal_intermediate_reward_policy: "sequential"
  # cal_intermediate_reward_policy: "discount"

  # ---------= [look_ahead] =---------
  num_look_ahead_step: 2
  # num_look_ahead_step: 3

  # ---------= [discount] =---------
  gamma: 0.99

lru_cache:
  num_gpu_resident_lim: 1000

mcts:
  # ---------= [Mode] =---------
  mode: 
    # mdp_modeling: "max_reward"
    mdp_modeling: "cumulative_reward"

    # value_policy: "max"
    value_policy: "average"

    pseudo_latent_as_final: False
    # pseudo_latent_as_final: True

    enable_pseudo_latent_as_final_depth: None

  # ---------= [Upper Confidence Bound (UCB)] =---------
  ucb:
    exploration_coef: 1.2
    # depth_coef: 1.35

    # exclude_last_intermediate_reward: False
    # exclude_last_intermediate_reward: True

  # ---------= [Selection Policy] =---------
  selection:
    selection_depth_lim: 5

  # ---------= [Expansion Policy] =---------
  expansion:
    expansion_action_sampling_policy: "uniform"
    # expansion_action_sampling_policy: "beta"
    # expansion_action_sampling_policy: "optimal_control"
    # enable_importance_sampling: True
    enable_importance_sampling: False
    importance_sampling_J_star_scaling_factor: 0.5
    # importance_sampling_J_star_scaling_factor: 0.95
    importance_sampling_eps: 1e-8
    importance_sampling_verbose: True
    per_iteration_expansion_lim: 1

  # ---------= [Simulation Policy] =---------
  simulation:
    simulation_action_sampling_policy: "uniform"
    # simulation_action_sampling_policy: "deterministic"
    default_action_list: [0.0]

  # ---------= [NFE Limit] =---------
  nfe_limit:
    nfe_cal_dynamics_lim: 40
    nfe_cal_intermediate_reward_lim: 40
    nfe_cal_final_reward_lim: 40

  # ---------= [Optimal Control] =---------
  # optimal_control:
  #   # optimal_control_online_update: True
  #   optimal_control_online_update: False
  #   optimal_control_update_reward_threshold: 0.001
  #   optimal_control_omega_z: 0.5
  #   optimal_control_omega_eta: 0.01
  #   optimal_control_finite_difference_accuracy_order: "SECOND"
  #   optimal_control_finite_difference_eps: 1e-8
  #   optimal_control_force_positive_semi_definite_max_tolerance: 1e-8
  #   optimal_control_force_positive_definite_max_tolerance: 1e-8
  #   optimal_control_clamp_eps: 1e-8

  # ---------= [Beta Distribution Parameterization] =---------
  beta:
    # beta_parameterization: False
    # beta_parameterization: True
    online_update: True
    update_policy: "hard"
    # update_policy: "soft"
    # update_policy: "value_gradient"
    value_gradient_update_time: "best_trajectory_updated"
    # value_gradient_update_time: "back_propagation"
    action_bias: 1e-2  # used for value gradient
    update_step_size: 0.1
    max_update_bias: 1.0  # to initial action
    zeta_list: 6
    update_reward_threshold: 1e-8
    clamp_eps: 1e-8
    direction_length_eps: 1e-6

display:
  display_trajectory: False
  # display_trajectory: True

  display_selected_node_depth: True
  # display_selected_node_depth: False

  # display_cal_state_value: True
  display_cal_state_value: False

  # display_reward_sum_to_leaf: True
  display_reward_sum_to_leaf: False

  # display_beta_mode_update: True
  display_beta_mode_update: False
