name: search-run_optimal_control_mcts_eps

init_latent:
  random: False

  # ---------= [Random] =---------
  seed_st: 0
  seed_ed: 1023

  # ---------= [Deterministic] =---------
  seed_list: 0
  seed_auto_increment: True

eps:
  random: True

  # ---------= [Random] =---------
  seed_st: 1024
  seed_ed: 2047

  # ---------= [Deterministic] =---------
  seed_list: 1024
  seed_auto_increment: True

eta:
  random: False

  # ---------= [Random] =---------
  eta_low: 0.0
  eta_high: 1.0

  # ---------= [Deterministic] =---------
  default_eta: [1.0]

prompt_list:
  num_prompt: 4

  prompt_manager_dict:
    prompt_manager_type: HumanPreferenceDataset_v2
    cfg_yaml_path: null

    # ---------= [HumanPreferenceDataset v2] =---------
    category_name_list: 
      - anime
      - concept-art
      - paintings
      - photo

task:
  num_sample_per_prompt: 2

sample:
  prompt_2: null
  negative_prompt: "low quality, blurry, ugly, oversaturated"
  negative_prompt_2: null

  height: 1024
  width: 1024

  down_sampling_ratio: 8

  num_inference_step: 15
  guidance_scale: 5.5

promptist:
  enable: False
  
  cfg_yaml_path: ./config/model/promptist.yaml

golden_noise:
  enable: False

  cfg_yaml_path: ./config/model/golden_noise.yaml

save:
  save_root_path: ./tmp/run_optimal_control_mcts_eps

reward_model:
  reward_model_type: hps_v2

  cal_dynamics_batch_size: 40
  cal_intermediate_reward_batch_size: 40
  cal_final_reward_batch_size: 40

  # ---------= [reward shaping] =---------
  reward_shaping_policy: "latent_reward"

  cal_intermediate_reward_policy: "immediate_posterior_mean"

  # ---------= [look_ahead] =---------
  num_look_ahead_step: 2

  # ---------= [discount] =---------
  gamma: 0.99

lru_cache:
  num_gpu_resident_lim: 1000

mcts:
  # ---------= [Mode] =---------
  mode: 
    # mdp_modeling: "max_reward"
    mdp_modeling: "cumulative_reward"

    # value_policy: "max"
    value_policy: "average"

    pseudo_latent_as_final: False
    # pseudo_latent_as_final: True

    enable_pseudo_latent_as_final_depth: None

  # ---------= [Upper Confidence Bound (UCB)] =---------
  ucb:
    exploration_coef: 1.2

  # ---------= [Selection Policy] =---------
  selection:
    selection_depth_lim: 5

  # ---------= [Expansion Policy] =---------
  expansion:
    expansion_action_sampling_policy: "uniform"
    enable_importance_sampling: False
    importance_sampling_J_star_scaling_factor: 0.5
    importance_sampling_eps: 1e-8
    importance_sampling_verbose: True
    per_iteration_expansion_lim: 1

  # ---------= [Simulation Policy] =---------
  simulation:
    simulation_action_sampling_policy: "uniform"
    default_action_list: [0.0]

  # ---------= [NFE Limit] =---------
  nfe_limit:
    nfe_cal_dynamics_lim: 40
    nfe_cal_intermediate_reward_lim: 40
    nfe_cal_final_reward_lim: 40

  # ---------= [Beta Distribution Parameterization] =---------
  beta:
    online_update: True
    # online_update: False
    update_policy: "hard"
    # update_policy: "soft"
    # update_policy: "value_gradient"
    value_gradient_update_time: "best_trajectory_updated"
    # value_gradient_update_time: "back_propagation"
    action_bias: 1e-2  # used for value gradient
    update_step_size: 0.1
    max_update_bias: 1.0  # to initial action
    zeta_list: 6
    update_reward_threshold: 1e-8
    clamp_eps: 1e-8
    direction_length_eps: 1e-6

display:
  display_trajectory: False
  display_selected_node_depth: True
  display_cal_state_value: False
  display_reward_sum_to_leaf: False
  display_beta_mode_update: False
