loop:
  oracle_name: ultrafeedback
  completions_dataset_path: /XXXX-3
  previous_output_path: null
  previous_checkpoint_path: null
  output_path: null
  logs_path: null
  args_path: null
  outer_loop_batch_size: 32 # 32, 128, 512
  rm_training_batch_size: 32 # RM training batch size ask, whether it makes the difference. (grdi search by them), yeah 64,128 is better then Iuse grad accumulation
  max_length: 4096 # just remove truncate elements.
  seed: 4
  replay_buffer_size: 3200 # factor, it's 100 times better. 1000 times  (100 * outerloopbatchsize) 10, 100, 1000
  report_to: none # wandb some good way to visualize grids. 
  wandb_project: null

acquisition:
  acquisition_function_type: dts
  acquisition_config:
    max_iterations: 30
    beta: 1
    argmax_tol: 1e-4
    decision_buffer: 0.0
    use_candidate_set: false
    seed: 42
# converge to evals 
# given a 
# try to evalulate checkpoints, 5000 datapoints. 2500 per each step. 
enn:
  # "meta-llama/Llama-3.2-1B-Instruct", "unsloth/Qwen2.5-1.5B-Instruct", "allenai/OLMo-2-1124-7B-SFT", "allenai/Llama-3.1-Tulu-3-8B-SFT", "unsloth/Qwen2.5-0.5B-Instruct", "Skywork/Skywork-Reward-V2-Qwen3-4B"
  base_model_name_or_path: "Skywork/Skywork-Reward-V2-Qwen3-4B" # 0.5B qwen (ask big difference in performance for using qwen 1.5B and 0.5B, what model choose), Tulu3 Paper, how to train model. ablations 8B scale. Big labs, deepsek lab.s they experimentwith smaler models. 
  freeze_base_model: true
  feature_extraction_layer: "last_hidden_state"
  num_train_epochs: 1
  save_strategy: "no"
  report_to: "none"
  disable_tqdm: true
  logging_strategy: "no"
  logging_steps: 1
  lr_scheduler_type: "cosine"
  learning_rate: 1e-4 # Ask, what was the right learning rate, he used.  Loss Curves from them. How quickly the 
  regularization_towards_initial_weights: 100 # ()
  regularization_weight_decay_type: "exponential" #exponential decay is more importnant.
  exponential_decay_base: 0.99 #0.95 0.99
  max_training_steps: 20 # 10 undetraining.  Underfitting more concern. 
  initialization_xavier_gain: 1.0
  center_rewards_coefficient: 0.01

##
## 3 * 3 * 3 * 3   This is equal to 81... Keep in mind the evaluation on multiple seeds.
## Now let's focus on RM training. how robust it is to change of seeds. 
##
##