task: GSM8K
train_ratio: # determined by dataset

seed: 42

# MODEL
# proposal_model: meta-llama/Meta-Llama-3-8B-Instruct

critic_model: microsoft/deberta-v3-large
add_special_tokens: True

load_semantic_model:

# MODEL ckpt
save_critic_model: ckpt/critic
load_critic_model: ckpt/critic/GSM8K/microsoft/deberta-v3-large/StepWise=True/dpo_iteration0/train_epoch_1_idx_0

# Debug
skip_eval: True

# Search Method
search_method: beam # can choose from beam, next
mode: linear # can choose from bbox, new
initial_weight: zero # can choose from zero, classifier
remove_root: True
similarity_model:
# SEARCH PARAMS
normalize: True
alpha: 1
sigma: 1
c: 0
beam_size: 3
num_candidates: 10 
# max search depth
max_length: 20
early_stopping: True
only_eval_answers: False

# QUERY_PARAMS
temperature: 0.0
top_p: 1.0
frequency_penalty: 0 
presence_penalty: 0 
stop: ["\n", "\n\n", "\n\n\n"]
max_tokens: 400

# TRAINING_PARAMS
## blackbox warmup
use_blackbox_warmup: True
num_epochs_blackbox_warmup: 2
num_candidates_blackbox_warmup: 5

## Proposal online finetuning
lora_proposal: 

## online finetuning
num_online_finetuning_repeat: 1
# num_online_dataloader_size: 
# num_online_dataloader_size: 32
# num_online_eval_size: 64

num_epochs: 2
batch_size: 14 
gradient_accumulation_steps: 3 


l2_reg_coef: 1.
energy_temp: 5.
warmup_steps: 50
learning_rate: 5.E-6
min_lr: 5.E-6
T_lr: 1000

use_outcome_supervision: True
use_stepwise_supervision: True
num_negatives_for_training: 1
qa_template: "Q: <Q>\nA: Let's think step by step. You MUST write the final answer only as an integer after the phrase 'So the answer is'.\n<A>"

# EVALUATION
eval_blackbox: True
eval_unfinetuned: False
num_eval_rounds: 1

# WANDB
log_with_wandb: False
wandb_project: SearchLLM
wandb_group: Beam_Search
wandb_run_name: GSM8K_Step

# [Optional] Used for unembedded vllm
port: 8000
whitebox: meta-llama/Meta-Llama-3-8B-Instruct

# Llama DPO Training
dpo_training: True
