task: openai_math
train_ratio: # determined by dataset

seed: 42

add_special_tokens: True

# MODEL ckpt
load_critic_model: microsoft/deberta-v3-large

# # Search Method
# search_method: beam # can choose from beam, next
search_method: greedy # can choose from beam, next
mode: bbox # can choose from bbox, new
# SEARCH PARAMS
beam_size: 3
num_candidates: 10
# max search depth
max_length: 15
early_stopping: True
only_eval_answers: False

use_blackbox_warmup: True
num_epochs_blackbox_warmup: 2
# num_candidates_blackbox_warmup: 5
num_candidates_blackbox_warmup: 10

num_online_finetuning_repeat: 1

num_epochs: 1
batch_size: 8
gradient_accumulation_steps: 3 

l2_reg_coef: 1.
energy_temp: 5.
warmup_steps: 50
learning_rate: 5.E-6
min_lr: 5.E-6
T_lr: 1000

# QUERY_PARAMS
temperature: 0.7
top_p: 0.9
frequency_penalty: 0 
presence_penalty: 0 
stop: ["\n", "\n\n", "\n\n\n"]
max_tokens: 512
max_tokens_cot: 25600

use_outcome_supervision: True
use_stepwise_supervision: True
num_negatives_for_training: 1
qa_template: "Q: <Q>\nA: Let's think step by step and output the final answer within \\boxed{}. <A>"

# WANDB
log_with_wandb: False
wandb_project: Math
wandb_group: Greedy
wandb_run_name: OpenAI_Math

# EVALUATION
eval_blackbox: False
eval_unfinetuned: False
num_eval_rounds: 1

# [Optional] Used for unembedded vllm
port: 8000
whitebox: Qwen2.5-Math-7B-Instruct
