task: s1
train_ratio: # determined by dataset

seed: 42

add_special_tokens: True

# MODEL ckpt
load_critic_model: answerdotai/ModernBERT-large
save_critic_model: ckpt/critic/initial

skip_eval: True

# Search Method
search_method: beam # can choose from beam, next
mode: linear # can choose from bbox, new
# SEARCH PARAMS
beam_size: 2
num_candidates: 5
# max search depth
max_length: 20
early_stopping: True
only_eval_answers: False

# QUERY_PARAMS
temperature: 0.7
top_p: 0.9
frequency_penalty: 0 
presence_penalty: 0 
stop: ["\n", "\n\n", "\n\n\n"]
max_tokens: 200
max_tokens_cot: 25600

# TRAINING_PARAMS
## blackbox warmup
use_blackbox_warmup: True
num_epochs_blackbox_warmup: 2
num_candidates_blackbox_warmup: 5

num_epochs: 1
batch_size: 8
gradient_accumulation_steps: 3

l2_reg_coef: 1.
energy_temp: 5.
warmup_steps: 50
learning_rate: 5.E-6
min_lr: 5.E-6
T_lr: 1000

use_outcome_supervision: True
use_stepwise_supervision: True
num_negatives_for_training: 1
qa_template:  "Q: <Q>\nA: You MUST conclude the final answer after the phrase 'The final answer is'. <A>"

# WANDB
# log_with_wandb: True
log_with_wandb: False
wandb_project: SearchLLM_ModernBert_Large
wandb_group: Greedy_warmup
wandb_run_name: S1K_epoch0

# EVALUATION
eval_blackbox: False
eval_unfinetuned: False
num_eval_rounds: 1

# [Optional] Used for unembedded vllm
port: 8000
proposal: Qwen2.5-Math-7B-Instruct
