verifier_cfg:
  verifier_type : "all" # "reward_models" or "judges" or "all"
  verifier_size: "all" # "all", "small", "medium", "large" , int (verifiers less than or equal to this number)
  verifier_subset: # list of verifier names to use

data_cfg:
  # Options: "MATH-500", "AIMO", "MMLU-Pro", "GPQA","MMLU-College","AlpacaEval","BBH"
  dataset_name: "BBH-v2"
  # fraction of data used for training 
  train_split: 1.0 # 1.0 means no test set
  # <=1: fraction of total queries (determined by 'train_split') used for training
  # >1: number of  train queries used for training
  train_queries: 1 # fraction/number of total queries in train_split used for training
  train_samples: 1 # fraction of total samples in train_split used for training
  nan_replacement: 0 #"mean" # "mean" or 0
  random_seed: 0
  model_size: "70B"
  reward_threshold:  # null means no threshold
  normalize_type: "per_problem" # per_problem, all_problems
  normalize_method: "minmax"
  normalize_params: ${normalize_method_params.${data_cfg.normalize_method}}
  closest_train_problem_method: "mean_verifier_distance" # 
  closest_train_problem_metric_type: "euclidean" # 
  verifier_cfg: ${verifier_cfg}
  mv_as_verifier: false
  fixed_test_split:  # test set is fixed independently of train_split
  same_train_test: 

debug: false

model_cfg:
  model_type: "majority_vote"  # "majority_vote", "coverage"
  model_class: "per_dataset" # per_problem, per_dataset
  model_params: ${model_params.${model_cfg.model_type}}

fit_cfg:
  fit_type: # not used for majority vote

logging: #"wandb"
wandb_cfg:
  project: "verification"
  entity: "${oc.env:WANDB_ENTITY}"

model_params:
  majority_vote:
    k: 1  # check if answer is positive in the top k majority responses
    majority_select: "one_sample" # "majority"
  coverage:
    tmp: # does not take any parameters
  first_sample:
    tmp: # does not take any parameters
  naive_ensemble:
    tmp: # does not take any parameters


normalize_method_params:
  minmax:
    tmp: # does not take any parameters
  quantile:
    output_distribution: "uniform"
    n_quantiles: 100
  winsorize: # map each observation to quantile by clipping extreme values
    lower_quantile: 0.05
    upper_quantile: 0.95


# python run.py --config-name="majority_vote" data_cfg.model_size="70B" verifier_cfg.verifier_type="judges" debug=1 
