verifier_cfg:
  verifier_type: "all" # "reward_models" or "judges" or "all"
  verifier_size: "all" # "all", "small", "medium", "large" , int (verifiers less than or equal to this number)
  verifier_subset: # list of verifier names to use

data_cfg:
  # Options: "MATH-500", "AIMO", "MMLU-Pro", "GPQA","MMLU-College","AlpacaEval","BBH"
  dataset_name: "MATH-500-v2"
  # fraction of data used for training 
  train_split: 1.0 # 1.0 means no test set
  # <=1: fraction of total queries (determined by 'train_split') used for training
  # >1: number of  train queries used for training
  train_queries: 1 # fraction/number of total queries in train_split used for training
  train_samples: 1 # fraction of total samples in train_split used for training
  same_train_test: false
  nan_replacement: 0 #"mean" # "mean" or 0
  random_seed: 0
  model_size: "70B"
  reward_threshold:  # null means no threshold
  # normalization config
  normalize_type: "all_problems" # per_problem, all_problems
  normalize_method: "minmax" # minmax, quantile 
  normalize_params: ${normalize_method_params.${data_cfg.normalize_method}}
  # how to map train and test problems
  closest_train_problem_method: "mean_verifier_distance" #"SBERT" "mean_verifier_distance"
  closest_train_problem_metric_type: "euclidean" # "cosine" "euclidean"
  verifier_cfg: ${verifier_cfg}
  mv_as_verifier: false
  fixed_test_split:  # test set is fixed independently of train_split

debug: false
# Add this parameter to limit samples per problem
max_samples_per_problem:  # Set to desired number or remove/comment out to use all samples

model_cfg:
  model_type: "weak_supervision"
  model_class: "per_dataset" # per_problem, per_dataset
  model_params: ${model_params.${model_cfg.model_type}}

fit_cfg:
  fit_type: "wclosest_to_train" # wclosest_to_train, search_weights

logging: "wandb"
wandb_cfg:
  project: "verification"
  entity: "${oc.env:WANDB_ENTITY}"

model_params:
  weak_supervision:
    # model parameters 
    k: 2
    seed: 0
    binarize_threshold: 0.5
    metric: "scores"
    #  metric: that is used to determine the top generation to select
    n_epochs: 1000 #10000 #50000
    mu_epochs: 1000 #10000 #50000
    log_train_every: 1000
    lr: 0.00001
    use_deps: "drop" # "drop" or "model"
    use_label_on_test: true  # debugging flag
    drop_imbalanced_verifiers: 'all' # null, 'all', 'small', 'large'
    drop_k: 100  # Number of verifiers to select when using drop_deps
    cb_args: 
      # if null then use gt data to calculate class balance
      # if float then use this value as the class balance
      class_balance: "labels" #"labels" or float
    

  naive_ensemble:
    tmp:  # Does not use any parameter

  first_sample:
    tmp:  # Does not use any parameter

normalize_method_params:
  minmax:
    tmp: # does not take any parameters
  quantile:
    output_distribution: "uniform"
    n_quantiles: 100
  winsorize: # map each observation to quantile by clipping extreme values
    lower_quantile: 0.05
    upper_quantile: 0.95