verifier_cfg:
  verifier_type : "all" # "reward_models" or "judges" or "all"
  verifier_size: 80 # "all", "small", "medium", "large" , int (verifiers less than or equal to this number)
  verifier_subset: # list of verifier names to use

data_cfg:
  # Options: "MATH-500", "AIMO", "MMLU-Pro", "GPQA","MMLU-College","AlpacaEval","BBH"
  dataset_name: "MATH-500-v2"
  # fraction of data used for training 
  train_split: 1.0 # 1.0 means no test set
  # <=1: fraction of total queries (determined by 'train_split') used for training
  # >1: number of  train queries used for training
  train_queries: 1 # fraction/number of total queries in train_split used for training
  train_samples: 1 # fraction of total samples in train_split used for training
  same_train_test: false #true
  nan_replacement: 0 #"mean" # "mean" or 0
  random_seed: 0
  model_size: "70B"
  reward_threshold: 0.5 # null means no threshold
  # normalization config
  normalize_type: "all_problems" # per_problem, all_problems
  normalize_method: "minmax"
  normalize_params: ${normalize_method_params.${data_cfg.normalize_method}}
  # how to map train and test problems
  closest_train_problem_method: "mean_verifier_distance" #"SBERT" "mean_verifier_distance"
  closest_train_problem_metric_type: "euclidean" # "cosine" "euclidean"
  verifier_cfg: ${verifier_cfg}
  mv_as_verifier: true
  fixed_test_split:  # test set is fixed independently of train_split
  train_split_bins: 3

debug: false

model_cfg:
  model_type: "weak_supervision"
  model_class: "cluster" # cluster, per_problem, per_dataset
  model_params: ${model_params.${model_cfg.model_type}}
  cluster_cfg:
    n_clusters: 2 # number of clusters 
    cluster_type: "by_difficulty" #"by_difficulty", "random", "bert_query" 

fit_cfg:
  fit_type: "wclosest_to_train" # wclosest_to_train, search_weights

logging: "wandb"
wandb_cfg:
  project: "verification"
  entity: "${oc.env:WANDB_ENTITY}"

model_params:
  weak_supervision:
    # model parameters 
    k: 2
    seed: 0
    binarize_threshold: 0.5
    metric: "scores"
    #  metric: that is used to determine the top generation to select
    n_epochs: 5000 #50000
    mu_epochs: 10000 # 50000
    log_train_every: 1000
    lr: 0.0001 #0.00001
    use_deps: #"drop" # 'drop' # "drop" or "model"
    use_label_on_test: true  # debugging flag
    deps_data_fraction: 1.0
    drop_imbalanced_verifiers: 'all' # null, 'all', 'small', 'large'
    cb_args: 
      # if null then use gt data to calculate class balance
      # if float then use this value as the class balance
      class_balance: "labels" #"labels" or float
    

  naive_ensemble:
    tmp:  # Does not use any parameter

  first_sample:
    tmp:  # Does not use any parameter

normalize_method_params:
  minmax:
    tmp: # does not take any parameters
  quantile:
    output_distribution: "uniform"
    n_quantiles: 100
  winsorize: # map each observation to quantile by clipping extreme values
    lower_quantile: 0.05
    upper_quantile: 0.95
