verifier_cfg:
  verifier_type : "all" # "reward_models" or "judges" or "all"
  verifier_size: "all" # "all", "small", "medium", "large" , int (verifiers less than or equal to this number)
  verifier_subset: # list of verifier names to use

data_cfg:
  # Options: "MATH-500", "AIMO", "MMLU-Pro", "GPQA","MMLU-College","AlpacaEval","BBH"
  dataset_name: "MATH-500-v2"
  # fraction of data used for training 
  train_split: 1.0 # 1.0 means no test set
  # <=1: fraction of total queries (determined by 'train_split') used for training
  # >1: number of  train queries used for training
  train_queries: 1 # fraction/number of total queries in train_split used for training
  train_samples: 1 # fraction of total samples in train_split used for training
  test_samples: 1 # fraction of total samples in test_split used for training and testing 
  same_train_test: false #true
  nan_replacement: 0 #"mean" # "mean" or 0
  random_seed: 0
  model_size: "70B"
  reward_threshold: 0.5 # null means no threshold
  # normalization config
  normalize_type: "all_problems" # per_problem, all_problems
  normalize_method: "minmax"
  normalize_params: ${normalize_method_params.${data_cfg.normalize_method}}
  # how to map train and test problems
  closest_train_problem_method: "mean_verifier_distance" #"SBERT" "mean_verifier_distance"
  closest_train_problem_metric_type: "euclidean" # "cosine" "euclidean"
  verifier_cfg: ${verifier_cfg}
  mv_as_verifier: true
  fixed_test_split:  # test set is fixed independently of train_split
  train_split_bins: 3
  shuffle_samples: false

debug: false

model_cfg:
  model_type: "weak_supervision"
  model_class: "cluster" # cluster, per_problem, per_dataset
  model_params: ${model_params.${model_cfg.model_type}}
  cluster_cfg:
    n_clusters: 3 # number of clusters 
    cluster_type: "unique_extracted_answers" #"by_difficulty", "unique_extracted_answers" "by_binned_difficulty" "random", "bert_query", "gmm", "hierarchical", "dbscan", "spectral", "json"
    preserve_ties: true # preserve ties between questions with same number of unique answers
    uniform_with_ties: false # whether to try to maintain uniform bucket sizes while preserving ties
    cluster_on_all: true
    #embedding_model: "nomic-ai/modernbert-embed-base" # "nomic-ai/modernbert-embed-base", "answerdotai/ModernBERT-large", "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
    #embedding_model: "cluster_mapping/MATH500_with_Llama_3.1_70B_Instruct/2025-04-17_10-41-04_preserve_ties.json"

fit_cfg:
  fit_type: "wclosest_to_train" # wclosest_to_train, search_weights

logging: #"wandb"
wandb_cfg:
  project: "verification"
  entity: "${oc.env:WANDB_ENTITY}"

model_params:
  weak_supervision:
    # model parameters 
    k: 2
    seed: 0
    binarize_threshold: 0.5
    metric: "scores"
    #  metric: that is used to determine the top generation to select
    n_epochs: 1000 #50000
    mu_epochs: 1000 #50000
    log_train_every: 1000
    lr: 0.00001 #0.00001
    deps_density: 0.1 #0.1, 10
    use_deps: "none" #"drop" # Changed from "none" to "drop" to match dependency modeling in logs
    deps_data_fraction: 0.01  # fraction of data to use for dependency modeling (between 0 and 1)
    use_label_on_test: false  # whether you use labels on the test set.
    drop_imbalanced_verifiers: null #"small" # null, 'all', 'small', 'large'
    drop_k: 100 # number of verifiers to keep 
    fit_when_calculating_metrics: false # set to true if you want to fit model for the test set.
    cb_args: 
      # if null then use gt data to calculate class balance
      # if float then use this value as the class balance
      class_balance: "labels" #"labels" or float
    

  naive_ensemble:
    tmp:  # Does not use any parameter

  first_sample:
    tmp:  # Does not use any parameter

normalize_method_params:
  minmax:
    tmp: # does not take any parameters
  quantile:
    output_distribution: "uniform"
    n_quantiles: 100
  winsorize: # map each observation to quantile by clipping extreme values
    lower_quantile: 0.05 # 0.03, 0.05
    upper_quantile: 0.95 # 0.97, 0.95
