
models:


  - name: "Qwen/Qwen2.5-1.5B-Instruct"
    provider: "hf"
    max_new_tokens: 64
    temperature: 0.7
    top_p: 0.9
    use_chat_template: true

  - name: "google/gemma-2-2b-it"
    provider: "hf"
    max_new_tokens: 64
    temperature: 0.7
    top_p: 0.9
    use_chat_template: true

  - name: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    provider: "hf"
    max_new_tokens: 64
    temperature: 0.7
    top_p: 0.9
    use_chat_template: false
  - name: "openai-community/gpt2"
    provider: "hf"
    max_new_tokens: 64
    temperature: 0.7
    top_p: 0.9
    use_chat_template: false
  - name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
    provider: "hf"
    max_new_tokens: 64
    temperature: 0.7
    top_p: 0.9
    use_chat_template: false
  - name: "PleIAs/Baguettotron"
    provider: "hf"
    max_new_tokens: 64
    temperature: 0.7
    top_p: 0.9
    use_chat_template: false 

  - name: "meta-llama/Llama-2-7b-hf"
    provider: "hf"
    max_new_tokens: 64
    temperature: 0.7
    top_p: 0.9
    use_chat_template: false

  - name: "HuggingFaceTB/SmolLM3-3B"
    provider: "hf"
    max_new_tokens: 64
    temperature: 0.7
    top_p: 0.9
    use_chat_template: true

  - name: "Qwen/Qwen2.5-3B-Instruct"
    provider: "hf"
    max_new_tokens: 64
    temperature: 0.7
    top_p: 0.9
    use_chat_template: true

  - name: "zai-org/glm-edge-4b-chat"
    provider: "hf"
    max_new_tokens: 64
    temperature: 0.7
    top_p: 0.9
    use_chat_template: true


questions: "hf-mmlu:cais/mmlu:all:test"


# questions: "hf-mmlu:cais/mmlu:moral_disputes:test"
# questions: "hf-mmlu:cais/mmlu:moral_scenarios:test"
# questions: "hf-mmlu:cais/mmlu:nutrition:test"
# questions: "hf-mmlu:cais/mmlu:philosophy:test"
# questions: "hf-mmlu:cais/mmlu:prehistory:test"
# questions: "hf-mmlu:cais/mmlu:professional_accounting:test"
# questions: "hf-mmlu:cais/mmlu:professional_law:test"
# questions: "hf-mmlu:cais/mmlu:professional_medicine:test"
# questions: "hf-mmlu:cais/mmlu:professional_psychology:test"
# questions: "hf-mmlu:cais/mmlu:public_relations:test"
# questions: "hf-mmlu:cais/mmlu:security_studies:test"
# questions: "hf-mmlu:cais/mmlu:sociology:test"
# questions: "hf-mmlu:cais/mmlu:us_foreign_policy:test"
question_shuffle: true
max_questions: 1000

alpha: 0.1
max_t: 5000
sampling_method: "random_pair"
top_k: null

judge:
  type: "openai"
  model: "gpt-4o-mini"
  temperature: 0.0
  max_tokens: 5
  system_prompt: |
    You are a strict multiple-choice question evaluator. You will see a multiple-choice question and responses from two models.
    Judge which model's response is better (more accurate, clearer, more logical).
    You MUST output ONLY 'A' or 'B', no other content is allowed.
    - Output 'A' if the first model's response is better
    - Output 'B' if the second model's response is better

# ------------------------------------------------------------------------------
huggingface:
  device_map: "auto"
  dtype: "float16"
  quantization: "4bit"
  trust_remote_code: true
  cache_dir: "/content/drive/MyDrive/hf_models/"
  low_cpu_mem_usage: true
  max_memory: 
    0: "35GB"
output:
  dir: "mmlu_results"
  save_interactions: true
  save_csv: true
  save_serpant_trace: true
  save_config_snapshot: true
  log_level: "INFO"

performance:
  batch_size: 4
  num_workers: 2
  prefetch_factor: 2
  pin_memory: true

