models:

  - name: "Qwen/Qwen2.5-1.5B-Instruct"
    provider: "hf"
    max_new_tokens: 128
    temperature: 0.7
    top_p: 0.9
    use_chat_template: true

  - name: "google/gemma-2-2b-it"
    provider: "hf"
    max_new_tokens: 128
    temperature: 0.7
    top_p: 0.9
    use_chat_template: true

  - name: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    provider: "hf"
    max_new_tokens: 128
    temperature: 0.7
    top_p: 0.9
    use_chat_template: false
  - name: "openai-community/gpt2"
    provider: "hf"
    max_new_tokens: 128
    temperature: 0.7
    top_p: 0.9
    use_chat_template: false
  - name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
    provider: "hf"
    max_new_tokens: 128
    temperature: 0.7
    top_p: 0.9
    use_chat_template: false
  - name: "PleIAs/Baguettotron"
    provider: "hf"
    max_new_tokens: 128
    temperature: 0.7
    top_p: 0.9
    use_chat_template: false  

  - name: "meta-llama/Llama-2-7b-hf"
    provider: "hf"
    max_new_tokens: 128
    temperature: 0.7
    top_p: 0.9
    use_chat_template: false

  - name: "HuggingFaceTB/SmolLM3-3B"
    provider: "hf"
    max_new_tokens: 256
    temperature: 0.7
    top_p: 0.9
    use_chat_template: true

  - name: "Qwen/Qwen2.5-3B-Instruct"
    provider: "hf"
    max_new_tokens: 128
    temperature: 0.7
    top_p: 0.9
    use_chat_template: true

  - name: "zai-org/glm-edge-4b-chat"
    provider: "hf"
    max_new_tokens: 256
    temperature: 0.7
    top_p: 0.9
    use_chat_template: true

questions: "hf:trivia_qa:rc:train:question"

question_shuffle: true
max_questions: 1000

alpha: 0.1
max_t: 10000
sampling_method: "random_pair"
top_k: null
max_tournament_samples: 600

judge:
  type: "openai"
  model: "gpt-4o-mini"
  temperature: 0.0
  max_tokens: 5
  system_prompt: "You are an expert evaluator for open-domain question answering. Compare two responses (A and B) to the same question. Which response is better overall based on:
1. Factual Correctness: Is the answer factually correct?
2. Completeness: Does it sufficiently answer the question?
3. Clarity: Is the response clear and well-structured?
You MUST output only a single character: 'A' or 'B'."

huggingface:
  device_map: "auto"
  dtype: "float16"
  quantization: "4bit"
  trust_remote_code: true
  cache_dir: "/content/drive/MyDrive/hf_models/"
  low_cpu_mem_usage: true
  max_memory: 
    0: "15GB"

output:
  dir: "colab_pro_plus_results"
  save_interactions: true
  save_csv: true
  save_serpant_trace: true
  save_config_snapshot: true
  log_level: "INFO"

performance:
  batch_size: 4
  num_workers: 2
  prefetch_factor: 2
  pin_memory: true
