
models:


  - name: "Qwen/Qwen2.5-1.5B-Instruct"
    provider: "hf"
    max_new_tokens: 256
    temperature: 0.7
    top_p: 0.9
    use_chat_template: true

  - name: "google/gemma-2-2b-it"
    provider: "hf"
    max_new_tokens: 256
    temperature: 0.7
    top_p: 0.9
    use_chat_template: true

  - name: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    provider: "hf"
    max_new_tokens: 256
    temperature: 0.7
    top_p: 0.9
    use_chat_template: false
  - name: "openai-community/gpt2"
    provider: "hf"
    max_new_tokens: 256
    temperature: 0.7
    top_p: 0.9
    use_chat_template: false
  - name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
    provider: "hf"
    max_new_tokens: 256
    temperature: 0.7
    top_p: 0.9
    use_chat_template: false
  - name: "PleIAs/Baguettotron"
    provider: "hf"
    max_new_tokens: 256
    temperature: 0.7
    top_p: 0.9
    use_chat_template: false  

  - name: "meta-llama/Llama-2-7b-hf"
    provider: "hf"
    max_new_tokens: 256
    temperature: 0.7
    top_p: 0.9
    use_chat_template: false

  - name: "HuggingFaceTB/SmolLM3-3B"
    provider: "hf"
    max_new_tokens: 256
    temperature: 0.7
    top_p: 0.9
    use_chat_template: true

  - name: "Qwen/Qwen2.5-3B-Instruct"
    provider: "hf"
    max_new_tokens: 256
    temperature: 0.7
    top_p: 0.9
    use_chat_template: true

  - name: "zai-org/glm-edge-4b-chat"
    provider: "hf"
    max_new_tokens: 256
    temperature: 0.7
    top_p: 0.9
    use_chat_template: true

questions: "hf:databricks/databricks-dolly-15k:default:train:instruction"

question_shuffle: true
max_questions: 1000

alpha: 0.1
max_t: 4000
sampling_method: "random_pair"
top_k: null

judge:
  type: "openai"
  model: "gpt-4o-mini"
  temperature: 0.0
  max_tokens: 5
  system_prompt: |
    You are an expert evaluator for instruction-following and text generation. Compare two responses (A and B) to the same user instruction. Which response is better overall based on:
    1. **Instruction Following**: Does it correctly do what the instruction asked?
    2. **Coherence & Helpfulness**: Is the response logical, clear, and useful?
    3. **Language Quality**: Is it well-written with good grammar and style?

    You MUST output only a single character: 'A' or 'B'. No explanations.


huggingface:
  device_map: "auto"
  dtype: "float16"
  quantization: "4bit"
  trust_remote_code: true
  cache_dir: "/content/drive/MyDrive/hf_models/"
  low_cpu_mem_usage: true
  max_memory: 
    0: "35GB"

output:
  dir: "colab_pro_plus_results_textgen"
  save_interactions: true
  save_csv: true
  save_serpant_trace: true
  save_config_snapshot: true
  log_level: "INFO"

performance:
  batch_size: 6
  num_workers: 4
  prefetch_factor: 4
  pin_memory: true

monitoring:
  enable: true
  log_memory_usage: true
  log_gpu_utilization: true
  checkpoint_interval: 100
  resume_from_checkpoint: true