# Shared configurations across all methods
shared:
  # Model configuration - using smaller model for synthetic data experiments
  model:
    model_name: "Qwen/Qwen2.5-3B-Instruct"  # Using smaller model for faster synthetic training
    trust_remote_code: true

  # Training configuration optimized for synthetic data
  training:
    output_dir: "outputs/synthetic_color" 
    save_dir: "saved_models/synthetic_color"  # Final saved models directory    
    per_device_train_batch_size: 8  # Increased batch size for synthetic data
    gradient_accumulation_steps: 1  # Reduced for smaller synthetic datasets
    num_train_epochs: 3  # Reduced epochs for synthetic data
    logging_steps: 50  # More frequent logging for shorter training
    save_steps: 500  # Adjusted for synthetic training length
    eval_steps: 100
    warmup_steps: 50  # Reduced warmup for synthetic data
    fp16: false  # Mixed precision disabled per user request
    remove_unused_columns: false
    report_to: "wandb"
    dataloader_num_workers: 4
    dataloader_pin_memory: true
    max_length: 25  # Reduced for synthetic data which is typically shorter
    max_prompt_length: 20  # Reduced for synthetic prompts

  # LoRA configuration optimized for synthetic training
  lora:
    r: 32  # Reduced rank for synthetic data
    lora_alpha: 32  # Proportionally reduced alpha
    target_modules: ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
    lora_dropout: 0.1
    bias: "none"
    task_type: "CAUSAL_LM"

  # Dataset configuration - will be overridden by command line argument
  dataset:
    train_path: "datasets/synthetic/color_dataset.json"  # Default to color dataset
    num_samples: 10000  # Limit for faster synthetic training

  # Tracking configuration
  tracking:
    wandb_project: ""
    wandb_token: ""

  # Authentication
  auth:
    hf_token: ""
    wandb_token: ""

# Methods to train (in order) - optimized for synthetic data
methods: ["dpo","single"]  # Train both methods for comparison

# Method-specific configurations optimized for synthetic data
method_configs:
  dpo:
    # DPO-specific training parameters for synthetic data
    training:
      learning_rate: 1e-4  # Higher learning rate for synthetic data
      run_name: "synthetic-dpo-training"

    # DPO-specific configuration
    specific:
      beta: 0.1  # Higher beta for clearer preference distinction in synthetic data
      max_prompt_length: 20 # Reduced for synthetic prompts
      max_length: 25  # Reduced for synthetic responses

  single:
    # Single model method specific training parameters for synthetic data
    training:
      learning_rate: 1e-5  # Moderate learning rate for single model
      run_name: "synthetic-single-training"

    # Single model specific configuration optimized for synthetic data
    specific:
      mu_epochs: 1    # Reduced epochs for mu training on synthetic data
      pi_epochs: 1    # Reduced epochs for pi training on synthetic data
      mu_learning_rate: 1e-5  # Learning rate for mu phase
      pi_learning_rate: 1e-5  # Learning rate for pi phase
      beta_pi: 0.1  # Beta parameter for pi loss
      beta: 0.01  # Beta parameter for pi loss calculation (moderate value for synthetic data)
      reference_model_id: "Qwen/Qwen2.5-3B-Instruct"  # Same as base model
      
      # Resume configuration (optional)
      resume:
        enabled: false
        checkpoint_path: ""
        epoch: 0
