# Global settings
seed: 42
cuda:
  visible_devices: "0"
deepspeed_config_path: null  # Path to DeepSpeed config file, e.g., "deepspeed_config.json"

# Model settings
model:
  name: "Qwen/Qwen2.5-Coder-7B-Instruct"
  torch_dtype: "bfloat16"
  use_peft: false
  peft:
    method: "lora" 
    r: 8 
    lora_alpha: 8 
    lora_dropout: 0.1
    target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", 'gate_proj', 'down_proj', 'up_proj'] 
    bias: "none" 
    task_type: "CAUSAL_LM"

# Dataset settings
dataset:
  source: "disk"  # or "hf"
  name: "datasets/codecontests"  # path for disk or HF dataset name
  train_split: "train"
  eval_split: "val"
  prompt_column: "prompt"
  ground_truth_column: "description"
  test_column: "generated_tests"

# Training settings
training:
  # Generation parameters
  max_prompt_length: 512
  num_generations: 8
  temperature: 1
  max_completion_length: 256
  
  # vLLM parameters
  use_vllm: false
  vllm_mode: null
  vllm_server_base_url: null
  vllm_gpu_memory_utilization: 0.9
  vllm_dtype: "auto" 
  vllm_max_model_len: null
  
  # Basic training parameters
  num_train_epochs: 1
  per_device_train_batch_size: 8
  gradient_accumulation_steps: 4
  max_grad_norm: 1.0
  learning_rate: 5e-6
  gradient_checkpointing: false
  
  # Mixed precision settings
  bf16: true  # Disable bf16 for single GPU training
  fp16: false  # Disable fp16
  
  # GRPO specific parameters
  beta: 0.01  # KL penalty coefficient
  reward_weights: null  # weights for different reward functions
  epsilon: 0.2 # value for clipping
  epsilon_high: null # upper bound for clipping, if not specified defaults to epsilon
  num_iterations: 3 # number of ppo iterations per batch
  loss_type: "grpo" # loss type, supported values: "grpo", "bnpo", "dr_grpo"

  # Reference model parameters
  sync_ref_model: false
  ref_model_mixup_alpha: 0.9
  ref_model_sync_steps: 64
  
  # Optimizer parameters
  adam_beta1: 0.9
  adam_beta2: 0.999
  weight_decay: 0.0
  warmup_ratio: 0.0
  warmup_steps: 100
  lr_scheduler_type: "cosine"
  
  # Logging and monitoring
  log_completions: true
  logging_steps: 10
  report_to: ["wandb"]
  run_name: ""
  output_dir: "checkpoints/Qwen/Qwen2.5-Coder-Instruct-7B-GRPO-bonmax-mean-baseline"
  save_steps: 5000
  
  # Evaluation parameters
  eval_strategy: "steps"
  eval_steps: 200

  # Advantage function
  advantage_function: "bon_max"
  advantage_function_kwargs:
    baseline: 'mean'
  
  # experimental
  entropy_coef: 0.0

# Weights & Biases settings
wandb:
  use_wandb: true
  project: "bon"
  note: "bonmax estimation with mean baseline ppo3"

# Reward functions configuration, for now only one is supported
reward_function: 
  name: "docker_io_test_reward"
  kwargs:
    code_extractor_fn_name: "default_chat"
    run_tests_separately: true
completion_processing_function: "default"
  
  