project:
  name: "Heatmaps"
  experiment_name: "MiddleInjection-upd-transcribation"

clearml:
  project_name: ${project.name}
  task_name: ${project.experiment_name}
  output_uri: false
  reuse_last_task_id: true

dataset:
  path: "./dataset/processed_transcribations"
  kwargs:
    transcribation_feature_name: "transcribation"
    calibration_feature_name: "model_description"
    calib_prob: 0.2
    image_folder: "./dataset/images/"
    
model:
  name: "baked_middle_heat_qwen"
  kwargs:
    torch_dtype: "bfloat16"
    device_map: "cuda"
    attn_implementation: "flash_attention_2"
    trust_remote_code: true

trainer:
  output_dir: "./middle_tune_heat_qwen2.5-checkpoints"
  num_train_epochs: 50
  per_device_train_batch_size: 18
  per_device_eval_batch_size: 12
  gradient_accumulation_steps: 2
  learning_rate: 8e-6
  weight_decay: 0.01
  lr_scheduler_type: "linear"
  warmup_steps: 20
  logging_steps: 2
  save_steps: 100
  save_strategy: "steps"
  bf16: true
  fp16: false
  gradient_checkpointing: true
  optim: "adamw_8bit"
  eval_strategy: "steps"
  eval_steps: 50
  remove_unused_columns: false
  dataloader_num_workers: 12

lora:
  r: 64
  lora_alpha: 128
  lora_dropout: 0.1
  bias: "none"
  target_modules_patterns:
    - "heat_embedding\\.linear[12]" 
    - "visual\\.merger\\.mlp\\.[02]"        
    - "visual\\.heat_block\\.(attn_self\\.(qkv|proj)|attn_cross\\.(q|kv)|mlp\\.(gate_proj|up_proj|down_proj))"
    - "visual\\.blocks\\.2[3-9]\\.(attn\\.(qkv|proj)|mlp\\.(gate_proj|up_proj|down_proj))" 
    - "visual\\.blocks\\.3[01]\\.(attn\\.(qkv|proj)|mlp\\.(gate_proj|up_proj|down_proj))" 
    - "model\\.layers\\.\\d+\\.self_attn\\.(q_proj|k_proj|v_proj|o_proj)" 
    - "model\\.layers\\.\\d+\\.mlp\\.(gate_proj|up_proj|down_proj)" 