run:
  output_dir: pic-lm-8b-sft
  num_gpus: 2
  per_device_train_batch_size: 2
  total_batch_size: 32
  gradient_accumulation_steps: 8

model:
  model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
  use_flash_attn: true 
  use_slow_tokenizer: true 
  tokenizer_name: ${model.model_name_or_path}

dataset:
  name: jacquelinehe/pic-lm-sft-mixture
  max_num_samples: 20000

training:
  num_train_epochs: 2 
  learning_rate: 1e-5
  max_train_steps: null
  max_seq_length: 4096
  preprocessing_num_workers: 32
  checkpointing_steps: epoch
  lr_scheduler_type: 'cosine'
  warmup_ratio: 0.01 
  weight_decay: 0.1
  logging_steps: 10
  resume_from_checkpoint: null 
  seed: 42 
  with_tracking: true 
  low_cpu_mem_usage: false 
  clip_grad_norm: -1 
  timeout: 10800
  overwrite_cache: false 
  gradient_checkpointing: false

lora:
  use_lora: false 
  use_qlora: false 
  lora_rank: 16 
  lora_alpha: 8
  lora_dropout: 0.1 
  use_8bit_optimizer: false 
  












