# Training mode configuration
model: 'llada_base'  # 'llada' or 'dream'
finetuning_method: 'lora'
task_name: 'code_feedback'

finetuning_parameters:
  r: &lora_rank 32
  lora_alpha: *lora_rank
  lora_dropout: 0.05
  # target_modules: ["q_proj", "k_proj", "v_proj", "up_proj", "ff_out","attn_out"]
  target_modules: ["q_proj", "k_proj", "v_proj", "attn_out"] # only shape 4096*r r*4096

data:
  val_split_seed: 42
  val_split_size: 128
  batch_size: 1
  # max_length: 1024

train:
  # Will use paths.experiment path
  decoder_resume_path:
  head_resume_path:
  skipped_keys:
  global_step: 
  global_epoch: 
  global_sample_number:
  global_update_number:
  global_token_number:
  random_length: False
  output_dir: 'ckpts'
  logging_dir: 'logs'
  mixed_precision: 'fp16'
  gradient_accumulation_steps: 32
  report_to: 'wandb'
  epoch_num: 1
  lr: 1e-4
  warmup_ratio: 0.05
  eval_every: 32
  eval_from_start: True #######################
  save_every: 32
  per_example_ratio: True
  exp_name:  &exp_name ""
  wandb_proj: ""
  eval:
    # metric: 'accuracy'
    metric: 'loss'
    use_fixed_batch: True
    noise_levels:
    eval_epoches_num: 1
    steps: 256
    gen_length: 256
    block_length: 8
    temperature: 0.0
    cfg_scale: 0.0
    remasking: 'low_confidence'
    observe_steps: True
paths:
  experiment: ''

