# Training mode configuration
model: 'llada_instruct'  # 'llada' or 'dream'
finetuning_method: 'ptuning'
task_name: 'commonsense170k'

finetuning_parameters:
  # Use CAUSAL_LM because LLaDA is a decoder-only architecture.
  # PEFT uses this to know it should prepend prompts to the input sequence.
  # It does not force causal masking on the diffusion process itself.
  task_type: "CAUSAL_LM" 
  
  # Increased from 20 to 100 to give the larger encoder more "steering" space.
  num_virtual_tokens: 20
  
  # Increased from 128 to 1024 to drastically increase trainable parameters.
  # This makes the LSTM/MLP much wider, closer to the ~60M param target.
  encoder_hidden_size: 550
  
  # CHANGED to 'LSTM' to follow the original P-Tuning paper ("GPT Understands, Too").
  # The paper uses an LSTM to model the dependency between pseudo-tokens.
  # 'MLP' is a simplification often used in later libraries but LSTM is the "classic" setting.
  encoder_reparameterization_type: "LSTM"

  # Verified against LLaDAConfig:
  # d_model (4096) -> token_dim
  # n_heads (32) -> num_attention_heads
  # n_layers (32) -> num_layers
  token_dim: 4096
  num_attention_heads: 32
  num_layers: 32

data:
  val_split_seed: 42
  val_split_size: 128
  batch_size: 1
  max_length: 512

train:
  # Will use paths.experiment path
  decoder_resume_path:
  head_resume_path:
  skipped_keys:
  global_step: 
  random_length: False
  global_epoch: 
  global_sample_number:
  global_update_number:
  global_token_number:

  output_dir: 'ckpts'
  logging_dir: 'logs'
  mixed_precision: 'fp16'
  gradient_accumulation_steps: 32
  report_to: 'wandb'
  epoch_num: 1
  
  # P-Tuning requires a higher LR than LoRA (usually 1e-3 vs 1e-4)
  lr: 1e-4
  
  warmup_ratio: 0.05
  eval_every: 64
  eval_from_start: True
  save_every: 64
  per_example_ratio: True
  exp_name:  &exp_name ""
  wandb_proj: ""
  eval:
    # metric: 'accuracy'
    metric: 'loss'
    noise_levels:
    eval_epoches_num: 1
    steps: 32
    gen_length: 512
    block_length: 16
    temperature: 0.0
    cfg_scale: 0.0
    remasking: 'low_confidence'
    observe_steps: True
paths:
  experiment: ''