    ### model
    model_name_or_path: Qwen/Qwen2.5-7B-Instruct
    trust_remote_code: true

    ### method
    stage: sft
    do_train: true
    finetuning_type: full
    deepspeed: examples/deepspeed/ds_z2_config.json  # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json]
    # deepspeed: examples/deepspeed/ds_z3_config.json  # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config.json]
    flash_attn: fa2

    ### dataset
    dataset: ablation_sft_20k_vanilla
    template: qwen3     # alternate to qwen3 template
    cutoff_len: 24576
    overwrite_cache: true
    preprocessing_num_workers: 16
    dataloader_num_workers: 4

    ### output
    output_dir: 
    logging_steps: 1
    save_steps: 200
    plot_loss: true
    overwrite_output_dir: false     # set to false to resume training
    save_only_model: false
    report_to: wandb  # choices: [none, wandb, tensorboard, swanlab, mlflow]
    run_name: ablation_sft_20k_vanilla
    
    ### train
    per_device_train_batch_size: 1
    gradient_accumulation_steps: 8
    learning_rate: 4.0e-5
    num_train_epochs: 5.0
    lr_scheduler_type: cosine
    warmup_ratio: 0.1
    bf16: true
    ddp_timeout: 180000000
    resume_from_checkpoint: null
    max_grad_norm: 1.0

    ### add think tokens
    add_tokens: <think>,</think>
    additional_target: embed_tokens,lm_head
    resize_vocab: true
