
    ### model
    model_name_or_path: meta-llama/Meta-Llama-3-8B

    ### method
    stage: pt
    do_train: true
    finetuning_type: lora
    lora_target: all

    ### ddp
    ddp_timeout: 180000000


    ### dataset
    dataset: mix_a4
    template: llama3
    # cutoff_len: 1024
    # max_samples: 1000
    overwrite_cache: true
    preprocessing_num_workers: 16

    ### output
    output_dir: saves/Meta-Llama-3-8B/pretrain/mix_a4
    logging_steps: 10
    save_steps: 500
    plot_loss: true
    overwrite_output_dir: true

    ### train
    per_device_train_batch_size: 2
    gradient_accumulation_steps: 2
    learning_rate: 5.0e-5
    num_train_epochs: 5
    lr_scheduler_type: cosine
    warmup_ratio: 0.1
    fp16: true

    ### eval
    val_size: 0.1
    per_device_eval_batch_size: 1
    eval_strategy: steps
    eval_steps: 500

    