# WORLD_SIZE=8
FORCE_TORCHRUN=1  llamafactory-cli train \
    --model_name_or_path EleutherAI/pythia-70m \
    --stage pt \
    --do_train \
    --finetuning_type full \
    --dataset minipile \
    --template default \
    --cutoff_len 2048 \
    --logging_steps 1 \
    --plot_loss \
    --num_train_epochs 1.0 \
    --per_device_train_batch_size 8 \
    --gradient_accumulation_steps 2 \
    --learning_rate 1.0e-3 \
    --lr_scheduler_type cosine_with_min_lr \
    --warmup_ratio 0.01 \
    --save_steps 2000 \
    --report_to wandb \
    --adam_beta1 0.9 \
    --preprocessing_num_workers 16 \
    --adam_beta2 0.95 \
    --weight_decay 0.01 \
    --ddp_timeout 180000000 \
    --deepspeed 'examples/deepspeed/ds_z0_config.json' \
    --flash_attn fa2 \
    --bf16 \
    --disable_gradient_checkpointing true \
    --scale_embeds true \
    --train_from_scratch true \
    --lr_scheduler_kwargs '{"min_lr_rate":0.1}' \
    --eval_dataset minipile_test \
    --per_device_eval_batch_size 16 \
    --eval_steps 2000 \
    --do_eval \
    --base_type 'mesh' \
    --num_enc_in 1 \
    --num_enc_out 1 \
    --num_loop_layers 2 \
    --num_loop_times 2 \