# WORLD_SIZE=64
FORCE_TORCHRUN=1  llamafactory-cli train \
    --model_name_or_path EleutherAI/pythia-1.4b \
    --stage pt \
    --do_train \
    --finetuning_type full \
    --dataset pile \
    --template default \
    --cutoff_len 4096 \
    --logging_steps 1 \
    --plot_loss \
    --num_train_epochs 1.0 \
    --per_device_train_batch_size 8 \
    --gradient_accumulation_steps 1 \
    --learning_rate 2.0e-4 \
    --lr_scheduler_type cosine_with_min_lr \
    --warmup_ratio 0.01 \
    --adam_beta1 0.9 \
    --preprocessing_num_workers 16 \
    --adam_beta2 0.95 \
    --weight_decay 0.01 \
    --ddp_timeout 180000000 \
    --deepspeed 'examples/deepspeed/ds_z0_config.json' \
    --flash_attn fa2 \
    --bf16 \
    --scale_embeds true \
    --train_from_scratch true \
    --eval_dataset testpile \
    --per_device_eval_batch_size 16 \
    --eval_steps 20000 \
    --do_eval \
    --save_steps 10000 \
    --report_to wandb \
    --lr_scheduler_kwargs '{"min_lr_rate":0.1}' \
    --base_type 'mesh' \
    --num_enc_in 4 \
    --num_enc_out 4 \
    --num_loop_layers 8 \
    --num_loop_times 2 \