CUDA_VISIBLE_DEVICES=5,6 accelerate launch --multi_gpu --num_processes 2 --mixed_precision fp16 train_flant5.py --model_name_or_path lmsys/fastchat-t5-3b-v1.0 --data_path playground/data/dummy.json --bf16 True --output_dir checkpoints_flant5_3b --num_train_epochs 3 --per_device_train_batch_size 1 --per_device_eval_batch_size 1 --gradient_accumulation_steps 4 --evaluation_strategy "no" --save_strategy "steps" --save_steps 300 --save_total_limit 1 --learning_rate 2e-5 --weight_decay 0. --warmup_ratio 0.03 --lr_scheduler_type "cosine" --logging_steps 1 --fsdp "full_shard auto_wrap" --fsdp_transformer_layer_cls_to_wrap T5Block --tf32 True --model_max_length 2048 --preprocessed_path preprocessed_data/processed.json --gradient_checkpointing True