CUDA_VISIBLE_DEVICES=0 python -m torch.distributed.run --nproc_per_node=1 --master_port=29694 train.py   \
--model_name_or_path Qwen/Qwen2-1.5B-Instruct      \
--data_path data/training/alpaca_data_cleaned.json      \
--output_dir Qwen/Qwen2-1.5B-Instruct_Qwen2-1.5B-Instruct_NaiveCompletion_2025      \
--num_train_epochs 3      \
--per_device_train_batch_size 8    \
--per_device_eval_batch_size 8        \
--gradient_accumulation_steps 8       \
--evaluation_strategy "no"      \
--save_strategy "no"       \
--learning_rate 2e-6      \
--weight_decay 0.       \
--warmup_ratio 0.03      \
--lr_scheduler_type "cosine"      \
--logging_steps 1      \
--fsdp "full_shard auto_wrap"       \
--fsdp_transformer_layer_cls_to_wrap "Qwen2DecoderLayer"       \
--bf16 False  \
--fp16 True  \
--tf32 False  \
--save_only_model True  \
--gradient_checkpointing True  \
--attack Qwen2-1.5B-Instruct_NaiveCompletion     \
--model_max_length 512