python -m torch.distributed.run --nproc_per_node=2 --master_port=29694 train.py   \
--model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct      \
--data_path data/training/alpaca_data_cleaned.json      \
--output_dir meta-llama/Meta-Llama-3-8B-Instruct_Meta-Llama-3-8B-Instruct_NaiveCompletion_2025     \
--num_train_epochs 3      \
--per_device_train_batch_size 4    \
--per_device_eval_batch_size 4        \
--gradient_accumulation_steps 8       \
--evaluation_strategy "no"      \
--save_strategy "no"       \
--learning_rate 2e-6      \
--weight_decay 0.       \
--warmup_ratio 0.03      \
--lr_scheduler_type "cosine"      \
--logging_steps 1      \
--fsdp "full_shard auto_wrap"       \
--fsdp_transformer_layer_cls_to_wrap "LlamaDecoderLayer"       \
--bf16 False  \
--fp16 True  \
--tf32 False  \
--save_only_model True  \
--gradient_checkpointing True  \
--attack Meta-Llama-3-8B-Instruct_NaiveCompletion     \
--model_max_length 512