python -m experiments.train.csft --kl_decay 0.0 --model_name meta-llama/Llama-3.2-3B-Instruct --data_type arc 
python -m experiments.train.csft --kl_decay 0.0 --model_name meta-llama/Llama-3.2-3B-Instruct --data_type hellaswag
python -m experiments.train.csft --kl_decay 0.0 --model_name meta-llama/Llama-3.2-3B-Instruct --data_type mmlu