git pull

# accelerate launch --main_process_port 12334 --config_file "configs/accel_ds_8h800_gas1.yaml" hf_trainer.py --log_steps 100 --max_grad_norm 1.0 --learning-rate 1e-4 --gradient_accumulation_steps 1 --max_steps 50000 --dataset_name /home/work/datasets/mix_general_llama3_tokenized_v2.2/train.jsonl --batch-size 6 --data-max-len 1024 --save_steps 25000 --check_data_cls_loss False --target_hidden_size 1536 --kl_temperature 20 --warmup-ratio 0.005 --raw-model-name /home/work/models/Llama-3.2-3B-Instruct/ --extra_tags general_train,8h800,kl_t --use_accelerate True --output_dir /home/work/ckpts --str_ban_losses no 

# accelerate launch --main_process_port 12334 --config_file "configs/accel_ds_8h800_gas1.yaml" hf_trainer.py --log_steps 100 --max_grad_norm 1.0 --learning-rate 1e-4 --gradient_accumulation_steps 1 --max_steps 50000 --dataset_name /home/work/datasets/mix_general_llama3_tokenized_v2.2/train.jsonl --batch-size 6 --data-max-len 1024 --save_steps 25000 --check_data_cls_loss False --target_hidden_size 1536 --kl_temperature 10 --warmup-ratio 0.005 --raw-model-name /home/work/models/Llama-3.2-3B-Instruct/ --extra_tags general_train,8h800,kl_t --use_accelerate True --output_dir /home/work/ckpts --str_ban_losses no 

# accelerate launch --main_process_port 12334 --config_file "configs/accel_ds_8h800_gas1.yaml" hf_trainer.py --log_steps 100 --max_grad_norm 1.0 --learning-rate 1e-4 --gradient_accumulation_steps 1 --max_steps 50000 --dataset_name /home/work/datasets/mix_general_llama3_tokenized_v2.2/train.jsonl --batch-size 6 --data-max-len 1024 --save_steps 25000 --check_data_cls_loss False --target_hidden_size 1536 --kl_temperature 30 --warmup-ratio 0.005 --raw-model-name /home/work/models/Llama-3.2-3B-Instruct/ --extra_tags general_train,8h800,kl_t --use_accelerate True --output_dir /home/work/ckpts --str_ban_losses no 

# accelerate launch --main_process_port 12334 --config_file "configs/accel_ds_8h800_gas1.yaml" hf_trainer.py --log_steps 100 --max_grad_norm 1.0 --learning-rate 1e-4 --gradient_accumulation_steps 1 --max_steps 50000 --dataset_name /home/work/datasets/mix_general_llama3_tokenized_v2.2/train.jsonl --batch-size 6 --data-max-len 1024 --save_steps 25000 --check_data_cls_loss False --target_hidden_size 1536 --kl_temperature 40 --warmup-ratio 0.005 --raw-model-name /home/work/models/Llama-3.2-3B-Instruct/ --extra_tags general_train,8h800,kl_t --use_accelerate True --output_dir /home/work/ckpts --str_ban_losses no 

# accelerate launch --main_process_port 12334 --config_file "configs/accel_ds_8h800_gas1.yaml" hf_trainer.py --log_steps 100 --max_grad_norm 1.0 --learning-rate 1e-4 --gradient_accumulation_steps 1 --max_steps 50000 --dataset_name /home/work/datasets/mix_general_llama3_tokenized_v2.2/train.jsonl --batch-size 6 --data-max-len 1024 --save_steps 25000 --check_data_cls_loss False --target_hidden_size 1536 --kl_temperature 40 --warmup-ratio 0.005 --raw-model-name /home/work/models/Llama-3.2-3B-Instruct/ --extra_tags general_train,8h800,kl_t --use_accelerate True --output_dir /home/work/ckpts --str_ban_losses no --use_logits_loss 0

accelerate launch --main_process_port 12334 --config_file "configs/accel_ds_8h800_gas1.yaml" hf_trainer.py --log_steps 100 --max_grad_norm 1.0 --learning-rate 1e-4 --gradient_accumulation_steps 1 --max_steps 50000 --dataset_name /home/work/datasets/mix_general_llama3_tokenized_v2.2/train.jsonl --batch-size 6 --data-max-len 1024 --save_steps 25000 --check_data_cls_loss False --target_hidden_size 1536 --kl_temperature 60 --warmup-ratio 0.005 --raw-model-name /home/work/models/Llama-3.2-3B-Instruct/ --extra_tags general_train,8h800,kl_t --use_accelerate True --output_dir /home/work/ckpts --str_ban_losses no
