# train 4-bit 64-rank llama-2-7b on wikitext-2 using 1 GPU
CUDA_VISIBLE_DEVICES=4 python train_clm_tq_lora.py \
--model_name_or_path meta-llama/Llama-2-7b-hf \
--output_dir exp_results/wikitext-2-tqlora-4/ \
--learning_rate 4e-4 \
--learning_rate_2 1e-4 \
--seed 11 \
--dataset_name wikitext \
--dataset_config wikitext-2-raw-v1 \
--num_train_epochs 15 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 8 \
--weight_decay 0.1 \
--save_strategy "steps" \
--save_steps 12000 \
--save_total_limit 1 \
--load_best_model_at_end True \
--warmup_steps 120 \
--logging_steps 40 \
--evaluation_strategy "steps" \
--eval_steps 80 \
--do_train --do_eval \
--report_to "wandb" \
--run_name "qlora_wiki_2bit_lr_4e-4" \
--block_size 1024 \
--full_precision True > llama2_outputs/output_qlora_wiki_2bit_lr_4e-4_gs_128_adaptive_nf_round_nearest_gridnum_10_gridstart_0.9_gridend_1.0_l3norm_test.txt