CUDA_VISIBLE_DEVICES=0,1,2,3 deepspeed --master_port=29501 \
          main.py \
          --cache_dir "./dump" \
          --output_dir "./training/training-1.3b" \
          --train_file "train.1.txt" \
          --num_train_epochs 20 \
          --per_device_train_batch_size 2 \
          --gradient_accumulation_steps 16 \
          --save_steps 5000 \
          --save_total_limit 2 \
          --prediction_loss_only \
          --learning_rate 1e-4 \
          --weight_decay 0.1 \
          --adam_beta1 0.9 \
          --adam_beta2 0.95 \
          --max_grad_norm 1.0 \
          --lr_scheduler_type "cosine" \
          --warmup_steps 5000 \
          --logging_steps 100 \
          --fp16 \
          --report_to wandb \
          --deepspeed ds_config.json
